pax_global_header00006660000000000000000000000064145110700650014511gustar00rootroot0000000000000052 comment=d75656b168df190bb443308b70edf85ab9b9c54c pg_fact_loader-2.0.1/000077500000000000000000000000001451107006500144425ustar00rootroot00000000000000pg_fact_loader-2.0.1/LICENSE000066400000000000000000000020511451107006500154450ustar00rootroot00000000000000Copyright 2018 Enova International, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pg_fact_loader-2.0.1/Makefile000066400000000000000000000017071451107006500161070ustar00rootroot00000000000000EXTENSION = pg_fact_loader DATA = pg_fact_loader--1.4.sql pg_fact_loader--1.4--1.5.sql \ pg_fact_loader--1.5.sql pg_fact_loader--1.5--1.6.sql \ pg_fact_loader--1.6.sql pg_fact_loader--1.6--1.7.sql \ pg_fact_loader--1.7.sql pg_fact_loader--1.7--2.0.sql \ pg_fact_loader--2.0.sql REGRESS := 01_create_ext 02_schema 03_audit \ 04_seeds 05_pgl_setup 06_basic_workers \ 07_launch_worker 08_fact_table_deps \ 09_purge 10_delete 11_more_data \ 12_no_proid 13_cutoff_no_dep_on_filter \ 14_null_key 15_source_change_date \ 16_1_2_features 17_1_3_features PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) # Prevent unintentional inheritance of PGSERVICE while running regression suite # with make installcheck. We typically use PGSERVICE in our shell environment but # not for dev. Require instead explicit PGPORT= or PGSERVICE= to do installcheck unexport PGSERVICE pg_fact_loader-2.0.1/README.md000066400000000000000000001157041451107006500157310ustar00rootroot00000000000000# pg_fact_loader Build fact tables with Postgres using replicated tables and a queue [Overview](#overview) - [High Level Description](#high_level) - [Features](#features) - [A Full Example](#full_example) - [Installation](#installation) [Setup and Deployment](#setup) - [Configuration](#config) - [Function Performance Considerations](#performance) - [Deployment](#deployment) - [Backfills](#backfills) [Administration](#admin) - [Manually Executing Jobs](#manual) - [Troubleshooting Errors and Issues](#troubleshoot) [Technical Documentation](#tech) - [Workflow](#workflow) # Overview ## High Level Description This extension is for building fact tables using queues that contain all write events (inserts, updates, and deletes) as the driver. By default, we assume that fact tables are built in a logical replica, not an OLTP master, which is why we have logic within the codebase that checks for replication stream delay (but it is possible to run this whole system locally without any deps on logical replication). There are several essential steps to having a working setup where fact tables will be automatically built for you as new data comes in that is queued for processing: 1. Replicate all source tables that will be used in the definition of how a fact table is built 2. Ensure audit star is installed on the OLTP system for all replicated source tables. This change log will be the basis for building the fact tables. (see below for *later* adding these to replication) 3. Create a fact table structure where the data will go 4. Create a Postgres function that takes a single key id field (like customer_id) as an argument, and returns 1 row of the fact table as a result. 5. Figure out which source tables are used in your fact table function. 6. Add the audit star table *structures* that you need to your reporting system. Only create these columns based on the OLTP structure: `$audit_id_field, changed_at, operation, change, primary_key, before_change` 7. Build your configuration that tells `pg_fact_loader` both when and how to use the audit tables to update your fact tables. Leave the configuration disabled (this is the default). 8. Add the audit star tables to replication to start populating audit changes on the reporting system. 9. As soon as possible, backfill the entire fact table by running your Postgres function across every row of the table i.e. where `customers_fact_merge(int)` is your fact function which populates `customers_fact`, and the `customers` table contains the full set of customers, and hence will populate fact table data for every customer: `SELECT customers_fact_merge(customer_id) FROM customers;`. 10. Enable the configuration for your fact table. 11. Schedule the fact_loader.worker() function to run to start continuously processing changes ## A Full Example ```sql --Optional - if using this system on a logical replica CREATE EXTENSION pglogical; CREATE EXTENSION pglogical_ticker; --Required CREATE EXTENSION pg_fact_loader; ``` For now, please refer to the test suite in the `./sql` folder contain abundant examples of how this configuration can be setup. ## Installation The functionality of this requires postgres version 9.5+ and a working install of pglogical and pglogical_ticker (or it can be used locally only without pglogical). DEB available on official PGDG repository as postgresql-${PGSQL_VERSION}-pg-fact-loader see installation instruction on https://wiki.postgresql.org/wiki/Apt Or to build from source: ``` make make install make installcheck # run regression suite ``` Assuming you have pglogical and pglogical_ticker, then the extension can be deployed as any postgres extension: ```sql CREATE EXTENSION pg_fact_loader; ``` # Setup and Deployment ## Configuration ### General Configuration Workflow The general workflow of seeding the configuration to drive the refresh of a fact table is as follows: 1. Seed the fact table itself in `fact_tables` 2. Seed any queue tables which have not already been seeded in `queue_tables` 3. Run the function `fact_loader.add_batch_id_fields()` to add `fact_loader_batch_id` to each queue table. **NOTE** that this function sets DEFAULT intentionally upon addition. Users are assumed to only use this function in a scenario when we are not adding a queue table which is massive. In any case this will take a long lock with lots of data. 4. Tie together fact tables and queue tables in `queue_table_deps`, along with functions to execute on each DML event 5. Explain how to get the key field values to pass into functions, which may include joining to other tables, in `key_retrieval_sequences` There are two additional steps if your fact table depends on *other* fact tables: 1. Tie together parent-child fact table dependencies in `fact_table_deps`, along with "default" functions to execute on each DML event 2. Specify any unique function execution requirements in `fact_table_dep_queue_table_deps` **You can see a full example of all cases of configs in the regression suite, which lies in the ** `./sql` directory, specifically the `schema` and `seeds` tests. Once things are configured, the following queries can help to get a bird's eye view: Less Detail: ```sql SELECT fact_table_id, fact_table_relid, queue_table_relid, queue_of_base_table_relid, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid, level, return_columns, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key FROM fact_loader.queue_deps_all_with_retrieval WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS ORDER BY queue_table_relid::TEXT, queue_table_dep_id, level; ``` More Detail: ```sql SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS ORDER BY queue_table_relid::TEXT, queue_table_dep_id, level; ``` **Note that each run of a job is logged in fact_loader_refresh_logs, which are pruned after 90 days. **Note that if using temp tables, it is recommended that you use ON COMMIT DROP option, even though the worker itself drops TEMP schema after each run. ### Configuring a Daily Scheduled Job Although it is assumed most tables are driven by queues, which is encouraged, we provide the ability to run a daily scheduled script instead. This is much, much simpler to configure, but that is because you lose many of the enormous performance benefits of a queue-based table. You simply must configure `fact_tables` ONLY, including the provided fields for `daily_schedule`: - `use_daily_schedule` - must be marked `true` - `daily_scheduled_time` - the time of day *after which* to run the job (the system will attempt to run until midnight) - `daily_scheduled_tz` - the timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date. - `daily_scheduled_proid` - the function to execute. Currently it takes no arguments. It is assumed to contain all the logic necessary to add any new daily entries. See the unit tests in `sql/16_1_2_features.sql` for an example. We support a simple set of chained jobs here as well. That is, the first job is scheduled, and another job can kick off after the first one finishes, and so on (chains of dependencies are supported). The fields relevant are: - `use_daily_schedule` - must be marked `true` for dependent jobs - `depends_on_base_daily_job_id` - **first** job in chain which is actually the only one with a scheduled time - `depends_on_parent_daily_job_id` - Immediate parent which must complete before this job will run Note that if a scheduled job fails and you re-enable it, it will try to run it again if it is still within the proper time range and has not yet succeeded the same day. ### Detailed Configuration Explanations (Generated from table/column comments) There are a number of config tables that drive pg_fact_loader loads: `fact_tables`: Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs". - `fact_table_id`: Unique identifier for the fact table or job - also referred to as job_id - `fact_table_relid`: The oid of the fact table itself regclass type to accept only valid relations. - `fact_table_agg_proid`: NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now. - `enabled`: Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE. - `priority`: Determines the order in which the job runs (in combination with other sorting factors) - `force_worker_priority`: If marked TRUE, this fact table will be prioritized in execution order above all other factors. - `last_refresh_source_cutoff`: The data cutoff time of the last refresh - only records older than this have been updated. - `last_refresh_attempted_at`: The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority. - `last_refresh_succeeded`: Whether or not the last run of the job succeeded. NULL if it has never been run. - `row_created_at`: Timestamp of when this row was first created. - `row_updated_at`: Timestamp of when this row was last updated (this is updated via trigger). - `use_daily_schedule`: If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs. - `daily_scheduled_time`: The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in. - `daily_scheduled_tz`: The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date. - `daily_scheduled_proid`: The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples. - `depends_on_base_daily_job_id`: For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time. - `depends_on_parent_daily_job_id`: For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run. - `daily_scheduled_deps`: OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it. - `daily_scheduled_dep_delay_tolerance`: OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it. - `pre_execute_hook_sql`: OPTIONAL - custom sql to execute within the `load.sql` function, after the `process_queue` has been loaded, but prior to the actual load of the fact table using the `process_queue`. This feature was originally written due to the need to index the process_queue in certain unique circumstances, prior to actual execution over the `process_queue`. `queue_tables`: Each queue table along with the base table to which it belongs. - `queue_table_id`: Unique identifier for queue tables. - `queue_table_relid`: The oid of the queue table itself regclass type to accept only valid relations. - `queue_of_base_table_relid`: The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations. - `pglogical_node_if_id`: Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table. - `queue_table_tz`: **NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz. - `row_created_at`: Timestamp of when this row was first created. - `row_updated_at`: Timestamp of when this row was last updated (this is updated via trigger). - `purge`: Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table. `queue_table_deps`: Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations. - `queue_table_dep_id`: Unique identifier. - `fact_table_id`: Fact table to tie together with a queue table it depends on. - `queue_table_id`: Queue table to tie together with a fact table that needs its changes. - `relevant_change_columns`: Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data. - `last_cutoff_id`: The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered. - `last_cutoff_source_time`: The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. - `insert_merge_proid`: Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events. - `update_merge_proid`: Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events. - `delete_merge_proid`: Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events. - `row_created_at`: Timestamp of when this row was first created. - `row_updated_at`: Timestamp of when this row was last updated (this is updated via trigger). `key_retrieval_sequences`: How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations. - `key_retrieval_sequence_id`: Unique identifier. - `queue_table_dep_id`: Which fact table - queue table record this is for (queue_table_deps) - `filter_scope`: NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this. - `level`: Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on. - `return_columns`: What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level. - `is_fact_key`: Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information. - `join_to_relation`: Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join. - `join_to_column`: Join to this column of join_to_relation. - `return_columns_from_join`: Return these columns from join_to_relation. - `join_return_is_fact_key`: If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key. - `pass_queue_table_change_date_at_tz`: If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. `fact_table_deps`: For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here. - `fact_table_dep_id`: Unique identifier. - `parent_id`: The parent fact_table_id that the child depends on. - `child_id`: The child fact_table_id that will run only after the parent is updated. - `default_insert_merge_proid`: Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples. - `default_update_merge_proid`: Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples. - `default_delete_merge_proid`: Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples. - `row_created_at`: Timestamp of when this row was first created. - `row_updated_at`: Timestamp of when this row was last updated (this is updated via trigger). `fact_table_dep_queue_table_deps`: Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. - `fact_table_dep_queue_table_dep_id`: Unique identifier - `fact_table_dep_id`: fact_table_dep for this specific dependency. - `queue_table_dep_id`: Inherited queue_table_dep that this dependent fact table depends on. - `last_cutoff_id`: This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered. - `last_cutoff_source_time`: This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. - `insert_merge_proid`: Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this. - `update_merge_proid`: Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this. - `delete_merge_proid`: Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this. - `row_created_at`: Timestamp of when this row was first created. - `row_updated_at`: Timestamp of when this row was last updated (this is updated via trigger). `process_queue`: Populated from gathering all unique queued changes for each fact table, then used to update fact tables. Redundant changes are already aggregated out before insert to this table. This table will never have data at the end of a transaction so can be unlogged. - `process_queue_id`: Unique identifier for each row to be executed. The order of this field is essential as it is used to execute the proids over the keys in the exact order of changes from queue tables. - `fact_table_id`: Identifies which fact table these records belong to. Strictly speaking this may not even be required because no other transaction is ever visible to the pg_fact_loader session loading this table, so you could only ever see your own records. But it is sensible to keep this unique separation of data in the same table. - `proid`: The function proid that will be executed with key_value passed as the argument, and (if applicable), source_change_date as the second argument. See fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz for more information on source_change_date. - `key_value`: The key that changed which will be passed to the function proid in order to update the fact table. This is text data type because this key could be essentially any data type. Casting is handled in the logic of execute_queue() which executes from this table. - `row_created_at`: Timestamp of when this row was first created. - `row_updated_at`: Timestamp of when this row was last updated (this is updated via trigger). - `source_change_date`: Only used with key_retrieval_sequences.pass_queue_table_change_date_at_tz. Will be populated by the changed_at timestamp of a queue table cast to date according to that configuration if it exists. `debug_process_queue`: A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG. `queue_deps_all`: A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies. `queue_deps_all_with_retrieval`: The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes. `fact_table_refresh_logs`: Used to log both job run times and exceptions. - `fact_table_refresh_log_id`: Unique identifier, - `fact_table_id`: Fact table that created the log. - `refresh_attempted_at`: The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures). - `messages`: Only for failures - Error message content in JSON format - including message, message detail, context, and hint. - `refresh_finished_at`: The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time. `unresolved_failures`: Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring. **NOTE** - to generate this markdown from the database, use: ```sql SELECT CASE WHEN d.objsubid = 0 THEN format(E'\n`%s`: %s', c.relname, description) ELSE format(' - `%s`: %s', a.attname, d.description) END AS markdown FROM pg_description d INNER JOIN pg_class c ON c.oid = d.objoid AND d.classoid = (SELECT oid FROM pg_class WHERE relname = 'pg_class') INNER JOIN pg_namespace n ON n.oid = c.relnamespace LEFT JOIN pg_attribute a ON a.attrelid = c.oid AND a.attnum = d.objsubid WHERE n.nspname = 'fact_loader' ORDER BY CASE WHEN c.relname = 'fact_tables' THEN 1 WHEN c.relname = 'queue_tables' THEN 2 WHEN c.relname = 'queue_table_deps' THEN 3 WHEN c.relname = 'key_retrieval_sequences' THEN 4 WHEN c.relname = 'fact_table_deps' THEN 5 WHEN c.relname = 'fact_table_dep_queue_table_deps' THEN 6 WHEN c.relname = 'process_queue' THEN 7 WHEN c.relname = 'debug_process_queue' THEN 8 WHEN c.relname = 'queue_deps_all' THEN 9 WHEN c.relname = 'queue_deps_all_with_retrieval' THEN 10 WHEN c.relname = 'fact_table_refresh_logs' THEN 11 WHEN c.relname = 'unresolved_failures' THEN 12 END, d.objsubid; ``` ## Function Performance Considerations You will notice the configuration APIs expect you to provide functions for `INSERT`, `UPDATE`, and `DELETE` events which will execute on a PER-ID basis. There are several reasons for this and if you consider performacne up front, you will have a very optimized fact_loader system. - Having a single id-based function allows for query plan caching if the functions are written in `plpgsql`. That means even a complex function with 20 joins may take 5 seconds to plan, but execute in 5ms. After several executions, the plan will be cached and you will effectively have a consistent 5ms execution time for every execution of your function. - It allows for simplicity of design and optimization. It is very easy to see holes in your query plan and missing indexes in a query that is executing an aggregation on a single id. In general, you should try to target a **sub-10ms execution time** for your functions. Such a demand may be relaxed for keys that are much more infrequently updated, or made more stringent for extremely high frequency of changes. ## Backfilling and Enabling a Fact Table Job Once you enable your fact table to be maintained by pg_fact_loader, all changes moving forward will be maintained by the system according to your configuration and the new queue data coming in. However, you will most likely need to initially populate the fact table as a starting point. Here is the typical process then to enable a job, once your configuration is in place: 1. Ensure the fact_loader job is disabled (this is the default) 2. Truncate the fact table 3. Backfill in batches by running your configured `merge` function over the entire set of data. For example: `SELECT customers_fact_merge(customer_id) FROM customers;` 4. Enable the fact_loader job. 5. Run worker function in whatever scheduled way desired (i.e. crontab). If you need to at any point in the future do another backfill on the table, this is the same set of step to follow. **However**, it will be better in production to not `TRUNCATE` the fact table, but rather to use small batches to refresh the whole table while still allowing concurrent access. This will also avoid overloading any replication stream going out of your system. To **enable** a fact_table in the `fact_tables` for it to be considered by the worker for refresh, simply run an update, i.e. ```sql UPDATE fact_loader.fact_tables SET enabled = TRUE WHERE fact_table_relid = 'test_fact.customers_fact'; ``` Concurrency is handled by locking fact_tables rows for update, which can be seen in the wrapping `worker()` function. Adding more workers means you will have smaller deltas, and more up to date fact tables. For example you can schedule 5 calls to `worker()` to kick off from cron every minute. # Administration ## Manually Executing Jobs If for some reason you need to manually execute a job in a concurrency-safe way that is integrated into `pg_fact_loader`, you can run this function: ```sql SELECT fact_loader.try_load(fact_table_id); ``` The function will return `TRUE` if it ran successfully. It will return `FALSE` either if the job errored out (see below [Troubleshooting](#troubleshoot)), or if the job is already being run and has a lock on it. ## Troubleshooting Errors and Issues If a job fails, it will be automatically disabled, and you can view the errors by running: ```sql SELECT * FROM fact_loader.unresolved_failures ``` The server logs may also have more details. By default, only `DEBUG` level messages are printed to the server logs with the SQL generated by `sql_builder`. This can be very useful for debugging should a question arise. You can peek at queued changes for any fact table by querying `SELECT * FROM gathered_queued_changes(fact_table_id);`. This can also be used for data quality checks - you can verify that all records for a fact table match expected output of your function definitions by comparing while excluding any gathered_queued_changes with this function which should not be compared. Furthermore, be aware that enabling `DEBUG` level logging will add process queue records to the unlogged table `debug_process_queue` to allow peeking at changes that are incoming. You can also do something similar to this by viewing the `gathered_queued_changes` function. For even further debugging, try running the sql generated by the `fact_loader.sql_builder(fact_table_id)` function by hand, passing it the id of the failed job. You can attempt to run this manually in a transaction. If that still does not give you enough information, you can attempt to run `fact_loader.execute_queue(fact_table_id)`, again still in a transaction that you may roll back. Once you have fixed whatever issues a job may have, you will need to re-enable it to get it running again. # Technical Documentation ## New Releases There are some helper scripts to assist in adding a new version of pg_fact_loader, mainly `pg_fact_loader-sql-maker.sh`. 1. To add a new version, open this file, change the `last_version` and `new_version` to the correct new values. 2. Remove everything after `create_update_file_with_header` in the script. The next few lines are custom files that were changed with a particular release, which are added to the new version's SQL script. Whatever functions or views you modify, or if you have a schema change in the `schema/` directory, you will want to add these files using the provided function, i.e. `add_file views/prioritized_jobs.sql $update_file` will add the SQL for views/prioritized_jobs.sql to the new extension script. You only need to add files that you modify with a release. 3. When all is prepared, run the script. It should create new files for you for the new extension version, including an update script from the previous version to the new version. 4. Update the Makefile to include these new SQL files. 5. Update the first script in both `sql/` and `expected/` directories, which refer to the most recent version as a default. Update it to the new version. 6. Update the pg_fact_loader.control file with the latest version. To test your extension for all postgres versions, including testing extension upgrade paths, see and run the script `test_all_versions.sh`. ## Workflow The function `fact_loader.worker()` drives everything in the fact table loads. It selects the next fact table based on several conditions, puts a lock on it, then goes to refresh the fact table. Here is the basic workflow and explanation of involved functions: **High Level:** `fact_loader.worker()` (chooses which fact table to work on, locks it and proceeds with load) For this a single fact_table_id, the following is the workflow: - `fact_loader.load(fact_table_id)` - `fact_loader.sql_builder(p_fact_table_id)` returns `insert_to_process_queue_sql` and `metadata_update_sql` - The SQL to load the process_queue (`insert_to_process_queue_sql`) is executed. If it is NULL, `SELECT 'No queue data' AS result` is executed instead. - `fact_loader.execute_queue(p_fact_table_id)` builds SQL `v_execute_sql` which executes the load across the process_queue in the correct order, again based on `(insert|update\delete)_merge_proid` in configs - Execute `metadata_update_sql` to update `last_cutoff_id` and `last_cutoff_source_time` for all relevant queues - `fact_loader.purge_queues();` - purge any queue data no longer needed across all configs, whether disabled or enabled **Detail Level:** `fact_loader.worker()` (chooses which fact table to work on, locks it and proceeds with load) For this a single fact_table_id, the following is the workflow: - `fact_loader.load(fact_table_id)` - `fact_loader.sql_builder(p_fact_table_id)` returns `insert_to_process_queue_sql` and `metadata_update_sql` - Retrieve all configuration information from `fact_loader.queue_deps_all_with_retrieval` for `I`, `U`, and `D` events - Recursively build the SQL to join from all queue tables in configuration to base tables (The recursive portion ONLY applies to cases requiring more than one `level` of joins in `key_retrieval_sequences`) - The `DELETE` case is unique in that joins have to factor in the likely possibility that the data no longer exists to join to, and thus one may have to join instead to the audit tables. - We `UNION ALL` together every event - We also `UNION ALL` together every query generated for every queue table of a given fact table - The SQL to load the process_queue (`insert_to_process_queue_sql`) is executed. If it is NULL, `SELECT 'No queue data' AS result` is executed instead. - The SQL to load the process_queue (`insert_to_process_queue_sql`) is executed. If it is NULL, `SELECT 'No queue data' AS result` is executed instead. - `fact_loader.execute_queue(p_fact_table_id)` builds SQL `v_execute_sql` which executes the load across the process_queue in the correct order, again based on `(insert|update\delete)_merge_proid` in configs - Execute `metadata_update_sql` to update `last_cutoff_id` and `last_cutoff_source_time` for all relevant queues - `fact_loader.purge_queues();` - purge any queue data no longer needed across all configs, whether disabled or enabled pg_fact_loader-2.0.1/audit.sh000077500000000000000000000113021451107006500161040ustar00rootroot00000000000000#!/usr/bin/env bash set -eu #TO BUILD THE REST OF THE AUDIT SQL: # #SELECT format('./audit.sh %s %s %s >> sql/03_audit.sql', schemaname, relname, pkey) AS script #FROM pg_stat_user_tables st #INNER JOIN LATERAL # (SELECT a.attname AS pkey # FROM (SELECT # i.indrelid # , unnest(indkey) AS ik # , row_number() # OVER () AS rn # FROM pg_index i # WHERE i.indrelid = st.relid AND i.indisprimary) pk # INNER JOIN pg_attribute a # ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aft ON TRUE #WHERE st.schemaname = 'test'; # #./audit.sh test customers customer_id >> sql/03_audit.sql #./audit.sh test orders order_id >> sql/03_audit.sql #./audit.sh test emails email_id >> sql/03_audit.sql #./audit.sh test email_promos email_promo_id >> sql/03_audit.sql #./audit.sh test products product_id >> sql/03_audit.sql #./audit.sh test order_products order_product_id >> sql/03_audit.sql #./audit.sh test reorders reorder_id >> sql/03_audit.sql sql() { schema=$1 audit_schema="${schema}_audit_raw" table=$2 primary_key=$3 sequence_name="${table}_audit_${table}_audit_id_seq" client_query="NULL" json_type="jsonb" cat << EOM CREATE SCHEMA IF NOT EXISTS ${audit_schema}; CREATE TABLE ${audit_schema}.${table}_audit ( ${table}_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "${schema}_audit_raw"."audit_${schema}_${table}"() RETURNS TRIGGER AS \$\$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('${audit_schema}.${sequence_name}') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_${json_type}(value_row), hstore_to_${json_type}(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_${json_type}(value_row), hstore_to_${json_type}(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_${json_type}(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_${json_type}(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "${schema}_audit_raw"."${table}_audit"("${table}_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; \$\$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON ${schema}.$table FOR EACH ROW EXECUTE PROCEDURE "${schema}_audit_raw"."audit_${schema}_${table}" ('${primary_key}'); EOM } sql $1 $2 $3 pg_fact_loader-2.0.1/audit_all.sh000077500000000000000000000010701451107006500167350ustar00rootroot00000000000000#!/usr/bin/env bash set -eu ./audit.sh test customers customer_id >> sql/03_audit.sql ./audit.sh test email_promos email_promo_id >> sql/03_audit.sql ./audit.sh test emails email_id >> sql/03_audit.sql ./audit.sh test order_product_promos order_product_promo_id >> sql/03_audit.sql ./audit.sh test order_products order_product_id >> sql/03_audit.sql ./audit.sh test orders order_id >> sql/03_audit.sql ./audit.sh test products product_id >> sql/03_audit.sql ./audit.sh test promos promo_id >> sql/03_audit.sql ./audit.sh test reorders reorder_id >> sql/03_audit.sql pg_fact_loader-2.0.1/comments/000077500000000000000000000000001451107006500162675ustar00rootroot00000000000000pg_fact_loader-2.0.1/comments/debug_process_queue.sql000066400000000000000000000002441451107006500230400ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.debug_process_queue IS 'A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG.'; pg_fact_loader-2.0.1/comments/fact_table_dep_queue_table_deps.sql000066400000000000000000000113051451107006500253120ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.fact_table_dep_queue_table_deps IS $$Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_queue_table_dep_id IS 'Unique identifier'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_id IS 'fact_table_dep for this specific dependency.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.queue_table_dep_id IS 'Inherited queue_table_dep that this dependent fact table depends on.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_id IS $$This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_source_time IS $$This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.insert_merge_proid IS $$Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.update_merge_proid IS $$Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.delete_merge_proid IS $$Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; pg_fact_loader-2.0.1/comments/fact_table_deps.sql000066400000000000000000000034071451107006500221130ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.fact_table_deps IS 'For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here.'; COMMENT ON COLUMN fact_loader.fact_table_deps.fact_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.fact_table_deps.parent_id IS 'The parent fact_table_id that the child depends on.'; COMMENT ON COLUMN fact_loader.fact_table_deps.child_id IS 'The child fact_table_id that will run only after the parent is updated.'; COMMENT ON COLUMN fact_loader.fact_table_deps.default_insert_merge_proid IS $$Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_update_merge_proid IS $$Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_delete_merge_proid IS $$Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; pg_fact_loader-2.0.1/comments/fact_table_refresh_logs.sql000066400000000000000000000016411451107006500236400ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.fact_table_refresh_logs IS 'Used to log both job run times and exceptions.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_refresh_log_id IS 'Unique identifier,'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_id IS 'Fact table that created the log.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_attempted_at IS 'The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures).'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_finished_at IS 'The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.messages IS 'Only for failures - Error message content in JSON format - including message, message detail, context, and hint.'; pg_fact_loader-2.0.1/comments/fact_tables.sql000066400000000000000000000114261451107006500212630ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.fact_tables IS 'Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs".'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_id IS 'Unique identifier for the fact table or job - also referred to as job_id'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_relid IS 'The oid of the fact table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_agg_proid IS $$NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now.$$; COMMENT ON COLUMN fact_loader.fact_tables.enabled IS 'Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE.'; COMMENT ON COLUMN fact_loader.fact_tables.priority IS 'Determines the order in which the job runs (in combination with other sorting factors)'; COMMENT ON COLUMN fact_loader.fact_tables.force_worker_priority IS 'If marked TRUE, this fact table will be prioritized in execution order above all other factors.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_source_cutoff IS 'The data cutoff time of the last refresh - only records older than this have been updated.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_attempted_at IS 'The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_succeeded IS 'Whether or not the last run of the job succeeded. NULL if it has never been run.'; COMMENT ON COLUMN fact_loader.fact_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.fact_tables.use_daily_schedule IS 'If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_time IS 'The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_tz IS 'The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_proid IS $$The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples.$$; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_base_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time.'; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_parent_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_deps IS 'OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_dep_delay_tolerance IS 'OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; pg_fact_loader-2.0.1/comments/key_retrieval_sequences.sql000066400000000000000000000071411451107006500237330ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.key_retrieval_sequences IS $$How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.key_retrieval_sequence_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.queue_table_dep_id IS 'Which fact table - queue table record this is for (queue_table_deps)'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.filter_scope IS $$NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.level IS $$Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns IS $$What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.is_fact_key IS 'Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_relation IS 'Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_column IS 'Join to this column of join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns_from_join IS 'Return these columns from join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_return_is_fact_key IS 'If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; pg_fact_loader-2.0.1/comments/queue_deps_all.sql000066400000000000000000000002531451107006500217770ustar00rootroot00000000000000COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; pg_fact_loader-2.0.1/comments/queue_deps_all_with_retrieval.sql000066400000000000000000000003411451107006500251050ustar00rootroot00000000000000COMMENT ON VIEW fact_loader.queue_deps_all_with_retrieval IS 'The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes.'; pg_fact_loader-2.0.1/comments/queue_table_deps.sql000066400000000000000000000070621451107006500223230ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.queue_table_deps IS $$Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.queue_table_deps.fact_table_id IS 'Fact table to tie together with a queue table it depends on.'; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_id IS 'Queue table to tie together with a fact table that needs its changes.'; COMMENT ON COLUMN fact_loader.queue_table_deps.relevant_change_columns IS $$Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_id IS $$The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_source_time IS $$The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.queue_table_deps.insert_merge_proid IS $$Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.update_merge_proid IS $$Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.delete_merge_proid IS $$Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; pg_fact_loader-2.0.1/comments/queue_tables.sql000066400000000000000000000034621451107006500214730ustar00rootroot00000000000000COMMENT ON TABLE fact_loader.queue_tables IS 'Each queue table along with the base table to which it belongs.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_id IS 'Unique identifier for queue tables.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_relid IS 'The oid of the queue table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_of_base_table_relid IS 'The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table.$$; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_tz IS $$**NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz.$$; COMMENT ON COLUMN fact_loader.queue_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.queue_tables.purge IS 'Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table.'; pg_fact_loader-2.0.1/comments/unresolved_failures.sql000066400000000000000000000003061451107006500230670ustar00rootroot00000000000000COMMENT ON VIEW fact_loader.unresolved_failures IS 'Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring.'; pg_fact_loader-2.0.1/debian/000077500000000000000000000000001451107006500156645ustar00rootroot00000000000000pg_fact_loader-2.0.1/debian/README.md000066400000000000000000000010031451107006500171350ustar00rootroot00000000000000# Debian/Ubuntu packaging This directory contains the Debian control section for pg_fact_loader packages. ## How to Use This 1. Edit the `debian/changelog` file. 2. Run the following command in the top level source directory to build all source and binary packages. ``` debuild -us -uc ``` ## New major version of PostgreSQL? Install the appropriate development packages. The debian/control file needs to be updated. Use the following command in the top level source directory: ``` pg_buildext updatecontrol ``` pg_fact_loader-2.0.1/debian/changelog000066400000000000000000000041071451107006500175400ustar00rootroot00000000000000pg-fact-loader (2.0.1-1) unstable; urgency=medium * Bug fix: Remove C functions from 2.0 extension SQL script -- Jeremy Finzel Mon, 09 Oct 2023 16:19:49 -0500 pg-fact-loader (2.0.0-1) unstable; urgency=medium * Add support for native logical replication * Remove support for background worker -- Jeremy Finzel Wed, 12 Jul 2023 14:59:48 -0500 pg-fact-loader (1.7.0-3) unstable; urgency=medium * Disable unstable parts of test 17. (Closes: #1023226) -- Christoph Berg Tue, 01 Nov 2022 11:01:38 +0100 pg-fact-loader (1.7.0-2) unstable; urgency=medium * Upload for PostgreSQL 15. * Convert to dh --with pgxs. * R³: no. * debian/tests: Use 'make' instead of postgresql-server-dev-all. -- Christoph Berg Mon, 24 Oct 2022 20:30:11 +0200 pg-fact-loader (1.7.0-1) unstable; urgency=medium * Add pre-execute hook -- Jeremy Finzel Tue, 23 Aug 2022 15:21:07 -0500 pg-fact-loader (1.6.0-2) UNRELEASED; urgency=medium * Trim trailing whitespace. * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, Repository-Browse. -- Debian Janitor Sat, 05 Feb 2022 09:00:36 -0000 pg-fact-loader (1.6.0-1) unstable; urgency=medium * Change regproc to text to allow pgupgrade * Ensure pg12 readiness -- Jeremy Finzel Mon, 31 Aug 2020 10:23:10 -0500 pg-fact-loader (1.5.2-1) unstable; urgency=medium * Fix time zone bug and behavior of SIGTERM -- Jeremy Finzel Thu, 29 Nov 2018 10:36:36 -0600 pg-fact-loader (1.5.1-2) UNRELEASED; urgency=medium * Test-Depend on postgresql-contrib-PGVERSION. -- Christoph Berg Wed, 28 Nov 2018 15:06:02 +0100 pg-fact-loader (1.5.1-1) unstable; urgency=medium * Fixes for old version tests -- Jeremy Finzel Tue, 27 Nov 2018 09:23:57 -0600 pg-fact-loader (1.5.0-1) unstable; urgency=medium * Initial public version of pg_fact_loader -- Jeremy Finzel Fri, 09 Nov 2018 11:54:11 -0600 pg_fact_loader-2.0.1/debian/control000066400000000000000000000034701451107006500172730ustar00rootroot00000000000000Source: pg-fact-loader Section: database Priority: optional Maintainer: Jeremy Finzel Build-Depends: debhelper-compat (= 12), libpq-dev, postgresql-common, postgresql-server-dev-all Standards-Version: 4.6.1 Rules-Requires-Root: no Homepage: https://github.com/enova/pg_fact_loader Vcs-Git: https://github.com/enova/pg_fact_loader.git Package: postgresql-10-pg-fact-loader Architecture: any Depends: postgresql-10, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL 10. Package: postgresql-11-pg-fact-loader Architecture: any Depends: postgresql-11, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL 11. Package: postgresql-12-pg-fact-loader Architecture: any Depends: postgresql-12, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL 12. Package: postgresql-13-pg-fact-loader Architecture: any Depends: postgresql-13, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL 13. Package: postgresql-14-pg-fact-loader Architecture: any Depends: postgresql-14, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL 14. Package: postgresql-15-pg-fact-loader Architecture: any Depends: postgresql-15, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL 15. pg_fact_loader-2.0.1/debian/control.in000066400000000000000000000011601451107006500176720ustar00rootroot00000000000000Source: pg-fact-loader Section: database Priority: optional Maintainer: Jeremy Finzel Build-Depends: debhelper-compat (= 12), libpq-dev, postgresql-common, postgresql-server-dev-all Standards-Version: 4.6.1 Rules-Requires-Root: no Homepage: https://github.com/enova/pg_fact_loader Vcs-Git: https://github.com/enova/pg_fact_loader.git Package: postgresql-PGVERSION-pg-fact-loader Architecture: any Depends: postgresql-PGVERSION, ${shlibs:Depends}, ${misc:Depends} Description: Build fact tables asynchronously with Postgres Use queue tables to build fact tables asynchronously for PostgreSQL PGVERSION. pg_fact_loader-2.0.1/debian/copyright000066400000000000000000000042021451107006500176150ustar00rootroot00000000000000Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: pg-fact-loader Source: https://github.com/enova/pg_fact_loader Files: * Copyright: 2018 Enova International, Inc. 2018 Jeremy Finzel License: MIT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: . The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. . THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Files: debian/* Copyright: 2018 Jeremy Finzel License: GPL-2+ This package is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. . This package is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. . You should have received a copy of the GNU General Public License along with this program. If not, see . On Debian systems, the complete text of the GNU General Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". pg_fact_loader-2.0.1/debian/docs000066400000000000000000000000121451107006500165300ustar00rootroot00000000000000README.md pg_fact_loader-2.0.1/debian/pgversions000066400000000000000000000000041451107006500200000ustar00rootroot0000000000000010+ pg_fact_loader-2.0.1/debian/rules000077500000000000000000000002321451107006500167410ustar00rootroot00000000000000#!/usr/bin/make -f override_dh_installdocs: dh_installdocs --all README.* # defer testing to autopkgtest override_dh_pgxs_test: %: dh $@ --with pgxs pg_fact_loader-2.0.1/debian/source/000077500000000000000000000000001451107006500171645ustar00rootroot00000000000000pg_fact_loader-2.0.1/debian/source/format000066400000000000000000000000141451107006500203720ustar00rootroot000000000000003.0 (quilt) pg_fact_loader-2.0.1/debian/tests/000077500000000000000000000000001451107006500170265ustar00rootroot00000000000000pg_fact_loader-2.0.1/debian/tests/control000066400000000000000000000002201451107006500204230ustar00rootroot00000000000000Depends: @, make, postgresql-contrib-15, postgresql-15-pglogical, postgresql-15-pglogical-ticker Tests: installcheck Restrictions: allow-stderr pg_fact_loader-2.0.1/debian/tests/control.in000066400000000000000000000002451451107006500210370ustar00rootroot00000000000000Depends: @, make, postgresql-contrib-PGVERSION, postgresql-PGVERSION-pglogical, postgresql-PGVERSION-pglogical-ticker Tests: installcheck Restrictions: allow-stderr pg_fact_loader-2.0.1/debian/tests/installcheck000077500000000000000000000001361451107006500214200ustar00rootroot00000000000000#!/bin/sh pg_buildext -o shared_preload_libraries=pglogical -o wal_level=logical installcheck pg_fact_loader-2.0.1/debian/upstream/000077500000000000000000000000001451107006500175245ustar00rootroot00000000000000pg_fact_loader-2.0.1/debian/upstream/metadata000066400000000000000000000003631451107006500212310ustar00rootroot00000000000000--- Bug-Database: https://github.com/enova/pg_fact_loader/issues Bug-Submit: https://github.com/enova/pg_fact_loader/issues/new Repository: https://github.com/enova/pg_fact_loader.git Repository-Browse: https://github.com/enova/pg_fact_loader pg_fact_loader-2.0.1/debian/watch000066400000000000000000000001071451107006500167130ustar00rootroot00000000000000version=4 https://github.com/enova/pg_fact_loader/tags .*/v(.*).tar.gz pg_fact_loader-2.0.1/expected/000077500000000000000000000000001451107006500162435ustar00rootroot00000000000000pg_fact_loader-2.0.1/expected/01_create_ext.out000066400000000000000000000003501451107006500214150ustar00rootroot00000000000000-- Allow running regression suite with upgrade paths \set v `echo ${FROMVERSION:-2.0}` SET client_min_messages TO warning; CREATE EXTENSION pglogical; CREATE EXTENSION pglogical_ticker; CREATE EXTENSION pg_fact_loader VERSION :'v'; pg_fact_loader-2.0.1/expected/02_schema.out000066400000000000000000000315151451107006500205420ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; DROP SCHEMA IF EXISTS test, test_fact, audit, test_audit_raw CASCADE ; TRUNCATE fact_loader.fact_tables CASCADE; TRUNCATE fact_loader.queue_tables CASCADE; --We use no serial/identity types here purely to be able to have consistency across multiple re-testing CREATE SCHEMA test; CREATE TABLE test.customers (customer_id INT PRIMARY KEY, customer_number text, phone TEXT, age INT); CREATE TABLE test.orders (order_id INT PRIMARY KEY, customer_id INT REFERENCES test.customers (customer_id) ON DELETE CASCADE, order_date DATE, total NUMERIC(10,2), row_updated_at TIMESTAMPTZ); CREATE TABLE test.emails (email_id INT PRIMARY KEY, customer_id INT REFERENCES test.customers (customer_id) ON DELETE CASCADE, read BOOLEAN); CREATE TABLE test.promos (promo_id INT PRIMARY KEY, description TEXT); CREATE TABLE test.email_promos (email_promo_id INT PRIMARY KEY, email_id INT REFERENCES test.emails (email_id) ON DELETE CASCADE, promo_id INT REFERENCES test.promos (promo_id) ON DELETE CASCADE); CREATE TABLE test.products (product_id INT PRIMARY KEY, product_name NAME); CREATE TABLE test.order_products (order_product_id INT PRIMARY KEY, order_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE, product_id INT REFERENCES test.products (product_id) ON DELETE CASCADE); --This table will test having to do multiple joins from changes to a table - join to orders, join to customers, in order to update customers_fact CREATE TABLE test.order_product_promos (order_product_promo_id INT PRIMARY KEY, order_product_id INT NOT NULL REFERENCES test.order_products (order_product_id) ON DELETE CASCADE, promo_id INT NOT NULL REFERENCES test.promos (promo_id) ON DELETE CASCADE); --This table will test multiple columns referring to a key of a fact table (orders.order_id) CREATE TABLE test.reorders (reorder_id INT PRIMARY KEY, base_order_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE, reorder_from_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE, reorder_to_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE); CREATE SCHEMA test_fact; CREATE TABLE test_fact.customers_fact (customer_id INT PRIMARY KEY, phone TEXT, age INT, last_order_id INT, order_product_count INT, order_product_promo_ids INT[], row_updated_at TIMESTAMPTZ); CREATE TABLE test_fact.orders_fact (order_id INT PRIMARY KEY, customer_id INT, order_date DATE, total NUMERIC(10,2), is_reorder BOOLEAN, row_updated_at TIMESTAMPTZ); --This is a silly dependent fact table definition, but will test correct updating of a fact table that depends on other fact tables CREATE TABLE test_fact.customersorders_fact (order_id INT PRIMARY KEY, customer_id INT, phone TEXT, age INT, max_order_date DATE, min_total NUMERIC(10,2), row_updated_at TIMESTAMPTZ); --This fact table def is an example of both a fact and base table dependency CREATE TABLE test_fact.order_emails_fact (order_id INT PRIMARY KEY, customer_id INT, order_date DATE, total NUMERIC(10,2), is_reorder BOOLEAN, num_emails INT, num_read INT, row_updated_at TIMESTAMPTZ); --This fact table tests nested fact table deps CREATE TABLE test_fact.customersorders_summary_fact (customer_id INT PRIMARY KEY, rows_in_customersorders_fact INT); --This fact table depends only on customers, which other fact tables depend on, and also emails, which the customers and test_fact.orders_fact do not depend on CREATE TABLE test_fact.emails_fact (email_id INT PRIMARY KEY, read BOOLEAN, promo_count INT); --This is to test range value tables CREATE TABLE test_fact.customer_order_history_fact (as_of_date daterange, customer_id INT, total_orders INT, last_order_date DATE, row_updated_at TIMESTAMPTZ, PRIMARY KEY (customer_id, as_of_date)); CREATE OR REPLACE FUNCTION test_fact.customers_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.customers_fact AS $BODY$ BEGIN RETURN QUERY SELECT customer_id, phone, age, os.last_order_id, ops.order_product_count::INT, oppi.order_product_promo_ids, now() AS row_updated_at FROM test.customers c LEFT JOIN LATERAL (SELECT MAX(order_id) AS last_order_id FROM test.orders o WHERE o.customer_id = c.customer_id) os ON TRUE LEFT JOIN LATERAL (SELECT COUNT(1) AS order_product_count FROM test.orders o INNER JOIN test.order_products op ON op.order_id = o.order_id WHERE o.customer_id = c.customer_id ) ops ON TRUE LEFT JOIN LATERAL (SELECT array_agg(opp.promo_id ORDER BY opp.promo_id) AS order_product_promo_ids FROM test.order_product_promos opp INNER JOIN test.order_products op ON opp.order_product_id = op.order_product_id INNER JOIN test.orders o ON op.order_id = o.order_id WHERE o.customer_id = c.customer_id) oppi ON TRUE WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customers_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customers_fact c WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.orders_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.orders_fact AS $BODY$ BEGIN RETURN QUERY SELECT order_id, customer_id, order_date, total, is_reorder, now() AS row_updated_at FROM test.orders o LEFT JOIN LATERAL (SELECT EXISTS (SELECT 1 FROM test.reorders ro WHERE ro.reorder_to_id = o.order_id) AS is_reorder) ros ON TRUE WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.orders_fact_delete(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.orders_fact c WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION test_fact.customersorders_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.customersorders_fact AS $BODY$ BEGIN RETURN QUERY SELECT order_id, customer_id, phone, age, MAX(order_date), MIN(total)::NUMERIC(10,2), now() AS row_updated_at FROM test_fact.customers_fact ff INNER JOIN test_fact.orders_fact bf USING (customer_id) WHERE ff.customer_id = p_customer_id GROUP BY order_id, customer_id, phone, age; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customersorders_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customersorders_fact c WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION test_fact.customersorders_summary_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.customersorders_summary_fact AS $BODY$ BEGIN RETURN QUERY SELECT customer_id, COUNT(1)::INT AS rows_in_customersorders_fact FROM test_fact.customersorders_fact WHERE customer_id = p_customer_id GROUP BY customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customersorders_summary_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customersorders_summary_fact c WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; /*** This fact table def is an example of both a fact and base table dependency */ CREATE OR REPLACE FUNCTION test_fact.order_emails_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.order_emails_fact AS $BODY$ BEGIN RETURN QUERY SELECT order_id, customer_id, order_date, total, is_reorder, es.num_emails::INT, es.num_read::INT, now() AS row_updated_at FROM test_fact.orders_fact of LEFT JOIN LATERAL (SELECT COUNT(1) AS num_emails, SUM(CASE WHEN read THEN 1 ELSE 0 END) AS num_read FROM test.emails e WHERE e.customer_id = of.customer_id) es ON TRUE WHERE of.customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.order_emails_fact_delete(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.order_emails_fact c WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION test_fact.emails_fact_aggregator(p_email_id INT) RETURNS SETOF test_fact.emails_fact AS $BODY$ BEGIN RETURN QUERY SELECT email_id, read, promo_count::INT FROM test.emails e LEFT JOIN LATERAL (SELECT COUNT(1) AS promo_count FROM test.email_promos ep WHERE ep.email_id = e.email_id) eps ON TRUE WHERE email_id = p_email_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.emails_fact_delete(p_email_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.emails_fact c WHERE email_id = p_email_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customer_order_history_fact_merge(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN PERFORM test_fact.customer_order_history_fact_record_merge(o.*) FROM test.orders o WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; --TODO - this assumes inserts always have a greater or equal order_date - but is that just implementation? CREATE FUNCTION test_fact.customer_order_history_fact_record_merge(p_order test.orders) RETURNS VOID AS $BODY$ DECLARE v_add_to_total_orders integer = 1; BEGIN WITH ended_last_fact AS (UPDATE test_fact.customer_order_history_fact SET as_of_date = daterange(lower(as_of_date), p_order.order_date) , row_updated_at = p_order.row_updated_at WHERE customer_id = p_order.customer_id AND lower(as_of_date) <> p_order.order_date AND upper(as_of_date) = 'infinity' RETURNING *) INSERT INTO test_fact.customer_order_history_fact AS f (as_of_date, customer_id, total_orders, last_order_date, row_updated_at) SELECT daterange(p_order.order_date, 'infinity'), p_order.customer_id, COALESCE(ended_last_fact.total_orders, 0) + v_add_to_total_orders AS total_orders, p_order.order_date, now() FROM (SELECT p_order.customer_id) nes LEFT JOIN ended_last_fact ON nes.customer_id = ended_last_fact.customer_id ON CONFLICT (customer_id, as_of_date) DO UPDATE SET total_orders = f.total_orders + v_add_to_total_orders , last_order_date = p_order.order_date , row_updated_at = now(); END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customer_order_history_fact_update(p_order_id INT) RETURNS VOID AS $BODY$ DECLARE v_customer_id INT = (SELECT customer_id FROM test.orders WHERE order_id = p_order_id); BEGIN --For simplicities sake for this unusual event, just drop and rebuild history DELETE FROM test_fact.customer_order_history_fact cohf WHERE customer_id = v_customer_id; PERFORM test_fact.customer_order_history_fact_record_merge(o_ordered.*) FROM (SELECT * FROM test.orders WHERE customer_id = v_customer_id ORDER BY order_id) o_ordered; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customer_order_history_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN --For simplicities sake for this unusual event, just drop and rebuild history DELETE FROM test_fact.customer_order_history_fact cohf WHERE customer_id = p_customer_id; PERFORM test_fact.customer_order_history_fact_record_merge(o_ordered.*) FROM (SELECT * FROM test.orders WHERE customer_id = p_customer_id ORDER BY order_id) o_ordered; END; $BODY$ LANGUAGE plpgsql; SELECT fact_loader.create_table_loader_function((schemaname||'.'||relname||'_aggregator')::REGPROC,relid,'{row_updated_at}') FROM pg_stat_user_tables WHERE relname IN('customers_fact','orders_fact','customersorders_fact','emails_fact','order_emails_fact','customersorders_summary_fact') ORDER BY schemaname, relname; create_table_loader_function ---------------------------------------------- test_fact.customers_fact_merge test_fact.customersorders_fact_merge test_fact.customersorders_summary_fact_merge test_fact.emails_fact_merge test_fact.order_emails_fact_merge test_fact.orders_fact_merge (6 rows) INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customers_fact'::REGCLASS, 'test_fact.customers_fact_aggregator'::REGPROC, 1); INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.orders_fact'::REGCLASS, 'test_fact.orders_fact_aggregator'::REGPROC, 2); --TODO feature INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customersorders_fact'::REGCLASS, 'test_fact.customersorders_fact_aggregator'::REGPROC, 3); INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.emails_fact' ::REGCLASS, 'test_fact.emails_fact_aggregator'::REGPROC, 4); --TODO feature INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.order_emails_fact' ::REGCLASS, 'test_fact.order_emails_fact_aggregator'::REGPROC, 5); --TODO feature INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customer_order_history_fact' ::REGCLASS, NULL, 6); --Nested fact table deps INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customersorders_summary_fact' ::REGCLASS, 'test_fact.customersorders_summary_fact_aggregator'::REGPROC, 7); pg_fact_loader-2.0.1/expected/03_audit.out000066400000000000000000001002421451107006500204030ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; CREATE EXTENSION IF NOT EXISTS hstore; CREATE SCHEMA IF NOT EXISTS audit; CREATE OR REPLACE FUNCTION audit.no_dml_on_audit_table() RETURNS TRIGGER AS $$ BEGIN RAISE EXCEPTION 'No common-case updates/deletes/truncates allowed on audit table'; RETURN NULL; END; $$ LANGUAGE plpgsql; /*** TO BUILD THE REST OF THE AUDIT SQL: SELECT format('./audit.sh %s %s %s >> sql/03_audit.sql', schemaname, relname, pkey) AS script FROM pg_stat_user_tables st INNER JOIN LATERAL (SELECT a.attname AS pkey FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = st.relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aft ON TRUE WHERE st.schemaname = 'test' ORDER BY schemaname, relname; */ CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.customers_audit ( customers_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_customers"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.customers_audit_customers_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.customers FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_customers" ('customer_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.email_promos_audit ( email_promos_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_email_promos"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.email_promos_audit_email_promos_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.email_promos FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_email_promos" ('email_promo_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.emails_audit ( emails_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_emails"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.emails_audit_emails_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.emails FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_emails" ('email_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.order_product_promos_audit ( order_product_promos_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_order_product_promos"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.order_product_promos_audit_order_product_promos_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.order_product_promos FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_order_product_promos" ('order_product_promo_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.order_products_audit ( order_products_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_order_products"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.order_products_audit_order_products_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.order_products FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_order_products" ('order_product_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.orders_audit ( orders_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_orders"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.orders_audit_orders_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.orders FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_orders" ('order_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.products_audit ( products_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_products"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.products_audit_products_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.products FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_products" ('product_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.promos_audit ( promos_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_promos"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.promos_audit_promos_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.promos FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_promos" ('promo_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.reorders_audit ( reorders_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_reorders"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.reorders_audit_reorders_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.reorders FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_reorders" ('reorder_id'); pg_fact_loader-2.0.1/expected/04_seeds.out000066400000000000000000000514701451107006500204110ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; --Test at least a column as timestamptz ALTER TABLE test_audit_raw.emails_audit ALTER COLUMN changed_at TYPE TIMESTAMPTZ; INSERT INTO test.customers (customer_id, customer_number, phone, age) SELECT generate_series, 'cust2018'||generate_series, '000123456'||RIGHT(generate_series::TEXT,1), 35 FROM generate_series(1,10); INSERT INTO test.products (product_id, product_name) VALUES (1,'Sour Candy'), (2,'Pool Table'), (3,'One Pocket Book'), (4,'Fast Turbo Car'), (5,'Black Cassock'), (6,'Pacifier'), (7,'Book Light'), (8,'A Dozen Roses'); INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (1, 1, '2018-04-10', 100.00), (2, 3, '2018-04-11', 200.00), (3, 5, '2018-04-12', 2000.00); INSERT INTO test.order_products (order_product_id, order_id, product_id) VALUES (1, 1, 1), (2, 1, 3), (3, 1, 5), (4, 2, 7), (5, 2, 8), (6, 3, 2); INSERT INTO test.promos (promo_id, description) VALUES (1, '50% off 9 foot pool table with real Italian slate'); INSERT INTO test.emails (email_id, customer_id, read) VALUES (1, 5, true); INSERT INTO test.email_promos (email_promo_id, email_id, promo_id) VALUES (1, 1, 1); INSERT INTO test.order_product_promos (order_product_promo_id, order_product_id, promo_id) VALUES (1, 6, 1); INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (4, 1, '2018-04-13', 100.00); INSERT INTO test.reorders (reorder_id, base_order_id, reorder_from_id, reorder_to_id) VALUES (1, 1, 1, 4); INSERT INTO fact_loader.queue_tables (queue_table_relid, queue_of_base_table_relid, pglogical_node_if_id, queue_table_tz) SELECT st.relid::REGCLASS, sts.relid::REGCLASS, 0, CASE WHEN st.relname = 'emails_audit' THEN NULL ELSE 'America/Chicago' END FROM (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) st INNER JOIN (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) sts ON sts.schemaname||'_audit_raw' = st.schemaname AND sts.relname||'_audit' = st.relname WHERE st.schemaname = 'test_audit_raw'; SELECT fact_loader.add_batch_id_fields(); add_batch_id_fields --------------------- (1 row) /**** Configuration for customers_fact */ --Queue tables WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.customers_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.customers_fact_merge'::REGPROC AS update_merge_proid, CASE WHEN queue_of_base_table_relid = 'test.customers'::REGCLASS THEN 'test_fact.customers_fact_delete'::REGPROC ELSE 'test_fact.customers_fact_merge'::REGPROC END AS delete_merge_proid, CASE WHEN queue_of_base_table_relid = 'test.customers'::REGCLASS THEN '{phone, age}'::TEXT[] WHEN queue_of_base_table_relid = 'test.orders'::REGCLASS --This update may be implausible, but would affect the fact table THEN '{customer_id}'::TEXT[] --Let's just consider that any update to the other tables should cause concern and we want to be safe and refresh all ELSE NULL END AS relevant_change_columns FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test.customers'::REGCLASS, 'test.orders'::REGCLASS, 'test.order_products'::REGCLASS, 'test.order_product_promos'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, level, return_columns, is_fact_key, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key) SELECT queue_table_dep_id, 1, '{customer_id}'::name[], true, null, null, null::name[], null FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.customers'::REGCLASS, 'test.orders'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 1, '{order_id}', false, 'test.orders'::REGCLASS, 'order_id', '{customer_id}', true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.order_products'::REGCLASS) UNION ALL /**** These 2 are an example of a dependency requiring multiple joins to get the customer_id key needed to update the customers_fact table */ SELECT queue_table_dep_id, 1, '{order_product_id}', false, 'test.order_products'::REGCLASS, 'order_product_id', '{order_id}', false FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.order_product_promos'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 2, '{order_id}', false, 'test.orders'::REGCLASS, 'order_id', '{customer_id}', true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.order_product_promos'::REGCLASS); /**** Configuration for orders_fact */ --Queue tables INSERT INTO fact_loader.queue_table_deps ( fact_table_id, queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid ) SELECT fact_table_id ,(SELECT queue_table_id FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN ('test.orders'::REGCLASS)) , '{order_date, total}'::TEXT[] , 'test_fact.orders_fact_merge'::REGPROC AS insert_merge_proid , 'test_fact.orders_fact_merge'::REGPROC AS update_merge_proid , 'test_fact.orders_fact_delete'::REGPROC AS delete_merge_proid FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; INSERT INTO fact_loader.queue_table_deps ( fact_table_id, queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid ) SELECT fact_table_id ,(SELECT queue_table_id FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN ('test.reorders'::REGCLASS)) , NULL , 'test_fact.orders_fact_merge'::REGPROC AS insert_merge_proid , 'test_fact.orders_fact_merge'::REGPROC AS update_merge_proid , 'test_fact.orders_fact_merge'::REGPROC AS delete_merge_proid FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; --Key retrieval INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key) SELECT queue_table_dep_id, evts.evt, 1, evts.return_columns, true, null, null, null::name[], null::boolean FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) CROSS JOIN (VALUES ('I','{customer_id}'::name[]), ('U','{customer_id}'::name[]), ('D','{order_id}'::name[])) evts (evt, return_columns) WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.orders'::REGCLASS) UNION ALL SELECT queue_table_dep_id, NULL, 1, '{base_order_id,reorder_from_id,reorder_to_id}', false, 'test.orders', 'order_id', '{customer_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.reorders'::REGCLASS); /**** Configuration for customersorders_fact_aggregator */ --Only deps in fact_table_deps for this fact table because it depends on no queue tables directly --TODO - revisit and add delete functions as appropriate INSERT INTO fact_loader.fact_table_deps (parent_id, child_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid) VALUES ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS),(SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_fact'::REGCLASS),'test_fact.customersorders_fact_merge','test_fact.customersorders_fact_merge','test_fact.customersorders_fact_delete'), ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS),(SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_fact'::REGCLASS),'test_fact.customersorders_fact_merge','test_fact.customersorders_fact_merge','test_fact.customersorders_fact_delete'); /**** Configuration for order_emails_fact */ --Queue tables WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.order_emails_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.order_emails_fact_merge'::REGPROC AS update_merge_proid, 'test_fact.order_emails_fact_merge'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN('test.emails'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, level, return_columns, is_fact_key) SELECT queue_table_dep_id, 1, '{customer_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS AND queue_of_base_table_relid IN('test.emails'::REGCLASS); --Fact table deps INSERT INTO fact_loader.fact_table_deps (parent_id, child_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid) VALUES ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS),(SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS),'test_fact.order_emails_fact_merge','test_fact.order_emails_fact_merge','test_fact.order_emails_fact_delete'); /**** Configuration for emails_fact */ --Queue tables WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.emails_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.emails_fact_merge'::REGPROC AS update_merge_proid, CASE WHEN queue_of_base_table_relid = 'test.emails'::REGCLASS THEN 'test_fact.emails_fact_delete'::REGPROC ELSE 'test_fact.emails_fact_merge'::REGPROC END AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test.emails'::REGCLASS, 'test.email_promos'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.emails_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, level, return_columns, is_fact_key) SELECT queue_table_dep_id, 1, '{email_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.emails_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.emails'::REGCLASS, 'test.email_promos'::REGCLASS); /**** Configuration for customer_order_history_fact */ --Queue tables WITH qt AS ( SELECT *, 'test_fact.customer_order_history_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.customer_order_history_fact_update'::REGPROC AS update_merge_proid, 'test_fact.customer_order_history_fact_delete'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN ('test.orders'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, qt.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN qt WHERE fact_table_relid = 'test_fact.customer_order_history_fact'::REGCLASS; /**** For this fact table, we need a different key_retrieval for deletes, so we enter all 3 separately */ INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key) SELECT queue_table_dep_id, evts.evt, 1, '{order_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) CROSS JOIN (VALUES ('I'),('U')) evts (evt) WHERE fact_table_relid = 'test_fact.customer_order_history_fact'::REGCLASS AND queue_of_base_table_relid IN('test.orders'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 'D', 1, '{customer_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customer_order_history_fact'::REGCLASS AND queue_of_base_table_relid IN('test.orders'::REGCLASS); /**** Configuration for test_fact.customersorders_summary_fact */ INSERT INTO fact_loader.fact_table_deps (parent_id, child_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid) VALUES ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_fact' :: REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_summary_fact' :: REGCLASS), 'test_fact.customersorders_summary_fact_merge', 'test_fact.customersorders_summary_fact_merge', 'test_fact.customersorders_summary_fact_delete'); /**** Because we need to manually adjust the dependent fact table config for at least one table, we do this manually 1. Now that configs are all in place, run fact_loader.refresh_fact_table_dep_queue_cutoffs(); to build the deps table 2. Query based on fact_table_relid and queue_table_relid to find the correct fact_table_dep_queue_table_dep_id to update 3. Set this dep to have a different delete function for this queue table */ SELECT fact_loader.refresh_fact_table_dep_queue_table_deps(); refresh_fact_table_dep_queue_table_deps ----------------------------------------- (1 row) WITH to_update AS ( SELECT ftdqtd.fact_table_dep_queue_table_dep_id , qtd.queue_table_id , qt.queue_table_relid , ft.fact_table_id FROM fact_loader.fact_table_deps ftd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqtd ON ftdqtd.fact_table_dep_id = ftd.fact_table_dep_id INNER JOIN fact_loader.queue_table_deps qtd ON qtd.queue_table_dep_id = ftdqtd.queue_table_dep_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS AND qt.queue_table_relid = 'test_audit_raw.reorders_audit'::REGCLASS ) UPDATE fact_loader.fact_table_dep_queue_table_deps SET delete_merge_proid = 'test_fact.order_emails_fact_merge' WHERE fact_table_dep_queue_table_dep_id = (SELECT fact_table_dep_queue_table_dep_id FROM to_update); /**** Both of these next 2 are the same situation because one depends on the other ****/ WITH to_update AS ( SELECT ftdqtd.fact_table_dep_queue_table_dep_id , qtd.queue_table_id , qt.queue_table_relid , ft.fact_table_id , ft.fact_table_relid FROM fact_loader.fact_table_deps ftd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqtd ON ftdqtd.fact_table_dep_id = ftd.fact_table_dep_id INNER JOIN fact_loader.queue_table_deps qtd ON qtd.queue_table_dep_id = ftdqtd.queue_table_dep_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE fact_table_relid = 'test_fact.customersorders_fact'::REGCLASS AND qt.queue_table_relid IN('test_audit_raw.reorders_audit'::REGCLASS,'test_audit_raw.order_product_promos_audit'::REGCLASS,'test_audit_raw.order_products_audit'::REGCLASS) ) UPDATE fact_loader.fact_table_dep_queue_table_deps SET delete_merge_proid = 'test_fact.customersorders_fact_merge' WHERE fact_table_dep_queue_table_dep_id IN (SELECT fact_table_dep_queue_table_dep_id FROM to_update); WITH to_update AS ( SELECT ftdqtd.fact_table_dep_queue_table_dep_id , qtd.queue_table_id , qt.queue_table_relid , ft.fact_table_id , ft.fact_table_relid FROM fact_loader.fact_table_deps ftd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqtd ON ftdqtd.fact_table_dep_id = ftd.fact_table_dep_id INNER JOIN fact_loader.queue_table_deps qtd ON qtd.queue_table_dep_id = ftdqtd.queue_table_dep_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE fact_table_relid = 'test_fact.customersorders_summary_fact'::REGCLASS AND qt.queue_table_relid IN('test_audit_raw.reorders_audit'::REGCLASS,'test_audit_raw.order_product_promos_audit'::REGCLASS,'test_audit_raw.order_products_audit'::REGCLASS) ) UPDATE fact_loader.fact_table_dep_queue_table_deps SET delete_merge_proid = 'test_fact.customersorders_summary_fact_merge' WHERE fact_table_dep_queue_table_dep_id IN (SELECT fact_table_dep_queue_table_dep_id FROM to_update); /**** DEMO SELECT * FROM fact_loader.fact_tables ORDER BY priority; SELECT * FROM fact_loader.queue_tables ORDER BY queue_table_relid::REGCLASS::TEXT; SELECT ft.fact_table_relid, qt.queue_table_relid, krs.* FROM fact_loader.key_retrieval_sequences krs INNER JOIN fact_loader.queue_table_deps qtd USING (queue_table_dep_id) INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) ORDER BY qtd.queue_table_dep_id, krs.filter_scope, krs.level; SELECT qtd.queue_table_dep_id, ft.fact_table_relid, qt.queue_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, qtd.insert_merge_proid, qtd.update_merge_proid, qtd.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) ORDER BY ft.fact_table_relid::TEXT, qt.queue_table_relid::TEXT; */ pg_fact_loader-2.0.1/expected/05_pgl_setup.out000066400000000000000000000135201451107006500213030ustar00rootroot00000000000000\set d `echo ${TESTDRIVER:-pglogical}` \set x `echo ${TESTDROPEXT:-false}` SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; SELECT pglogical.create_node('test','host=localhost') INTO TEMP foonode; DROP TABLE foonode; WITH sets AS ( SELECT 'test'||generate_series AS set_name FROM generate_series(1,1) ) SELECT pglogical.create_replication_set (set_name:=s.set_name ,replicate_insert:=TRUE ,replicate_update:=TRUE ,replicate_delete:=TRUE ,replicate_truncate:=TRUE) AS result INTO TEMP repsets FROM sets s WHERE NOT EXISTS ( SELECT 1 FROM pglogical.replication_set WHERE set_name = s.set_name); DROP TABLE repsets; -- native equivalent CREATE PUBLICATION test1 WITH (publish = 'insert,update,delete'); SELECT pglogical_ticker.deploy_ticker_tables(); deploy_ticker_tables ---------------------- 4 (1 row) -- native equivalent CREATE SCHEMA IF NOT EXISTS logical_ticker; CREATE TABLE IF NOT EXISTS logical_ticker.tick ( db text DEFAULT current_database() NOT NULL PRIMARY KEY, tick_time TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, tier SMALLINT DEFAULT 1 NULL ); --As of pglogical_ticker 1.2, we don't tick tables not in replication uselessly, but this --would break our tests which did exactly that. So we can fix the test breakage by just adding these tables --to replication as they would be on an actual provider SELECT pglogical_ticker.add_ticker_tables_to_replication(); add_ticker_tables_to_replication ---------------------------------- 4 (1 row) --The tests will manually run tick() before new data is needed -- native equivalent ALTER PUBLICATION test1 ADD TABLE logical_ticker.tick; CREATE TEMP TABLE vars AS SELECT :'d'::text as driver, :'x'::boolean as drop_ext; DO $$ DECLARE v_record RECORD; BEGIN IF (SELECT driver FROM vars) = 'native' THEN FOR v_record IN SELECT schemaname, tablename FROM pg_tables WHERE schemaname IN('test', 'test_audit_raw') LOOP EXECUTE format('ALTER PUBLICATION test1 ADD TABLE %s.%s', v_record.schemaname, v_record.tablename); END LOOP; CREATE OR REPLACE FUNCTION test.tick() RETURNS VOID AS $BODY$ BEGIN INSERT INTO logical_ticker.tick (tick_time) VALUES (now()) ON CONFLICT (db) DO UPDATE SET tick_time = now(); END;$BODY$ LANGUAGE plpgsql; CREATE TABLE public.mock_pg_subscription ( oid oid NOT NULL, subdbid oid NOT NULL, subname name NOT NULL, subowner oid NOT NULL, subenabled boolean NOT NULL, subconninfo text NOT NULL, subslotname name NOT NULL, subsynccommit text NOT NULL, subpublications text[] NOT NULL ); INSERT INTO mock_pg_subscription (oid, subdbid, subname, subowner, subenabled, subconninfo, subslotname, subsynccommit, subpublications) VALUES (10000, (SELECT oid FROM pg_database WHERE datname = current_database()), 'test', 16384, true, 'host=example.com dbname=contrib_regression', 'test', 'off', '{test1}'); CREATE OR REPLACE FUNCTION fact_loader.subscription() RETURNS TABLE (oid OID, subpublications text[], subconninfo text) AS $BODY$ BEGIN RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo FROM mock_pg_subscription s; END; $BODY$ LANGUAGE plpgsql; CREATE TABLE public.mock_pg_subscription_rel ( srsubid oid NOT NULL, srrelid oid NOT NULL, srsubstate "char" NOT NULL, srsublsn pg_lsn NOT NULL ); INSERT INTO mock_pg_subscription_rel (srsubid, srrelid, srsubstate, srsublsn) SELECT (SELECT oid FROM mock_pg_subscription LIMIT 1), c.oid, 'r', '0/0' FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname IN('test', 'test_audit_raw') AND c.relkind = 'r'; CREATE OR REPLACE FUNCTION fact_loader.subscription_rel() RETURNS TABLE (srsubid OID, srrelid OID) AS $BODY$ BEGIN RETURN QUERY SELECT sr.srsubid, sr.srrelid FROM mock_pg_subscription_rel sr; END; $BODY$ LANGUAGE plpgsql; IF (SELECT drop_ext FROM vars) THEN DROP EXTENSION pglogical CASCADE; END IF; ELSE UPDATE fact_loader.queue_tables SET pglogical_node_if_id = (SELECT if_id FROM pglogical.node_interface); CREATE OR REPLACE FUNCTION test.tick() RETURNS VOID AS $BODY$ BEGIN PERFORM pglogical_ticker.tick(); END;$BODY$ LANGUAGE plpgsql; END IF; END$$; /*** Mock this function so that we find results locally */ CREATE OR REPLACE FUNCTION pglogical_ticker.all_subscription_tickers() RETURNS TABLE (provider_name NAME, set_name NAME, source_time TIMESTAMPTZ) AS $BODY$ BEGIN RETURN QUERY SELECT t.provider_name, 'test1'::NAME AS set_name, t.source_time FROM pglogical_ticker.test1 t; END; $BODY$ LANGUAGE plpgsql; /*** Mock so we get what we want here also */ CREATE OR REPLACE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (subid OID, subpublications text[], subconninfo text, dbname text, driver fact_loader.driver) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT if_id AS subid, '{test1}'::text[] as subpublications, null::text AS subconninfo, null::text AS dbname, 'pglogical'::fact_loader.driver AS driver FROM pglogical.node_interface UNION ALL SELECT s.oid, s.subpublications, s.subconninfo, (regexp_matches(s.subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription() s; $$; ELSE RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo, (regexp_matches(s.subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription() s; END IF; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/expected/06_basic_workers.out000066400000000000000000000204011451107006500221330ustar00rootroot00000000000000SET client_min_messages TO warning; -- Client time zone should not change functionality of worker - use a different one here SET TIMEZONE TO 'UTC'; --Enable all except dep tables for now UPDATE fact_loader.fact_tables ft SET enabled = TRUE WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_deps d WHERE d.child_id = ft.fact_table_id); --Move the mock replication stream forward to now SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------+------------+-----+---------------+---------------------+------------------------- 1 | 0001234561 | 35 | 4 | 3 | 2 | 0001234562 | 35 | | 0 | 3 | 0001234563 | 35 | 2 | 2 | 4 | 0001234564 | 35 | | 0 | 5 | 0001234565 | 35 | 3 | 1 | {1} 6 | 0001234566 | 35 | | 0 | 7 | 0001234567 | 35 | | 0 | 8 | 0001234568 | 35 | | 0 | 9 | 0001234569 | 35 | | 0 | 10 | 0001234560 | 35 | | 0 | (10 rows) --test debugging feature on this table SET log_min_messages TO debug3; SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 1 | 1 | 04-10-2018 | 100.00 | f 2 | 3 | 04-11-2018 | 200.00 | f 3 | 5 | 04-12-2018 | 2000.00 | f 4 | 1 | 04-13-2018 | 100.00 | t (4 rows) RESET log_min_messages; DO $$ BEGIN IF NOT (SELECT COUNT(1) FROM fact_loader.debug_process_queue) = 3 THEN RAISE EXCEPTION '%', 'No worky'; END IF; END$$; SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; order_id | customer_id | phone | age | max_order_date | min_total ----------+-------------+-------+-----+----------------+----------- (0 rows) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; email_id | read | promo_count ----------+------+------------- 1 | t | 1 (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder | num_emails | num_read ----------+-------------+------------+-------+------------+------------+---------- (0 rows) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, as_of_date, total_orders, last_order_date FROM test_fact.customer_order_history_fact ORDER BY customer_id, as_of_date; customer_id | as_of_date | total_orders | last_order_date -------------+-------------------------+--------------+----------------- 1 | [04-10-2018,04-13-2018) | 1 | 04-10-2018 1 | [04-13-2018,infinity) | 2 | 04-13-2018 3 | [04-11-2018,infinity) | 1 | 04-11-2018 5 | [04-12-2018,infinity) | 1 | 04-12-2018 (4 rows) -- Set time zone back to America/Chicago because the audit data is being logged at that time zone -- (another great reason NEVER to use timestamp, but functionality we need at any rate) SET TIMEZONE TO 'America/Chicago'; UPDATE test.customers SET age = 40 WHERE customer_id = 2; -- We need to make deletes handled with recursive joins as well first before testing this -- DELETE FROM test.customers WHERE customer_id = 3; /**** This should not update the fact table, because the replication stream is behind these last 2 updates */ SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------+------------+-----+---------------+---------------------+------------------------- 1 | 0001234561 | 35 | 4 | 3 | 2 | 0001234562 | 35 | | 0 | 3 | 0001234563 | 35 | 2 | 2 | 4 | 0001234564 | 35 | | 0 | 5 | 0001234565 | 35 | 3 | 1 | {1} 6 | 0001234566 | 35 | | 0 | 7 | 0001234567 | 35 | | 0 | 8 | 0001234568 | 35 | | 0 | 9 | 0001234569 | 35 | | 0 | 10 | 0001234560 | 35 | | 0 | (10 rows) UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE, -- Test 1.7 pre-hook feature pre_execute_hook_sql = 'CREATE TABLE cool_pre_execute_hook_sql (id int);' WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------+------------+-----+---------------+---------------------+------------------------- 1 | 0001234561 | 35 | 4 | 3 | 2 | 0001234562 | 40 | | 0 | 3 | 0001234563 | 35 | 2 | 2 | 4 | 0001234564 | 35 | | 0 | 5 | 0001234565 | 35 | 3 | 1 | {1} 6 | 0001234566 | 35 | | 0 | 7 | 0001234567 | 35 | | 0 | 8 | 0001234568 | 35 | | 0 | 9 | 0001234569 | 35 | | 0 | 10 | 0001234560 | 35 | | 0 | (10 rows) SELECT * FROM cool_pre_execute_hook_sql; id ---- (0 rows) UPDATE fact_loader.fact_tables SET pre_execute_hook_sql = NULL WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; --This would simulate an application's changes being out of order now UPDATE test.customers SET age = 41 WHERE customer_id = 2; SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) --Pretend the transaction for this began before the update above - by lowering the actual audit_id and tx time UPDATE test.customers SET age = 42 WHERE customer_id = 2; UPDATE test_audit_raw.customers_audit SET customers_audit_id = customers_audit_id - 1000, changed_at = changed_at - interval '1 minute' WHERE customers_audit_id = (SELECT MAX(customers_audit_id) FROM test_audit_raw.customers_audit); --However, we assume fact_loader_batch_id is still in order because we have a single-threaded --predicatable order with pglogical or a local queue table fed by pg_fact_loader --This will be missed by version 1.2, but not 1.3 SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT (age = 42) AS age_is_updated FROM test_fact.customers_fact WHERE customer_id = 2 ORDER BY customer_id; age_is_updated ---------------- t (1 row) ALTER EXTENSION pg_fact_loader UPDATE; UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; pg_fact_loader-2.0.1/expected/07_launch_worker.out000066400000000000000000000037141451107006500221520ustar00rootroot00000000000000-- NOTE: Original functionality of background worker has been removed. Retaining this test for consistency, -- replacing calls to launch the worker with instead direct calls to SELECT fact_loader.worker(); SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; --Make one change to guarantee we want to see a fact table change, ensure rep stream is up to date UPDATE test.customers SET phone = '0001234577' WHERE customer_id = 10; SELECT test.tick(); tick ------ (1 row) --Ensure this one table is prioritized UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------+------------+-----+---------------+---------------------+------------------------- 1 | 0001234561 | 35 | 4 | 3 | 2 | 0001234562 | 42 | | 0 | 3 | 0001234563 | 35 | 2 | 2 | 4 | 0001234564 | 35 | | 0 | 5 | 0001234565 | 35 | 3 | 1 | {1} 6 | 0001234566 | 35 | | 0 | 7 | 0001234567 | 35 | | 0 | 8 | 0001234568 | 35 | | 0 | 9 | 0001234569 | 35 | | 0 | 10 | 0001234577 | 35 | | 0 | (10 rows) UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; pg_fact_loader-2.0.1/expected/08_fact_table_deps.out000066400000000000000000000041261451107006500224050ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; --Enable the dep tables - which thereby should be placed first in line! UPDATE fact_loader.fact_tables ft SET enabled = TRUE WHERE EXISTS (SELECT 1 FROM fact_loader.fact_table_deps d WHERE d.child_id = ft.fact_table_id); SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; order_id | customer_id | phone | age | max_order_date | min_total ----------+-------------+------------+-----+----------------+----------- 1 | 1 | 0001234561 | 35 | 04-10-2018 | 100.00 2 | 3 | 0001234563 | 35 | 04-11-2018 | 200.00 3 | 5 | 0001234565 | 35 | 04-12-2018 | 2000.00 4 | 1 | 0001234561 | 35 | 04-13-2018 | 100.00 (4 rows) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder | num_emails | num_read ----------+-------------+------------+---------+------------+------------+---------- 1 | 1 | 04-10-2018 | 100.00 | f | 0 | 2 | 3 | 04-11-2018 | 200.00 | f | 0 | 3 | 5 | 04-12-2018 | 2000.00 | f | 1 | 1 4 | 1 | 04-13-2018 | 100.00 | t | 0 | (4 rows) /**** Nested fact table deps */ SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, rows_in_customersorders_fact FROM test_fact.customersorders_summary_fact ORDER BY customer_id; customer_id | rows_in_customersorders_fact -------------+------------------------------ 1 | 2 3 | 1 5 | 1 (3 rows) pg_fact_loader-2.0.1/expected/09_purge.out000066400000000000000000000010511451107006500204230ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; SELECT COUNT(1) FROM test_audit_raw.customers_audit; count ------- 14 (1 row) --We call this explicitly, because the worker will take the default add_interval of 1 hour, thus --won't see any actual purging in the test suite. SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); purge_queues -------------- (1 row) SELECT COUNT(1) FROM test_audit_raw.customers_audit; count ------- 4 (1 row) pg_fact_loader-2.0.1/expected/10_delete.out000066400000000000000000000135521451107006500205440ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; DELETE FROM test.customers WHERE customer_id = 3; SELECT pg_sleep(1); pg_sleep ---------- (1 row) SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------+------------+-----+---------------+---------------------+------------------------- 1 | 0001234561 | 35 | 4 | 3 | 2 | 0001234562 | 42 | | 0 | 4 | 0001234564 | 35 | | 0 | 5 | 0001234565 | 35 | 3 | 1 | {1} 6 | 0001234566 | 35 | | 0 | 7 | 0001234567 | 35 | | 0 | 8 | 0001234568 | 35 | | 0 | 9 | 0001234569 | 35 | | 0 | 10 | 0001234577 | 35 | | 0 | (9 rows) SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 1 | 1 | 04-10-2018 | 100.00 | f 3 | 5 | 04-12-2018 | 2000.00 | f 4 | 1 | 04-13-2018 | 100.00 | t (3 rows) SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; order_id | customer_id | phone | age | max_order_date | min_total ----------+-------------+------------+-----+----------------+----------- 1 | 1 | 0001234561 | 35 | 04-10-2018 | 100.00 3 | 5 | 0001234565 | 35 | 04-12-2018 | 2000.00 4 | 1 | 0001234561 | 35 | 04-13-2018 | 100.00 (3 rows) SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; email_id | read | promo_count ----------+------+------------- 1 | t | 1 (1 row) SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder | num_emails | num_read ----------+-------------+------------+---------+------------+------------+---------- 1 | 1 | 04-10-2018 | 100.00 | f | 0 | 3 | 5 | 04-12-2018 | 2000.00 | f | 1 | 1 4 | 1 | 04-13-2018 | 100.00 | t | 0 | (3 rows) SELECT customer_id, as_of_date, total_orders, last_order_date FROM test_fact.customer_order_history_fact ORDER BY customer_id, as_of_date; customer_id | as_of_date | total_orders | last_order_date -------------+-------------------------+--------------+----------------- 1 | [04-10-2018,04-13-2018) | 1 | 04-10-2018 1 | [04-13-2018,infinity) | 2 | 04-13-2018 5 | [04-12-2018,infinity) | 1 | 04-12-2018 (3 rows) SELECT customer_id, rows_in_customersorders_fact FROM test_fact.customersorders_summary_fact ORDER BY customer_id; customer_id | rows_in_customersorders_fact -------------+------------------------------ 1 | 2 5 | 1 (2 rows) SELECT COUNT(1) FROM test_audit_raw.customers_audit; count ------- 5 (1 row) --We call this explicitly, because the worker will take the default add_interval of 1 hour, thus --won't see any actual purging in the test suite. SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); purge_queues -------------- (1 row) SELECT COUNT(1) FROM test_audit_raw.customers_audit; count ------- 0 (1 row) DELETE FROM test.reorders; SELECT pg_sleep(1); pg_sleep ---------- (1 row) SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 1 | 1 | 04-10-2018 | 100.00 | f 3 | 5 | 04-12-2018 | 2000.00 | f 4 | 1 | 04-13-2018 | 100.00 | f (3 rows) SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder | num_emails | num_read ----------+-------------+------------+---------+------------+------------+---------- 1 | 1 | 04-10-2018 | 100.00 | f | 0 | 3 | 5 | 04-12-2018 | 2000.00 | f | 1 | 1 4 | 1 | 04-13-2018 | 100.00 | f | 0 | (3 rows) pg_fact_loader-2.0.1/expected/11_more_data.out000066400000000000000000000104311451107006500212270ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /*** Try this odd case to be sure we process all events in order correctly ***/ UPDATE test.orders SET total = 1000.00 WHERE order_id = 3; DELETE FROM test.orders WHERE order_id = 3; INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (3, 5, '2018-04-12', 2000.00); --Move the mock replication stream forward to now SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------+------------+-----+---------------+---------------------+------------------------- 1 | 0001234561 | 35 | 4 | 3 | 2 | 0001234562 | 42 | | 0 | 4 | 0001234564 | 35 | | 0 | 5 | 0001234565 | 35 | 3 | 0 | 6 | 0001234566 | 35 | | 0 | 7 | 0001234567 | 35 | | 0 | 8 | 0001234568 | 35 | | 0 | 9 | 0001234569 | 35 | | 0 | 10 | 0001234577 | 35 | | 0 | (9 rows) SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 1 | 1 | 04-10-2018 | 100.00 | f 3 | 5 | 04-12-2018 | 2000.00 | f 4 | 1 | 04-13-2018 | 100.00 | f (3 rows) SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; order_id | customer_id | phone | age | max_order_date | min_total ----------+-------------+------------+-----+----------------+----------- 1 | 1 | 0001234561 | 35 | 04-10-2018 | 100.00 3 | 5 | 0001234565 | 35 | 04-12-2018 | 2000.00 4 | 1 | 0001234561 | 35 | 04-13-2018 | 100.00 (3 rows) SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; email_id | read | promo_count ----------+------+------------- 1 | t | 1 (1 row) SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder | num_emails | num_read ----------+-------------+------------+---------+------------+------------+---------- 1 | 1 | 04-10-2018 | 100.00 | f | 0 | 3 | 5 | 04-12-2018 | 2000.00 | f | 1 | 1 4 | 1 | 04-13-2018 | 100.00 | f | 0 | (3 rows) SELECT customer_id, as_of_date, total_orders, last_order_date FROM test_fact.customer_order_history_fact ORDER BY customer_id, as_of_date; customer_id | as_of_date | total_orders | last_order_date -------------+-------------------------+--------------+----------------- 1 | [04-10-2018,04-13-2018) | 1 | 04-10-2018 1 | [04-13-2018,infinity) | 2 | 04-13-2018 5 | [04-12-2018,infinity) | 1 | 04-12-2018 (3 rows) SELECT customer_id, rows_in_customersorders_fact FROM test_fact.customersorders_summary_fact ORDER BY customer_id; customer_id | rows_in_customersorders_fact -------------+------------------------------ 1 | 2 5 | 1 (2 rows) pg_fact_loader-2.0.1/expected/12_no_proid.out000066400000000000000000000071331451107006500211130ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /**** This makes no sense in reality for the fact table, but we are trying to simulate the potential issue */ WITH to_update AS (SELECT qtd.queue_table_dep_id FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE ft.fact_table_relid = 'test_fact.emails_fact'::REGCLASS AND qt.queue_table_relid = 'test_audit_raw.emails_audit'::REGCLASS) UPDATE fact_loader.queue_table_deps qtd SET insert_merge_proid = NULL FROM to_update tu WHERE tu.queue_table_dep_id = qtd.queue_table_dep_id; --We have configured for this NOT to show up as a change to the fact table INSERT INTO test.emails (email_id, customer_id, read) VALUES (2, 6, true), (3, 7, false); --The bug would have caused this to be missed UPDATE test.emails SET read = FALSE WHERE email_id = 1; --We have configured for this NOT to show up as a change to the fact table INSERT INTO test.emails (email_id, customer_id, read) VALUES (4, 8, true), (5, 9, false); SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker() FROM generate_series(1,6); worker -------- t t t t t t (6 rows) SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; email_id | read | promo_count ----------+------+------------- 1 | f | 1 (1 row) SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker() FROM generate_series(1,6); worker -------- t t t t t t (6 rows) SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); purge_queues -------------- (1 row) SELECT COUNT(1) FROM test_audit_raw.emails_audit; count ------- 0 (1 row) /**** Now fix what we broke */ WITH to_update AS (SELECT qtd.queue_table_dep_id FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE ft.fact_table_relid = 'test_fact.emails_fact'::REGCLASS AND qt.queue_table_relid = 'test_audit_raw.emails_audit'::REGCLASS) UPDATE fact_loader.queue_table_deps qtd SET insert_merge_proid = 'test_fact.emails_fact_merge'::REGPROC FROM to_update tu WHERE tu.queue_table_dep_id = qtd.queue_table_dep_id; SELECT test_fact.emails_fact_merge(email_id) FROM test.emails; emails_fact_merge ------------------- (5 rows) SELECT test_fact.order_emails_fact_merge(customer_id) FROM test.customers c WHERE EXISTS (SELECT 1 FROM test.emails e WHERE e.customer_id = c.customer_id); order_emails_fact_merge ------------------------- (5 rows) SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; email_id | read | promo_count ----------+------+------------- 1 | f | 1 2 | t | 0 3 | f | 0 4 | t | 0 5 | f | 0 (5 rows) SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder | num_emails | num_read ----------+-------------+------------+---------+------------+------------+---------- 1 | 1 | 04-10-2018 | 100.00 | f | 0 | 3 | 5 | 04-12-2018 | 2000.00 | f | 1 | 0 4 | 1 | 04-13-2018 | 100.00 | f | 0 | (3 rows) pg_fact_loader-2.0.1/expected/13_cutoff_no_dep_on_filter.out000066400000000000000000000020671451107006500241570ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /*** Based on our config, no actual changes will be processed based on these updates. But we still want the queue to be cleared. */ UPDATE test.customers SET customer_number = customer_number||'1'; SELECT COUNT(1) FROM test_audit_raw.customers_audit; count ------- 9 (1 row) SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker() FROM generate_series(1,6); worker -------- t t t t t t (6 rows) --Should now handle dep fact tables SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker() FROM generate_series(1,6); worker -------- t t t t t t (6 rows) SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker() FROM generate_series(1,6); worker -------- t t t t t t (6 rows) SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); purge_queues -------------- (1 row) SELECT COUNT(1) FROM test_audit_raw.customers_audit; count ------- 0 (1 row) pg_fact_loader-2.0.1/expected/14_null_key.out000066400000000000000000000031371451107006500211260ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /*** Based on our config, this should not create an ERROR but should not do anything. */ INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (5, NULL, '2018-04-10', 100.00); SELECT COUNT(1) FROM test_audit_raw.orders_audit; count ------- 1 (1 row) /**** We limit everything to this 1 table because the above grossly violates our schema and will create errors on other tables. We just want to verify that this actually runs without error when processed. */ UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker() FROM generate_series(1,6); worker -------- t t t t t t (6 rows) SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 1 | 1 | 04-10-2018 | 100.00 | f 3 | 5 | 04-12-2018 | 2000.00 | f 4 | 1 | 04-13-2018 | 100.00 | f (3 rows) SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); purge_queues -------------- (1 row) SELECT COUNT(1) FROM test_audit_raw.orders_audit; count ------- 1 (1 row) TRUNCATE test_audit_raw.orders_audit; UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; pg_fact_loader-2.0.1/expected/15_source_change_date.out000066400000000000000000000233271451107006500231120ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /**** This example tests not only using the queue_table timestamp to build a date-based table, but doing that in a different time zone than we might expect, just to show the functionality. So we are going to show a history table of customers from the perspective of the UK */ CREATE TABLE test_fact.customers_history_uktime_fact (customer_id INT, as_of_date DATERANGE, customer_number text, phone TEXT, age INT, PRIMARY KEY (customer_id, as_of_date)); CREATE FUNCTION test_fact.customers_history_uktime_fact_merge(p_customer_id INT, p_as_of_date DATE) RETURNS VOID AS $BODY$ BEGIN WITH it_really_changed AS ( SELECT customer_id, daterange(p_as_of_date, 'infinity') AS as_of_date, customer_number, phone, age FROM test.customers WHERE customer_id = p_customer_id EXCEPT SELECT customer_id, as_of_date, customer_number, phone, age FROM test_fact.customers_history_uktime_fact WHERE customer_id = p_customer_id AND upper(as_of_date) = 'infinity' ) , ended_last_fact AS (UPDATE test_fact.customers_history_uktime_fact f SET as_of_date = daterange(lower(f.as_of_date), lower(irc.as_of_date)) FROM it_really_changed irc WHERE f.customer_id = irc.customer_id AND lower(f.as_of_date) <> lower(irc.as_of_date) AND upper(f.as_of_date) = 'infinity' RETURNING *) INSERT INTO test_fact.customers_history_uktime_fact AS f (customer_id, as_of_date, customer_number, phone, age) SELECT customer_id, as_of_date, customer_number, phone, age FROM it_really_changed nes ON CONFLICT (customer_id, as_of_date) DO UPDATE SET customer_number = EXCLUDED.customer_number , phone = EXCLUDED.phone , age = EXCLUDED.age; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customers_history_uktime_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customers_history_uktime_fact WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; INSERT INTO fact_loader.fact_tables (fact_table_relid, priority) VALUES ('test_fact.customers_history_uktime_fact'::REGCLASS, 8); WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.customers_history_uktime_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.customers_history_uktime_fact_merge'::REGPROC AS update_merge_proid, 'test_fact.customers_history_uktime_fact_delete'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test.customers'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key, pass_queue_table_change_date_at_tz) SELECT queue_table_dep_id, evts.evt, 1, '{customer_id}'::name[], true, null, null, null::name[], null::boolean, --THIS is the key of which time zone the date is seen from 'Europe/London'::TEXT FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) CROSS JOIN (VALUES ('I'),('U')) evts (evt) WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS AND queue_of_base_table_relid IN('test.customers'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 'D', 1, '{customer_id}'::name[], true, null, null, null::name[], null::boolean, null::TEXT FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS AND queue_of_base_table_relid IN('test.customers'::REGCLASS); SELECT test_fact.customers_history_uktime_fact_merge(customer_id, '2018-04-22'::DATE) FROM test.customers; customers_history_uktime_fact_merge ------------------------------------- (9 rows) UPDATE test.customers SET customer_number = customer_number||'a' WHERE customer_id BETWEEN 1 AND 5; UPDATE test.customers SET customer_number = customer_number||'b' WHERE customer_id BETWEEN 1 AND 5; UPDATE test.customers SET customer_number = customer_number||'c' WHERE customer_id BETWEEN 6 AND 10; UPDATE test.customers SET customer_number = customer_number||'d' WHERE customer_id BETWEEN 6 AND 10; UPDATE test.customers SET customer_number = customer_number||'e' WHERE customer_id BETWEEN 1 AND 5; /**** Now we have to mock that this actually happened on different days. */ UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-24'::DATE WHERE change ->> 'customer_number' ~ '1a$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-24'::DATE WHERE change ->> 'customer_number' ~ '1ab$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-25'::DATE WHERE change ->> 'customer_number' ~ '1c$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-26'::DATE WHERE change ->> 'customer_number' ~ '1cd$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-27'::DATE WHERE change ->> 'customer_number' ~ '1abe$'; --Ensure this one table is prioritized UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE, enabled = TRUE WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS; SELECT test.tick(); tick ------ (1 row) DO $$ BEGIN IF NOT (SELECT COUNT(1) FROM fact_loader.gathered_queued_changes((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS))) = 18 THEN RAISE EXCEPTION '%', 'No worky'; END IF; END$$; SELECT fact_loader.worker(); worker -------- t (1 row) SELECT * FROM test_fact.customers_history_uktime_fact ORDER BY upper(as_of_date), customer_id; customer_id | as_of_date | customer_number | phone | age -------------+-------------------------+-----------------+------------+----- 1 | [04-22-2018,04-24-2018) | cust201811 | 0001234561 | 35 2 | [04-22-2018,04-24-2018) | cust201821 | 0001234562 | 42 4 | [04-22-2018,04-24-2018) | cust201841 | 0001234564 | 35 5 | [04-22-2018,04-24-2018) | cust201851 | 0001234565 | 35 6 | [04-22-2018,04-25-2018) | cust201861 | 0001234566 | 35 7 | [04-22-2018,04-25-2018) | cust201871 | 0001234567 | 35 8 | [04-22-2018,04-25-2018) | cust201881 | 0001234568 | 35 9 | [04-22-2018,04-25-2018) | cust201891 | 0001234569 | 35 10 | [04-22-2018,04-25-2018) | cust2018101 | 0001234577 | 35 6 | [04-25-2018,04-26-2018) | cust201861cd | 0001234566 | 35 7 | [04-25-2018,04-26-2018) | cust201871cd | 0001234567 | 35 8 | [04-25-2018,04-26-2018) | cust201881cd | 0001234568 | 35 9 | [04-25-2018,04-26-2018) | cust201891cd | 0001234569 | 35 10 | [04-25-2018,04-26-2018) | cust2018101cd | 0001234577 | 35 1 | [04-24-2018,04-27-2018) | cust201811abe | 0001234561 | 35 2 | [04-24-2018,04-27-2018) | cust201821abe | 0001234562 | 42 4 | [04-24-2018,04-27-2018) | cust201841abe | 0001234564 | 35 5 | [04-24-2018,04-27-2018) | cust201851abe | 0001234565 | 35 1 | [04-27-2018,infinity) | cust201811abe | 0001234561 | 35 2 | [04-27-2018,infinity) | cust201821abe | 0001234562 | 42 4 | [04-27-2018,infinity) | cust201841abe | 0001234564 | 35 5 | [04-27-2018,infinity) | cust201851abe | 0001234565 | 35 6 | [04-26-2018,infinity) | cust201861cd | 0001234566 | 35 7 | [04-26-2018,infinity) | cust201871cd | 0001234567 | 35 8 | [04-26-2018,infinity) | cust201881cd | 0001234568 | 35 9 | [04-26-2018,infinity) | cust201891cd | 0001234569 | 35 10 | [04-26-2018,infinity) | cust2018101cd | 0001234577 | 35 (27 rows) --Let's verify the current records are the same as the actual table SELECT customer_id, customer_number, phone, age FROM test.customers INTERSECT SELECT customer_id, customer_number, phone, age FROM test_fact.customers_history_uktime_fact WHERE upper(as_of_date) = 'infinity' ORDER BY customer_id; customer_id | customer_number | phone | age -------------+-----------------+------------+----- 1 | cust201811abe | 0001234561 | 35 2 | cust201821abe | 0001234562 | 42 4 | cust201841abe | 0001234564 | 35 5 | cust201851abe | 0001234565 | 35 6 | cust201861cd | 0001234566 | 35 7 | cust201871cd | 0001234567 | 35 8 | cust201881cd | 0001234568 | 35 9 | cust201891cd | 0001234569 | 35 10 | cust2018101cd | 0001234577 | 35 (9 rows) UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS; pg_fact_loader-2.0.1/expected/16_1_2_features.out000066400000000000000000000627641451107006500216000ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; CREATE TABLE test_fact.orders_fact_chain (LIKE test_fact.orders_fact); /**** First make a bad function def to test exception handling */ CREATE FUNCTION test_fact.orders_fact_chain_merge(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN INSERT INTO test_fact.orders_fact_chain SELECT * FROM test_fact.orders_fact WHERE p_order_id = NULL::JSONB; END; $BODY$ LANGUAGE plpgsql; INSERT INTO fact_loader.fact_tables (fact_table_relid, priority) VALUES ('test_fact.orders_fact_chain'::REGCLASS, 9); /**** This example will use a local fact table as a queue table */ CREATE SCHEMA IF NOT EXISTS test_fact_audit_raw; CREATE TABLE test_fact_audit_raw.orders_fact_audit ( orders_fact_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp with time zone NOT NULL, --NOTE THE TIMESTAMPTZ operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_fact_audit_raw"."audit_test_fact_orders_fact"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_fact_audit_raw.orders_fact_audit_orders_fact_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test_fact.orders_fact FOR EACH ROW EXECUTE PROCEDURE "test_fact_audit_raw"."audit_test_fact_orders_fact" ('order_id'); --Note that we DO NOT insert a pglogical_node_if_id - because this queue table is local INSERT INTO fact_loader.queue_tables (queue_table_relid, queue_of_base_table_relid, queue_table_tz) SELECT st.relid::REGCLASS, sts.relid::REGCLASS, NULL FROM (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) st INNER JOIN (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) sts ON sts.schemaname||'_audit_raw' = st.schemaname AND sts.relname||'_audit' = st.relname WHERE st.schemaname = 'test_fact_audit_raw'; SELECT fact_loader.add_batch_id_fields(); add_batch_id_fields --------------------- (1 row) WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.orders_fact_chain_merge'::REGPROC AS insert_merge_proid, 'test_fact.orders_fact_chain_merge'::REGPROC AS update_merge_proid, 'test_fact.orders_fact_chain_merge'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test_fact.orders_fact'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key) SELECT queue_table_dep_id, NULL, 1, '{order_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS AND queue_of_base_table_relid IN('test_fact.orders_fact'::REGCLASS); --Force orders_fact update UPDATE test.orders SET total = 2010.00 WHERE order_id = 3; UPDATE fact_loader.fact_tables SET enabled = (fact_table_relid = 'test_fact.orders_fact'::REGCLASS); SELECT test.tick(); tick ------ (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 1 | 1 | 04-10-2018 | 100.00 | f 3 | 5 | 04-12-2018 | 2010.00 | f 4 | 1 | 04-13-2018 | 100.00 | f (3 rows) --Don't tick - because this table is LOCAL and should update regardless of ticker. UPDATE fact_loader.fact_tables SET enabled = FALSE; UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE, enabled = TRUE WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS; --This should also return false in case of error SELECT fact_loader.worker(); worker -------- f (1 row) --We should see an error now SELECT fact_table_id, fact_table_relid, CASE WHEN current_setting('server_version_num')::INT >= 110000 THEN REPLACE(messages::TEXT, 'types', 'type(s)')::JSONB ELSE messages END FROM fact_loader.unresolved_failures; fact_table_id | fact_table_relid | messages ---------------+-----------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 9 | test_fact.orders_fact_chain | {"Hint": "No operator matches the given name and argument type(s). You might need to add explicit type casts.", "Context": "PL/pgSQL function test_fact.orders_fact_chain_merge(integer) line 4 at SQL statement\nSQL statement \"\nSELECT process_queue_id, test_fact.orders_fact_chain_merge(key_value::integer)\nFROM (\n/****\nMust wrap this to execute in order of ids\n***/\nSELECT *\nFROM process_queue\nWHERE process_queue_id BETWEEN 1 AND 1\n AND fact_table_id = 9\n AND proid = 'test_fact.orders_fact_chain_merge'::REGPROC\nORDER BY process_queue_id) q;\n\"\nPL/pgSQL function fact_loader.load(integer) line 56 at EXECUTE\nSQL statement \"SELECT fact_loader.load(p_fact_table_id)\"\nPL/pgSQL function fact_loader.try_load(integer) line 40 at PERFORM\nPL/pgSQL function fact_loader.worker() line 16 at IF", "Message": "operator does not exist: integer = jsonb"} (1 row) --No data SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact_chain ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+-------+------------ (0 rows) --Let's fix the function def and re-run CREATE OR REPLACE FUNCTION test_fact.orders_fact_chain_merge(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN INSERT INTO test_fact.orders_fact_chain SELECT * FROM test_fact.orders_fact WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; --Now re-enable and re-run UPDATE fact_loader.fact_tables SET enabled = TRUE WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS; SELECT fact_loader.worker(); worker -------- t (1 row) --We should see nothing here now SELECT fact_table_id, fact_table_relid, messages FROM fact_loader.unresolved_failures; fact_table_id | fact_table_relid | messages ---------------+------------------+---------- (0 rows) --1 row SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact_chain ORDER BY order_id; order_id | customer_id | order_date | total | is_reorder ----------+-------------+------------+---------+------------ 3 | 5 | 04-12-2018 | 2010.00 | f (1 row) --This is NOT a new feature but a new test coverage - testing concurrency. \! psql contrib_regression -c 'BEGIN; SELECT fact_loader.worker() INTO try1; SELECT pg_sleep(2); COMMIT;' & SELECT pg_sleep(1); NOTICE: table "process_queue" does not exist, skipping pg_sleep ---------- (1 row) \! psql contrib_regression -c ' SELECT fact_loader.worker() INTO try2;' SELECT 1 SELECT pg_sleep(4); COMMIT pg_sleep ---------- (1 row) SELECT * FROM try1; worker -------- t (1 row) SELECT * FROM try2; worker -------- f (1 row) --Daily schedule test - with range format suggestions included!!! --This kind of table should have a gist exclusion index for the daterange but we won't do it in the test CREATE TABLE test_fact.daily_customers_fact (LIKE test_fact.customers_fact); ALTER TABLE test_fact.daily_customers_fact ADD COLUMN as_of_date daterange; ALTER TABLE test_fact.daily_customers_fact ADD PRIMARY KEY (customer_id, as_of_date); CREATE FUNCTION test_fact.daily_customers_fact_merge() RETURNS VOID AS $BODY$ BEGIN DROP TABLE IF EXISTS changes; CREATE TEMP TABLE changes AS SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact EXCEPT SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.daily_customers_fact WHERE upper(as_of_date) = 'infinity'; UPDATE test_fact.daily_customers_fact SET as_of_date = daterange(lower(as_of_date), current_date) WHERE customer_id IN (SELECT customer_id FROM changes) AND upper(as_of_date) = 'infinity'; INSERT INTO test_fact.daily_customers_fact (as_of_date, customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids) SELECT daterange(current_date,'infinity') AS as_of_date, customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM changes ON CONFLICT (customer_id, as_of_date) DO UPDATE SET phone = EXCLUDED.phone, age = EXCLUDED.age, last_order_id = EXCLUDED.last_order_id, order_product_count = EXCLUDED.order_product_count, order_product_promo_ids = EXCLUDED.order_product_promo_ids; END; $BODY$ LANGUAGE plpgsql; UPDATE fact_loader.fact_tables SET enabled = FALSE; BEGIN; --Keep the same transaction time to make these tests possible INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid) VALUES ('test_fact.daily_customers_fact', TRUE, 10, TRUE, now() + interval '1 second', 'America/Chicago', 'test_fact.daily_customers_fact_merge'::REGPROC); UPDATE fact_loader.fact_tables SET enabled = TRUE WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; /***** Dependent scheduled job - as of 1.3 ****/ CREATE TABLE silly (id int); CREATE FUNCTION itran() RETURNS VOID AS $BODY$ BEGIN INSERT INTO silly VALUES (1); END; $BODY$ LANGUAGE plpgsql; CREATE TABLE willy (id int); CREATE FUNCTION itrantoo() RETURNS VOID AS $BODY$ BEGIN INSERT INTO willy VALUES (1); END; $BODY$ LANGUAGE plpgsql; CREATE TABLE nilly (id int); CREATE FUNCTION itrantootoo() RETURNS VOID AS $BODY$ BEGIN INSERT INTO nilly VALUES (1); END; $BODY$ LANGUAGE plpgsql; INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid, depends_on_base_daily_job_id, depends_on_parent_daily_job_id) VALUES ('silly', TRUE, 11, TRUE, NULL, NULL, 'itran'::REGPROC, (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS)); INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid, depends_on_base_daily_job_id, depends_on_parent_daily_job_id) VALUES ('willy', TRUE, 12, TRUE, NULL, NULL, 'itrantoo'::REGPROC, (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'silly'::REGCLASS)); INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid, depends_on_base_daily_job_id, depends_on_parent_daily_job_id) VALUES ('nilly', TRUE, 13, TRUE, NULL, NULL, 'itrantootoo'::REGPROC, (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'willy'::REGCLASS)); --BELOW we will try to run it only after our first one did successfully. --Should not show the daily job because we set the daily schedule ahead in time SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- (0 rows) UPDATE fact_loader.fact_tables SET daily_scheduled_time = now() - interval '1 second' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Now it should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 10 (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) --We have to mock out the date so it appears the same any day we run this test SELECT daterange('2018-04-15'::DATE + (lower(as_of_date) - current_date),upper(as_of_date)), customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.daily_customers_fact ORDER BY customer_id, as_of_date; daterange | customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -----------------------+-------------+------------+-----+---------------+---------------------+------------------------- [04-15-2018,infinity) | 1 | 0001234561 | 35 | 4 | 3 | [04-15-2018,infinity) | 2 | 0001234562 | 42 | | 0 | [04-15-2018,infinity) | 4 | 0001234564 | 35 | | 0 | [04-15-2018,infinity) | 5 | 0001234565 | 35 | 3 | 0 | [04-15-2018,infinity) | 6 | 0001234566 | 35 | | 0 | [04-15-2018,infinity) | 7 | 0001234567 | 35 | | 0 | [04-15-2018,infinity) | 8 | 0001234568 | 35 | | 0 | [04-15-2018,infinity) | 9 | 0001234569 | 35 | | 0 | [04-15-2018,infinity) | 10 | 0001234577 | 35 | | 0 | (9 rows) --Pretend we ran this yesterday UPDATE test_fact.daily_customers_fact SET as_of_date = daterange(lower(as_of_date) - 1,'infinity'); --Job should not show because it just ran - but if it has dependent job it should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 11 (1 row) --Pretend it ran yesterday UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = last_refresh_attempted_at - interval '1 day' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Job should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 10 (1 row) --Change something silly UPDATE test_fact.customers_fact SET phone = NULL WHERE customer_id = 10; SELECT fact_loader.worker(); worker -------- t (1 row) --Job should not show because it just ran - but if it has dependent job it should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 11 (1 row) --This should run the dependent job SELECT fact_loader.worker(); worker -------- t (1 row) TABLE silly; id ---- 1 (1 row) TABLE willy; id ---- (0 rows) --Now 2nd level dep should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 12 (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) TABLE willy; id ---- 1 (1 row) TABLE nilly; id ---- (0 rows) --Now 3rd level dep should show --Just check if enabling regular jobs is ok UPDATE fact_loader.fact_tables SET enabled = true WHERE fact_table_id IN(1,2); SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 13 1 2 (3 rows) SELECT fact_loader.worker(); worker -------- t (1 row) TABLE nilly; id ---- 1 (1 row) UPDATE fact_loader.fact_tables SET enabled = false WHERE fact_table_id IN(1,2); -- Need to test the next day's run when last_refresh_attempted_at is not null UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = last_refresh_attempted_at - interval '1 day' WHERE use_daily_schedule; SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 11 (1 row) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 12 (1 row) TABLE silly; id ---- 1 1 (2 rows) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 13 (1 row) TABLE willy; id ---- 1 1 (2 rows) SELECT fact_loader.worker(); worker -------- t (1 row) SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- (0 rows) TABLE nilly; id ---- 1 1 (2 rows) --We should see one changed range --We have to mock out the date so it appears the same any day we run this test SELECT daterange('2018-04-15'::DATE + (lower(as_of_date) - current_date), CASE WHEN upper(as_of_date) = 'infinity' THEN 'infinity' ELSE '2018-04-15'::DATE + (upper(as_of_date) - current_date) END), customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.daily_customers_fact ORDER BY customer_id, as_of_date; daterange | customer_id | phone | age | last_order_id | order_product_count | order_product_promo_ids -------------------------+-------------+------------+-----+---------------+---------------------+------------------------- [04-14-2018,infinity) | 1 | 0001234561 | 35 | 4 | 3 | [04-14-2018,infinity) | 2 | 0001234562 | 42 | | 0 | [04-14-2018,infinity) | 4 | 0001234564 | 35 | | 0 | [04-14-2018,infinity) | 5 | 0001234565 | 35 | 3 | 0 | [04-14-2018,infinity) | 6 | 0001234566 | 35 | | 0 | [04-14-2018,infinity) | 7 | 0001234567 | 35 | | 0 | [04-14-2018,infinity) | 8 | 0001234568 | 35 | | 0 | [04-14-2018,infinity) | 9 | 0001234569 | 35 | | 0 | [04-14-2018,04-15-2018) | 10 | 0001234577 | 35 | | 0 | [04-15-2018,infinity) | 10 | | 35 | | 0 | (10 rows) --Verify it still shows if we simulate a job failure UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = now(), last_refresh_succeeded = FALSE WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 10 (1 row) --Here it should not show - if we mark that it did succeed UPDATE fact_loader.fact_tables SET last_refresh_succeeded = TRUE WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- (0 rows) /*** TEST ADDING DEPS TO SCHEDULED JOBS ***/ --AGAIN Pretend it ran yesterday UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = last_refresh_attempted_at - interval '1 day' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Job should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 10 (1 row) --Change something silly UPDATE test_fact.customers_fact SET phone = NULL WHERE customer_id = 10; --Now add deps that are not met UPDATE fact_loader.fact_tables SET daily_scheduled_deps = ARRAY['test.customers'::REGCLASS,'test.orders'::REGCLASS, 'test_fact.customers_fact'::REGCLASS], daily_scheduled_dep_delay_tolerance = '1 millisecond' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Should fail because no way they have been updated 1 millisecond ago SELECT fact_loader.worker(); worker -------- f (1 row) --We fail jobs that don't meet deps because as configured, it should be an exceptional occurrence and we want to raise an alarm. Should show an error message containing "Delayed" lingo SELECT fact_table_id FROM fact_loader.unresolved_failures WHERE messages ->> 'Message' LIKE '%Delayed%'; fact_table_id --------------- 10 (1 row) --Now make the tolerance such that we know the deps are met UPDATE fact_loader.fact_tables SET enabled = TRUE, daily_scheduled_deps = ARRAY['test.customers'::REGCLASS,'test.orders'::REGCLASS, 'test_fact.customers_fact'::REGCLASS], daily_scheduled_dep_delay_tolerance = '1 minute' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Shows up again SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- 10 (1 row) --Succeeds SELECT fact_loader.worker(); worker -------- t (1 row) --Does not show now SELECT fact_table_id FROM fact_loader.prioritized_jobs; fact_table_id --------------- (0 rows) ROLLBACK; pg_fact_loader-2.0.1/expected/17_1_3_features.out000066400000000000000000000034001451107006500215600ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; -- These 2 calls replace legacy tests for background worker launches (functionality now removed) SELECT fact_loader.worker(); worker -------- f (1 row) SELECT fact_loader.worker(); worker -------- f (1 row) INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES ((SELECT MAX(order_id)+1 FROM test.orders) ,5, '2018-07-27', 2500.00); SELECT test.tick(); tick ------ (1 row) -- For some reason queue_table_id seems indeterminate so don't show it DO $$ BEGIN IF NOT (SELECT COUNT(1) FROM fact_loader.raw_queued_changes(1)) = 24 OR NOT (SELECT COUNT(1) FROM fact_loader.gathered_queued_changes(1)) = 1 THEN RAISE EXCEPTION '%', 'No worky'; END IF; END$$; --Count could be different if we are doing FROMVERSION=1.2 or lower but should be at least 50 (actually should be 66 for 1.2 and 76 for 1.3) SELECT COUNT(1) > 50 AS got_enough_logs FROM fact_loader.fact_table_refresh_logs; got_enough_logs ----------------- t (1 row) --Test the auto-pruning BEGIN; UPDATE fact_loader.fact_table_refresh_logs SET refresh_attempted_at = refresh_attempted_at - interval '1 year' WHERE messages IS NULL; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id) VALUES (1000); SELECT COUNT(1) FROM fact_loader.fact_table_refresh_logs; count ------- 2 (1 row) ROLLBACK; --Test support for extension without deps (older tests for version 1.2 are removed as no longer relevant) BEGIN; DROP EXTENSION pg_fact_loader CASCADE; DROP EXTENSION IF EXISTS pglogical_ticker CASCADE; DROP EXTENSION IF EXISTS pglogical CASCADE; CREATE EXTENSION pg_fact_loader; DROP EXTENSION pg_fact_loader; ROLLBACK; pg_fact_loader-2.0.1/functions/000077500000000000000000000000001451107006500164525ustar00rootroot00000000000000pg_fact_loader-2.0.1/functions/add_batch_id_fields.sql000066400000000000000000000015421451107006500230700ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.add_batch_id_fields() RETURNS VOID AS $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT queue_table_relid FROM fact_loader.queue_tables qt INNER JOIN pg_class c ON c.oid = qt.queue_table_relid INNER JOIN pg_namespace n ON n.oid = c.relnamespace WHERE NOT EXISTS (SELECT 1 FROM information_schema.columns col WHERE col.column_name = 'fact_loader_batch_id' AND col.table_schema = n.nspname AND col.table_name = c.relname) LOOP v_sql = format($F$ ALTER TABLE %s ADD COLUMN fact_loader_batch_id BIGINT DEFAULT nextval('fact_loader.batch_id'); $F$, v_rec.queue_table_relid::text, v_rec.queue_table_relid::text); RAISE LOG 'ADDING fact_loader_batch_id COLUMN TO queue table %: %', v_rec.queue_table_relid::text, v_sql; EXECUTE v_sql; END LOOP; END $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/create_table_loader_function.sql000066400000000000000000000014551451107006500250450ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.create_table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS REGPROC AS $BODY$ DECLARE v_new_proc TEXT; v_sql TEXT; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT function_name, function_sql INTO v_new_proc, v_sql FROM fact_loader.table_loader_function(p_source_proc, p_destination_relation, p_ignore_diff_for_columns); EXECUTE v_sql; RETURN v_new_proc::REGPROC; END; $BODY$ LANGUAGE plpgsql;pg_fact_loader-2.0.1/functions/daily_scheduled_load.sql000066400000000000000000000032771451107006500233250ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; v_deps regclass[]; v_dep_delay_tolerance interval; v_delayed_msg text; BEGIN /*** There are 3 basic steps to this load: 1. If dependencies are listed, verify they are up to date enough 2. Execute the single daily-refresh function 3. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()', daily_scheduled_deps, daily_scheduled_dep_delay_tolerance INTO v_execute_sql, v_deps, v_dep_delay_tolerance FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; IF v_deps IS NOT NULL THEN WITH deps AS (SELECT unnest(v_deps) AS dep) , delays AS ( SELECT dep, now() - source_time as delay_interval FROM fact_loader.queue_table_delay_info() qtd INNER JOIN deps d ON d.dep = qtd.queue_of_base_table_relid UNION ALL SELECT dep, now() - last_refresh_source_cutoff as delay_interval FROM fact_loader.fact_tables ft INNER JOIN deps d ON d.dep = ft.fact_table_relid ) SELECT string_agg(dep::text||': Delayed '||delay_interval::text, ', ') INTO v_delayed_msg FROM delays WHERE delay_interval > v_dep_delay_tolerance; IF v_delayed_msg IS NOT NULL THEN RAISE EXCEPTION '%', v_delayed_msg; END IF; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/execute_queue.sql000066400000000000000000000032431451107006500220430ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/execute_table_loader.sql000066400000000000000000000025751451107006500233430ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.execute_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is the actual load to the destination table, and assumes that 'prepare' phase has already been run, which is supposed to have gathered the actual minimal delta and determine what to do here. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT execute_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql;pg_fact_loader-2.0.1/functions/fact_table_refresh_logs_pruner.sql000066400000000000000000000022241451107006500254140ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.fact_table_refresh_logs_pruner() RETURNS trigger LANGUAGE plpgsql AS $$ declare step int := 1000; -- step should equal the firing frequency in trigger definition overdrive int := 2; -- overdrive times step = max rows (see below) max_rows int := step * overdrive; rows int; begin delete from fact_loader.fact_table_refresh_logs where fact_table_refresh_log_id in ( select fact_table_refresh_log_id from fact_loader.fact_table_refresh_logs where refresh_attempted_at < now() - '90 days'::interval -- do not do the literal interval value above as a declare parameter order by fact_table_refresh_log_id limit max_rows for update skip locked ); get diagnostics rows = row_count; return null; end $$; CREATE TRIGGER fact_table_refresh_logs_pruner AFTER INSERT ON fact_loader.fact_table_refresh_logs FOR EACH ROW WHEN ((new.fact_table_refresh_log_id % 1000::bigint) = 0) EXECUTE PROCEDURE fact_loader.fact_table_refresh_logs_pruner(); pg_fact_loader-2.0.1/functions/gathered_queued_changes.sql000066400000000000000000000006261451107006500240220ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.gathered_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, proid REGPROC, key_value TEXT, source_change_date DATE) AS $BODY$ DECLARE v_gather_sql text; BEGIN SELECT gathered_queued_changes_sql INTO v_gather_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_gather_sql; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/load.sql000066400000000000000000000057071451107006500201230ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; v_pre_execute_hook_sql text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** Pre-execute hook */ SELECT pre_execute_hook_sql INTO v_pre_execute_hook_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id; EXECUTE COALESCE(v_pre_execute_hook_sql, $$SELECT 'No pre-execute hook.' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/logical_subscription.sql000066400000000000000000000017461451107006500234210ustar00rootroot00000000000000/*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (subid OID, subpublications text[], subconninfo text, dbname text, driver fact_loader.driver) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if AS subid, sub_replication_sets AS subpublications, null::text AS subconninfo, null::text AS dbname, 'pglogical'::fact_loader.driver AS driver FROM pglogical.subscription UNION ALL SELECT oid, subpublications, subconninfo, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription(); $$; ELSE RETURN QUERY SELECT oid, subpublications, subconninfo, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription(); END IF; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/prepare_table_loader.sql000066400000000000000000000026101451107006500233250ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.prepare_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is not going to lock any of the destination table for writing, which is precisely why it is separated from the 'execute' phase which actually writes to the table in the shortest transaction possible. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT prepare_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql;pg_fact_loader-2.0.1/functions/purge_queues.sql000066400000000000000000000046331451107006500217120ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL AND qt.purge /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s IN (SELECT %s FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s) FOR UPDATE SKIP LOCKED ); $$, queue_table_relid, 'fact_loader_batch_id', 'fact_loader_batch_id', queue_table_relid, 'fact_loader_batch_id', min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE DEBUG 'Purging Queue: %', v_sql; BEGIN EXECUTE v_sql; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure in queue purging for transaction % - skipping.', txid_current()::text; WHEN OTHERS THEN RAISE; END; END IF; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/queue_table_delay_info.sql000066400000000000000000000057461451107006500236730ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("publication_name" text, "queue_of_base_table_relid" regclass, "publisher" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ -- pglogical SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , n.if_name AS publisher , t.source_time FROM fact_loader.queue_tables qt JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.subid AND s.driver = 'pglogical' JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name UNION ALL -- native logical SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , t.db AS publisher , t.tick_time AS source_time FROM fact_loader.queue_tables qt JOIN fact_loader.subscription_rel() psr ON psr.srrelid = qt.queue_table_relid JOIN fact_loader.logical_subscription() s ON psr.srsubid = s.subid JOIN logical_ticker.tick t ON t.db = s.dbname UNION ALL -- local SELECT NULL::text AS publication_name , qt.queue_of_base_table_relid , NULL::name AS publisher , now() AS source_time FROM fact_loader.queue_tables qt WHERE qt.pglogical_node_if_id IS NULL AND NOT EXISTS ( SELECT 1 FROM fact_loader.subscription_rel() psr WHERE psr.srrelid = qt.queue_table_relid );$$; ELSE RETURN QUERY -- local SELECT NULL::TEXT AS publication_name , qt.queue_of_base_table_relid , NULL::NAME AS publisher --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt WHERE NOT EXISTS (SELECT 1 FROM fact_loader.subscription_rel() psr WHERE psr.srrelid = qt.queue_table_relid) UNION ALL -- native logical (WITH logical_subscription_with_db AS ( SELECT *, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS db FROM fact_loader.logical_subscription() ) SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , t.db AS publisher , t.tick_time AS source_time FROM fact_loader.queue_tables qt JOIN fact_loader.subscription_rel() psr ON psr.srrelid = qt.queue_table_relid JOIN logical_subscription_with_db s ON psr.srsubid = s.subid JOIN logical_ticker.tick t ON t.db = s.db); END IF; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/raw_queued_changes.sql000066400000000000000000000011171451107006500230240ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ, min_missed_id BIGINT, queue_table_id INT ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/refresh_fact_table_dep_queue_table_deps.sql000066400000000000000000000067751451107006500272320ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist Add not exists because we think allowing all records to insert and conflict could be cause of serialization errors in repeatable read isolation. */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps new WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_dep_queue_table_deps existing WHERE existing.fact_table_dep_id = new.fact_table_dep_id AND existing.queue_table_dep_id = new.queue_table_dep_id) ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/sql_builder.sql000066400000000000000000000574551451107006500215200ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ -- changed_at is guaranteed now to be in timestamptz - any time zone casting is only in subquery changed_at < %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_literal(c.maximum_cutoff_time) ) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; pg_fact_loader-2.0.1/functions/subscription.sql000066400000000000000000000004771451107006500217270ustar00rootroot00000000000000/*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.subscription() RETURNS TABLE (oid OID, subpublications text[], subconninfo text) AS $BODY$ BEGIN RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo FROM pg_subscription s; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/subscription_rel.sql000066400000000000000000000004371451107006500225650ustar00rootroot00000000000000/*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.subscription_rel() RETURNS TABLE (srsubid OID, srrelid OID) AS $BODY$ BEGIN RETURN QUERY SELECT sr.srsubid, sr.srrelid FROM pg_subscription_rel sr; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/table_loader.sql000066400000000000000000000226471451107006500216230ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type) RETURNS TABLE (prepare_sql text, execute_sql text, unmapped_src_columns text[], unmapped_dest_columns text[]) AS $BODY$ DECLARE v_pkey_fields TEXT[]; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT array_agg(a.attname ORDER BY pk.rn) INTO v_pkey_fields FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik; RETURN QUERY WITH source_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_source_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , destination_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , unmapped_source_columns AS ( SELECT array_agg(s.column_name::text) AS unmapped_columns_src FROM source_columns s WHERE NOT EXISTS (SELECT 1 FROM destination_columns d WHERE d.column_name = s.column_name) ) , unmapped_dest_columns AS ( SELECT array_agg(d.column_name::text) AS unmapped_columns_dest FROM destination_columns d WHERE NOT EXISTS (SELECT 1 FROM source_columns s WHERE d.column_name = s.column_name) ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT unnest AS pkey_field FROM unnest(v_pkey_fields)) pk ) , info AS ( SELECT string_agg( CASE WHEN sc.column_name IS NOT NULL THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN sc.column_name IS NOT NULL AND (p_ignore_diff_for_columns IS NULL OR sc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN sc.column_name IS NOT NULL AND NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_join FROM destination_columns dc CROSS JOIN pkeys LEFT JOIN source_columns sc ON dc.column_name = sc.column_name GROUP BY pkeys.pkey_fields, pkeys.pkey_join ) , sql_snippets AS ( SELECT $$ DROP TABLE IF EXISTS count_tracker; CREATE TEMP TABLE count_tracker (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)); INSERT INTO count_tracker VALUES (NULL, NULL, FALSE, NULL); $$::TEXT AS count_tracker_sql , $$ DROP TABLE IF EXISTS actual_delta; CREATE TEMP TABLE actual_delta AS WITH final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_source_relation::TEXT||$$ EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ DROP TABLE IF EXISTS removed_keys; CREATE TEMP TABLE removed_keys AS SELECT $$||pkey_fields||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE NOT EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$); $$ AS removed_keys_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM $$||p_source_relation::TEXT||$$ s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ); $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql , $$ /*** We add a primary key to the actual_delta table to ensure there are no duplicate keys. ***/ ALTER TABLE actual_delta ADD PRIMARY KEY ($$||pkey_fields||$$); $$ AS add_delta_pkey_sql , $$ /**** This part is not implemented yet, but partially complete. If we decide we want to figure out that >50% of the table will be updated, we could decide to truncate. But then we have to balance the desire for that with more read queries to figure it out. To implement, add the type full_refresh_truncate to fact_loader.table_load_type, and uncomment code. We would also have to add the logic to find actual keys added, then subtract it from actual_delta to get the net updates expected. If this is over 50%, we should truncate and re-insert all data. ***/ DROP TABLE IF EXISTS percent_of_destination; CREATE TEMP TABLE percent_of_destination AS SELECT (((SELECT COUNT(1) FROM actual_delta) - (SELECT COUNT(1) FROM added_keys))::NUMERIC / (SELECT COUNT(1) FROM $$||p_destination_relation::TEXT||$$)::NUMERIC)::NUMERIC(8,2) AS pct; UPDATE count_tracker SET pct_dest = (SELECT pct FROM percent_of_destination); $$ AS percent_change_sql ,$$ DO $LOCK_SAFE_DDL$ BEGIN SET lock_timeout TO '10ms'; IF (SELECT pct FROM percent_of_destination) >= 0.5 THEN LOOP BEGIN TRUNCATE $$||p_destination_relation::TEXT||$$; UPDATE count_tracker SET truncated = true; EXIT; EXCEPTION WHEN lock_not_available THEN RAISE WARNING 'Could not obtain immediate lock for SQL %, retrying', p_sql; PERFORM pg_sleep(3); WHEN OTHERS THEN RAISE; END; END LOOP; END IF; RESET lock_timeout; END $LOCK_SAFE_DDL$ ; $$ AS lock_safe_truncate_sql ,$$ --Delete keys that are no longer in your new version DELETE FROM $$||p_destination_relation::TEXT||$$ d WHERE EXISTS (SELECT 1 FROM removed_keys s WHERE $$||pkey_join||$$); GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET deleted = v_row_count; $$ AS delete_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET upserted = v_row_count; $$ AS upsert_sql FROM info ) SELECT count_tracker_sql|| CASE /*** not implemented truncate pattern WHEN p_load_type IN('full_refresh','full_refresh_truncate') THEN ***/ WHEN p_load_type = 'full_refresh' THEN removed_keys_sql||actual_delta_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ WHEN p_load_type = 'delta' THEN actual_delta_sql||key_join_exists_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ END||$$ $$|| /*** not implemented truncate pattern CASE WHEN p_load_type = 'full_refresh_truncate' THEN percent_change_sql ELSE '' END ***/ '' AS prepare_sql , $$ --THIS SHOULD BE RUN IN A TRANSACTION DO $SCRIPT$ DECLARE v_row_count INT; v_results RECORD; BEGIN $$|| CASE /*** not implemented truncate pattern WHEN p_load_type = 'full_refresh_truncate' THEN lock_safe_truncate_sql||delete_sql||upsert_sql ***/ WHEN p_load_type = 'full_refresh' THEN delete_sql||upsert_sql WHEN p_load_type = 'delta' THEN upsert_sql END||$$ FOR v_results IN SELECT * FROM count_tracker LOOP RAISE LOG 'upserted: %, deleted: %, truncated: %, pct_dest: %', v_results.upserted, v_results.deleted, v_results.truncated, v_results.pct_dest; END LOOP; END $SCRIPT$; $$ AS execute_sql , (SELECT unmapped_columns_src FROM unmapped_source_columns) AS unmapped_src_columns , (SELECT unmapped_columns_dest FROM unmapped_dest_columns) AS unmapped_dest_columns FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql;pg_fact_loader-2.0.1/functions/table_loader_function.sql000066400000000000000000000131161451107006500235170ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join, string_agg($$d.$$||quote_ident(pkey_field)||$$ = $$||(SELECT arg_params FROM function_schema),E'\nAND ') AS pkey_join_to_arg FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs , pkey_join_to_arg FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs, pkey_join_to_arg ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE $$||pkey_join_to_arg AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql;pg_fact_loader-2.0.1/functions/table_loader_validator.sql000066400000000000000000000027371451107006500236660ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.table_loader_validator (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_unmapped_src_columns TEXT[], p_unmapped_dest_columns TEXT[], p_ignore_unmapped_columns BOOLEAN) RETURNS VOID AS $BODY$ DECLARE v_messages TEXT = ''; BEGIN IF NOT p_ignore_unmapped_columns AND p_unmapped_src_columns IS NOT NULL THEN v_messages = format($$You have unmapped columns (%s) in the source table %s. All source columns must be named identically to destination in order to map. If you are certain you want to ignore these columns, meaning they will not update anything in destination table %s, add the final argument to this function as TRUE. $$ , array_to_string(p_unmapped_src_columns,', ') , p_source_relation::TEXT , p_destination_relation::TEXT); END IF; IF NOT p_ignore_unmapped_columns AND p_unmapped_dest_columns IS NOT NULL THEN v_messages = v_messages||format($$ You have unmapped columns (%s) in the destination table %s. All destination columns must be named identically to source in order to map. If you are certain you want to ignore these columns, meaning the source table %s does not contain all columns in destination table, add the final argument to this function as TRUE.$$ , array_to_string(p_unmapped_dest_columns,', ') , p_destination_relation::TEXT , p_source_relation::TEXT); END IF; IF v_messages <> '' THEN RAISE EXCEPTION '%', v_messages; END IF; END; $BODY$ LANGUAGE plpgsql;pg_fact_loader-2.0.1/functions/try_load.sql000066400000000000000000000055141451107006500210150ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; v_err JSONB; v_errmsg TEXT; v_errdetail TEXT; v_errhint TEXT; v_errcontext TEXT; BEGIN -- We except rare serialization failures here which we will ignore and move to the next record -- Anything else should be raised BEGIN IF EXISTS (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id FOR UPDATE SKIP LOCKED) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_errmsg = MESSAGE_TEXT, v_errdetail = PG_EXCEPTION_DETAIL, v_errhint = PG_EXCEPTION_HINT, v_errcontext = PG_EXCEPTION_CONTEXT; UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; v_err = jsonb_strip_nulls( jsonb_build_object( 'Message', v_errmsg, 'Detail', case when v_errdetail = '' then null else v_errdetail end, 'Hint', case when v_errhint = '' then null else v_errhint end, 'Context', case when v_errcontext = '' then null else v_errcontext end) ); INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at, messages) VALUES (p_fact_table_id, now(), clock_timestamp(), v_err); RETURN FALSE; END; ELSE RETURN FALSE; END IF; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure on transaction % attempting to lock % - skipping.', txid_current()::text, p_fact_table_id::text; RETURN FALSE; WHEN OTHERS THEN RAISE; END; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/functions/worker.sql000066400000000000000000000020211451107006500204770ustar00rootroot00000000000000CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN --If any configured functions use temp tables, --must discard to avoid them hanging around in the idle background worker session DISCARD TEMP; --Log job times INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at) VALUES (v_fact_record.fact_table_id, now(), clock_timestamp()); --Return true meaning the fact table was refreshed (this applies even if there was no new data) RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/pg_fact_loader--1.4--1.5.sql000066400000000000000000000577441451107006500210700ustar00rootroot00000000000000/* pg_fact_loader--1.4--1.5.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ -- changed_at is guaranteed now to be in timestamptz - any time zone casting is only in subquery changed_at < %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_literal(c.maximum_cutoff_time) ) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; pg_fact_loader-2.0.1/pg_fact_loader--1.4.sql000066400000000000000000005236121451107006500205020ustar00rootroot00000000000000/* pg_fact_loader--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE FUNCTION fact_loader._launch_worker(oid) RETURNS pg_catalog.INT4 STRICT AS 'MODULE_PATHNAME', 'pg_fact_loader_worker' LANGUAGE C; CREATE FUNCTION fact_loader.launch_worker() RETURNS pg_catalog.INT4 STRICT AS 'SELECT fact_loader._launch_worker(oid) FROM pg_database WHERE datname = current_database();' LANGUAGE SQL; CREATE TABLE fact_loader.fact_tables ( fact_table_id SERIAL PRIMARY KEY, fact_table_relid REGCLASS NOT NULL, fact_table_agg_proid REGPROC NULL, --This may only be used to generate a merge function but is not used in automation enabled BOOLEAN NOT NULL DEFAULT FALSE, priority INT, attempt_number INT, retries_allowed INT DEFAULT 0, force_worker_priority BOOLEAN NOT NULL DEFAULT FALSE, last_refresh_source_cutoff TIMESTAMPTZ, last_refresh_attempted_at TIMESTAMPTZ, --TODO - answer if we want the worker to bail or record messages on ERROR (or both) last_refresh_succeeded BOOLEAN, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_tables UNIQUE (fact_table_relid) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables', ''); CREATE TABLE fact_loader.fact_table_deps ( fact_table_dep_id SERIAL PRIMARY KEY, parent_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), child_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), /***** In very many cases, you will use the same procs for insert, update, and delete even with multiple dependencies. This is why you must give defaults here which will be used to auto-populate fact_loader.fact_table_dep_queue_table_deps which can be overridden if necessary for each queue table. After you configure all of your fact tables and queue tables, run the function refresh_fact_table_dep_queue_table_deps manually to populate fact_table_dep_queue_table_deps, then make any changes as necessary. You can see an example of this in the test suite "seeds" file. You can also see an override example with order_emails_fact having a different proc for orders and reorders delete cases. */ default_insert_merge_proid REGPROC NOT NULL, default_update_merge_proid REGPROC NOT NULL, default_delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_deps UNIQUE (parent_id, child_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps', ''); CREATE TABLE fact_loader.queue_tables ( queue_table_id SERIAL PRIMARY KEY, queue_table_relid REGCLASS NOT NULL, queue_of_base_table_relid REGCLASS NOT NULL, /**** NOTE - the reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information. */ pglogical_node_if_id INT NOT NULL, --This is the timezone for the changed_at column - if null, we assume it is timestamptz (we could check that actually) queue_table_tz TEXT, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_table UNIQUE (queue_table_relid), CONSTRAINT unique_base_table UNIQUE (queue_of_base_table_relid) ); COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$The reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information.$$; SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables', ''); CREATE TABLE fact_loader.queue_table_deps ( queue_table_dep_id SERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), queue_table_id INT NOT NULL REFERENCES fact_loader.queue_tables (queue_table_id), relevant_change_columns NAME[], last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_deps UNIQUE (fact_table_id, queue_table_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps', ''); CREATE TABLE fact_loader.key_retrieval_sequences ( key_retrieval_sequence_id SERIAL PRIMARY KEY, queue_table_dep_id INT NOT NULL REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), /**** In almost all cases, we only need to write one way to retrieve keys. The only exception is, for example, when in a delete case, you need to pass a different field (customer_id instead of order_id) to the delete_merge_proid function. You then need a different key_retrieval_sequence to handle a different field name for this delete case. By default this is NULL, meaning there is no filter, meaning the sequence applies to all events I, U, D. Otherwise, you can add scopes in which case you must have one for each of 'I','U','D'. */ filter_scope CHAR(1) NULL, level INT NOT NULL, return_columns NAME[] NOT NULL, is_fact_key BOOLEAN NOT NULL, join_to_relation REGCLASS NULL, join_to_column NAME NULL, return_columns_from_join NAME[] NULL, join_return_is_fact_key BOOLEAN NULL, CONSTRAINT unique_retrievals UNIQUE (queue_table_dep_id, filter_scope, level), CONSTRAINT valid_scopes CHECK (filter_scope IN ('I','U','D')) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences', ''); CREATE TABLE fact_loader.fact_table_dep_queue_table_deps ( fact_table_dep_queue_table_dep_id SERIAL PRIMARY KEY, fact_table_dep_id INT REFERENCES fact_loader.fact_table_deps (fact_table_dep_id), queue_table_dep_id INT REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_cutoffs UNIQUE (fact_table_dep_id, queue_table_dep_id) ); CREATE OR REPLACE FUNCTION fact_loader.unique_scopes() RETURNS TRIGGER AS $BODY$ BEGIN IF (NEW.filter_scope IS NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NOT NULL )) OR (NEW.filter_scope IS NOT NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NULL )) THEN RAISE EXCEPTION $$You must either use a NULL filter_scope to cover all 3 events I, U, D or you must specify all 3 events separately I, U, D (For queue_table_dep_id %). $$, NEW.queue_table_dep_id; END IF; RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER unique_scopes BEFORE INSERT OR UPDATE ON fact_loader.key_retrieval_sequences FOR EACH ROW EXECUTE PROCEDURE fact_loader.unique_scopes(); /*** This table is unlogged because it only has data mid-transaction and should always be empty */ CREATE UNLOGGED TABLE fact_loader.process_queue ( process_queue_id BIGSERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), proid REGPROC NOT NULL, key_value TEXT NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ ); CREATE OR REPLACE FUNCTION fact_loader.set_row_updated_at_to_now() RETURNS TRIGGER AS $BODY$ BEGIN NEW.row_updated_at = now(); RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_dep_queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.process_queue FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TYPE fact_loader.table_load_type AS ENUM('delta','full_refresh'); CREATE OR REPLACE FUNCTION fact_loader.create_table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS REGPROC AS $BODY$ DECLARE v_new_proc TEXT; v_sql TEXT; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT function_name, function_sql INTO v_new_proc, v_sql FROM fact_loader.table_loader_function(p_source_proc, p_destination_relation, p_ignore_diff_for_columns); EXECUTE v_sql; RETURN v_new_proc::REGPROC; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id, proid, key_value, --TODO - either infer the data type of the function args, which is not super easy with postgres, --or add configuration fields for the name and data type of these. This will suffice for now --because we only have integer args for all functions 'integer' AS queue_of_base_table_key_type FROM fact_loader.process_queue pq WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT format('%s(%s::%s)', proid::TEXT, 'key_value', queue_of_base_table_key_type) AS function_call, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id) AS execute_sql FROM with_rank GROUP BY execution_group, function_call ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is the actual load to the destination table, and assumes that 'prepare' phase has already been run, which is supposed to have gathered the actual minimal delta and determine what to do here. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT execute_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_insert_to_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Using the process_queue data, execute the delta load of the fact table 3. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT insert_to_process_queue_sql, metadata_update_sql INTO v_insert_to_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue */ RAISE LOG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_insert_to_process_queue_sql; EXECUTE COALESCE(v_insert_to_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') LIKE 'debug%' THEN FOR v_debug_rec IN SELECT * FROM fact_loader.process_queue LOOP v_debug_text = v_debug_text||E'\n'||format('%s', v_debug_rec.process_queue_id||chr(9)||v_debug_rec.fact_table_id||chr(9)||v_debug_rec.proid||chr(9)||v_debug_rec.key_value); END LOOP; IF v_debug_text <> '' THEN v_debug_text = E'\n'||format('%s', (SELECT string_agg(column_name,chr(9)) FROM information_schema.columns WHERE table_name = 'process_queue' AND table_schema = 'fact_loader' AND column_name NOT LIKE 'row_%_at')) ||v_debug_text; RAISE DEBUG '%', v_debug_text; END IF; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE LOG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE LOG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.prepare_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is not going to lock any of the destination table for writing, which is precisely why it is separated from the 'execute' phase which actually writes to the table in the shortest transaction possible. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT prepare_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s); $$, queue_table_relid, queue_table_id_field, min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE LOG 'Purging Queue: %', v_sql; EXECUTE v_sql; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type) RETURNS TABLE (prepare_sql text, execute_sql text, unmapped_src_columns text[], unmapped_dest_columns text[]) AS $BODY$ DECLARE v_pkey_fields TEXT[]; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT array_agg(a.attname ORDER BY pk.rn) INTO v_pkey_fields FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik; RETURN QUERY WITH source_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_source_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , destination_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , unmapped_source_columns AS ( SELECT array_agg(s.column_name::text) AS unmapped_columns_src FROM source_columns s WHERE NOT EXISTS (SELECT 1 FROM destination_columns d WHERE d.column_name = s.column_name) ) , unmapped_dest_columns AS ( SELECT array_agg(d.column_name::text) AS unmapped_columns_dest FROM destination_columns d WHERE NOT EXISTS (SELECT 1 FROM source_columns s WHERE d.column_name = s.column_name) ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT unnest AS pkey_field FROM unnest(v_pkey_fields)) pk ) , info AS ( SELECT string_agg( CASE WHEN sc.column_name IS NOT NULL THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN sc.column_name IS NOT NULL AND (p_ignore_diff_for_columns IS NULL OR sc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN sc.column_name IS NOT NULL AND NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_join FROM destination_columns dc CROSS JOIN pkeys LEFT JOIN source_columns sc ON dc.column_name = sc.column_name GROUP BY pkeys.pkey_fields, pkeys.pkey_join ) , sql_snippets AS ( SELECT $$ DROP TABLE IF EXISTS count_tracker; CREATE TEMP TABLE count_tracker (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)); INSERT INTO count_tracker VALUES (NULL, NULL, FALSE, NULL); $$::TEXT AS count_tracker_sql , $$ DROP TABLE IF EXISTS actual_delta; CREATE TEMP TABLE actual_delta AS WITH final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_source_relation::TEXT||$$ EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ DROP TABLE IF EXISTS removed_keys; CREATE TEMP TABLE removed_keys AS SELECT $$||pkey_fields||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE NOT EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$); $$ AS removed_keys_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM $$||p_source_relation::TEXT||$$ s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ); $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql , $$ /*** We add a primary key to the actual_delta table to ensure there are no duplicate keys. ***/ ALTER TABLE actual_delta ADD PRIMARY KEY ($$||pkey_fields||$$); $$ AS add_delta_pkey_sql , $$ /**** This part is not implemented yet, but partially complete. If we decide we want to figure out that >50% of the table will be updated, we could decide to truncate. But then we have to balance the desire for that with more read queries to figure it out. To implement, add the type full_refresh_truncate to fact_loader.table_load_type, and uncomment code. We would also have to add the logic to find actual keys added, then subtract it from actual_delta to get the net updates expected. If this is over 50%, we should truncate and re-insert all data. ***/ DROP TABLE IF EXISTS percent_of_destination; CREATE TEMP TABLE percent_of_destination AS SELECT (((SELECT COUNT(1) FROM actual_delta) - (SELECT COUNT(1) FROM added_keys))::NUMERIC / (SELECT COUNT(1) FROM $$||p_destination_relation::TEXT||$$)::NUMERIC)::NUMERIC(8,2) AS pct; UPDATE count_tracker SET pct_dest = (SELECT pct FROM percent_of_destination); $$ AS percent_change_sql ,$$ DO $LOCK_SAFE_DDL$ BEGIN SET lock_timeout TO '10ms'; IF (SELECT pct FROM percent_of_destination) >= 0.5 THEN LOOP BEGIN TRUNCATE $$||p_destination_relation::TEXT||$$; UPDATE count_tracker SET truncated = true; EXIT; EXCEPTION WHEN lock_not_available THEN RAISE WARNING 'Could not obtain immediate lock for SQL %, retrying', p_sql; PERFORM pg_sleep(3); WHEN OTHERS THEN RAISE; END; END LOOP; END IF; RESET lock_timeout; END $LOCK_SAFE_DDL$ ; $$ AS lock_safe_truncate_sql ,$$ --Delete keys that are no longer in your new version DELETE FROM $$||p_destination_relation::TEXT||$$ d WHERE EXISTS (SELECT 1 FROM removed_keys s WHERE $$||pkey_join||$$); GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET deleted = v_row_count; $$ AS delete_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET upserted = v_row_count; $$ AS upsert_sql FROM info ) SELECT count_tracker_sql|| CASE /*** not implemented truncate pattern WHEN p_load_type IN('full_refresh','full_refresh_truncate') THEN ***/ WHEN p_load_type = 'full_refresh' THEN removed_keys_sql||actual_delta_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ WHEN p_load_type = 'delta' THEN actual_delta_sql||key_join_exists_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ END||$$ $$|| /*** not implemented truncate pattern CASE WHEN p_load_type = 'full_refresh_truncate' THEN percent_change_sql ELSE '' END ***/ '' AS prepare_sql , $$ --THIS SHOULD BE RUN IN A TRANSACTION DO $SCRIPT$ DECLARE v_row_count INT; v_results RECORD; BEGIN $$|| CASE /*** not implemented truncate pattern WHEN p_load_type = 'full_refresh_truncate' THEN lock_safe_truncate_sql||delete_sql||upsert_sql ***/ WHEN p_load_type = 'full_refresh' THEN delete_sql||upsert_sql WHEN p_load_type = 'delta' THEN upsert_sql END||$$ FOR v_results IN SELECT * FROM count_tracker LOOP RAISE LOG 'upserted: %, deleted: %, truncated: %, pct_dest: %', v_results.upserted, v_results.deleted, v_results.truncated, v_results.pct_dest; END LOOP; END $SCRIPT$; $$ AS execute_sql , (SELECT unmapped_columns_src FROM unmapped_source_columns) AS unmapped_src_columns , (SELECT unmapped_columns_dest FROM unmapped_dest_columns) AS unmapped_dest_columns FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_validator (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_unmapped_src_columns TEXT[], p_unmapped_dest_columns TEXT[], p_ignore_unmapped_columns BOOLEAN) RETURNS VOID AS $BODY$ DECLARE v_messages TEXT = ''; BEGIN IF NOT p_ignore_unmapped_columns AND p_unmapped_src_columns IS NOT NULL THEN v_messages = format($$You have unmapped columns (%s) in the source table %s. All source columns must be named identically to destination in order to map. If you are certain you want to ignore these columns, meaning they will not update anything in destination table %s, add the final argument to this function as TRUE. $$ , array_to_string(p_unmapped_src_columns,', ') , p_source_relation::TEXT , p_destination_relation::TEXT); END IF; IF NOT p_ignore_unmapped_columns AND p_unmapped_dest_columns IS NOT NULL THEN v_messages = v_messages||format($$ You have unmapped columns (%s) in the destination table %s. All destination columns must be named identically to source in order to map. If you are certain you want to ignore these columns, meaning the source table %s does not contain all columns in destination table, add the final argument to this function as TRUE.$$ , array_to_string(p_unmapped_dest_columns,', ') , p_destination_relation::TEXT , p_source_relation::TEXT); END IF; IF v_messages <> '' THEN RAISE EXCEPTION '%', v_messages; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; c_lock_cutoff_refresh INT = 99995; BEGIN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.fact_tables WHERE enabled ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority LOOP IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = v_fact_record.fact_table_id) THEN --Load fact table PERFORM fact_loader.load(v_fact_record.fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); RETURN TRUE; END IF; END LOOP; RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.0--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit ALTER TABLE fact_loader.key_retrieval_sequences ADD COLUMN pass_queue_table_change_date_at_tz TEXT NULL; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; ALTER TABLE fact_loader.key_retrieval_sequences ADD CONSTRAINT verify_valid_tz CHECK (pass_queue_table_change_date_at_tz IS NULL OR (now() AT TIME ZONE pass_queue_table_change_date_at_tz IS NOT NULL)); --This check constraint could have been added in v. 1.0 ALTER TABLE fact_loader.queue_tables ADD CONSTRAINT verify_valid_tz CHECK (queue_table_tz IS NULL OR (now() AT TIME ZONE queue_table_tz IS NOT NULL)); ALTER TABLE fact_loader.process_queue ADD COLUMN source_change_date DATE NULL; COMMENT ON COLUMN fact_loader.process_queue.source_change_date IS 'Corresponds to fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz. If this field is populated, a function will be expected that has args (key_value, source_change_date) based on this process_queue table.'; --This should have already been added in v. 1.0 SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_deps', ''); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid DROP NOT NULL, ALTER COLUMN default_update_merge_proid DROP NOT NULL, ALTER COLUMN default_delete_merge_proid DROP NOT NULL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM fact_loader.process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join, string_agg($$d.$$||quote_ident(pkey_field)||$$ = $$||(SELECT arg_params FROM function_schema),E'\nAND ') AS pkey_join_to_arg FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs , pkey_join_to_arg FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs, pkey_join_to_arg ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE $$||pkey_join_to_arg AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.1--1.2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit --To support non-replicated queue tables ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id DROP NOT NULL; CREATE TABLE fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id SERIAL PRIMARY KEY, fact_table_id INT REFERENCES fact_loader.fact_tables (fact_table_id), refresh_attempted_at TIMESTAMPTZ, messages TEXT); ALTER TABLE fact_loader.fact_tables ADD COLUMN use_daily_schedule BOOLEAN NOT NULL DEFAULT FALSE, ADD COLUMN daily_scheduled_time TIME NULL, ADD COLUMN daily_scheduled_tz TEXT NULL, ADD COLUMN daily_scheduled_proid REGPROC, ADD CONSTRAINT verify_valid_daily_tz CHECK (daily_scheduled_tz IS NULL OR (now() AT TIME ZONE daily_scheduled_tz IS NOT NULL)), ADD CONSTRAINT daily_schedule_configured_correctly CHECK ((NOT use_daily_schedule) OR (use_daily_schedule AND daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL)); CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS SELECT * FROM fact_loader.fact_tables WHERE enabled AND (NOT use_daily_schedule OR --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone ( (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME ) ) ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN (use_daily_schedule AND (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME) THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; BEGIN /*** There are 2 basic steps to this load: 1. Execute the single daily-refresh function 2. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()' INTO v_execute_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; BEGIN IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, messages) VALUES (p_fact_table_id, now(), SQLERRM); RETURN FALSE; END; ELSE RETURN FALSE; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.2--1.3.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW IF EXISTS fact_loader.queue_deps_all_with_retrieval; DROP VIEW IF EXISTS fact_loader.queue_deps_all; DROP VIEW IF EXISTS fact_loader.logical_subscription; DROP VIEW IF EXISTS fact_loader.prioritized_jobs; DROP VIEW IF EXISTS fact_loader.unresolved_failures; DROP FUNCTION IF EXISTS fact_loader.sql_builder(int); CREATE OR REPLACE FUNCTION fact_loader.add_batch_id_fields() RETURNS VOID AS $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT queue_table_relid FROM fact_loader.queue_tables qt INNER JOIN pg_class c ON c.oid = qt.queue_table_relid INNER JOIN pg_namespace n ON n.oid = c.relnamespace WHERE NOT EXISTS (SELECT 1 FROM information_schema.columns col WHERE col.column_name = 'fact_loader_batch_id' AND col.table_schema = n.nspname AND col.table_name = c.relname) LOOP v_sql = format($F$ ALTER TABLE %s ADD COLUMN fact_loader_batch_id BIGINT DEFAULT nextval('fact_loader.batch_id'); $F$, v_rec.queue_table_relid::text, v_rec.queue_table_relid::text); RAISE LOG 'ADDING fact_loader_batch_id COLUMN TO queue table %: %', v_rec.queue_table_relid::text, v_sql; EXECUTE v_sql; END LOOP; END $BODY$ LANGUAGE plpgsql; ALTER TABLE fact_loader.queue_tables ADD COLUMN purge BOOLEAN NOT NULL DEFAULT TRUE; UPDATE fact_loader.fact_table_refresh_logs SET messages = jsonb_build_object('Message', messages) WHERE messages IS NOT NULL; --Will be re-added via \i in sql file ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN messages TYPE jsonb USING messages::jsonb; --This was a problem from the start ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id TYPE OID; --This should have been done from the start SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_de_fact_table_dep_queue_table_de_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps_fact_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables_fact_table_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences_key_retrieval_sequence_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps_queue_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables_queue_table_id_seq', ''); --No indexes or anything but allow debugging CREATE UNLOGGED TABLE fact_loader.debug_process_queue (LIKE fact_loader.process_queue); ALTER TABLE fact_loader.debug_process_queue ADD PRIMARY KEY (process_queue_id); -- Now a temp table to avoid serialization contention DROP TABLE fact_loader.process_queue; --Make this a trigger to check dep fact tables ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_base_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_parent_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables DROP CONSTRAINT daily_schedule_configured_correctly; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_schedule_configured_correctly CHECK (NOT use_daily_schedule OR (use_daily_schedule AND ((daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL) OR (depends_on_base_daily_job_id IS NOT NULL AND depends_on_parent_daily_job_id IS NOT NULL)))); --These columns have never been used ALTER TABLE fact_loader.fact_tables DROP COLUMN attempt_number, DROP COLUMN retries_allowed; --This is the usual case and makes sense ALTER TABLE fact_loader.key_retrieval_sequences ALTER COLUMN level SET DEFAULT 1; --Need to have a more reliable dependency knowledge for scheduled jobs ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_deps REGCLASS[]; ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_dep_delay_tolerance INTERVAL; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_deps_correctly_configured CHECK ((daily_scheduled_deps IS NULL AND daily_scheduled_dep_delay_tolerance IS NULL) OR (daily_scheduled_deps IS NOT NULL AND daily_scheduled_dep_delay_tolerance IS NOT NULL)); --Log all events and add pruning ALTER TABLE fact_loader.fact_table_refresh_logs ADD COLUMN refresh_finished_at TIMESTAMPTZ; ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN fact_table_refresh_log_id TYPE BIGINT; -- Handle race conditions by changing to batch usage CREATE SEQUENCE fact_loader.batch_id; SELECT fact_loader.add_batch_id_fields(); CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL AND qt.purge /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s IN (SELECT %s FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s) FOR UPDATE SKIP LOCKED ); $$, queue_table_relid, 'fact_loader_batch_id', 'fact_loader_batch_id', queue_table_relid, 'fact_loader_batch_id', min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE DEBUG 'Purging Queue: %', v_sql; BEGIN EXECUTE v_sql; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure in queue purging for transaction % - skipping.', txid_current()::text; WHEN OTHERS THEN RAISE; END; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN --If any configured functions use temp tables, --must discard to avoid them hanging around in the idle background worker session DISCARD TEMP; --Log job times INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at) VALUES (v_fact_record.fact_table_id, now(), clock_timestamp()); --Return true meaning the fact table was refreshed (this applies even if there was no new data) RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; v_err JSONB; v_errmsg TEXT; v_errdetail TEXT; v_errhint TEXT; v_errcontext TEXT; BEGIN -- We except rare serialization failures here which we will ignore and move to the next record -- Anything else should be raised BEGIN IF EXISTS (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id FOR UPDATE SKIP LOCKED) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_errmsg = MESSAGE_TEXT, v_errdetail = PG_EXCEPTION_DETAIL, v_errhint = PG_EXCEPTION_HINT, v_errcontext = PG_EXCEPTION_CONTEXT; UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; v_err = jsonb_strip_nulls( jsonb_build_object( 'Message', v_errmsg, 'Detail', case when v_errdetail = '' then null else v_errdetail end, 'Hint', case when v_errhint = '' then null else v_errhint end, 'Context', case when v_errcontext = '' then null else v_errcontext end) ); INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at, messages) VALUES (p_fact_table_id, now(), clock_timestamp(), v_err); RETURN FALSE; END; ELSE RETURN FALSE; END IF; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure on transaction % attempting to lock % - skipping.', txid_current()::text, p_fact_table_id::text; RETURN FALSE; WHEN OTHERS THEN RAISE; END; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue SELECT * FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE FUNCTION fact_loader.safely_terminate_workers() RETURNS TABLE (number_terminated INT, number_still_live INT, pids_still_live INT[]) AS $BODY$ /**** It is not a requirement to use this function to terminate workers. Because workers are transactional, you can simply terminate them and no data loss will result in pg_fact_loader. Likewise, a hard crash of any system using pg_fact_loader will recover just fine upon re-launching workers. Still, it is ideal to avoid bloat to cleanly terminate workers and restart them using this function to kill them, and launch_workers(int) to re-launch them. */ BEGIN RETURN QUERY WITH try_term_pids AS ( SELECT pid, CASE WHEN state = 'idle' AND state_change BETWEEN SYMMETRIC now() - interval '5 seconds' AND now() - interval '55 seconds' THEN pg_terminate_backend(pid) ELSE FALSE END AS terminated FROM pg_stat_activity WHERE usename = 'postgres' AND query = 'SELECT fact_loader.worker();') SELECT SUM(CASE WHEN terminated THEN 1 ELSE 0 END)::INT AS number_terminated_out, SUM(CASE WHEN NOT terminated THEN 1 ELSE 0 END)::INT AS number_still_live_out, (SELECT array_agg(pid) FROM try_term_pids WHERE NOT terminated) AS pids_still_live_out FROM try_term_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.launch_workers(number_to_launch int) RETURNS INT[] AS $BODY$ DECLARE v_pids INT[]; BEGIN FOR i IN 1..number_to_launch LOOP v_pids = array_append(v_pids, fact_loader.launch_worker()); /* It's not strictly required to not launch all workers simultaneously, but it's also a little more invasive to do that, probably requiring more advisory lock skips. Better to just sleep 1 second between launches. */ PERFORM pg_sleep(1); END LOOP; RETURN v_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, format($$ %s AS fact_table_id, %s AS queue_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time)) AS metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END, quote_literal(maximum_cutoff_time)) AS global_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table E'\nINNER JOIN '||queue_of_base_table_relid::TEXT||' b'|| E'\n ON q.'||quote_ident(queue_table_key)||'::'||queue_of_base_table_key_type||' = b.'||quote_ident(queue_of_base_table_key) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. format($$ %s AND q.%s < %s %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END) AS global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_insert_sql) AS queue_insert_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_update_sql) AS queue_update_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.global_where_sql||nrs.where_for_delete_sql) AS queue_delete_sql, format($$ SELECT %s FROM %s WHERE %s $$, nrs.metadata_select_columns, nrs.queue_table_aliased, nrs.global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.gathered_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, proid REGPROC, key_value TEXT, source_change_date DATE) AS $BODY$ DECLARE v_gather_sql text; BEGIN SELECT gathered_queued_changes_sql INTO v_gather_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_gather_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; v_deps regclass[]; v_dep_delay_tolerance interval; v_delayed_msg text; BEGIN /*** There are 3 basic steps to this load: 1. If dependencies are listed, verify they are up to date enough 2. Execute the single daily-refresh function 3. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()', daily_scheduled_deps, daily_scheduled_dep_delay_tolerance INTO v_execute_sql, v_deps, v_dep_delay_tolerance FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; IF v_deps IS NOT NULL THEN WITH deps AS (SELECT unnest(v_deps) AS dep) , delays AS ( SELECT dep, now() - source_time as delay_interval FROM fact_loader.queue_table_delay_info() qtd INNER JOIN deps d ON d.dep = qtd.queue_of_base_table_relid UNION ALL SELECT dep, now() - last_refresh_source_cutoff as delay_interval FROM fact_loader.fact_tables ft INNER JOIN deps d ON d.dep = ft.fact_table_relid ) SELECT string_agg(dep::text||': Delayed '||delay_interval::text, ', ') INTO v_delayed_msg FROM delays WHERE delay_interval > v_dep_delay_tolerance; IF v_delayed_msg IS NOT NULL THEN RAISE EXCEPTION '%', v_delayed_msg; END IF; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.fact_table_refresh_logs_pruner() RETURNS trigger LANGUAGE plpgsql AS $$ declare step int := 1000; -- step should equal the firing frequency in trigger definition overdrive int := 2; -- overdrive times step = max rows (see below) max_rows int := step * overdrive; rows int; begin delete from fact_loader.fact_table_refresh_logs where fact_table_refresh_log_id in ( select fact_table_refresh_log_id from fact_loader.fact_table_refresh_logs where refresh_attempted_at < now() - '90 days'::interval -- do not do the literal interval value above as a declare parameter order by fact_table_refresh_log_id limit max_rows for update skip locked ); get diagnostics rows = row_count; return null; end $$; CREATE TRIGGER fact_table_refresh_logs_pruner AFTER INSERT ON fact_loader.fact_table_refresh_logs FOR EACH ROW WHEN ((new.fact_table_refresh_log_id % 1000::bigint) = 0) EXECUTE PROCEDURE fact_loader.fact_table_refresh_logs_pruner(); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (sub_origin_if OID, sub_replication_sets text[]) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if, sub_replication_sets FROM pglogical.subscription; $$; ELSE RETURN QUERY SELECT NULL::OID, NULL::TEXT[]; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist Add not exists because we think allowing all records to insert and conflict could be cause of serialization errors in repeatable read isolation. */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps new WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_dep_queue_table_deps existing WHERE existing.fact_table_dep_id = new.fact_table_dep_id AND existing.queue_table_dep_id = new.queue_table_dep_id) ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; -- These fields now becomes based on batch, not based on queue_table_id_field DO $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT format($$ UPDATE fact_loader.%s SET last_cutoff_id = (SELECT fact_loader_batch_id FROM %s WHERE %s = %s) WHERE %s = %s; $$, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_deps' ELSE 'fact_table_dep_queue_table_deps' END, queue_table_relid::text, queue_table_id_field::text, last_cutoff_id::text, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_dep_id' ELSE 'fact_table_dep_queue_table_dep_id' END, CASE WHEN fact_table_dep_id IS NULL THEN queue_table_dep_id ELSE fact_table_dep_queue_table_dep_id END ) AS sql FROM fact_loader.queue_deps_all WHERE last_cutoff_id IS NOT NULL LOOP v_sql = v_rec.sql; RAISE LOG 'Updating Extension pg_fact_loader Executed: %', v_sql; EXECUTE v_sql; END LOOP; END$BODY$; COMMENT ON TABLE fact_loader.debug_process_queue IS 'A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG.'; COMMENT ON TABLE fact_loader.fact_table_dep_queue_table_deps IS $$Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_queue_table_dep_id IS 'Unique identifier'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_id IS 'fact_table_dep for this specific dependency.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.queue_table_dep_id IS 'Inherited queue_table_dep that this dependent fact table depends on.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_id IS $$This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_source_time IS $$This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.insert_merge_proid IS $$Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.update_merge_proid IS $$Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.delete_merge_proid IS $$Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_deps IS 'For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here.'; COMMENT ON COLUMN fact_loader.fact_table_deps.fact_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.fact_table_deps.parent_id IS 'The parent fact_table_id that the child depends on.'; COMMENT ON COLUMN fact_loader.fact_table_deps.child_id IS 'The child fact_table_id that will run only after the parent is updated.'; COMMENT ON COLUMN fact_loader.fact_table_deps.default_insert_merge_proid IS $$Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_update_merge_proid IS $$Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_delete_merge_proid IS $$Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_refresh_logs IS 'Used to log both job run times and exceptions.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_refresh_log_id IS 'Unique identifier,'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_id IS 'Fact table that created the log.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_attempted_at IS 'The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures).'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_finished_at IS 'The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.messages IS 'Only for failures - Error message content in JSON format - including message, message detail, context, and hint.'; COMMENT ON TABLE fact_loader.fact_tables IS 'Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs".'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_id IS 'Unique identifier for the fact table or job - also referred to as job_id'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_relid IS 'The oid of the fact table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_agg_proid IS $$NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now.$$; COMMENT ON COLUMN fact_loader.fact_tables.enabled IS 'Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE.'; COMMENT ON COLUMN fact_loader.fact_tables.priority IS 'Determines the order in which the job runs (in combination with other sorting factors)'; COMMENT ON COLUMN fact_loader.fact_tables.force_worker_priority IS 'If marked TRUE, this fact table will be prioritized in execution order above all other factors.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_source_cutoff IS 'The data cutoff time of the last refresh - only records older than this have been updated.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_attempted_at IS 'The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_succeeded IS 'Whether or not the last run of the job succeeded. NULL if it has never been run.'; COMMENT ON COLUMN fact_loader.fact_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.fact_tables.use_daily_schedule IS 'If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_time IS 'The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_tz IS 'The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_proid IS $$The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples.$$; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_base_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time.'; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_parent_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_deps IS 'OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_dep_delay_tolerance IS 'OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON TABLE fact_loader.key_retrieval_sequences IS $$How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.key_retrieval_sequence_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.queue_table_dep_id IS 'Which fact table - queue table record this is for (queue_table_deps)'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.filter_scope IS $$NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.level IS $$Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns IS $$What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.is_fact_key IS 'Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_relation IS 'Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_column IS 'Join to this column of join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns_from_join IS 'Return these columns from join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_return_is_fact_key IS 'If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; COMMENT ON VIEW fact_loader.queue_deps_all_with_retrieval IS 'The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes.'; COMMENT ON TABLE fact_loader.queue_table_deps IS $$Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.queue_table_deps.fact_table_id IS 'Fact table to tie together with a queue table it depends on.'; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_id IS 'Queue table to tie together with a fact table that needs its changes.'; COMMENT ON COLUMN fact_loader.queue_table_deps.relevant_change_columns IS $$Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_id IS $$The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_source_time IS $$The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.queue_table_deps.insert_merge_proid IS $$Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.update_merge_proid IS $$Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.delete_merge_proid IS $$Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.queue_tables IS 'Each queue table along with the base table to which it belongs.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_id IS 'Unique identifier for queue tables.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_relid IS 'The oid of the queue table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_of_base_table_relid IS 'The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table.$$; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_tz IS $$**NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz.$$; COMMENT ON COLUMN fact_loader.queue_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.queue_tables.purge IS 'Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table.'; COMMENT ON VIEW fact_loader.unresolved_failures IS 'Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring.'; /* pg_fact_loader--1.3--1.4.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP FUNCTION fact_loader.raw_queued_changes(int); ALTER TABLE fact_loader.debug_process_queue DROP CONSTRAINT debug_process_queue_pkey; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ %s < %s %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), changed_at_tz_correction) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ, min_missed_id BIGINT, queue_table_id INT ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; pg_fact_loader-2.0.1/pg_fact_loader--1.5--1.6.sql000066400000000000000000000414521451107006500210570ustar00rootroot00000000000000/* pg_fact_loader--1.5--1.6.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; -- Must ensure we have the fully schema-qualified regprod before converting to text SET search_path TO ''; ALTER TABLE fact_loader.debug_process_queue ALTER COLUMN proid TYPE TEXT; ALTER TABLE fact_loader.debug_process_queue ADD CONSTRAINT check_proid CHECK (COALESCE(proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_delete_merge_proid CHECK (COALESCE(default_delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_insert_merge_proid CHECK (COALESCE(default_insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_update_merge_proid CHECK (COALESCE(default_update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN daily_scheduled_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_daily_scheduled_proid CHECK (COALESCE(daily_scheduled_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN fact_table_agg_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_fact_table_agg_proid CHECK (COALESCE(fact_table_agg_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); RESET search_path; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/pg_fact_loader--1.5.sql000066400000000000000000006035561451107006500205110ustar00rootroot00000000000000/* pg_fact_loader--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE FUNCTION fact_loader._launch_worker(oid) RETURNS pg_catalog.INT4 STRICT AS 'MODULE_PATHNAME', 'pg_fact_loader_worker' LANGUAGE C; CREATE FUNCTION fact_loader.launch_worker() RETURNS pg_catalog.INT4 STRICT AS 'SELECT fact_loader._launch_worker(oid) FROM pg_database WHERE datname = current_database();' LANGUAGE SQL; CREATE TABLE fact_loader.fact_tables ( fact_table_id SERIAL PRIMARY KEY, fact_table_relid REGCLASS NOT NULL, fact_table_agg_proid REGPROC NULL, --This may only be used to generate a merge function but is not used in automation enabled BOOLEAN NOT NULL DEFAULT FALSE, priority INT, attempt_number INT, retries_allowed INT DEFAULT 0, force_worker_priority BOOLEAN NOT NULL DEFAULT FALSE, last_refresh_source_cutoff TIMESTAMPTZ, last_refresh_attempted_at TIMESTAMPTZ, --TODO - answer if we want the worker to bail or record messages on ERROR (or both) last_refresh_succeeded BOOLEAN, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_tables UNIQUE (fact_table_relid) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables', ''); CREATE TABLE fact_loader.fact_table_deps ( fact_table_dep_id SERIAL PRIMARY KEY, parent_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), child_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), /***** In very many cases, you will use the same procs for insert, update, and delete even with multiple dependencies. This is why you must give defaults here which will be used to auto-populate fact_loader.fact_table_dep_queue_table_deps which can be overridden if necessary for each queue table. After you configure all of your fact tables and queue tables, run the function refresh_fact_table_dep_queue_table_deps manually to populate fact_table_dep_queue_table_deps, then make any changes as necessary. You can see an example of this in the test suite "seeds" file. You can also see an override example with order_emails_fact having a different proc for orders and reorders delete cases. */ default_insert_merge_proid REGPROC NOT NULL, default_update_merge_proid REGPROC NOT NULL, default_delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_deps UNIQUE (parent_id, child_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps', ''); CREATE TABLE fact_loader.queue_tables ( queue_table_id SERIAL PRIMARY KEY, queue_table_relid REGCLASS NOT NULL, queue_of_base_table_relid REGCLASS NOT NULL, /**** NOTE - the reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information. */ pglogical_node_if_id INT NOT NULL, --This is the timezone for the changed_at column - if null, we assume it is timestamptz (we could check that actually) queue_table_tz TEXT, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_table UNIQUE (queue_table_relid), CONSTRAINT unique_base_table UNIQUE (queue_of_base_table_relid) ); COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$The reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information.$$; SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables', ''); CREATE TABLE fact_loader.queue_table_deps ( queue_table_dep_id SERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), queue_table_id INT NOT NULL REFERENCES fact_loader.queue_tables (queue_table_id), relevant_change_columns NAME[], last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_deps UNIQUE (fact_table_id, queue_table_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps', ''); CREATE TABLE fact_loader.key_retrieval_sequences ( key_retrieval_sequence_id SERIAL PRIMARY KEY, queue_table_dep_id INT NOT NULL REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), /**** In almost all cases, we only need to write one way to retrieve keys. The only exception is, for example, when in a delete case, you need to pass a different field (customer_id instead of order_id) to the delete_merge_proid function. You then need a different key_retrieval_sequence to handle a different field name for this delete case. By default this is NULL, meaning there is no filter, meaning the sequence applies to all events I, U, D. Otherwise, you can add scopes in which case you must have one for each of 'I','U','D'. */ filter_scope CHAR(1) NULL, level INT NOT NULL, return_columns NAME[] NOT NULL, is_fact_key BOOLEAN NOT NULL, join_to_relation REGCLASS NULL, join_to_column NAME NULL, return_columns_from_join NAME[] NULL, join_return_is_fact_key BOOLEAN NULL, CONSTRAINT unique_retrievals UNIQUE (queue_table_dep_id, filter_scope, level), CONSTRAINT valid_scopes CHECK (filter_scope IN ('I','U','D')) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences', ''); CREATE TABLE fact_loader.fact_table_dep_queue_table_deps ( fact_table_dep_queue_table_dep_id SERIAL PRIMARY KEY, fact_table_dep_id INT REFERENCES fact_loader.fact_table_deps (fact_table_dep_id), queue_table_dep_id INT REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_cutoffs UNIQUE (fact_table_dep_id, queue_table_dep_id) ); CREATE OR REPLACE FUNCTION fact_loader.unique_scopes() RETURNS TRIGGER AS $BODY$ BEGIN IF (NEW.filter_scope IS NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NOT NULL )) OR (NEW.filter_scope IS NOT NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NULL )) THEN RAISE EXCEPTION $$You must either use a NULL filter_scope to cover all 3 events I, U, D or you must specify all 3 events separately I, U, D (For queue_table_dep_id %). $$, NEW.queue_table_dep_id; END IF; RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER unique_scopes BEFORE INSERT OR UPDATE ON fact_loader.key_retrieval_sequences FOR EACH ROW EXECUTE PROCEDURE fact_loader.unique_scopes(); /*** This table is unlogged because it only has data mid-transaction and should always be empty */ CREATE UNLOGGED TABLE fact_loader.process_queue ( process_queue_id BIGSERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), proid REGPROC NOT NULL, key_value TEXT NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ ); CREATE OR REPLACE FUNCTION fact_loader.set_row_updated_at_to_now() RETURNS TRIGGER AS $BODY$ BEGIN NEW.row_updated_at = now(); RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_dep_queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.process_queue FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TYPE fact_loader.table_load_type AS ENUM('delta','full_refresh'); CREATE OR REPLACE FUNCTION fact_loader.create_table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS REGPROC AS $BODY$ DECLARE v_new_proc TEXT; v_sql TEXT; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT function_name, function_sql INTO v_new_proc, v_sql FROM fact_loader.table_loader_function(p_source_proc, p_destination_relation, p_ignore_diff_for_columns); EXECUTE v_sql; RETURN v_new_proc::REGPROC; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id, proid, key_value, --TODO - either infer the data type of the function args, which is not super easy with postgres, --or add configuration fields for the name and data type of these. This will suffice for now --because we only have integer args for all functions 'integer' AS queue_of_base_table_key_type FROM fact_loader.process_queue pq WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT format('%s(%s::%s)', proid::TEXT, 'key_value', queue_of_base_table_key_type) AS function_call, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id) AS execute_sql FROM with_rank GROUP BY execution_group, function_call ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is the actual load to the destination table, and assumes that 'prepare' phase has already been run, which is supposed to have gathered the actual minimal delta and determine what to do here. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT execute_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_insert_to_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Using the process_queue data, execute the delta load of the fact table 3. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT insert_to_process_queue_sql, metadata_update_sql INTO v_insert_to_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue */ RAISE LOG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_insert_to_process_queue_sql; EXECUTE COALESCE(v_insert_to_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') LIKE 'debug%' THEN FOR v_debug_rec IN SELECT * FROM fact_loader.process_queue LOOP v_debug_text = v_debug_text||E'\n'||format('%s', v_debug_rec.process_queue_id||chr(9)||v_debug_rec.fact_table_id||chr(9)||v_debug_rec.proid||chr(9)||v_debug_rec.key_value); END LOOP; IF v_debug_text <> '' THEN v_debug_text = E'\n'||format('%s', (SELECT string_agg(column_name,chr(9)) FROM information_schema.columns WHERE table_name = 'process_queue' AND table_schema = 'fact_loader' AND column_name NOT LIKE 'row_%_at')) ||v_debug_text; RAISE DEBUG '%', v_debug_text; END IF; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE LOG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE LOG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.prepare_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is not going to lock any of the destination table for writing, which is precisely why it is separated from the 'execute' phase which actually writes to the table in the shortest transaction possible. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT prepare_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s); $$, queue_table_relid, queue_table_id_field, min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE LOG 'Purging Queue: %', v_sql; EXECUTE v_sql; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type) RETURNS TABLE (prepare_sql text, execute_sql text, unmapped_src_columns text[], unmapped_dest_columns text[]) AS $BODY$ DECLARE v_pkey_fields TEXT[]; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT array_agg(a.attname ORDER BY pk.rn) INTO v_pkey_fields FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik; RETURN QUERY WITH source_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_source_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , destination_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , unmapped_source_columns AS ( SELECT array_agg(s.column_name::text) AS unmapped_columns_src FROM source_columns s WHERE NOT EXISTS (SELECT 1 FROM destination_columns d WHERE d.column_name = s.column_name) ) , unmapped_dest_columns AS ( SELECT array_agg(d.column_name::text) AS unmapped_columns_dest FROM destination_columns d WHERE NOT EXISTS (SELECT 1 FROM source_columns s WHERE d.column_name = s.column_name) ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT unnest AS pkey_field FROM unnest(v_pkey_fields)) pk ) , info AS ( SELECT string_agg( CASE WHEN sc.column_name IS NOT NULL THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN sc.column_name IS NOT NULL AND (p_ignore_diff_for_columns IS NULL OR sc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN sc.column_name IS NOT NULL AND NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_join FROM destination_columns dc CROSS JOIN pkeys LEFT JOIN source_columns sc ON dc.column_name = sc.column_name GROUP BY pkeys.pkey_fields, pkeys.pkey_join ) , sql_snippets AS ( SELECT $$ DROP TABLE IF EXISTS count_tracker; CREATE TEMP TABLE count_tracker (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)); INSERT INTO count_tracker VALUES (NULL, NULL, FALSE, NULL); $$::TEXT AS count_tracker_sql , $$ DROP TABLE IF EXISTS actual_delta; CREATE TEMP TABLE actual_delta AS WITH final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_source_relation::TEXT||$$ EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ DROP TABLE IF EXISTS removed_keys; CREATE TEMP TABLE removed_keys AS SELECT $$||pkey_fields||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE NOT EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$); $$ AS removed_keys_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM $$||p_source_relation::TEXT||$$ s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ); $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql , $$ /*** We add a primary key to the actual_delta table to ensure there are no duplicate keys. ***/ ALTER TABLE actual_delta ADD PRIMARY KEY ($$||pkey_fields||$$); $$ AS add_delta_pkey_sql , $$ /**** This part is not implemented yet, but partially complete. If we decide we want to figure out that >50% of the table will be updated, we could decide to truncate. But then we have to balance the desire for that with more read queries to figure it out. To implement, add the type full_refresh_truncate to fact_loader.table_load_type, and uncomment code. We would also have to add the logic to find actual keys added, then subtract it from actual_delta to get the net updates expected. If this is over 50%, we should truncate and re-insert all data. ***/ DROP TABLE IF EXISTS percent_of_destination; CREATE TEMP TABLE percent_of_destination AS SELECT (((SELECT COUNT(1) FROM actual_delta) - (SELECT COUNT(1) FROM added_keys))::NUMERIC / (SELECT COUNT(1) FROM $$||p_destination_relation::TEXT||$$)::NUMERIC)::NUMERIC(8,2) AS pct; UPDATE count_tracker SET pct_dest = (SELECT pct FROM percent_of_destination); $$ AS percent_change_sql ,$$ DO $LOCK_SAFE_DDL$ BEGIN SET lock_timeout TO '10ms'; IF (SELECT pct FROM percent_of_destination) >= 0.5 THEN LOOP BEGIN TRUNCATE $$||p_destination_relation::TEXT||$$; UPDATE count_tracker SET truncated = true; EXIT; EXCEPTION WHEN lock_not_available THEN RAISE WARNING 'Could not obtain immediate lock for SQL %, retrying', p_sql; PERFORM pg_sleep(3); WHEN OTHERS THEN RAISE; END; END LOOP; END IF; RESET lock_timeout; END $LOCK_SAFE_DDL$ ; $$ AS lock_safe_truncate_sql ,$$ --Delete keys that are no longer in your new version DELETE FROM $$||p_destination_relation::TEXT||$$ d WHERE EXISTS (SELECT 1 FROM removed_keys s WHERE $$||pkey_join||$$); GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET deleted = v_row_count; $$ AS delete_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET upserted = v_row_count; $$ AS upsert_sql FROM info ) SELECT count_tracker_sql|| CASE /*** not implemented truncate pattern WHEN p_load_type IN('full_refresh','full_refresh_truncate') THEN ***/ WHEN p_load_type = 'full_refresh' THEN removed_keys_sql||actual_delta_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ WHEN p_load_type = 'delta' THEN actual_delta_sql||key_join_exists_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ END||$$ $$|| /*** not implemented truncate pattern CASE WHEN p_load_type = 'full_refresh_truncate' THEN percent_change_sql ELSE '' END ***/ '' AS prepare_sql , $$ --THIS SHOULD BE RUN IN A TRANSACTION DO $SCRIPT$ DECLARE v_row_count INT; v_results RECORD; BEGIN $$|| CASE /*** not implemented truncate pattern WHEN p_load_type = 'full_refresh_truncate' THEN lock_safe_truncate_sql||delete_sql||upsert_sql ***/ WHEN p_load_type = 'full_refresh' THEN delete_sql||upsert_sql WHEN p_load_type = 'delta' THEN upsert_sql END||$$ FOR v_results IN SELECT * FROM count_tracker LOOP RAISE LOG 'upserted: %, deleted: %, truncated: %, pct_dest: %', v_results.upserted, v_results.deleted, v_results.truncated, v_results.pct_dest; END LOOP; END $SCRIPT$; $$ AS execute_sql , (SELECT unmapped_columns_src FROM unmapped_source_columns) AS unmapped_src_columns , (SELECT unmapped_columns_dest FROM unmapped_dest_columns) AS unmapped_dest_columns FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_validator (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_unmapped_src_columns TEXT[], p_unmapped_dest_columns TEXT[], p_ignore_unmapped_columns BOOLEAN) RETURNS VOID AS $BODY$ DECLARE v_messages TEXT = ''; BEGIN IF NOT p_ignore_unmapped_columns AND p_unmapped_src_columns IS NOT NULL THEN v_messages = format($$You have unmapped columns (%s) in the source table %s. All source columns must be named identically to destination in order to map. If you are certain you want to ignore these columns, meaning they will not update anything in destination table %s, add the final argument to this function as TRUE. $$ , array_to_string(p_unmapped_src_columns,', ') , p_source_relation::TEXT , p_destination_relation::TEXT); END IF; IF NOT p_ignore_unmapped_columns AND p_unmapped_dest_columns IS NOT NULL THEN v_messages = v_messages||format($$ You have unmapped columns (%s) in the destination table %s. All destination columns must be named identically to source in order to map. If you are certain you want to ignore these columns, meaning the source table %s does not contain all columns in destination table, add the final argument to this function as TRUE.$$ , array_to_string(p_unmapped_dest_columns,', ') , p_destination_relation::TEXT , p_source_relation::TEXT); END IF; IF v_messages <> '' THEN RAISE EXCEPTION '%', v_messages; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; c_lock_cutoff_refresh INT = 99995; BEGIN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.fact_tables WHERE enabled ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority LOOP IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = v_fact_record.fact_table_id) THEN --Load fact table PERFORM fact_loader.load(v_fact_record.fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); RETURN TRUE; END IF; END LOOP; RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.0--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit ALTER TABLE fact_loader.key_retrieval_sequences ADD COLUMN pass_queue_table_change_date_at_tz TEXT NULL; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; ALTER TABLE fact_loader.key_retrieval_sequences ADD CONSTRAINT verify_valid_tz CHECK (pass_queue_table_change_date_at_tz IS NULL OR (now() AT TIME ZONE pass_queue_table_change_date_at_tz IS NOT NULL)); --This check constraint could have been added in v. 1.0 ALTER TABLE fact_loader.queue_tables ADD CONSTRAINT verify_valid_tz CHECK (queue_table_tz IS NULL OR (now() AT TIME ZONE queue_table_tz IS NOT NULL)); ALTER TABLE fact_loader.process_queue ADD COLUMN source_change_date DATE NULL; COMMENT ON COLUMN fact_loader.process_queue.source_change_date IS 'Corresponds to fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz. If this field is populated, a function will be expected that has args (key_value, source_change_date) based on this process_queue table.'; --This should have already been added in v. 1.0 SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_deps', ''); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid DROP NOT NULL, ALTER COLUMN default_update_merge_proid DROP NOT NULL, ALTER COLUMN default_delete_merge_proid DROP NOT NULL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM fact_loader.process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join, string_agg($$d.$$||quote_ident(pkey_field)||$$ = $$||(SELECT arg_params FROM function_schema),E'\nAND ') AS pkey_join_to_arg FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs , pkey_join_to_arg FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs, pkey_join_to_arg ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE $$||pkey_join_to_arg AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.1--1.2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit --To support non-replicated queue tables ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id DROP NOT NULL; CREATE TABLE fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id SERIAL PRIMARY KEY, fact_table_id INT REFERENCES fact_loader.fact_tables (fact_table_id), refresh_attempted_at TIMESTAMPTZ, messages TEXT); ALTER TABLE fact_loader.fact_tables ADD COLUMN use_daily_schedule BOOLEAN NOT NULL DEFAULT FALSE, ADD COLUMN daily_scheduled_time TIME NULL, ADD COLUMN daily_scheduled_tz TEXT NULL, ADD COLUMN daily_scheduled_proid REGPROC, ADD CONSTRAINT verify_valid_daily_tz CHECK (daily_scheduled_tz IS NULL OR (now() AT TIME ZONE daily_scheduled_tz IS NOT NULL)), ADD CONSTRAINT daily_schedule_configured_correctly CHECK ((NOT use_daily_schedule) OR (use_daily_schedule AND daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL)); CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS SELECT * FROM fact_loader.fact_tables WHERE enabled AND (NOT use_daily_schedule OR --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone ( (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME ) ) ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN (use_daily_schedule AND (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME) THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; BEGIN /*** There are 2 basic steps to this load: 1. Execute the single daily-refresh function 2. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()' INTO v_execute_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; BEGIN IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, messages) VALUES (p_fact_table_id, now(), SQLERRM); RETURN FALSE; END; ELSE RETURN FALSE; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.2--1.3.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW IF EXISTS fact_loader.queue_deps_all_with_retrieval; DROP VIEW IF EXISTS fact_loader.queue_deps_all; DROP VIEW IF EXISTS fact_loader.logical_subscription; DROP VIEW IF EXISTS fact_loader.prioritized_jobs; DROP VIEW IF EXISTS fact_loader.unresolved_failures; DROP FUNCTION IF EXISTS fact_loader.sql_builder(int); CREATE OR REPLACE FUNCTION fact_loader.add_batch_id_fields() RETURNS VOID AS $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT queue_table_relid FROM fact_loader.queue_tables qt INNER JOIN pg_class c ON c.oid = qt.queue_table_relid INNER JOIN pg_namespace n ON n.oid = c.relnamespace WHERE NOT EXISTS (SELECT 1 FROM information_schema.columns col WHERE col.column_name = 'fact_loader_batch_id' AND col.table_schema = n.nspname AND col.table_name = c.relname) LOOP v_sql = format($F$ ALTER TABLE %s ADD COLUMN fact_loader_batch_id BIGINT DEFAULT nextval('fact_loader.batch_id'); $F$, v_rec.queue_table_relid::text, v_rec.queue_table_relid::text); RAISE LOG 'ADDING fact_loader_batch_id COLUMN TO queue table %: %', v_rec.queue_table_relid::text, v_sql; EXECUTE v_sql; END LOOP; END $BODY$ LANGUAGE plpgsql; ALTER TABLE fact_loader.queue_tables ADD COLUMN purge BOOLEAN NOT NULL DEFAULT TRUE; UPDATE fact_loader.fact_table_refresh_logs SET messages = jsonb_build_object('Message', messages) WHERE messages IS NOT NULL; --Will be re-added via \i in sql file ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN messages TYPE jsonb USING messages::jsonb; --This was a problem from the start ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id TYPE OID; --This should have been done from the start SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_de_fact_table_dep_queue_table_de_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps_fact_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables_fact_table_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences_key_retrieval_sequence_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps_queue_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables_queue_table_id_seq', ''); --No indexes or anything but allow debugging CREATE UNLOGGED TABLE fact_loader.debug_process_queue (LIKE fact_loader.process_queue); ALTER TABLE fact_loader.debug_process_queue ADD PRIMARY KEY (process_queue_id); -- Now a temp table to avoid serialization contention DROP TABLE fact_loader.process_queue; --Make this a trigger to check dep fact tables ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_base_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_parent_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables DROP CONSTRAINT daily_schedule_configured_correctly; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_schedule_configured_correctly CHECK (NOT use_daily_schedule OR (use_daily_schedule AND ((daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL) OR (depends_on_base_daily_job_id IS NOT NULL AND depends_on_parent_daily_job_id IS NOT NULL)))); --These columns have never been used ALTER TABLE fact_loader.fact_tables DROP COLUMN attempt_number, DROP COLUMN retries_allowed; --This is the usual case and makes sense ALTER TABLE fact_loader.key_retrieval_sequences ALTER COLUMN level SET DEFAULT 1; --Need to have a more reliable dependency knowledge for scheduled jobs ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_deps REGCLASS[]; ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_dep_delay_tolerance INTERVAL; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_deps_correctly_configured CHECK ((daily_scheduled_deps IS NULL AND daily_scheduled_dep_delay_tolerance IS NULL) OR (daily_scheduled_deps IS NOT NULL AND daily_scheduled_dep_delay_tolerance IS NOT NULL)); --Log all events and add pruning ALTER TABLE fact_loader.fact_table_refresh_logs ADD COLUMN refresh_finished_at TIMESTAMPTZ; ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN fact_table_refresh_log_id TYPE BIGINT; -- Handle race conditions by changing to batch usage CREATE SEQUENCE fact_loader.batch_id; SELECT fact_loader.add_batch_id_fields(); CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL AND qt.purge /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s IN (SELECT %s FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s) FOR UPDATE SKIP LOCKED ); $$, queue_table_relid, 'fact_loader_batch_id', 'fact_loader_batch_id', queue_table_relid, 'fact_loader_batch_id', min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE DEBUG 'Purging Queue: %', v_sql; BEGIN EXECUTE v_sql; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure in queue purging for transaction % - skipping.', txid_current()::text; WHEN OTHERS THEN RAISE; END; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN --If any configured functions use temp tables, --must discard to avoid them hanging around in the idle background worker session DISCARD TEMP; --Log job times INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at) VALUES (v_fact_record.fact_table_id, now(), clock_timestamp()); --Return true meaning the fact table was refreshed (this applies even if there was no new data) RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; v_err JSONB; v_errmsg TEXT; v_errdetail TEXT; v_errhint TEXT; v_errcontext TEXT; BEGIN -- We except rare serialization failures here which we will ignore and move to the next record -- Anything else should be raised BEGIN IF EXISTS (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id FOR UPDATE SKIP LOCKED) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_errmsg = MESSAGE_TEXT, v_errdetail = PG_EXCEPTION_DETAIL, v_errhint = PG_EXCEPTION_HINT, v_errcontext = PG_EXCEPTION_CONTEXT; UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; v_err = jsonb_strip_nulls( jsonb_build_object( 'Message', v_errmsg, 'Detail', case when v_errdetail = '' then null else v_errdetail end, 'Hint', case when v_errhint = '' then null else v_errhint end, 'Context', case when v_errcontext = '' then null else v_errcontext end) ); INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at, messages) VALUES (p_fact_table_id, now(), clock_timestamp(), v_err); RETURN FALSE; END; ELSE RETURN FALSE; END IF; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure on transaction % attempting to lock % - skipping.', txid_current()::text, p_fact_table_id::text; RETURN FALSE; WHEN OTHERS THEN RAISE; END; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue SELECT * FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE FUNCTION fact_loader.safely_terminate_workers() RETURNS TABLE (number_terminated INT, number_still_live INT, pids_still_live INT[]) AS $BODY$ /**** It is not a requirement to use this function to terminate workers. Because workers are transactional, you can simply terminate them and no data loss will result in pg_fact_loader. Likewise, a hard crash of any system using pg_fact_loader will recover just fine upon re-launching workers. Still, it is ideal to avoid bloat to cleanly terminate workers and restart them using this function to kill them, and launch_workers(int) to re-launch them. */ BEGIN RETURN QUERY WITH try_term_pids AS ( SELECT pid, CASE WHEN state = 'idle' AND state_change BETWEEN SYMMETRIC now() - interval '5 seconds' AND now() - interval '55 seconds' THEN pg_terminate_backend(pid) ELSE FALSE END AS terminated FROM pg_stat_activity WHERE usename = 'postgres' AND query = 'SELECT fact_loader.worker();') SELECT SUM(CASE WHEN terminated THEN 1 ELSE 0 END)::INT AS number_terminated_out, SUM(CASE WHEN NOT terminated THEN 1 ELSE 0 END)::INT AS number_still_live_out, (SELECT array_agg(pid) FROM try_term_pids WHERE NOT terminated) AS pids_still_live_out FROM try_term_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.launch_workers(number_to_launch int) RETURNS INT[] AS $BODY$ DECLARE v_pids INT[]; BEGIN FOR i IN 1..number_to_launch LOOP v_pids = array_append(v_pids, fact_loader.launch_worker()); /* It's not strictly required to not launch all workers simultaneously, but it's also a little more invasive to do that, probably requiring more advisory lock skips. Better to just sleep 1 second between launches. */ PERFORM pg_sleep(1); END LOOP; RETURN v_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, format($$ %s AS fact_table_id, %s AS queue_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time)) AS metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END, quote_literal(maximum_cutoff_time)) AS global_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table E'\nINNER JOIN '||queue_of_base_table_relid::TEXT||' b'|| E'\n ON q.'||quote_ident(queue_table_key)||'::'||queue_of_base_table_key_type||' = b.'||quote_ident(queue_of_base_table_key) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. format($$ %s AND q.%s < %s %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END) AS global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_insert_sql) AS queue_insert_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_update_sql) AS queue_update_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.global_where_sql||nrs.where_for_delete_sql) AS queue_delete_sql, format($$ SELECT %s FROM %s WHERE %s $$, nrs.metadata_select_columns, nrs.queue_table_aliased, nrs.global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.gathered_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, proid REGPROC, key_value TEXT, source_change_date DATE) AS $BODY$ DECLARE v_gather_sql text; BEGIN SELECT gathered_queued_changes_sql INTO v_gather_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_gather_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; v_deps regclass[]; v_dep_delay_tolerance interval; v_delayed_msg text; BEGIN /*** There are 3 basic steps to this load: 1. If dependencies are listed, verify they are up to date enough 2. Execute the single daily-refresh function 3. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()', daily_scheduled_deps, daily_scheduled_dep_delay_tolerance INTO v_execute_sql, v_deps, v_dep_delay_tolerance FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; IF v_deps IS NOT NULL THEN WITH deps AS (SELECT unnest(v_deps) AS dep) , delays AS ( SELECT dep, now() - source_time as delay_interval FROM fact_loader.queue_table_delay_info() qtd INNER JOIN deps d ON d.dep = qtd.queue_of_base_table_relid UNION ALL SELECT dep, now() - last_refresh_source_cutoff as delay_interval FROM fact_loader.fact_tables ft INNER JOIN deps d ON d.dep = ft.fact_table_relid ) SELECT string_agg(dep::text||': Delayed '||delay_interval::text, ', ') INTO v_delayed_msg FROM delays WHERE delay_interval > v_dep_delay_tolerance; IF v_delayed_msg IS NOT NULL THEN RAISE EXCEPTION '%', v_delayed_msg; END IF; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.fact_table_refresh_logs_pruner() RETURNS trigger LANGUAGE plpgsql AS $$ declare step int := 1000; -- step should equal the firing frequency in trigger definition overdrive int := 2; -- overdrive times step = max rows (see below) max_rows int := step * overdrive; rows int; begin delete from fact_loader.fact_table_refresh_logs where fact_table_refresh_log_id in ( select fact_table_refresh_log_id from fact_loader.fact_table_refresh_logs where refresh_attempted_at < now() - '90 days'::interval -- do not do the literal interval value above as a declare parameter order by fact_table_refresh_log_id limit max_rows for update skip locked ); get diagnostics rows = row_count; return null; end $$; CREATE TRIGGER fact_table_refresh_logs_pruner AFTER INSERT ON fact_loader.fact_table_refresh_logs FOR EACH ROW WHEN ((new.fact_table_refresh_log_id % 1000::bigint) = 0) EXECUTE PROCEDURE fact_loader.fact_table_refresh_logs_pruner(); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (sub_origin_if OID, sub_replication_sets text[]) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if, sub_replication_sets FROM pglogical.subscription; $$; ELSE RETURN QUERY SELECT NULL::OID, NULL::TEXT[]; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist Add not exists because we think allowing all records to insert and conflict could be cause of serialization errors in repeatable read isolation. */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps new WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_dep_queue_table_deps existing WHERE existing.fact_table_dep_id = new.fact_table_dep_id AND existing.queue_table_dep_id = new.queue_table_dep_id) ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; -- These fields now becomes based on batch, not based on queue_table_id_field DO $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT format($$ UPDATE fact_loader.%s SET last_cutoff_id = (SELECT fact_loader_batch_id FROM %s WHERE %s = %s) WHERE %s = %s; $$, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_deps' ELSE 'fact_table_dep_queue_table_deps' END, queue_table_relid::text, queue_table_id_field::text, last_cutoff_id::text, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_dep_id' ELSE 'fact_table_dep_queue_table_dep_id' END, CASE WHEN fact_table_dep_id IS NULL THEN queue_table_dep_id ELSE fact_table_dep_queue_table_dep_id END ) AS sql FROM fact_loader.queue_deps_all WHERE last_cutoff_id IS NOT NULL LOOP v_sql = v_rec.sql; RAISE LOG 'Updating Extension pg_fact_loader Executed: %', v_sql; EXECUTE v_sql; END LOOP; END$BODY$; COMMENT ON TABLE fact_loader.debug_process_queue IS 'A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG.'; COMMENT ON TABLE fact_loader.fact_table_dep_queue_table_deps IS $$Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_queue_table_dep_id IS 'Unique identifier'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_id IS 'fact_table_dep for this specific dependency.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.queue_table_dep_id IS 'Inherited queue_table_dep that this dependent fact table depends on.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_id IS $$This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_source_time IS $$This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.insert_merge_proid IS $$Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.update_merge_proid IS $$Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.delete_merge_proid IS $$Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_deps IS 'For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here.'; COMMENT ON COLUMN fact_loader.fact_table_deps.fact_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.fact_table_deps.parent_id IS 'The parent fact_table_id that the child depends on.'; COMMENT ON COLUMN fact_loader.fact_table_deps.child_id IS 'The child fact_table_id that will run only after the parent is updated.'; COMMENT ON COLUMN fact_loader.fact_table_deps.default_insert_merge_proid IS $$Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_update_merge_proid IS $$Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_delete_merge_proid IS $$Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_refresh_logs IS 'Used to log both job run times and exceptions.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_refresh_log_id IS 'Unique identifier,'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_id IS 'Fact table that created the log.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_attempted_at IS 'The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures).'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_finished_at IS 'The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.messages IS 'Only for failures - Error message content in JSON format - including message, message detail, context, and hint.'; COMMENT ON TABLE fact_loader.fact_tables IS 'Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs".'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_id IS 'Unique identifier for the fact table or job - also referred to as job_id'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_relid IS 'The oid of the fact table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_agg_proid IS $$NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now.$$; COMMENT ON COLUMN fact_loader.fact_tables.enabled IS 'Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE.'; COMMENT ON COLUMN fact_loader.fact_tables.priority IS 'Determines the order in which the job runs (in combination with other sorting factors)'; COMMENT ON COLUMN fact_loader.fact_tables.force_worker_priority IS 'If marked TRUE, this fact table will be prioritized in execution order above all other factors.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_source_cutoff IS 'The data cutoff time of the last refresh - only records older than this have been updated.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_attempted_at IS 'The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_succeeded IS 'Whether or not the last run of the job succeeded. NULL if it has never been run.'; COMMENT ON COLUMN fact_loader.fact_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.fact_tables.use_daily_schedule IS 'If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_time IS 'The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_tz IS 'The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_proid IS $$The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples.$$; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_base_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time.'; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_parent_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_deps IS 'OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_dep_delay_tolerance IS 'OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON TABLE fact_loader.key_retrieval_sequences IS $$How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.key_retrieval_sequence_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.queue_table_dep_id IS 'Which fact table - queue table record this is for (queue_table_deps)'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.filter_scope IS $$NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.level IS $$Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns IS $$What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.is_fact_key IS 'Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_relation IS 'Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_column IS 'Join to this column of join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns_from_join IS 'Return these columns from join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_return_is_fact_key IS 'If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; COMMENT ON VIEW fact_loader.queue_deps_all_with_retrieval IS 'The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes.'; COMMENT ON TABLE fact_loader.queue_table_deps IS $$Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.queue_table_deps.fact_table_id IS 'Fact table to tie together with a queue table it depends on.'; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_id IS 'Queue table to tie together with a fact table that needs its changes.'; COMMENT ON COLUMN fact_loader.queue_table_deps.relevant_change_columns IS $$Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_id IS $$The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_source_time IS $$The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.queue_table_deps.insert_merge_proid IS $$Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.update_merge_proid IS $$Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.delete_merge_proid IS $$Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.queue_tables IS 'Each queue table along with the base table to which it belongs.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_id IS 'Unique identifier for queue tables.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_relid IS 'The oid of the queue table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_of_base_table_relid IS 'The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table.$$; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_tz IS $$**NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz.$$; COMMENT ON COLUMN fact_loader.queue_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.queue_tables.purge IS 'Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table.'; COMMENT ON VIEW fact_loader.unresolved_failures IS 'Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring.'; /* pg_fact_loader--1.3--1.4.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP FUNCTION fact_loader.raw_queued_changes(int); ALTER TABLE fact_loader.debug_process_queue DROP CONSTRAINT debug_process_queue_pkey; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ %s < %s %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), changed_at_tz_correction) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ, min_missed_id BIGINT, queue_table_id INT ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; /* pg_fact_loader--1.4--1.5.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ -- changed_at is guaranteed now to be in timestamptz - any time zone casting is only in subquery changed_at < %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_literal(c.maximum_cutoff_time) ) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; pg_fact_loader-2.0.1/pg_fact_loader--1.6--1.7.sql000066400000000000000000000415211451107006500210560ustar00rootroot00000000000000/* pg_fact_loader--1.6--1.7.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; ALTER TABLE fact_loader.fact_tables ADD COLUMN pre_execute_hook_sql TEXT; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; v_pre_execute_hook_sql text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** Pre-execute hook */ SELECT pre_execute_hook_sql INTO v_pre_execute_hook_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id; EXECUTE COALESCE(v_pre_execute_hook_sql, $$SELECT 'No pre-execute hook.' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/pg_fact_loader--1.6.sql000066400000000000000000006452301451107006500205050ustar00rootroot00000000000000/* pg_fact_loader--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE FUNCTION fact_loader._launch_worker(oid) RETURNS pg_catalog.INT4 STRICT AS 'MODULE_PATHNAME', 'pg_fact_loader_worker' LANGUAGE C; CREATE FUNCTION fact_loader.launch_worker() RETURNS pg_catalog.INT4 STRICT AS 'SELECT fact_loader._launch_worker(oid) FROM pg_database WHERE datname = current_database();' LANGUAGE SQL; CREATE TABLE fact_loader.fact_tables ( fact_table_id SERIAL PRIMARY KEY, fact_table_relid REGCLASS NOT NULL, fact_table_agg_proid REGPROC NULL, --This may only be used to generate a merge function but is not used in automation enabled BOOLEAN NOT NULL DEFAULT FALSE, priority INT, attempt_number INT, retries_allowed INT DEFAULT 0, force_worker_priority BOOLEAN NOT NULL DEFAULT FALSE, last_refresh_source_cutoff TIMESTAMPTZ, last_refresh_attempted_at TIMESTAMPTZ, --TODO - answer if we want the worker to bail or record messages on ERROR (or both) last_refresh_succeeded BOOLEAN, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_tables UNIQUE (fact_table_relid) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables', ''); CREATE TABLE fact_loader.fact_table_deps ( fact_table_dep_id SERIAL PRIMARY KEY, parent_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), child_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), /***** In very many cases, you will use the same procs for insert, update, and delete even with multiple dependencies. This is why you must give defaults here which will be used to auto-populate fact_loader.fact_table_dep_queue_table_deps which can be overridden if necessary for each queue table. After you configure all of your fact tables and queue tables, run the function refresh_fact_table_dep_queue_table_deps manually to populate fact_table_dep_queue_table_deps, then make any changes as necessary. You can see an example of this in the test suite "seeds" file. You can also see an override example with order_emails_fact having a different proc for orders and reorders delete cases. */ default_insert_merge_proid REGPROC NOT NULL, default_update_merge_proid REGPROC NOT NULL, default_delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_deps UNIQUE (parent_id, child_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps', ''); CREATE TABLE fact_loader.queue_tables ( queue_table_id SERIAL PRIMARY KEY, queue_table_relid REGCLASS NOT NULL, queue_of_base_table_relid REGCLASS NOT NULL, /**** NOTE - the reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information. */ pglogical_node_if_id INT NOT NULL, --This is the timezone for the changed_at column - if null, we assume it is timestamptz (we could check that actually) queue_table_tz TEXT, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_table UNIQUE (queue_table_relid), CONSTRAINT unique_base_table UNIQUE (queue_of_base_table_relid) ); COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$The reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information.$$; SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables', ''); CREATE TABLE fact_loader.queue_table_deps ( queue_table_dep_id SERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), queue_table_id INT NOT NULL REFERENCES fact_loader.queue_tables (queue_table_id), relevant_change_columns NAME[], last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_deps UNIQUE (fact_table_id, queue_table_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps', ''); CREATE TABLE fact_loader.key_retrieval_sequences ( key_retrieval_sequence_id SERIAL PRIMARY KEY, queue_table_dep_id INT NOT NULL REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), /**** In almost all cases, we only need to write one way to retrieve keys. The only exception is, for example, when in a delete case, you need to pass a different field (customer_id instead of order_id) to the delete_merge_proid function. You then need a different key_retrieval_sequence to handle a different field name for this delete case. By default this is NULL, meaning there is no filter, meaning the sequence applies to all events I, U, D. Otherwise, you can add scopes in which case you must have one for each of 'I','U','D'. */ filter_scope CHAR(1) NULL, level INT NOT NULL, return_columns NAME[] NOT NULL, is_fact_key BOOLEAN NOT NULL, join_to_relation REGCLASS NULL, join_to_column NAME NULL, return_columns_from_join NAME[] NULL, join_return_is_fact_key BOOLEAN NULL, CONSTRAINT unique_retrievals UNIQUE (queue_table_dep_id, filter_scope, level), CONSTRAINT valid_scopes CHECK (filter_scope IN ('I','U','D')) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences', ''); CREATE TABLE fact_loader.fact_table_dep_queue_table_deps ( fact_table_dep_queue_table_dep_id SERIAL PRIMARY KEY, fact_table_dep_id INT REFERENCES fact_loader.fact_table_deps (fact_table_dep_id), queue_table_dep_id INT REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_cutoffs UNIQUE (fact_table_dep_id, queue_table_dep_id) ); CREATE OR REPLACE FUNCTION fact_loader.unique_scopes() RETURNS TRIGGER AS $BODY$ BEGIN IF (NEW.filter_scope IS NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NOT NULL )) OR (NEW.filter_scope IS NOT NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NULL )) THEN RAISE EXCEPTION $$You must either use a NULL filter_scope to cover all 3 events I, U, D or you must specify all 3 events separately I, U, D (For queue_table_dep_id %). $$, NEW.queue_table_dep_id; END IF; RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER unique_scopes BEFORE INSERT OR UPDATE ON fact_loader.key_retrieval_sequences FOR EACH ROW EXECUTE PROCEDURE fact_loader.unique_scopes(); /*** This table is unlogged because it only has data mid-transaction and should always be empty */ CREATE UNLOGGED TABLE fact_loader.process_queue ( process_queue_id BIGSERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), proid REGPROC NOT NULL, key_value TEXT NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ ); CREATE OR REPLACE FUNCTION fact_loader.set_row_updated_at_to_now() RETURNS TRIGGER AS $BODY$ BEGIN NEW.row_updated_at = now(); RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_dep_queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.process_queue FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TYPE fact_loader.table_load_type AS ENUM('delta','full_refresh'); CREATE OR REPLACE FUNCTION fact_loader.create_table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS REGPROC AS $BODY$ DECLARE v_new_proc TEXT; v_sql TEXT; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT function_name, function_sql INTO v_new_proc, v_sql FROM fact_loader.table_loader_function(p_source_proc, p_destination_relation, p_ignore_diff_for_columns); EXECUTE v_sql; RETURN v_new_proc::REGPROC; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id, proid, key_value, --TODO - either infer the data type of the function args, which is not super easy with postgres, --or add configuration fields for the name and data type of these. This will suffice for now --because we only have integer args for all functions 'integer' AS queue_of_base_table_key_type FROM fact_loader.process_queue pq WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT format('%s(%s::%s)', proid::TEXT, 'key_value', queue_of_base_table_key_type) AS function_call, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id) AS execute_sql FROM with_rank GROUP BY execution_group, function_call ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is the actual load to the destination table, and assumes that 'prepare' phase has already been run, which is supposed to have gathered the actual minimal delta and determine what to do here. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT execute_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_insert_to_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Using the process_queue data, execute the delta load of the fact table 3. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT insert_to_process_queue_sql, metadata_update_sql INTO v_insert_to_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue */ RAISE LOG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_insert_to_process_queue_sql; EXECUTE COALESCE(v_insert_to_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') LIKE 'debug%' THEN FOR v_debug_rec IN SELECT * FROM fact_loader.process_queue LOOP v_debug_text = v_debug_text||E'\n'||format('%s', v_debug_rec.process_queue_id||chr(9)||v_debug_rec.fact_table_id||chr(9)||v_debug_rec.proid||chr(9)||v_debug_rec.key_value); END LOOP; IF v_debug_text <> '' THEN v_debug_text = E'\n'||format('%s', (SELECT string_agg(column_name,chr(9)) FROM information_schema.columns WHERE table_name = 'process_queue' AND table_schema = 'fact_loader' AND column_name NOT LIKE 'row_%_at')) ||v_debug_text; RAISE DEBUG '%', v_debug_text; END IF; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE LOG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE LOG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.prepare_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is not going to lock any of the destination table for writing, which is precisely why it is separated from the 'execute' phase which actually writes to the table in the shortest transaction possible. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT prepare_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s); $$, queue_table_relid, queue_table_id_field, min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE LOG 'Purging Queue: %', v_sql; EXECUTE v_sql; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type) RETURNS TABLE (prepare_sql text, execute_sql text, unmapped_src_columns text[], unmapped_dest_columns text[]) AS $BODY$ DECLARE v_pkey_fields TEXT[]; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT array_agg(a.attname ORDER BY pk.rn) INTO v_pkey_fields FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik; RETURN QUERY WITH source_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_source_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , destination_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , unmapped_source_columns AS ( SELECT array_agg(s.column_name::text) AS unmapped_columns_src FROM source_columns s WHERE NOT EXISTS (SELECT 1 FROM destination_columns d WHERE d.column_name = s.column_name) ) , unmapped_dest_columns AS ( SELECT array_agg(d.column_name::text) AS unmapped_columns_dest FROM destination_columns d WHERE NOT EXISTS (SELECT 1 FROM source_columns s WHERE d.column_name = s.column_name) ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT unnest AS pkey_field FROM unnest(v_pkey_fields)) pk ) , info AS ( SELECT string_agg( CASE WHEN sc.column_name IS NOT NULL THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN sc.column_name IS NOT NULL AND (p_ignore_diff_for_columns IS NULL OR sc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN sc.column_name IS NOT NULL AND NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_join FROM destination_columns dc CROSS JOIN pkeys LEFT JOIN source_columns sc ON dc.column_name = sc.column_name GROUP BY pkeys.pkey_fields, pkeys.pkey_join ) , sql_snippets AS ( SELECT $$ DROP TABLE IF EXISTS count_tracker; CREATE TEMP TABLE count_tracker (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)); INSERT INTO count_tracker VALUES (NULL, NULL, FALSE, NULL); $$::TEXT AS count_tracker_sql , $$ DROP TABLE IF EXISTS actual_delta; CREATE TEMP TABLE actual_delta AS WITH final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_source_relation::TEXT||$$ EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ DROP TABLE IF EXISTS removed_keys; CREATE TEMP TABLE removed_keys AS SELECT $$||pkey_fields||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE NOT EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$); $$ AS removed_keys_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM $$||p_source_relation::TEXT||$$ s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ); $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql , $$ /*** We add a primary key to the actual_delta table to ensure there are no duplicate keys. ***/ ALTER TABLE actual_delta ADD PRIMARY KEY ($$||pkey_fields||$$); $$ AS add_delta_pkey_sql , $$ /**** This part is not implemented yet, but partially complete. If we decide we want to figure out that >50% of the table will be updated, we could decide to truncate. But then we have to balance the desire for that with more read queries to figure it out. To implement, add the type full_refresh_truncate to fact_loader.table_load_type, and uncomment code. We would also have to add the logic to find actual keys added, then subtract it from actual_delta to get the net updates expected. If this is over 50%, we should truncate and re-insert all data. ***/ DROP TABLE IF EXISTS percent_of_destination; CREATE TEMP TABLE percent_of_destination AS SELECT (((SELECT COUNT(1) FROM actual_delta) - (SELECT COUNT(1) FROM added_keys))::NUMERIC / (SELECT COUNT(1) FROM $$||p_destination_relation::TEXT||$$)::NUMERIC)::NUMERIC(8,2) AS pct; UPDATE count_tracker SET pct_dest = (SELECT pct FROM percent_of_destination); $$ AS percent_change_sql ,$$ DO $LOCK_SAFE_DDL$ BEGIN SET lock_timeout TO '10ms'; IF (SELECT pct FROM percent_of_destination) >= 0.5 THEN LOOP BEGIN TRUNCATE $$||p_destination_relation::TEXT||$$; UPDATE count_tracker SET truncated = true; EXIT; EXCEPTION WHEN lock_not_available THEN RAISE WARNING 'Could not obtain immediate lock for SQL %, retrying', p_sql; PERFORM pg_sleep(3); WHEN OTHERS THEN RAISE; END; END LOOP; END IF; RESET lock_timeout; END $LOCK_SAFE_DDL$ ; $$ AS lock_safe_truncate_sql ,$$ --Delete keys that are no longer in your new version DELETE FROM $$||p_destination_relation::TEXT||$$ d WHERE EXISTS (SELECT 1 FROM removed_keys s WHERE $$||pkey_join||$$); GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET deleted = v_row_count; $$ AS delete_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET upserted = v_row_count; $$ AS upsert_sql FROM info ) SELECT count_tracker_sql|| CASE /*** not implemented truncate pattern WHEN p_load_type IN('full_refresh','full_refresh_truncate') THEN ***/ WHEN p_load_type = 'full_refresh' THEN removed_keys_sql||actual_delta_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ WHEN p_load_type = 'delta' THEN actual_delta_sql||key_join_exists_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ END||$$ $$|| /*** not implemented truncate pattern CASE WHEN p_load_type = 'full_refresh_truncate' THEN percent_change_sql ELSE '' END ***/ '' AS prepare_sql , $$ --THIS SHOULD BE RUN IN A TRANSACTION DO $SCRIPT$ DECLARE v_row_count INT; v_results RECORD; BEGIN $$|| CASE /*** not implemented truncate pattern WHEN p_load_type = 'full_refresh_truncate' THEN lock_safe_truncate_sql||delete_sql||upsert_sql ***/ WHEN p_load_type = 'full_refresh' THEN delete_sql||upsert_sql WHEN p_load_type = 'delta' THEN upsert_sql END||$$ FOR v_results IN SELECT * FROM count_tracker LOOP RAISE LOG 'upserted: %, deleted: %, truncated: %, pct_dest: %', v_results.upserted, v_results.deleted, v_results.truncated, v_results.pct_dest; END LOOP; END $SCRIPT$; $$ AS execute_sql , (SELECT unmapped_columns_src FROM unmapped_source_columns) AS unmapped_src_columns , (SELECT unmapped_columns_dest FROM unmapped_dest_columns) AS unmapped_dest_columns FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_validator (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_unmapped_src_columns TEXT[], p_unmapped_dest_columns TEXT[], p_ignore_unmapped_columns BOOLEAN) RETURNS VOID AS $BODY$ DECLARE v_messages TEXT = ''; BEGIN IF NOT p_ignore_unmapped_columns AND p_unmapped_src_columns IS NOT NULL THEN v_messages = format($$You have unmapped columns (%s) in the source table %s. All source columns must be named identically to destination in order to map. If you are certain you want to ignore these columns, meaning they will not update anything in destination table %s, add the final argument to this function as TRUE. $$ , array_to_string(p_unmapped_src_columns,', ') , p_source_relation::TEXT , p_destination_relation::TEXT); END IF; IF NOT p_ignore_unmapped_columns AND p_unmapped_dest_columns IS NOT NULL THEN v_messages = v_messages||format($$ You have unmapped columns (%s) in the destination table %s. All destination columns must be named identically to source in order to map. If you are certain you want to ignore these columns, meaning the source table %s does not contain all columns in destination table, add the final argument to this function as TRUE.$$ , array_to_string(p_unmapped_dest_columns,', ') , p_destination_relation::TEXT , p_source_relation::TEXT); END IF; IF v_messages <> '' THEN RAISE EXCEPTION '%', v_messages; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; c_lock_cutoff_refresh INT = 99995; BEGIN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.fact_tables WHERE enabled ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority LOOP IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = v_fact_record.fact_table_id) THEN --Load fact table PERFORM fact_loader.load(v_fact_record.fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); RETURN TRUE; END IF; END LOOP; RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.0--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit ALTER TABLE fact_loader.key_retrieval_sequences ADD COLUMN pass_queue_table_change_date_at_tz TEXT NULL; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; ALTER TABLE fact_loader.key_retrieval_sequences ADD CONSTRAINT verify_valid_tz CHECK (pass_queue_table_change_date_at_tz IS NULL OR (now() AT TIME ZONE pass_queue_table_change_date_at_tz IS NOT NULL)); --This check constraint could have been added in v. 1.0 ALTER TABLE fact_loader.queue_tables ADD CONSTRAINT verify_valid_tz CHECK (queue_table_tz IS NULL OR (now() AT TIME ZONE queue_table_tz IS NOT NULL)); ALTER TABLE fact_loader.process_queue ADD COLUMN source_change_date DATE NULL; COMMENT ON COLUMN fact_loader.process_queue.source_change_date IS 'Corresponds to fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz. If this field is populated, a function will be expected that has args (key_value, source_change_date) based on this process_queue table.'; --This should have already been added in v. 1.0 SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_deps', ''); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid DROP NOT NULL, ALTER COLUMN default_update_merge_proid DROP NOT NULL, ALTER COLUMN default_delete_merge_proid DROP NOT NULL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM fact_loader.process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join, string_agg($$d.$$||quote_ident(pkey_field)||$$ = $$||(SELECT arg_params FROM function_schema),E'\nAND ') AS pkey_join_to_arg FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs , pkey_join_to_arg FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs, pkey_join_to_arg ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE $$||pkey_join_to_arg AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.1--1.2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit --To support non-replicated queue tables ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id DROP NOT NULL; CREATE TABLE fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id SERIAL PRIMARY KEY, fact_table_id INT REFERENCES fact_loader.fact_tables (fact_table_id), refresh_attempted_at TIMESTAMPTZ, messages TEXT); ALTER TABLE fact_loader.fact_tables ADD COLUMN use_daily_schedule BOOLEAN NOT NULL DEFAULT FALSE, ADD COLUMN daily_scheduled_time TIME NULL, ADD COLUMN daily_scheduled_tz TEXT NULL, ADD COLUMN daily_scheduled_proid REGPROC, ADD CONSTRAINT verify_valid_daily_tz CHECK (daily_scheduled_tz IS NULL OR (now() AT TIME ZONE daily_scheduled_tz IS NOT NULL)), ADD CONSTRAINT daily_schedule_configured_correctly CHECK ((NOT use_daily_schedule) OR (use_daily_schedule AND daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL)); CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS SELECT * FROM fact_loader.fact_tables WHERE enabled AND (NOT use_daily_schedule OR --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone ( (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME ) ) ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN (use_daily_schedule AND (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME) THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; BEGIN /*** There are 2 basic steps to this load: 1. Execute the single daily-refresh function 2. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()' INTO v_execute_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; BEGIN IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, messages) VALUES (p_fact_table_id, now(), SQLERRM); RETURN FALSE; END; ELSE RETURN FALSE; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.2--1.3.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW IF EXISTS fact_loader.queue_deps_all_with_retrieval; DROP VIEW IF EXISTS fact_loader.queue_deps_all; DROP VIEW IF EXISTS fact_loader.logical_subscription; DROP VIEW IF EXISTS fact_loader.prioritized_jobs; DROP VIEW IF EXISTS fact_loader.unresolved_failures; DROP FUNCTION IF EXISTS fact_loader.sql_builder(int); CREATE OR REPLACE FUNCTION fact_loader.add_batch_id_fields() RETURNS VOID AS $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT queue_table_relid FROM fact_loader.queue_tables qt INNER JOIN pg_class c ON c.oid = qt.queue_table_relid INNER JOIN pg_namespace n ON n.oid = c.relnamespace WHERE NOT EXISTS (SELECT 1 FROM information_schema.columns col WHERE col.column_name = 'fact_loader_batch_id' AND col.table_schema = n.nspname AND col.table_name = c.relname) LOOP v_sql = format($F$ ALTER TABLE %s ADD COLUMN fact_loader_batch_id BIGINT DEFAULT nextval('fact_loader.batch_id'); $F$, v_rec.queue_table_relid::text, v_rec.queue_table_relid::text); RAISE LOG 'ADDING fact_loader_batch_id COLUMN TO queue table %: %', v_rec.queue_table_relid::text, v_sql; EXECUTE v_sql; END LOOP; END $BODY$ LANGUAGE plpgsql; ALTER TABLE fact_loader.queue_tables ADD COLUMN purge BOOLEAN NOT NULL DEFAULT TRUE; UPDATE fact_loader.fact_table_refresh_logs SET messages = jsonb_build_object('Message', messages) WHERE messages IS NOT NULL; --Will be re-added via \i in sql file ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN messages TYPE jsonb USING messages::jsonb; --This was a problem from the start ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id TYPE OID; --This should have been done from the start SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_de_fact_table_dep_queue_table_de_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps_fact_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables_fact_table_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences_key_retrieval_sequence_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps_queue_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables_queue_table_id_seq', ''); --No indexes or anything but allow debugging CREATE UNLOGGED TABLE fact_loader.debug_process_queue (LIKE fact_loader.process_queue); ALTER TABLE fact_loader.debug_process_queue ADD PRIMARY KEY (process_queue_id); -- Now a temp table to avoid serialization contention DROP TABLE fact_loader.process_queue; --Make this a trigger to check dep fact tables ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_base_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_parent_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables DROP CONSTRAINT daily_schedule_configured_correctly; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_schedule_configured_correctly CHECK (NOT use_daily_schedule OR (use_daily_schedule AND ((daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL) OR (depends_on_base_daily_job_id IS NOT NULL AND depends_on_parent_daily_job_id IS NOT NULL)))); --These columns have never been used ALTER TABLE fact_loader.fact_tables DROP COLUMN attempt_number, DROP COLUMN retries_allowed; --This is the usual case and makes sense ALTER TABLE fact_loader.key_retrieval_sequences ALTER COLUMN level SET DEFAULT 1; --Need to have a more reliable dependency knowledge for scheduled jobs ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_deps REGCLASS[]; ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_dep_delay_tolerance INTERVAL; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_deps_correctly_configured CHECK ((daily_scheduled_deps IS NULL AND daily_scheduled_dep_delay_tolerance IS NULL) OR (daily_scheduled_deps IS NOT NULL AND daily_scheduled_dep_delay_tolerance IS NOT NULL)); --Log all events and add pruning ALTER TABLE fact_loader.fact_table_refresh_logs ADD COLUMN refresh_finished_at TIMESTAMPTZ; ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN fact_table_refresh_log_id TYPE BIGINT; -- Handle race conditions by changing to batch usage CREATE SEQUENCE fact_loader.batch_id; SELECT fact_loader.add_batch_id_fields(); CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL AND qt.purge /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s IN (SELECT %s FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s) FOR UPDATE SKIP LOCKED ); $$, queue_table_relid, 'fact_loader_batch_id', 'fact_loader_batch_id', queue_table_relid, 'fact_loader_batch_id', min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE DEBUG 'Purging Queue: %', v_sql; BEGIN EXECUTE v_sql; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure in queue purging for transaction % - skipping.', txid_current()::text; WHEN OTHERS THEN RAISE; END; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN --If any configured functions use temp tables, --must discard to avoid them hanging around in the idle background worker session DISCARD TEMP; --Log job times INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at) VALUES (v_fact_record.fact_table_id, now(), clock_timestamp()); --Return true meaning the fact table was refreshed (this applies even if there was no new data) RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; v_err JSONB; v_errmsg TEXT; v_errdetail TEXT; v_errhint TEXT; v_errcontext TEXT; BEGIN -- We except rare serialization failures here which we will ignore and move to the next record -- Anything else should be raised BEGIN IF EXISTS (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id FOR UPDATE SKIP LOCKED) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_errmsg = MESSAGE_TEXT, v_errdetail = PG_EXCEPTION_DETAIL, v_errhint = PG_EXCEPTION_HINT, v_errcontext = PG_EXCEPTION_CONTEXT; UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; v_err = jsonb_strip_nulls( jsonb_build_object( 'Message', v_errmsg, 'Detail', case when v_errdetail = '' then null else v_errdetail end, 'Hint', case when v_errhint = '' then null else v_errhint end, 'Context', case when v_errcontext = '' then null else v_errcontext end) ); INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at, messages) VALUES (p_fact_table_id, now(), clock_timestamp(), v_err); RETURN FALSE; END; ELSE RETURN FALSE; END IF; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure on transaction % attempting to lock % - skipping.', txid_current()::text, p_fact_table_id::text; RETURN FALSE; WHEN OTHERS THEN RAISE; END; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue SELECT * FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE FUNCTION fact_loader.safely_terminate_workers() RETURNS TABLE (number_terminated INT, number_still_live INT, pids_still_live INT[]) AS $BODY$ /**** It is not a requirement to use this function to terminate workers. Because workers are transactional, you can simply terminate them and no data loss will result in pg_fact_loader. Likewise, a hard crash of any system using pg_fact_loader will recover just fine upon re-launching workers. Still, it is ideal to avoid bloat to cleanly terminate workers and restart them using this function to kill them, and launch_workers(int) to re-launch them. */ BEGIN RETURN QUERY WITH try_term_pids AS ( SELECT pid, CASE WHEN state = 'idle' AND state_change BETWEEN SYMMETRIC now() - interval '5 seconds' AND now() - interval '55 seconds' THEN pg_terminate_backend(pid) ELSE FALSE END AS terminated FROM pg_stat_activity WHERE usename = 'postgres' AND query = 'SELECT fact_loader.worker();') SELECT SUM(CASE WHEN terminated THEN 1 ELSE 0 END)::INT AS number_terminated_out, SUM(CASE WHEN NOT terminated THEN 1 ELSE 0 END)::INT AS number_still_live_out, (SELECT array_agg(pid) FROM try_term_pids WHERE NOT terminated) AS pids_still_live_out FROM try_term_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.launch_workers(number_to_launch int) RETURNS INT[] AS $BODY$ DECLARE v_pids INT[]; BEGIN FOR i IN 1..number_to_launch LOOP v_pids = array_append(v_pids, fact_loader.launch_worker()); /* It's not strictly required to not launch all workers simultaneously, but it's also a little more invasive to do that, probably requiring more advisory lock skips. Better to just sleep 1 second between launches. */ PERFORM pg_sleep(1); END LOOP; RETURN v_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, format($$ %s AS fact_table_id, %s AS queue_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time)) AS metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END, quote_literal(maximum_cutoff_time)) AS global_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table E'\nINNER JOIN '||queue_of_base_table_relid::TEXT||' b'|| E'\n ON q.'||quote_ident(queue_table_key)||'::'||queue_of_base_table_key_type||' = b.'||quote_ident(queue_of_base_table_key) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. format($$ %s AND q.%s < %s %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END) AS global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_insert_sql) AS queue_insert_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_update_sql) AS queue_update_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.global_where_sql||nrs.where_for_delete_sql) AS queue_delete_sql, format($$ SELECT %s FROM %s WHERE %s $$, nrs.metadata_select_columns, nrs.queue_table_aliased, nrs.global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.gathered_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, proid REGPROC, key_value TEXT, source_change_date DATE) AS $BODY$ DECLARE v_gather_sql text; BEGIN SELECT gathered_queued_changes_sql INTO v_gather_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_gather_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; v_deps regclass[]; v_dep_delay_tolerance interval; v_delayed_msg text; BEGIN /*** There are 3 basic steps to this load: 1. If dependencies are listed, verify they are up to date enough 2. Execute the single daily-refresh function 3. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()', daily_scheduled_deps, daily_scheduled_dep_delay_tolerance INTO v_execute_sql, v_deps, v_dep_delay_tolerance FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; IF v_deps IS NOT NULL THEN WITH deps AS (SELECT unnest(v_deps) AS dep) , delays AS ( SELECT dep, now() - source_time as delay_interval FROM fact_loader.queue_table_delay_info() qtd INNER JOIN deps d ON d.dep = qtd.queue_of_base_table_relid UNION ALL SELECT dep, now() - last_refresh_source_cutoff as delay_interval FROM fact_loader.fact_tables ft INNER JOIN deps d ON d.dep = ft.fact_table_relid ) SELECT string_agg(dep::text||': Delayed '||delay_interval::text, ', ') INTO v_delayed_msg FROM delays WHERE delay_interval > v_dep_delay_tolerance; IF v_delayed_msg IS NOT NULL THEN RAISE EXCEPTION '%', v_delayed_msg; END IF; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.fact_table_refresh_logs_pruner() RETURNS trigger LANGUAGE plpgsql AS $$ declare step int := 1000; -- step should equal the firing frequency in trigger definition overdrive int := 2; -- overdrive times step = max rows (see below) max_rows int := step * overdrive; rows int; begin delete from fact_loader.fact_table_refresh_logs where fact_table_refresh_log_id in ( select fact_table_refresh_log_id from fact_loader.fact_table_refresh_logs where refresh_attempted_at < now() - '90 days'::interval -- do not do the literal interval value above as a declare parameter order by fact_table_refresh_log_id limit max_rows for update skip locked ); get diagnostics rows = row_count; return null; end $$; CREATE TRIGGER fact_table_refresh_logs_pruner AFTER INSERT ON fact_loader.fact_table_refresh_logs FOR EACH ROW WHEN ((new.fact_table_refresh_log_id % 1000::bigint) = 0) EXECUTE PROCEDURE fact_loader.fact_table_refresh_logs_pruner(); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (sub_origin_if OID, sub_replication_sets text[]) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if, sub_replication_sets FROM pglogical.subscription; $$; ELSE RETURN QUERY SELECT NULL::OID, NULL::TEXT[]; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist Add not exists because we think allowing all records to insert and conflict could be cause of serialization errors in repeatable read isolation. */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps new WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_dep_queue_table_deps existing WHERE existing.fact_table_dep_id = new.fact_table_dep_id AND existing.queue_table_dep_id = new.queue_table_dep_id) ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; -- These fields now becomes based on batch, not based on queue_table_id_field DO $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT format($$ UPDATE fact_loader.%s SET last_cutoff_id = (SELECT fact_loader_batch_id FROM %s WHERE %s = %s) WHERE %s = %s; $$, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_deps' ELSE 'fact_table_dep_queue_table_deps' END, queue_table_relid::text, queue_table_id_field::text, last_cutoff_id::text, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_dep_id' ELSE 'fact_table_dep_queue_table_dep_id' END, CASE WHEN fact_table_dep_id IS NULL THEN queue_table_dep_id ELSE fact_table_dep_queue_table_dep_id END ) AS sql FROM fact_loader.queue_deps_all WHERE last_cutoff_id IS NOT NULL LOOP v_sql = v_rec.sql; RAISE LOG 'Updating Extension pg_fact_loader Executed: %', v_sql; EXECUTE v_sql; END LOOP; END$BODY$; COMMENT ON TABLE fact_loader.debug_process_queue IS 'A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG.'; COMMENT ON TABLE fact_loader.fact_table_dep_queue_table_deps IS $$Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_queue_table_dep_id IS 'Unique identifier'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_id IS 'fact_table_dep for this specific dependency.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.queue_table_dep_id IS 'Inherited queue_table_dep that this dependent fact table depends on.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_id IS $$This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_source_time IS $$This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.insert_merge_proid IS $$Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.update_merge_proid IS $$Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.delete_merge_proid IS $$Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_deps IS 'For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here.'; COMMENT ON COLUMN fact_loader.fact_table_deps.fact_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.fact_table_deps.parent_id IS 'The parent fact_table_id that the child depends on.'; COMMENT ON COLUMN fact_loader.fact_table_deps.child_id IS 'The child fact_table_id that will run only after the parent is updated.'; COMMENT ON COLUMN fact_loader.fact_table_deps.default_insert_merge_proid IS $$Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_update_merge_proid IS $$Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_delete_merge_proid IS $$Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_refresh_logs IS 'Used to log both job run times and exceptions.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_refresh_log_id IS 'Unique identifier,'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_id IS 'Fact table that created the log.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_attempted_at IS 'The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures).'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_finished_at IS 'The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.messages IS 'Only for failures - Error message content in JSON format - including message, message detail, context, and hint.'; COMMENT ON TABLE fact_loader.fact_tables IS 'Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs".'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_id IS 'Unique identifier for the fact table or job - also referred to as job_id'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_relid IS 'The oid of the fact table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_agg_proid IS $$NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now.$$; COMMENT ON COLUMN fact_loader.fact_tables.enabled IS 'Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE.'; COMMENT ON COLUMN fact_loader.fact_tables.priority IS 'Determines the order in which the job runs (in combination with other sorting factors)'; COMMENT ON COLUMN fact_loader.fact_tables.force_worker_priority IS 'If marked TRUE, this fact table will be prioritized in execution order above all other factors.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_source_cutoff IS 'The data cutoff time of the last refresh - only records older than this have been updated.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_attempted_at IS 'The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_succeeded IS 'Whether or not the last run of the job succeeded. NULL if it has never been run.'; COMMENT ON COLUMN fact_loader.fact_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.fact_tables.use_daily_schedule IS 'If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_time IS 'The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_tz IS 'The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_proid IS $$The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples.$$; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_base_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time.'; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_parent_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_deps IS 'OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_dep_delay_tolerance IS 'OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON TABLE fact_loader.key_retrieval_sequences IS $$How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.key_retrieval_sequence_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.queue_table_dep_id IS 'Which fact table - queue table record this is for (queue_table_deps)'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.filter_scope IS $$NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.level IS $$Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns IS $$What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.is_fact_key IS 'Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_relation IS 'Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_column IS 'Join to this column of join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns_from_join IS 'Return these columns from join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_return_is_fact_key IS 'If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; COMMENT ON VIEW fact_loader.queue_deps_all_with_retrieval IS 'The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes.'; COMMENT ON TABLE fact_loader.queue_table_deps IS $$Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.queue_table_deps.fact_table_id IS 'Fact table to tie together with a queue table it depends on.'; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_id IS 'Queue table to tie together with a fact table that needs its changes.'; COMMENT ON COLUMN fact_loader.queue_table_deps.relevant_change_columns IS $$Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_id IS $$The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_source_time IS $$The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.queue_table_deps.insert_merge_proid IS $$Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.update_merge_proid IS $$Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.delete_merge_proid IS $$Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.queue_tables IS 'Each queue table along with the base table to which it belongs.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_id IS 'Unique identifier for queue tables.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_relid IS 'The oid of the queue table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_of_base_table_relid IS 'The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table.$$; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_tz IS $$**NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz.$$; COMMENT ON COLUMN fact_loader.queue_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.queue_tables.purge IS 'Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table.'; COMMENT ON VIEW fact_loader.unresolved_failures IS 'Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring.'; /* pg_fact_loader--1.3--1.4.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP FUNCTION fact_loader.raw_queued_changes(int); ALTER TABLE fact_loader.debug_process_queue DROP CONSTRAINT debug_process_queue_pkey; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ %s < %s %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), changed_at_tz_correction) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ, min_missed_id BIGINT, queue_table_id INT ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; /* pg_fact_loader--1.4--1.5.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ -- changed_at is guaranteed now to be in timestamptz - any time zone casting is only in subquery changed_at < %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_literal(c.maximum_cutoff_time) ) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; /* pg_fact_loader--1.5--1.6.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; -- Must ensure we have the fully schema-qualified regprod before converting to text SET search_path TO ''; ALTER TABLE fact_loader.debug_process_queue ALTER COLUMN proid TYPE TEXT; ALTER TABLE fact_loader.debug_process_queue ADD CONSTRAINT check_proid CHECK (COALESCE(proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_delete_merge_proid CHECK (COALESCE(default_delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_insert_merge_proid CHECK (COALESCE(default_insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_update_merge_proid CHECK (COALESCE(default_update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN daily_scheduled_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_daily_scheduled_proid CHECK (COALESCE(daily_scheduled_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN fact_table_agg_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_fact_table_agg_proid CHECK (COALESCE(fact_table_agg_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); RESET search_path; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/pg_fact_loader--1.7--2.0.sql000066400000000000000000000363401451107006500210540ustar00rootroot00000000000000/* pg_fact_loader--1.7--2.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP FUNCTION fact_loader.safely_terminate_workers(); DROP FUNCTION fact_loader.launch_workers(int); DROP FUNCTION fact_loader.launch_worker(); DROP FUNCTION fact_loader._launch_worker(oid); DROP FUNCTION fact_loader.queue_table_delay_info(); DROP FUNCTION fact_loader.logical_subscription(); CREATE TYPE fact_loader.driver AS ENUM ('pglogical', 'native'); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.subscription() RETURNS TABLE (oid OID, subpublications text[], subconninfo text) AS $BODY$ BEGIN RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo FROM pg_subscription s; END; $BODY$ LANGUAGE plpgsql; /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.subscription_rel() RETURNS TABLE (srsubid OID, srrelid OID) AS $BODY$ BEGIN RETURN QUERY SELECT sr.srsubid, sr.srrelid FROM pg_subscription_rel sr; END; $BODY$ LANGUAGE plpgsql; /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (subid OID, subpublications text[], subconninfo text, dbname text, driver fact_loader.driver) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if AS subid, sub_replication_sets AS subpublications, null::text AS subconninfo, null::text AS dbname, 'pglogical'::fact_loader.driver AS driver FROM pglogical.subscription UNION ALL SELECT oid, subpublications, subconninfo, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription(); $$; ELSE RETURN QUERY SELECT oid, subpublications, subconninfo, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription(); END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("publication_name" text, "queue_of_base_table_relid" regclass, "publisher" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ -- pglogical SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , n.if_name AS publisher , t.source_time FROM fact_loader.queue_tables qt JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.subid AND s.driver = 'pglogical' JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name UNION ALL -- native logical SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , t.db AS publisher , t.tick_time AS source_time FROM fact_loader.queue_tables qt JOIN fact_loader.subscription_rel() psr ON psr.srrelid = qt.queue_table_relid JOIN fact_loader.logical_subscription() s ON psr.srsubid = s.subid JOIN logical_ticker.tick t ON t.db = s.dbname UNION ALL -- local SELECT NULL::text AS publication_name , qt.queue_of_base_table_relid , NULL::name AS publisher , now() AS source_time FROM fact_loader.queue_tables qt WHERE qt.pglogical_node_if_id IS NULL AND NOT EXISTS ( SELECT 1 FROM fact_loader.subscription_rel() psr WHERE psr.srrelid = qt.queue_table_relid );$$; ELSE RETURN QUERY -- local SELECT NULL::TEXT AS publication_name , qt.queue_of_base_table_relid , NULL::NAME AS publisher --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt WHERE NOT EXISTS (SELECT 1 FROM fact_loader.subscription_rel() psr WHERE psr.srrelid = qt.queue_table_relid) UNION ALL -- native logical (WITH logical_subscription_with_db AS ( SELECT *, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS db FROM fact_loader.logical_subscription() ) SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , t.db AS publisher , t.tick_time AS source_time FROM fact_loader.queue_tables qt JOIN fact_loader.subscription_rel() psr ON psr.srrelid = qt.queue_table_relid JOIN logical_subscription_with_db s ON psr.srsubid = s.subid JOIN logical_ticker.tick t ON t.db = s.db); END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.publisher AS provider_name, rt.publication_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/pg_fact_loader--1.7.sql000066400000000000000000007067511451107006500205140ustar00rootroot00000000000000/* pg_fact_loader--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE FUNCTION fact_loader._launch_worker(oid) RETURNS pg_catalog.INT4 STRICT AS 'MODULE_PATHNAME', 'pg_fact_loader_worker' LANGUAGE C; CREATE FUNCTION fact_loader.launch_worker() RETURNS pg_catalog.INT4 STRICT AS 'SELECT fact_loader._launch_worker(oid) FROM pg_database WHERE datname = current_database();' LANGUAGE SQL; CREATE TABLE fact_loader.fact_tables ( fact_table_id SERIAL PRIMARY KEY, fact_table_relid REGCLASS NOT NULL, fact_table_agg_proid REGPROC NULL, --This may only be used to generate a merge function but is not used in automation enabled BOOLEAN NOT NULL DEFAULT FALSE, priority INT, attempt_number INT, retries_allowed INT DEFAULT 0, force_worker_priority BOOLEAN NOT NULL DEFAULT FALSE, last_refresh_source_cutoff TIMESTAMPTZ, last_refresh_attempted_at TIMESTAMPTZ, --TODO - answer if we want the worker to bail or record messages on ERROR (or both) last_refresh_succeeded BOOLEAN, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_tables UNIQUE (fact_table_relid) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables', ''); CREATE TABLE fact_loader.fact_table_deps ( fact_table_dep_id SERIAL PRIMARY KEY, parent_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), child_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), /***** In very many cases, you will use the same procs for insert, update, and delete even with multiple dependencies. This is why you must give defaults here which will be used to auto-populate fact_loader.fact_table_dep_queue_table_deps which can be overridden if necessary for each queue table. After you configure all of your fact tables and queue tables, run the function refresh_fact_table_dep_queue_table_deps manually to populate fact_table_dep_queue_table_deps, then make any changes as necessary. You can see an example of this in the test suite "seeds" file. You can also see an override example with order_emails_fact having a different proc for orders and reorders delete cases. */ default_insert_merge_proid REGPROC NOT NULL, default_update_merge_proid REGPROC NOT NULL, default_delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_deps UNIQUE (parent_id, child_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps', ''); CREATE TABLE fact_loader.queue_tables ( queue_table_id SERIAL PRIMARY KEY, queue_table_relid REGCLASS NOT NULL, queue_of_base_table_relid REGCLASS NOT NULL, /**** NOTE - the reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information. */ pglogical_node_if_id INT NOT NULL, --This is the timezone for the changed_at column - if null, we assume it is timestamptz (we could check that actually) queue_table_tz TEXT, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_table UNIQUE (queue_table_relid), CONSTRAINT unique_base_table UNIQUE (queue_of_base_table_relid) ); COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$The reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information.$$; SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables', ''); CREATE TABLE fact_loader.queue_table_deps ( queue_table_dep_id SERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), queue_table_id INT NOT NULL REFERENCES fact_loader.queue_tables (queue_table_id), relevant_change_columns NAME[], last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_deps UNIQUE (fact_table_id, queue_table_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps', ''); CREATE TABLE fact_loader.key_retrieval_sequences ( key_retrieval_sequence_id SERIAL PRIMARY KEY, queue_table_dep_id INT NOT NULL REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), /**** In almost all cases, we only need to write one way to retrieve keys. The only exception is, for example, when in a delete case, you need to pass a different field (customer_id instead of order_id) to the delete_merge_proid function. You then need a different key_retrieval_sequence to handle a different field name for this delete case. By default this is NULL, meaning there is no filter, meaning the sequence applies to all events I, U, D. Otherwise, you can add scopes in which case you must have one for each of 'I','U','D'. */ filter_scope CHAR(1) NULL, level INT NOT NULL, return_columns NAME[] NOT NULL, is_fact_key BOOLEAN NOT NULL, join_to_relation REGCLASS NULL, join_to_column NAME NULL, return_columns_from_join NAME[] NULL, join_return_is_fact_key BOOLEAN NULL, CONSTRAINT unique_retrievals UNIQUE (queue_table_dep_id, filter_scope, level), CONSTRAINT valid_scopes CHECK (filter_scope IN ('I','U','D')) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences', ''); CREATE TABLE fact_loader.fact_table_dep_queue_table_deps ( fact_table_dep_queue_table_dep_id SERIAL PRIMARY KEY, fact_table_dep_id INT REFERENCES fact_loader.fact_table_deps (fact_table_dep_id), queue_table_dep_id INT REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_cutoffs UNIQUE (fact_table_dep_id, queue_table_dep_id) ); CREATE OR REPLACE FUNCTION fact_loader.unique_scopes() RETURNS TRIGGER AS $BODY$ BEGIN IF (NEW.filter_scope IS NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NOT NULL )) OR (NEW.filter_scope IS NOT NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NULL )) THEN RAISE EXCEPTION $$You must either use a NULL filter_scope to cover all 3 events I, U, D or you must specify all 3 events separately I, U, D (For queue_table_dep_id %). $$, NEW.queue_table_dep_id; END IF; RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER unique_scopes BEFORE INSERT OR UPDATE ON fact_loader.key_retrieval_sequences FOR EACH ROW EXECUTE PROCEDURE fact_loader.unique_scopes(); /*** This table is unlogged because it only has data mid-transaction and should always be empty */ CREATE UNLOGGED TABLE fact_loader.process_queue ( process_queue_id BIGSERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), proid REGPROC NOT NULL, key_value TEXT NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ ); CREATE OR REPLACE FUNCTION fact_loader.set_row_updated_at_to_now() RETURNS TRIGGER AS $BODY$ BEGIN NEW.row_updated_at = now(); RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_dep_queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.process_queue FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TYPE fact_loader.table_load_type AS ENUM('delta','full_refresh'); CREATE OR REPLACE FUNCTION fact_loader.create_table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS REGPROC AS $BODY$ DECLARE v_new_proc TEXT; v_sql TEXT; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT function_name, function_sql INTO v_new_proc, v_sql FROM fact_loader.table_loader_function(p_source_proc, p_destination_relation, p_ignore_diff_for_columns); EXECUTE v_sql; RETURN v_new_proc::REGPROC; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id, proid, key_value, --TODO - either infer the data type of the function args, which is not super easy with postgres, --or add configuration fields for the name and data type of these. This will suffice for now --because we only have integer args for all functions 'integer' AS queue_of_base_table_key_type FROM fact_loader.process_queue pq WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT format('%s(%s::%s)', proid::TEXT, 'key_value', queue_of_base_table_key_type) AS function_call, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id) AS execute_sql FROM with_rank GROUP BY execution_group, function_call ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is the actual load to the destination table, and assumes that 'prepare' phase has already been run, which is supposed to have gathered the actual minimal delta and determine what to do here. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT execute_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_insert_to_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Using the process_queue data, execute the delta load of the fact table 3. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT insert_to_process_queue_sql, metadata_update_sql INTO v_insert_to_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue */ RAISE LOG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_insert_to_process_queue_sql; EXECUTE COALESCE(v_insert_to_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') LIKE 'debug%' THEN FOR v_debug_rec IN SELECT * FROM fact_loader.process_queue LOOP v_debug_text = v_debug_text||E'\n'||format('%s', v_debug_rec.process_queue_id||chr(9)||v_debug_rec.fact_table_id||chr(9)||v_debug_rec.proid||chr(9)||v_debug_rec.key_value); END LOOP; IF v_debug_text <> '' THEN v_debug_text = E'\n'||format('%s', (SELECT string_agg(column_name,chr(9)) FROM information_schema.columns WHERE table_name = 'process_queue' AND table_schema = 'fact_loader' AND column_name NOT LIKE 'row_%_at')) ||v_debug_text; RAISE DEBUG '%', v_debug_text; END IF; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE LOG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE LOG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.prepare_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is not going to lock any of the destination table for writing, which is precisely why it is separated from the 'execute' phase which actually writes to the table in the shortest transaction possible. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT prepare_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s); $$, queue_table_relid, queue_table_id_field, min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE LOG 'Purging Queue: %', v_sql; EXECUTE v_sql; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type) RETURNS TABLE (prepare_sql text, execute_sql text, unmapped_src_columns text[], unmapped_dest_columns text[]) AS $BODY$ DECLARE v_pkey_fields TEXT[]; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT array_agg(a.attname ORDER BY pk.rn) INTO v_pkey_fields FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik; RETURN QUERY WITH source_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_source_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , destination_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , unmapped_source_columns AS ( SELECT array_agg(s.column_name::text) AS unmapped_columns_src FROM source_columns s WHERE NOT EXISTS (SELECT 1 FROM destination_columns d WHERE d.column_name = s.column_name) ) , unmapped_dest_columns AS ( SELECT array_agg(d.column_name::text) AS unmapped_columns_dest FROM destination_columns d WHERE NOT EXISTS (SELECT 1 FROM source_columns s WHERE d.column_name = s.column_name) ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT unnest AS pkey_field FROM unnest(v_pkey_fields)) pk ) , info AS ( SELECT string_agg( CASE WHEN sc.column_name IS NOT NULL THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN sc.column_name IS NOT NULL AND (p_ignore_diff_for_columns IS NULL OR sc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN sc.column_name IS NOT NULL AND NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_join FROM destination_columns dc CROSS JOIN pkeys LEFT JOIN source_columns sc ON dc.column_name = sc.column_name GROUP BY pkeys.pkey_fields, pkeys.pkey_join ) , sql_snippets AS ( SELECT $$ DROP TABLE IF EXISTS count_tracker; CREATE TEMP TABLE count_tracker (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)); INSERT INTO count_tracker VALUES (NULL, NULL, FALSE, NULL); $$::TEXT AS count_tracker_sql , $$ DROP TABLE IF EXISTS actual_delta; CREATE TEMP TABLE actual_delta AS WITH final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_source_relation::TEXT||$$ EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ DROP TABLE IF EXISTS removed_keys; CREATE TEMP TABLE removed_keys AS SELECT $$||pkey_fields||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE NOT EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$); $$ AS removed_keys_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM $$||p_source_relation::TEXT||$$ s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ); $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql , $$ /*** We add a primary key to the actual_delta table to ensure there are no duplicate keys. ***/ ALTER TABLE actual_delta ADD PRIMARY KEY ($$||pkey_fields||$$); $$ AS add_delta_pkey_sql , $$ /**** This part is not implemented yet, but partially complete. If we decide we want to figure out that >50% of the table will be updated, we could decide to truncate. But then we have to balance the desire for that with more read queries to figure it out. To implement, add the type full_refresh_truncate to fact_loader.table_load_type, and uncomment code. We would also have to add the logic to find actual keys added, then subtract it from actual_delta to get the net updates expected. If this is over 50%, we should truncate and re-insert all data. ***/ DROP TABLE IF EXISTS percent_of_destination; CREATE TEMP TABLE percent_of_destination AS SELECT (((SELECT COUNT(1) FROM actual_delta) - (SELECT COUNT(1) FROM added_keys))::NUMERIC / (SELECT COUNT(1) FROM $$||p_destination_relation::TEXT||$$)::NUMERIC)::NUMERIC(8,2) AS pct; UPDATE count_tracker SET pct_dest = (SELECT pct FROM percent_of_destination); $$ AS percent_change_sql ,$$ DO $LOCK_SAFE_DDL$ BEGIN SET lock_timeout TO '10ms'; IF (SELECT pct FROM percent_of_destination) >= 0.5 THEN LOOP BEGIN TRUNCATE $$||p_destination_relation::TEXT||$$; UPDATE count_tracker SET truncated = true; EXIT; EXCEPTION WHEN lock_not_available THEN RAISE WARNING 'Could not obtain immediate lock for SQL %, retrying', p_sql; PERFORM pg_sleep(3); WHEN OTHERS THEN RAISE; END; END LOOP; END IF; RESET lock_timeout; END $LOCK_SAFE_DDL$ ; $$ AS lock_safe_truncate_sql ,$$ --Delete keys that are no longer in your new version DELETE FROM $$||p_destination_relation::TEXT||$$ d WHERE EXISTS (SELECT 1 FROM removed_keys s WHERE $$||pkey_join||$$); GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET deleted = v_row_count; $$ AS delete_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET upserted = v_row_count; $$ AS upsert_sql FROM info ) SELECT count_tracker_sql|| CASE /*** not implemented truncate pattern WHEN p_load_type IN('full_refresh','full_refresh_truncate') THEN ***/ WHEN p_load_type = 'full_refresh' THEN removed_keys_sql||actual_delta_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ WHEN p_load_type = 'delta' THEN actual_delta_sql||key_join_exists_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ END||$$ $$|| /*** not implemented truncate pattern CASE WHEN p_load_type = 'full_refresh_truncate' THEN percent_change_sql ELSE '' END ***/ '' AS prepare_sql , $$ --THIS SHOULD BE RUN IN A TRANSACTION DO $SCRIPT$ DECLARE v_row_count INT; v_results RECORD; BEGIN $$|| CASE /*** not implemented truncate pattern WHEN p_load_type = 'full_refresh_truncate' THEN lock_safe_truncate_sql||delete_sql||upsert_sql ***/ WHEN p_load_type = 'full_refresh' THEN delete_sql||upsert_sql WHEN p_load_type = 'delta' THEN upsert_sql END||$$ FOR v_results IN SELECT * FROM count_tracker LOOP RAISE LOG 'upserted: %, deleted: %, truncated: %, pct_dest: %', v_results.upserted, v_results.deleted, v_results.truncated, v_results.pct_dest; END LOOP; END $SCRIPT$; $$ AS execute_sql , (SELECT unmapped_columns_src FROM unmapped_source_columns) AS unmapped_src_columns , (SELECT unmapped_columns_dest FROM unmapped_dest_columns) AS unmapped_dest_columns FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_validator (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_unmapped_src_columns TEXT[], p_unmapped_dest_columns TEXT[], p_ignore_unmapped_columns BOOLEAN) RETURNS VOID AS $BODY$ DECLARE v_messages TEXT = ''; BEGIN IF NOT p_ignore_unmapped_columns AND p_unmapped_src_columns IS NOT NULL THEN v_messages = format($$You have unmapped columns (%s) in the source table %s. All source columns must be named identically to destination in order to map. If you are certain you want to ignore these columns, meaning they will not update anything in destination table %s, add the final argument to this function as TRUE. $$ , array_to_string(p_unmapped_src_columns,', ') , p_source_relation::TEXT , p_destination_relation::TEXT); END IF; IF NOT p_ignore_unmapped_columns AND p_unmapped_dest_columns IS NOT NULL THEN v_messages = v_messages||format($$ You have unmapped columns (%s) in the destination table %s. All destination columns must be named identically to source in order to map. If you are certain you want to ignore these columns, meaning the source table %s does not contain all columns in destination table, add the final argument to this function as TRUE.$$ , array_to_string(p_unmapped_dest_columns,', ') , p_destination_relation::TEXT , p_source_relation::TEXT); END IF; IF v_messages <> '' THEN RAISE EXCEPTION '%', v_messages; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; c_lock_cutoff_refresh INT = 99995; BEGIN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.fact_tables WHERE enabled ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority LOOP IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = v_fact_record.fact_table_id) THEN --Load fact table PERFORM fact_loader.load(v_fact_record.fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); RETURN TRUE; END IF; END LOOP; RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.0--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit ALTER TABLE fact_loader.key_retrieval_sequences ADD COLUMN pass_queue_table_change_date_at_tz TEXT NULL; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; ALTER TABLE fact_loader.key_retrieval_sequences ADD CONSTRAINT verify_valid_tz CHECK (pass_queue_table_change_date_at_tz IS NULL OR (now() AT TIME ZONE pass_queue_table_change_date_at_tz IS NOT NULL)); --This check constraint could have been added in v. 1.0 ALTER TABLE fact_loader.queue_tables ADD CONSTRAINT verify_valid_tz CHECK (queue_table_tz IS NULL OR (now() AT TIME ZONE queue_table_tz IS NOT NULL)); ALTER TABLE fact_loader.process_queue ADD COLUMN source_change_date DATE NULL; COMMENT ON COLUMN fact_loader.process_queue.source_change_date IS 'Corresponds to fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz. If this field is populated, a function will be expected that has args (key_value, source_change_date) based on this process_queue table.'; --This should have already been added in v. 1.0 SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_deps', ''); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid DROP NOT NULL, ALTER COLUMN default_update_merge_proid DROP NOT NULL, ALTER COLUMN default_delete_merge_proid DROP NOT NULL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM fact_loader.process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join, string_agg($$d.$$||quote_ident(pkey_field)||$$ = $$||(SELECT arg_params FROM function_schema),E'\nAND ') AS pkey_join_to_arg FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs , pkey_join_to_arg FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs, pkey_join_to_arg ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE $$||pkey_join_to_arg AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.1--1.2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit --To support non-replicated queue tables ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id DROP NOT NULL; CREATE TABLE fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id SERIAL PRIMARY KEY, fact_table_id INT REFERENCES fact_loader.fact_tables (fact_table_id), refresh_attempted_at TIMESTAMPTZ, messages TEXT); ALTER TABLE fact_loader.fact_tables ADD COLUMN use_daily_schedule BOOLEAN NOT NULL DEFAULT FALSE, ADD COLUMN daily_scheduled_time TIME NULL, ADD COLUMN daily_scheduled_tz TEXT NULL, ADD COLUMN daily_scheduled_proid REGPROC, ADD CONSTRAINT verify_valid_daily_tz CHECK (daily_scheduled_tz IS NULL OR (now() AT TIME ZONE daily_scheduled_tz IS NOT NULL)), ADD CONSTRAINT daily_schedule_configured_correctly CHECK ((NOT use_daily_schedule) OR (use_daily_schedule AND daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL)); CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS SELECT * FROM fact_loader.fact_tables WHERE enabled AND (NOT use_daily_schedule OR --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone ( (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME ) ) ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN (use_daily_schedule AND (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME) THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; BEGIN /*** There are 2 basic steps to this load: 1. Execute the single daily-refresh function 2. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()' INTO v_execute_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; BEGIN IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, messages) VALUES (p_fact_table_id, now(), SQLERRM); RETURN FALSE; END; ELSE RETURN FALSE; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.2--1.3.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW IF EXISTS fact_loader.queue_deps_all_with_retrieval; DROP VIEW IF EXISTS fact_loader.queue_deps_all; DROP VIEW IF EXISTS fact_loader.logical_subscription; DROP VIEW IF EXISTS fact_loader.prioritized_jobs; DROP VIEW IF EXISTS fact_loader.unresolved_failures; DROP FUNCTION IF EXISTS fact_loader.sql_builder(int); CREATE OR REPLACE FUNCTION fact_loader.add_batch_id_fields() RETURNS VOID AS $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT queue_table_relid FROM fact_loader.queue_tables qt INNER JOIN pg_class c ON c.oid = qt.queue_table_relid INNER JOIN pg_namespace n ON n.oid = c.relnamespace WHERE NOT EXISTS (SELECT 1 FROM information_schema.columns col WHERE col.column_name = 'fact_loader_batch_id' AND col.table_schema = n.nspname AND col.table_name = c.relname) LOOP v_sql = format($F$ ALTER TABLE %s ADD COLUMN fact_loader_batch_id BIGINT DEFAULT nextval('fact_loader.batch_id'); $F$, v_rec.queue_table_relid::text, v_rec.queue_table_relid::text); RAISE LOG 'ADDING fact_loader_batch_id COLUMN TO queue table %: %', v_rec.queue_table_relid::text, v_sql; EXECUTE v_sql; END LOOP; END $BODY$ LANGUAGE plpgsql; ALTER TABLE fact_loader.queue_tables ADD COLUMN purge BOOLEAN NOT NULL DEFAULT TRUE; UPDATE fact_loader.fact_table_refresh_logs SET messages = jsonb_build_object('Message', messages) WHERE messages IS NOT NULL; --Will be re-added via \i in sql file ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN messages TYPE jsonb USING messages::jsonb; --This was a problem from the start ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id TYPE OID; --This should have been done from the start SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_de_fact_table_dep_queue_table_de_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps_fact_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables_fact_table_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences_key_retrieval_sequence_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps_queue_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables_queue_table_id_seq', ''); --No indexes or anything but allow debugging CREATE UNLOGGED TABLE fact_loader.debug_process_queue (LIKE fact_loader.process_queue); ALTER TABLE fact_loader.debug_process_queue ADD PRIMARY KEY (process_queue_id); -- Now a temp table to avoid serialization contention DROP TABLE fact_loader.process_queue; --Make this a trigger to check dep fact tables ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_base_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_parent_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables DROP CONSTRAINT daily_schedule_configured_correctly; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_schedule_configured_correctly CHECK (NOT use_daily_schedule OR (use_daily_schedule AND ((daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL) OR (depends_on_base_daily_job_id IS NOT NULL AND depends_on_parent_daily_job_id IS NOT NULL)))); --These columns have never been used ALTER TABLE fact_loader.fact_tables DROP COLUMN attempt_number, DROP COLUMN retries_allowed; --This is the usual case and makes sense ALTER TABLE fact_loader.key_retrieval_sequences ALTER COLUMN level SET DEFAULT 1; --Need to have a more reliable dependency knowledge for scheduled jobs ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_deps REGCLASS[]; ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_dep_delay_tolerance INTERVAL; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_deps_correctly_configured CHECK ((daily_scheduled_deps IS NULL AND daily_scheduled_dep_delay_tolerance IS NULL) OR (daily_scheduled_deps IS NOT NULL AND daily_scheduled_dep_delay_tolerance IS NOT NULL)); --Log all events and add pruning ALTER TABLE fact_loader.fact_table_refresh_logs ADD COLUMN refresh_finished_at TIMESTAMPTZ; ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN fact_table_refresh_log_id TYPE BIGINT; -- Handle race conditions by changing to batch usage CREATE SEQUENCE fact_loader.batch_id; SELECT fact_loader.add_batch_id_fields(); CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL AND qt.purge /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s IN (SELECT %s FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s) FOR UPDATE SKIP LOCKED ); $$, queue_table_relid, 'fact_loader_batch_id', 'fact_loader_batch_id', queue_table_relid, 'fact_loader_batch_id', min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE DEBUG 'Purging Queue: %', v_sql; BEGIN EXECUTE v_sql; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure in queue purging for transaction % - skipping.', txid_current()::text; WHEN OTHERS THEN RAISE; END; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN --If any configured functions use temp tables, --must discard to avoid them hanging around in the idle background worker session DISCARD TEMP; --Log job times INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at) VALUES (v_fact_record.fact_table_id, now(), clock_timestamp()); --Return true meaning the fact table was refreshed (this applies even if there was no new data) RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; v_err JSONB; v_errmsg TEXT; v_errdetail TEXT; v_errhint TEXT; v_errcontext TEXT; BEGIN -- We except rare serialization failures here which we will ignore and move to the next record -- Anything else should be raised BEGIN IF EXISTS (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id FOR UPDATE SKIP LOCKED) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_errmsg = MESSAGE_TEXT, v_errdetail = PG_EXCEPTION_DETAIL, v_errhint = PG_EXCEPTION_HINT, v_errcontext = PG_EXCEPTION_CONTEXT; UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; v_err = jsonb_strip_nulls( jsonb_build_object( 'Message', v_errmsg, 'Detail', case when v_errdetail = '' then null else v_errdetail end, 'Hint', case when v_errhint = '' then null else v_errhint end, 'Context', case when v_errcontext = '' then null else v_errcontext end) ); INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at, messages) VALUES (p_fact_table_id, now(), clock_timestamp(), v_err); RETURN FALSE; END; ELSE RETURN FALSE; END IF; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure on transaction % attempting to lock % - skipping.', txid_current()::text, p_fact_table_id::text; RETURN FALSE; WHEN OTHERS THEN RAISE; END; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue SELECT * FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE FUNCTION fact_loader.safely_terminate_workers() RETURNS TABLE (number_terminated INT, number_still_live INT, pids_still_live INT[]) AS $BODY$ /**** It is not a requirement to use this function to terminate workers. Because workers are transactional, you can simply terminate them and no data loss will result in pg_fact_loader. Likewise, a hard crash of any system using pg_fact_loader will recover just fine upon re-launching workers. Still, it is ideal to avoid bloat to cleanly terminate workers and restart them using this function to kill them, and launch_workers(int) to re-launch them. */ BEGIN RETURN QUERY WITH try_term_pids AS ( SELECT pid, CASE WHEN state = 'idle' AND state_change BETWEEN SYMMETRIC now() - interval '5 seconds' AND now() - interval '55 seconds' THEN pg_terminate_backend(pid) ELSE FALSE END AS terminated FROM pg_stat_activity WHERE usename = 'postgres' AND query = 'SELECT fact_loader.worker();') SELECT SUM(CASE WHEN terminated THEN 1 ELSE 0 END)::INT AS number_terminated_out, SUM(CASE WHEN NOT terminated THEN 1 ELSE 0 END)::INT AS number_still_live_out, (SELECT array_agg(pid) FROM try_term_pids WHERE NOT terminated) AS pids_still_live_out FROM try_term_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.launch_workers(number_to_launch int) RETURNS INT[] AS $BODY$ DECLARE v_pids INT[]; BEGIN FOR i IN 1..number_to_launch LOOP v_pids = array_append(v_pids, fact_loader.launch_worker()); /* It's not strictly required to not launch all workers simultaneously, but it's also a little more invasive to do that, probably requiring more advisory lock skips. Better to just sleep 1 second between launches. */ PERFORM pg_sleep(1); END LOOP; RETURN v_pids; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, format($$ %s AS fact_table_id, %s AS queue_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time)) AS metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END, quote_literal(maximum_cutoff_time)) AS global_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table E'\nINNER JOIN '||queue_of_base_table_relid::TEXT||' b'|| E'\n ON q.'||quote_ident(queue_table_key)||'::'||queue_of_base_table_key_type||' = b.'||quote_ident(queue_of_base_table_key) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. format($$ %s AND q.%s < %s %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END) AS global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_insert_sql) AS queue_insert_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_update_sql) AS queue_update_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.global_where_sql||nrs.where_for_delete_sql) AS queue_delete_sql, format($$ SELECT %s FROM %s WHERE %s $$, nrs.metadata_select_columns, nrs.queue_table_aliased, nrs.global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.gathered_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, proid REGPROC, key_value TEXT, source_change_date DATE) AS $BODY$ DECLARE v_gather_sql text; BEGIN SELECT gathered_queued_changes_sql INTO v_gather_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_gather_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; v_deps regclass[]; v_dep_delay_tolerance interval; v_delayed_msg text; BEGIN /*** There are 3 basic steps to this load: 1. If dependencies are listed, verify they are up to date enough 2. Execute the single daily-refresh function 3. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()', daily_scheduled_deps, daily_scheduled_dep_delay_tolerance INTO v_execute_sql, v_deps, v_dep_delay_tolerance FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; IF v_deps IS NOT NULL THEN WITH deps AS (SELECT unnest(v_deps) AS dep) , delays AS ( SELECT dep, now() - source_time as delay_interval FROM fact_loader.queue_table_delay_info() qtd INNER JOIN deps d ON d.dep = qtd.queue_of_base_table_relid UNION ALL SELECT dep, now() - last_refresh_source_cutoff as delay_interval FROM fact_loader.fact_tables ft INNER JOIN deps d ON d.dep = ft.fact_table_relid ) SELECT string_agg(dep::text||': Delayed '||delay_interval::text, ', ') INTO v_delayed_msg FROM delays WHERE delay_interval > v_dep_delay_tolerance; IF v_delayed_msg IS NOT NULL THEN RAISE EXCEPTION '%', v_delayed_msg; END IF; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.fact_table_refresh_logs_pruner() RETURNS trigger LANGUAGE plpgsql AS $$ declare step int := 1000; -- step should equal the firing frequency in trigger definition overdrive int := 2; -- overdrive times step = max rows (see below) max_rows int := step * overdrive; rows int; begin delete from fact_loader.fact_table_refresh_logs where fact_table_refresh_log_id in ( select fact_table_refresh_log_id from fact_loader.fact_table_refresh_logs where refresh_attempted_at < now() - '90 days'::interval -- do not do the literal interval value above as a declare parameter order by fact_table_refresh_log_id limit max_rows for update skip locked ); get diagnostics rows = row_count; return null; end $$; CREATE TRIGGER fact_table_refresh_logs_pruner AFTER INSERT ON fact_loader.fact_table_refresh_logs FOR EACH ROW WHEN ((new.fact_table_refresh_log_id % 1000::bigint) = 0) EXECUTE PROCEDURE fact_loader.fact_table_refresh_logs_pruner(); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (sub_origin_if OID, sub_replication_sets text[]) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if, sub_replication_sets FROM pglogical.subscription; $$; ELSE RETURN QUERY SELECT NULL::OID, NULL::TEXT[]; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist Add not exists because we think allowing all records to insert and conflict could be cause of serialization errors in repeatable read isolation. */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps new WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_dep_queue_table_deps existing WHERE existing.fact_table_dep_id = new.fact_table_dep_id AND existing.queue_table_dep_id = new.queue_table_dep_id) ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; -- These fields now becomes based on batch, not based on queue_table_id_field DO $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT format($$ UPDATE fact_loader.%s SET last_cutoff_id = (SELECT fact_loader_batch_id FROM %s WHERE %s = %s) WHERE %s = %s; $$, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_deps' ELSE 'fact_table_dep_queue_table_deps' END, queue_table_relid::text, queue_table_id_field::text, last_cutoff_id::text, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_dep_id' ELSE 'fact_table_dep_queue_table_dep_id' END, CASE WHEN fact_table_dep_id IS NULL THEN queue_table_dep_id ELSE fact_table_dep_queue_table_dep_id END ) AS sql FROM fact_loader.queue_deps_all WHERE last_cutoff_id IS NOT NULL LOOP v_sql = v_rec.sql; RAISE LOG 'Updating Extension pg_fact_loader Executed: %', v_sql; EXECUTE v_sql; END LOOP; END$BODY$; COMMENT ON TABLE fact_loader.debug_process_queue IS 'A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG.'; COMMENT ON TABLE fact_loader.fact_table_dep_queue_table_deps IS $$Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_queue_table_dep_id IS 'Unique identifier'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_id IS 'fact_table_dep for this specific dependency.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.queue_table_dep_id IS 'Inherited queue_table_dep that this dependent fact table depends on.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_id IS $$This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_source_time IS $$This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.insert_merge_proid IS $$Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.update_merge_proid IS $$Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.delete_merge_proid IS $$Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_deps IS 'For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here.'; COMMENT ON COLUMN fact_loader.fact_table_deps.fact_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.fact_table_deps.parent_id IS 'The parent fact_table_id that the child depends on.'; COMMENT ON COLUMN fact_loader.fact_table_deps.child_id IS 'The child fact_table_id that will run only after the parent is updated.'; COMMENT ON COLUMN fact_loader.fact_table_deps.default_insert_merge_proid IS $$Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_update_merge_proid IS $$Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_delete_merge_proid IS $$Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_refresh_logs IS 'Used to log both job run times and exceptions.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_refresh_log_id IS 'Unique identifier,'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_id IS 'Fact table that created the log.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_attempted_at IS 'The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures).'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_finished_at IS 'The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.messages IS 'Only for failures - Error message content in JSON format - including message, message detail, context, and hint.'; COMMENT ON TABLE fact_loader.fact_tables IS 'Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs".'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_id IS 'Unique identifier for the fact table or job - also referred to as job_id'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_relid IS 'The oid of the fact table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_agg_proid IS $$NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now.$$; COMMENT ON COLUMN fact_loader.fact_tables.enabled IS 'Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE.'; COMMENT ON COLUMN fact_loader.fact_tables.priority IS 'Determines the order in which the job runs (in combination with other sorting factors)'; COMMENT ON COLUMN fact_loader.fact_tables.force_worker_priority IS 'If marked TRUE, this fact table will be prioritized in execution order above all other factors.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_source_cutoff IS 'The data cutoff time of the last refresh - only records older than this have been updated.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_attempted_at IS 'The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_succeeded IS 'Whether or not the last run of the job succeeded. NULL if it has never been run.'; COMMENT ON COLUMN fact_loader.fact_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.fact_tables.use_daily_schedule IS 'If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_time IS 'The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_tz IS 'The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_proid IS $$The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples.$$; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_base_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time.'; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_parent_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_deps IS 'OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_dep_delay_tolerance IS 'OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON TABLE fact_loader.key_retrieval_sequences IS $$How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.key_retrieval_sequence_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.queue_table_dep_id IS 'Which fact table - queue table record this is for (queue_table_deps)'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.filter_scope IS $$NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.level IS $$Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns IS $$What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.is_fact_key IS 'Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_relation IS 'Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_column IS 'Join to this column of join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns_from_join IS 'Return these columns from join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_return_is_fact_key IS 'If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; COMMENT ON VIEW fact_loader.queue_deps_all_with_retrieval IS 'The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes.'; COMMENT ON TABLE fact_loader.queue_table_deps IS $$Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.queue_table_deps.fact_table_id IS 'Fact table to tie together with a queue table it depends on.'; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_id IS 'Queue table to tie together with a fact table that needs its changes.'; COMMENT ON COLUMN fact_loader.queue_table_deps.relevant_change_columns IS $$Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_id IS $$The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_source_time IS $$The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.queue_table_deps.insert_merge_proid IS $$Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.update_merge_proid IS $$Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.delete_merge_proid IS $$Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.queue_tables IS 'Each queue table along with the base table to which it belongs.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_id IS 'Unique identifier for queue tables.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_relid IS 'The oid of the queue table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_of_base_table_relid IS 'The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table.$$; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_tz IS $$**NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz.$$; COMMENT ON COLUMN fact_loader.queue_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.queue_tables.purge IS 'Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table.'; COMMENT ON VIEW fact_loader.unresolved_failures IS 'Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring.'; /* pg_fact_loader--1.3--1.4.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP FUNCTION fact_loader.raw_queued_changes(int); ALTER TABLE fact_loader.debug_process_queue DROP CONSTRAINT debug_process_queue_pkey; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ %s < %s %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), changed_at_tz_correction) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ, min_missed_id BIGINT, queue_table_id INT ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; /* pg_fact_loader--1.4--1.5.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ -- changed_at is guaranteed now to be in timestamptz - any time zone casting is only in subquery changed_at < %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_literal(c.maximum_cutoff_time) ) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; /* pg_fact_loader--1.5--1.6.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; -- Must ensure we have the fully schema-qualified regprod before converting to text SET search_path TO ''; ALTER TABLE fact_loader.debug_process_queue ALTER COLUMN proid TYPE TEXT; ALTER TABLE fact_loader.debug_process_queue ADD CONSTRAINT check_proid CHECK (COALESCE(proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_delete_merge_proid CHECK (COALESCE(default_delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_insert_merge_proid CHECK (COALESCE(default_insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_update_merge_proid CHECK (COALESCE(default_update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN daily_scheduled_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_daily_scheduled_proid CHECK (COALESCE(daily_scheduled_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN fact_table_agg_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_fact_table_agg_proid CHECK (COALESCE(fact_table_agg_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); RESET search_path; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; /* pg_fact_loader--1.6--1.7.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; ALTER TABLE fact_loader.fact_tables ADD COLUMN pre_execute_hook_sql TEXT; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; v_pre_execute_hook_sql text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** Pre-execute hook */ SELECT pre_execute_hook_sql INTO v_pre_execute_hook_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id; EXECUTE COALESCE(v_pre_execute_hook_sql, $$SELECT 'No pre-execute hook.' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/pg_fact_loader--2.0.sql000066400000000000000000007406441451107006500205050ustar00rootroot00000000000000/* pg_fact_loader--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE TABLE fact_loader.fact_tables ( fact_table_id SERIAL PRIMARY KEY, fact_table_relid REGCLASS NOT NULL, fact_table_agg_proid REGPROC NULL, --This may only be used to generate a merge function but is not used in automation enabled BOOLEAN NOT NULL DEFAULT FALSE, priority INT, attempt_number INT, retries_allowed INT DEFAULT 0, force_worker_priority BOOLEAN NOT NULL DEFAULT FALSE, last_refresh_source_cutoff TIMESTAMPTZ, last_refresh_attempted_at TIMESTAMPTZ, --TODO - answer if we want the worker to bail or record messages on ERROR (or both) last_refresh_succeeded BOOLEAN, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_tables UNIQUE (fact_table_relid) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables', ''); CREATE TABLE fact_loader.fact_table_deps ( fact_table_dep_id SERIAL PRIMARY KEY, parent_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), child_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), /***** In very many cases, you will use the same procs for insert, update, and delete even with multiple dependencies. This is why you must give defaults here which will be used to auto-populate fact_loader.fact_table_dep_queue_table_deps which can be overridden if necessary for each queue table. After you configure all of your fact tables and queue tables, run the function refresh_fact_table_dep_queue_table_deps manually to populate fact_table_dep_queue_table_deps, then make any changes as necessary. You can see an example of this in the test suite "seeds" file. You can also see an override example with order_emails_fact having a different proc for orders and reorders delete cases. */ default_insert_merge_proid REGPROC NOT NULL, default_update_merge_proid REGPROC NOT NULL, default_delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_deps UNIQUE (parent_id, child_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps', ''); CREATE TABLE fact_loader.queue_tables ( queue_table_id SERIAL PRIMARY KEY, queue_table_relid REGCLASS NOT NULL, queue_of_base_table_relid REGCLASS NOT NULL, /**** NOTE - the reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information. */ pglogical_node_if_id INT NOT NULL, --This is the timezone for the changed_at column - if null, we assume it is timestamptz (we could check that actually) queue_table_tz TEXT, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_table UNIQUE (queue_table_relid), CONSTRAINT unique_base_table UNIQUE (queue_of_base_table_relid) ); COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$The reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information.$$; SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables', ''); CREATE TABLE fact_loader.queue_table_deps ( queue_table_dep_id SERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), queue_table_id INT NOT NULL REFERENCES fact_loader.queue_tables (queue_table_id), relevant_change_columns NAME[], last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_deps UNIQUE (fact_table_id, queue_table_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps', ''); CREATE TABLE fact_loader.key_retrieval_sequences ( key_retrieval_sequence_id SERIAL PRIMARY KEY, queue_table_dep_id INT NOT NULL REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), /**** In almost all cases, we only need to write one way to retrieve keys. The only exception is, for example, when in a delete case, you need to pass a different field (customer_id instead of order_id) to the delete_merge_proid function. You then need a different key_retrieval_sequence to handle a different field name for this delete case. By default this is NULL, meaning there is no filter, meaning the sequence applies to all events I, U, D. Otherwise, you can add scopes in which case you must have one for each of 'I','U','D'. */ filter_scope CHAR(1) NULL, level INT NOT NULL, return_columns NAME[] NOT NULL, is_fact_key BOOLEAN NOT NULL, join_to_relation REGCLASS NULL, join_to_column NAME NULL, return_columns_from_join NAME[] NULL, join_return_is_fact_key BOOLEAN NULL, CONSTRAINT unique_retrievals UNIQUE (queue_table_dep_id, filter_scope, level), CONSTRAINT valid_scopes CHECK (filter_scope IN ('I','U','D')) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences', ''); CREATE TABLE fact_loader.fact_table_dep_queue_table_deps ( fact_table_dep_queue_table_dep_id SERIAL PRIMARY KEY, fact_table_dep_id INT REFERENCES fact_loader.fact_table_deps (fact_table_dep_id), queue_table_dep_id INT REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_cutoffs UNIQUE (fact_table_dep_id, queue_table_dep_id) ); CREATE OR REPLACE FUNCTION fact_loader.unique_scopes() RETURNS TRIGGER AS $BODY$ BEGIN IF (NEW.filter_scope IS NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NOT NULL )) OR (NEW.filter_scope IS NOT NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NULL )) THEN RAISE EXCEPTION $$You must either use a NULL filter_scope to cover all 3 events I, U, D or you must specify all 3 events separately I, U, D (For queue_table_dep_id %). $$, NEW.queue_table_dep_id; END IF; RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER unique_scopes BEFORE INSERT OR UPDATE ON fact_loader.key_retrieval_sequences FOR EACH ROW EXECUTE PROCEDURE fact_loader.unique_scopes(); /*** This table is unlogged because it only has data mid-transaction and should always be empty */ CREATE UNLOGGED TABLE fact_loader.process_queue ( process_queue_id BIGSERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), proid REGPROC NOT NULL, key_value TEXT NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ ); CREATE OR REPLACE FUNCTION fact_loader.set_row_updated_at_to_now() RETURNS TRIGGER AS $BODY$ BEGIN NEW.row_updated_at = now(); RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_dep_queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.process_queue FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TYPE fact_loader.table_load_type AS ENUM('delta','full_refresh'); CREATE OR REPLACE FUNCTION fact_loader.create_table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS REGPROC AS $BODY$ DECLARE v_new_proc TEXT; v_sql TEXT; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT function_name, function_sql INTO v_new_proc, v_sql FROM fact_loader.table_loader_function(p_source_proc, p_destination_relation, p_ignore_diff_for_columns); EXECUTE v_sql; RETURN v_new_proc::REGPROC; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id, proid, key_value, --TODO - either infer the data type of the function args, which is not super easy with postgres, --or add configuration fields for the name and data type of these. This will suffice for now --because we only have integer args for all functions 'integer' AS queue_of_base_table_key_type FROM fact_loader.process_queue pq WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT format('%s(%s::%s)', proid::TEXT, 'key_value', queue_of_base_table_key_type) AS function_call, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id) AS execute_sql FROM with_rank GROUP BY execution_group, function_call ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.execute_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is the actual load to the destination table, and assumes that 'prepare' phase has already been run, which is supposed to have gathered the actual minimal delta and determine what to do here. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT execute_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_insert_to_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Using the process_queue data, execute the delta load of the fact table 3. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT insert_to_process_queue_sql, metadata_update_sql INTO v_insert_to_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue */ RAISE LOG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_insert_to_process_queue_sql; EXECUTE COALESCE(v_insert_to_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') LIKE 'debug%' THEN FOR v_debug_rec IN SELECT * FROM fact_loader.process_queue LOOP v_debug_text = v_debug_text||E'\n'||format('%s', v_debug_rec.process_queue_id||chr(9)||v_debug_rec.fact_table_id||chr(9)||v_debug_rec.proid||chr(9)||v_debug_rec.key_value); END LOOP; IF v_debug_text <> '' THEN v_debug_text = E'\n'||format('%s', (SELECT string_agg(column_name,chr(9)) FROM information_schema.columns WHERE table_name = 'process_queue' AND table_schema = 'fact_loader' AND column_name NOT LIKE 'row_%_at')) ||v_debug_text; RAISE DEBUG '%', v_debug_text; END IF; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE LOG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE LOG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.prepare_table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type, p_ignore_unmapped_columns BOOLEAN = FALSE) RETURNS TABLE (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)) AS $BODY$ /*** The SQL executed within this container is not going to lock any of the destination table for writing, which is precisely why it is separated from the 'execute' phase which actually writes to the table in the shortest transaction possible. */ DECLARE v_sql TEXT; v_unmapped_src_columns TEXT[]; v_unmapped_dest_columns TEXT[]; BEGIN SELECT prepare_sql, unmapped_src_columns, unmapped_dest_columns INTO v_sql, v_unmapped_src_columns, v_unmapped_dest_columns FROM fact_loader.table_loader( p_source_relation, p_destination_relation, p_ignore_diff_for_columns, p_load_type); PERFORM fact_loader.table_loader_validator(p_source_relation, p_destination_relation, v_unmapped_src_columns, v_unmapped_dest_columns, p_ignore_unmapped_columns); RAISE LOG 'Executing SQL: %', v_sql; EXECUTE v_sql; RETURN QUERY SELECT * FROM count_tracker; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , qt.queue_table_id_field , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s); $$, queue_table_relid, queue_table_id_field, min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE LOG 'Purging Queue: %', v_sql; EXECUTE v_sql; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[], p_load_type fact_loader.table_load_type) RETURNS TABLE (prepare_sql text, execute_sql text, unmapped_src_columns text[], unmapped_dest_columns text[]) AS $BODY$ DECLARE v_pkey_fields TEXT[]; BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ SELECT array_agg(a.attname ORDER BY pk.rn) INTO v_pkey_fields FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik; RETURN QUERY WITH source_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_source_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , destination_columns AS ( SELECT column_name, ordinal_position, CASE WHEN column_name = ANY(v_pkey_fields) THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname ORDER BY ordinal_position ) , unmapped_source_columns AS ( SELECT array_agg(s.column_name::text) AS unmapped_columns_src FROM source_columns s WHERE NOT EXISTS (SELECT 1 FROM destination_columns d WHERE d.column_name = s.column_name) ) , unmapped_dest_columns AS ( SELECT array_agg(d.column_name::text) AS unmapped_columns_dest FROM destination_columns d WHERE NOT EXISTS (SELECT 1 FROM source_columns s WHERE d.column_name = s.column_name) ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT unnest AS pkey_field FROM unnest(v_pkey_fields)) pk ) , info AS ( SELECT string_agg( CASE WHEN sc.column_name IS NOT NULL THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN sc.column_name IS NOT NULL AND (p_ignore_diff_for_columns IS NULL OR sc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN sc.column_name IS NOT NULL AND NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_join FROM destination_columns dc CROSS JOIN pkeys LEFT JOIN source_columns sc ON dc.column_name = sc.column_name GROUP BY pkeys.pkey_fields, pkeys.pkey_join ) , sql_snippets AS ( SELECT $$ DROP TABLE IF EXISTS count_tracker; CREATE TEMP TABLE count_tracker (upserted INT, deleted INT, truncated BOOLEAN, pct_dest NUMERIC(8,2)); INSERT INTO count_tracker VALUES (NULL, NULL, FALSE, NULL); $$::TEXT AS count_tracker_sql , $$ DROP TABLE IF EXISTS actual_delta; CREATE TEMP TABLE actual_delta AS WITH final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_source_relation::TEXT||$$ EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ DROP TABLE IF EXISTS removed_keys; CREATE TEMP TABLE removed_keys AS SELECT $$||pkey_fields||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE NOT EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$); $$ AS removed_keys_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM $$||p_source_relation::TEXT||$$ s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ); $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM $$||p_source_relation::TEXT||$$ s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql , $$ /*** We add a primary key to the actual_delta table to ensure there are no duplicate keys. ***/ ALTER TABLE actual_delta ADD PRIMARY KEY ($$||pkey_fields||$$); $$ AS add_delta_pkey_sql , $$ /**** This part is not implemented yet, but partially complete. If we decide we want to figure out that >50% of the table will be updated, we could decide to truncate. But then we have to balance the desire for that with more read queries to figure it out. To implement, add the type full_refresh_truncate to fact_loader.table_load_type, and uncomment code. We would also have to add the logic to find actual keys added, then subtract it from actual_delta to get the net updates expected. If this is over 50%, we should truncate and re-insert all data. ***/ DROP TABLE IF EXISTS percent_of_destination; CREATE TEMP TABLE percent_of_destination AS SELECT (((SELECT COUNT(1) FROM actual_delta) - (SELECT COUNT(1) FROM added_keys))::NUMERIC / (SELECT COUNT(1) FROM $$||p_destination_relation::TEXT||$$)::NUMERIC)::NUMERIC(8,2) AS pct; UPDATE count_tracker SET pct_dest = (SELECT pct FROM percent_of_destination); $$ AS percent_change_sql ,$$ DO $LOCK_SAFE_DDL$ BEGIN SET lock_timeout TO '10ms'; IF (SELECT pct FROM percent_of_destination) >= 0.5 THEN LOOP BEGIN TRUNCATE $$||p_destination_relation::TEXT||$$; UPDATE count_tracker SET truncated = true; EXIT; EXCEPTION WHEN lock_not_available THEN RAISE WARNING 'Could not obtain immediate lock for SQL %, retrying', p_sql; PERFORM pg_sleep(3); WHEN OTHERS THEN RAISE; END; END LOOP; END IF; RESET lock_timeout; END $LOCK_SAFE_DDL$ ; $$ AS lock_safe_truncate_sql ,$$ --Delete keys that are no longer in your new version DELETE FROM $$||p_destination_relation::TEXT||$$ d WHERE EXISTS (SELECT 1 FROM removed_keys s WHERE $$||pkey_join||$$); GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET deleted = v_row_count; $$ AS delete_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; GET DIAGNOSTICS v_row_count = ROW_COUNT; UPDATE count_tracker SET upserted = v_row_count; $$ AS upsert_sql FROM info ) SELECT count_tracker_sql|| CASE /*** not implemented truncate pattern WHEN p_load_type IN('full_refresh','full_refresh_truncate') THEN ***/ WHEN p_load_type = 'full_refresh' THEN removed_keys_sql||actual_delta_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ WHEN p_load_type = 'delta' THEN actual_delta_sql||key_join_exists_sql||except_join_to_source_sql||add_delta_pkey_sql||$$;$$ END||$$ $$|| /*** not implemented truncate pattern CASE WHEN p_load_type = 'full_refresh_truncate' THEN percent_change_sql ELSE '' END ***/ '' AS prepare_sql , $$ --THIS SHOULD BE RUN IN A TRANSACTION DO $SCRIPT$ DECLARE v_row_count INT; v_results RECORD; BEGIN $$|| CASE /*** not implemented truncate pattern WHEN p_load_type = 'full_refresh_truncate' THEN lock_safe_truncate_sql||delete_sql||upsert_sql ***/ WHEN p_load_type = 'full_refresh' THEN delete_sql||upsert_sql WHEN p_load_type = 'delta' THEN upsert_sql END||$$ FOR v_results IN SELECT * FROM count_tracker LOOP RAISE LOG 'upserted: %, deleted: %, truncated: %, pct_dest: %', v_results.upserted, v_results.deleted, v_results.truncated, v_results.pct_dest; END LOOP; END $SCRIPT$; $$ AS execute_sql , (SELECT unmapped_columns_src FROM unmapped_source_columns) AS unmapped_src_columns , (SELECT unmapped_columns_dest FROM unmapped_dest_columns) AS unmapped_dest_columns FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d $$ AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_validator (p_source_relation REGCLASS, p_destination_relation REGCLASS, p_unmapped_src_columns TEXT[], p_unmapped_dest_columns TEXT[], p_ignore_unmapped_columns BOOLEAN) RETURNS VOID AS $BODY$ DECLARE v_messages TEXT = ''; BEGIN IF NOT p_ignore_unmapped_columns AND p_unmapped_src_columns IS NOT NULL THEN v_messages = format($$You have unmapped columns (%s) in the source table %s. All source columns must be named identically to destination in order to map. If you are certain you want to ignore these columns, meaning they will not update anything in destination table %s, add the final argument to this function as TRUE. $$ , array_to_string(p_unmapped_src_columns,', ') , p_source_relation::TEXT , p_destination_relation::TEXT); END IF; IF NOT p_ignore_unmapped_columns AND p_unmapped_dest_columns IS NOT NULL THEN v_messages = v_messages||format($$ You have unmapped columns (%s) in the destination table %s. All destination columns must be named identically to source in order to map. If you are certain you want to ignore these columns, meaning the source table %s does not contain all columns in destination table, add the final argument to this function as TRUE.$$ , array_to_string(p_unmapped_dest_columns,', ') , p_destination_relation::TEXT , p_source_relation::TEXT); END IF; IF v_messages <> '' THEN RAISE EXCEPTION '%', v_messages; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; c_lock_cutoff_refresh INT = 99995; BEGIN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.fact_tables WHERE enabled ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority LOOP IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = v_fact_record.fact_table_id) THEN --Load fact table PERFORM fact_loader.load(v_fact_record.fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); RETURN TRUE; END IF; END LOOP; RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.0--1.1.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit ALTER TABLE fact_loader.key_retrieval_sequences ADD COLUMN pass_queue_table_change_date_at_tz TEXT NULL; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; ALTER TABLE fact_loader.key_retrieval_sequences ADD CONSTRAINT verify_valid_tz CHECK (pass_queue_table_change_date_at_tz IS NULL OR (now() AT TIME ZONE pass_queue_table_change_date_at_tz IS NOT NULL)); --This check constraint could have been added in v. 1.0 ALTER TABLE fact_loader.queue_tables ADD CONSTRAINT verify_valid_tz CHECK (queue_table_tz IS NULL OR (now() AT TIME ZONE queue_table_tz IS NOT NULL)); ALTER TABLE fact_loader.process_queue ADD COLUMN source_change_date DATE NULL; COMMENT ON COLUMN fact_loader.process_queue.source_change_date IS 'Corresponds to fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz. If this field is populated, a function will be expected that has args (key_value, source_change_date) based on this process_queue table.'; --This should have already been added in v. 1.0 SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_deps', ''); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid DROP NOT NULL, ALTER COLUMN update_merge_proid DROP NOT NULL, ALTER COLUMN delete_merge_proid DROP NOT NULL; ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid DROP NOT NULL, ALTER COLUMN default_update_merge_proid DROP NOT NULL, ALTER COLUMN default_delete_merge_proid DROP NOT NULL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM fact_loader.process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ WITH newly_processed AS ( SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM fact_loader.process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q ) DELETE FROM fact_loader.process_queue pq USING newly_processed np WHERE np.process_queue_id = pq.process_queue_id; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.table_loader_function (p_source_proc REGPROC, p_destination_relation REGCLASS, p_ignore_diff_for_columns TEXT[]) RETURNS TABLE (function_name text, function_sql text) AS $BODY$ BEGIN /**** Find the primary key for the destination table. This is required. If the destination table does not have a primary key, it should. This is partly for simplicity, and partly to encourage good practice that we build and refresh tables based on chosen primary key to match records 1 for 1, which is basic DB design 101. */ RETURN QUERY WITH get_pkey_fields AS ( SELECT a.attname, format_type(a.atttypid, a.atttypmod) AS atttype, pk.rn FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = p_destination_relation AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) , pkey_fields_sorted AS (SELECT array_agg(attname ORDER BY rn) AS pkey_fields FROM get_pkey_fields) , function_args AS (SELECT regexp_matches(pg_get_function_identity_arguments(p_source_proc),'(?:^|, )(\w+)','g') AS arg) , function_schema AS (SELECT string_agg(arg[1],', ') AS arg_params, pg_get_function_identity_arguments(p_source_proc) AS arg_defs FROM function_args) , destination_columns AS ( SELECT c.table_schema, c.table_name, column_name, ordinal_position, CASE WHEN gpf.attname IS NOT NULL THEN TRUE ELSE FALSE END AS pkey_field FROM information_schema.columns c INNER JOIN pg_class pc ON pc.relname = c.table_name AND pc.oid = p_destination_relation INNER JOIN pg_namespace n ON n.oid = pc.relnamespace AND c.table_schema = n.nspname LEFT JOIN get_pkey_fields gpf ON gpf.attname = c.column_name ORDER BY ordinal_position ) , pkeys AS ( SELECT string_agg(quote_ident(pkey_field),E'\n, ') AS pkey_fields, string_agg(quote_ident(pkey_field)||' '||pkey_type,', ') AS pkey_fields_ddl, string_agg($$s.$$||quote_ident(pkey_field)||$$ = d.$$||quote_ident(pkey_field),E'\nAND ') AS pkey_join, string_agg($$d.$$||quote_ident(pkey_field)||$$ = $$||(SELECT arg_params FROM function_schema),E'\nAND ') AS pkey_join_to_arg FROM (SELECT attname AS pkey_field, atttype AS pkey_type FROM get_pkey_fields ORDER BY rn) pk ) , info AS ( SELECT string_agg( dc.column_name, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list , string_agg( CASE WHEN (p_ignore_diff_for_columns IS NULL OR dc.column_name != ALL (p_ignore_diff_for_columns) ) THEN dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS matching_column_list_without_ignored , string_agg( CASE WHEN NOT dc.pkey_field THEN dc.column_name || ' = EXCLUDED.' || dc.column_name ELSE NULL END, E'\n , ' ORDER BY dc.ordinal_position ) AS upsert_list , pkeys.pkey_fields , pkeys.pkey_fields_ddl , pkeys.pkey_join , quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge') AS proposed_function_name , fs.arg_params , fs.arg_defs , pkey_join_to_arg FROM destination_columns dc CROSS JOIN pkeys CROSS JOIN function_schema fs GROUP BY pkeys.pkey_fields, pkeys.pkey_fields_ddl, pkeys.pkey_join, quote_ident(dc.table_schema)||'.'||quote_ident(table_name||'_merge'), fs.arg_params, fs.arg_defs, pkey_join_to_arg ) , sql_snippets AS ( SELECT proposed_function_name , $$ CREATE OR REPLACE FUNCTION $$||proposed_function_name||$$($$||arg_defs||$$) RETURNS VOID AS $FUNC$ BEGIN $$::TEXT AS function_start , $$ END; $FUNC$ LANGUAGE plpgsql; $$::TEXT AS function_end , $$ WITH actual_delta AS ( $$::TEXT AS actual_delta_cte , $$ WITH data AS ( SELECT * FROM $$||p_source_proc::TEXT||$$($$||arg_params||$$) ) , final_diff AS ( SELECT $$||pkey_fields||$$ FROM (SELECT $$||matching_column_list_without_ignored||$$ FROM data EXCEPT SELECT $$||matching_column_list_without_ignored||$$ FROM $$||p_destination_relation::TEXT||$$ d WHERE $$||pkey_join_to_arg AS actual_delta_sql , $$ ) full_diff) --This extra step is necessarily precisely because we may want to not except every column, like load_dttm SELECT * FROM data s WHERE EXISTS ( SELECT 1 FROM final_diff d WHERE $$||pkey_join||$$ ) $$ AS except_join_to_source_sql , $$ /*** We add the exists here because we are only looking for column-level differences for the given keys that have changed. This may be a very small portion of the table. Without the exists clause, this second part of EXCEPT would do a full table scan unnecessarily. ***/ WHERE EXISTS (SELECT 1 FROM data s WHERE $$||pkey_join||$$)$$ AS key_join_exists_sql ,$$ INSERT INTO $$||p_destination_relation::TEXT||$$ AS t ($$|| matching_column_list||$$) SELECT $$||matching_column_list|| $$ FROM actual_delta ON CONFLICT ($$||pkey_fields||$$) DO UPDATE SET $$||upsert_list||$$ ; $$ AS upsert_sql FROM info ) SELECT proposed_function_name AS function_name , function_start||actual_delta_cte||actual_delta_sql||except_join_to_source_sql||')'||upsert_sql||function_end AS function_sql FROM sql_snippets; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.1--1.2.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit --To support non-replicated queue tables ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id DROP NOT NULL; CREATE TABLE fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id SERIAL PRIMARY KEY, fact_table_id INT REFERENCES fact_loader.fact_tables (fact_table_id), refresh_attempted_at TIMESTAMPTZ, messages TEXT); ALTER TABLE fact_loader.fact_tables ADD COLUMN use_daily_schedule BOOLEAN NOT NULL DEFAULT FALSE, ADD COLUMN daily_scheduled_time TIME NULL, ADD COLUMN daily_scheduled_tz TEXT NULL, ADD COLUMN daily_scheduled_proid REGPROC, ADD CONSTRAINT verify_valid_daily_tz CHECK (daily_scheduled_tz IS NULL OR (now() AT TIME ZONE daily_scheduled_tz IS NOT NULL)), ADD CONSTRAINT daily_schedule_configured_correctly CHECK ((NOT use_daily_schedule) OR (use_daily_schedule AND daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL)); CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS SELECT * FROM fact_loader.fact_tables WHERE enabled AND (NOT use_daily_schedule OR --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone ( (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME ) ) ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN (use_daily_schedule AND (last_refresh_attempted_at IS NULL OR last_refresh_attempted_at::DATE < (now() AT TIME ZONE daily_scheduled_tz)::DATE ) AND (now() AT TIME ZONE daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME) THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; BEGIN /*** There are 2 basic steps to this load: 1. Execute the single daily-refresh function 2. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()' INTO v_execute_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; BEGIN IF (SELECT pg_try_advisory_xact_lock(fact_table_id) FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, messages) VALUES (p_fact_table_id, now(), SQLERRM); RETURN FALSE; END; ELSE RETURN FALSE; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; /* pg_fact_loader--1.2--1.3.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW IF EXISTS fact_loader.queue_deps_all_with_retrieval; DROP VIEW IF EXISTS fact_loader.queue_deps_all; DROP VIEW IF EXISTS fact_loader.logical_subscription; DROP VIEW IF EXISTS fact_loader.prioritized_jobs; DROP VIEW IF EXISTS fact_loader.unresolved_failures; DROP FUNCTION IF EXISTS fact_loader.sql_builder(int); CREATE OR REPLACE FUNCTION fact_loader.add_batch_id_fields() RETURNS VOID AS $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT queue_table_relid FROM fact_loader.queue_tables qt INNER JOIN pg_class c ON c.oid = qt.queue_table_relid INNER JOIN pg_namespace n ON n.oid = c.relnamespace WHERE NOT EXISTS (SELECT 1 FROM information_schema.columns col WHERE col.column_name = 'fact_loader_batch_id' AND col.table_schema = n.nspname AND col.table_name = c.relname) LOOP v_sql = format($F$ ALTER TABLE %s ADD COLUMN fact_loader_batch_id BIGINT DEFAULT nextval('fact_loader.batch_id'); $F$, v_rec.queue_table_relid::text, v_rec.queue_table_relid::text); RAISE LOG 'ADDING fact_loader_batch_id COLUMN TO queue table %: %', v_rec.queue_table_relid::text, v_sql; EXECUTE v_sql; END LOOP; END $BODY$ LANGUAGE plpgsql; ALTER TABLE fact_loader.queue_tables ADD COLUMN purge BOOLEAN NOT NULL DEFAULT TRUE; UPDATE fact_loader.fact_table_refresh_logs SET messages = jsonb_build_object('Message', messages) WHERE messages IS NOT NULL; --Will be re-added via \i in sql file ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN messages TYPE jsonb USING messages::jsonb; --This was a problem from the start ALTER TABLE fact_loader.queue_tables ALTER COLUMN pglogical_node_if_id TYPE OID; --This should have been done from the start SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_dep_queue_table_de_fact_table_dep_queue_table_de_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps_fact_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables_fact_table_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences_key_retrieval_sequence_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps_queue_table_dep_id_seq', ''); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables_queue_table_id_seq', ''); --No indexes or anything but allow debugging CREATE UNLOGGED TABLE fact_loader.debug_process_queue (LIKE fact_loader.process_queue); ALTER TABLE fact_loader.debug_process_queue ADD PRIMARY KEY (process_queue_id); -- Now a temp table to avoid serialization contention DROP TABLE fact_loader.process_queue; --Make this a trigger to check dep fact tables ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_base_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables ADD COLUMN depends_on_parent_daily_job_id INT REFERENCES fact_loader.fact_tables (fact_table_id); ALTER TABLE fact_loader.fact_tables DROP CONSTRAINT daily_schedule_configured_correctly; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_schedule_configured_correctly CHECK (NOT use_daily_schedule OR (use_daily_schedule AND ((daily_scheduled_time IS NOT NULL AND daily_scheduled_tz IS NOT NULL AND daily_scheduled_proid IS NOT NULL) OR (depends_on_base_daily_job_id IS NOT NULL AND depends_on_parent_daily_job_id IS NOT NULL)))); --These columns have never been used ALTER TABLE fact_loader.fact_tables DROP COLUMN attempt_number, DROP COLUMN retries_allowed; --This is the usual case and makes sense ALTER TABLE fact_loader.key_retrieval_sequences ALTER COLUMN level SET DEFAULT 1; --Need to have a more reliable dependency knowledge for scheduled jobs ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_deps REGCLASS[]; ALTER TABLE fact_loader.fact_tables ADD COLUMN daily_scheduled_dep_delay_tolerance INTERVAL; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT daily_deps_correctly_configured CHECK ((daily_scheduled_deps IS NULL AND daily_scheduled_dep_delay_tolerance IS NULL) OR (daily_scheduled_deps IS NOT NULL AND daily_scheduled_dep_delay_tolerance IS NOT NULL)); --Log all events and add pruning ALTER TABLE fact_loader.fact_table_refresh_logs ADD COLUMN refresh_finished_at TIMESTAMPTZ; ALTER TABLE fact_loader.fact_table_refresh_logs ALTER COLUMN fact_table_refresh_log_id TYPE BIGINT; -- Handle race conditions by changing to batch usage CREATE SEQUENCE fact_loader.batch_id; SELECT fact_loader.add_batch_id_fields(); CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; CREATE OR REPLACE FUNCTION fact_loader.purge_queues (p_add_interval INTERVAL = '1 hour') RETURNS VOID AS $BODY$ /***** The interval overlap is only important for delete cases in which you may need to join to another audit table in order to get a deleted row's data. 1 hour is somewhat arbitrary, but in the delete case, any related deleted rows would seem to normally appear very close to another relation's deleted rows. 1 hour is probably generous but also safe. */ DECLARE v_sql TEXT; BEGIN WITH eligible_queue_tables_for_purge AS (SELECT /**** This logic should handle dependent fact tables as well, because they share the same queue tables but they have separately logged last_cutoffs. */ qt.queue_table_relid , queue_table_timestamp , queue_table_tz , MIN(last_cutoff_id) AS min_cutoff_id , MIN(last_cutoff_source_time) AS min_source_time FROM fact_loader.queue_deps_all qt WHERE qt.last_cutoff_id IS NOT NULL AND qt.purge /*** There must be no other fact tables using the same queue which have not yet been processed at all */ AND NOT EXISTS (SELECT 1 FROM fact_loader.queue_deps_all qtdx WHERE qtdx.queue_table_id = qt.queue_table_id AND qtdx.last_cutoff_id IS NULL) GROUP BY qt.queue_table_relid , queue_table_timestamp , queue_table_tz) SELECT string_agg( format($$ DELETE FROM %s WHERE %s IN (SELECT %s FROM %s WHERE %s <= %s AND %s %s < (%s::TIMESTAMPTZ - interval %s) FOR UPDATE SKIP LOCKED ); $$, queue_table_relid, 'fact_loader_batch_id', 'fact_loader_batch_id', queue_table_relid, 'fact_loader_batch_id', min_cutoff_id, quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(min_source_time), quote_literal(p_add_interval::TEXT) ) , E'\n\n') INTO v_sql FROM eligible_queue_tables_for_purge; IF v_sql IS NOT NULL THEN RAISE DEBUG 'Purging Queue: %', v_sql; BEGIN EXECUTE v_sql; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure in queue purging for transaction % - skipping.', txid_current()::text; WHEN OTHERS THEN RAISE; END; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.worker() RETURNS BOOLEAN AS $BODY$ DECLARE v_fact_record RECORD; BEGIN /**** Acquire an advisory lock on the row indicating this job, which will cause the function to simply return false if another session is running it concurrently. It will be released upon transaction commit or rollback. */ FOR v_fact_record IN SELECT fact_table_id FROM fact_loader.prioritized_jobs LOOP IF fact_loader.try_load(v_fact_record.fact_table_id) THEN --If any configured functions use temp tables, --must discard to avoid them hanging around in the idle background worker session DISCARD TEMP; --Log job times INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at) VALUES (v_fact_record.fact_table_id, now(), clock_timestamp()); --Return true meaning the fact table was refreshed (this applies even if there was no new data) RETURN TRUE; END IF; END LOOP; --If no jobs returned true, then return false RETURN FALSE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.try_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ /*** This will be used by the worker, but can also be used safely if a DBA wants to run a job manually. */ DECLARE c_lock_cutoff_refresh INT = 99995; v_err JSONB; v_errmsg TEXT; v_errdetail TEXT; v_errhint TEXT; v_errcontext TEXT; BEGIN -- We except rare serialization failures here which we will ignore and move to the next record -- Anything else should be raised BEGIN IF EXISTS (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id FOR UPDATE SKIP LOCKED) THEN /**** Attempt to refresh fact_table_dep_queue_table_deps or ignore if refresh is in progress. */ IF (SELECT pg_try_advisory_xact_lock(c_lock_cutoff_refresh)) THEN PERFORM fact_loader.refresh_fact_table_dep_queue_table_deps(); END IF; --Load fact table and handle exceptions to auto-disable job and log errors in case of error BEGIN --Scheduled daily job IF (SELECT use_daily_schedule FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id) THEN PERFORM fact_loader.daily_scheduled_load(p_fact_table_id); --Queue-based job ELSE PERFORM fact_loader.load(p_fact_table_id); /*** Run purge process. This need not run every launch of worker but it should not hurt. It is better for it to run after the fact table load is successful so as to avoid a rollback and more dead bloat */ PERFORM fact_loader.purge_queues(); END IF; RETURN TRUE; EXCEPTION WHEN OTHERS THEN GET STACKED DIAGNOSTICS v_errmsg = MESSAGE_TEXT, v_errdetail = PG_EXCEPTION_DETAIL, v_errhint = PG_EXCEPTION_HINT, v_errcontext = PG_EXCEPTION_CONTEXT; UPDATE fact_loader.fact_tables SET last_refresh_succeeded = FALSE, last_refresh_attempted_at = now(), enabled = FALSE WHERE fact_table_id = p_fact_table_id; v_err = jsonb_strip_nulls( jsonb_build_object( 'Message', v_errmsg, 'Detail', case when v_errdetail = '' then null else v_errdetail end, 'Hint', case when v_errhint = '' then null else v_errhint end, 'Context', case when v_errcontext = '' then null else v_errcontext end) ); INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_id, refresh_attempted_at, refresh_finished_at, messages) VALUES (p_fact_table_id, now(), clock_timestamp(), v_err); RETURN FALSE; END; ELSE RETURN FALSE; END IF; EXCEPTION WHEN serialization_failure THEN RAISE LOG 'Serialization failure on transaction % attempting to lock % - skipping.', txid_current()::text, p_fact_table_id::text; RETURN FALSE; WHEN OTHERS THEN RAISE; END; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue SELECT * FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, format($$ %s AS fact_table_id, %s AS queue_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time)) AS metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid, %s::TIMESTAMPTZ AS maximum_cutoff_time $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END, quote_literal(maximum_cutoff_time)) AS global_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table E'\nINNER JOIN '||queue_of_base_table_relid::TEXT||' b'|| E'\n ON q.'||quote_ident(queue_table_key)||'::'||queue_of_base_table_key_type||' = b.'||quote_ident(queue_of_base_table_key) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. format($$ %s AND q.%s < %s %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END) AS global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_insert_sql) AS queue_insert_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.global_where_sql||nrs.where_for_update_sql) AS queue_update_sql, format($$ SELECT %s FROM %s %s WHERE %s $$, nrs.global_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.global_where_sql||nrs.where_for_delete_sql) AS queue_delete_sql, format($$ SELECT %s FROM %s WHERE %s $$, nrs.metadata_select_columns, nrs.queue_table_aliased, nrs.global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.execute_queue(p_fact_table_id INT) RETURNS TABLE (sql TEXT) AS $BODY$ BEGIN RETURN QUERY WITH ordered_process_queue AS (SELECT process_queue_id , proid , key_value , source_change_date , (pp.proargtypes::REGTYPE[])[0] AS proid_first_arg FROM process_queue pq LEFT JOIN pg_proc pp ON pp.oid = proid WHERE pq.fact_table_id = p_fact_table_id ORDER BY process_queue_id) , with_rank AS (SELECT /**** If source_change_date is NULL, we assume the proid has one arg and pass it. If not, we assume the proid has two args and pass source_change_date as the second. */ format('%s(%s::%s%s)' , proid::TEXT , 'key_value' , proid_first_arg , CASE WHEN source_change_date IS NOT NULL THEN format(', %s::DATE',quote_literal(source_change_date)) ELSE '' END ) AS function_call, proid, process_queue_id, RANK() OVER (PARTITION BY proid) AS execution_group FROM ordered_process_queue ) , execute_sql_groups AS ( SELECT execution_group, format($$ SELECT process_queue_id, %s FROM ( /**** Must wrap this to execute in order of ids ***/ SELECT * FROM process_queue WHERE process_queue_id BETWEEN %s AND %s AND fact_table_id = %s AND proid = %s::REGPROC ORDER BY process_queue_id) q; $$, function_call, MIN(process_queue_id), MAX(process_queue_id), p_fact_table_id, quote_literal(proid::TEXT)) AS execute_sql FROM with_rank GROUP BY execution_group, function_call, proid ORDER BY execution_group ) SELECT COALESCE(string_agg(execute_sql,''),'SELECT NULL') AS final_execute_sql FROM execute_sql_groups; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.gathered_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, proid REGPROC, key_value TEXT, source_change_date DATE) AS $BODY$ DECLARE v_gather_sql text; BEGIN SELECT gathered_queued_changes_sql INTO v_gather_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_gather_sql; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.daily_scheduled_load(p_fact_table_id INT) RETURNS BOOLEAN AS $BODY$ DECLARE v_execute_sql text; v_deps regclass[]; v_dep_delay_tolerance interval; v_delayed_msg text; BEGIN /*** There are 3 basic steps to this load: 1. If dependencies are listed, verify they are up to date enough 2. Execute the single daily-refresh function 3. Update the metadata indicating the last attempt time */ SELECT 'SELECT '||daily_scheduled_proid::TEXT||'()', daily_scheduled_deps, daily_scheduled_dep_delay_tolerance INTO v_execute_sql, v_deps, v_dep_delay_tolerance FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id AND use_daily_schedule; IF v_execute_sql IS NULL THEN RETURN FALSE; END IF; IF v_deps IS NOT NULL THEN WITH deps AS (SELECT unnest(v_deps) AS dep) , delays AS ( SELECT dep, now() - source_time as delay_interval FROM fact_loader.queue_table_delay_info() qtd INNER JOIN deps d ON d.dep = qtd.queue_of_base_table_relid UNION ALL SELECT dep, now() - last_refresh_source_cutoff as delay_interval FROM fact_loader.fact_tables ft INNER JOIN deps d ON d.dep = ft.fact_table_relid ) SELECT string_agg(dep::text||': Delayed '||delay_interval::text, ', ') INTO v_delayed_msg FROM delays WHERE delay_interval > v_dep_delay_tolerance; IF v_delayed_msg IS NOT NULL THEN RAISE EXCEPTION '%', v_delayed_msg; END IF; END IF; EXECUTE v_execute_sql; UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = p_fact_table_id; RETURN TRUE; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.fact_table_refresh_logs_pruner() RETURNS trigger LANGUAGE plpgsql AS $$ declare step int := 1000; -- step should equal the firing frequency in trigger definition overdrive int := 2; -- overdrive times step = max rows (see below) max_rows int := step * overdrive; rows int; begin delete from fact_loader.fact_table_refresh_logs where fact_table_refresh_log_id in ( select fact_table_refresh_log_id from fact_loader.fact_table_refresh_logs where refresh_attempted_at < now() - '90 days'::interval -- do not do the literal interval value above as a declare parameter order by fact_table_refresh_log_id limit max_rows for update skip locked ); get diagnostics rows = row_count; return null; end $$; CREATE TRIGGER fact_table_refresh_logs_pruner AFTER INSERT ON fact_loader.fact_table_refresh_logs FOR EACH ROW WHEN ((new.fact_table_refresh_log_id % 1000::bigint) = 0) EXECUTE PROCEDURE fact_loader.fact_table_refresh_logs_pruner(); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (sub_origin_if OID, sub_replication_sets text[]) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if, sub_replication_sets FROM pglogical.subscription; $$; ELSE RETURN QUERY SELECT NULL::OID, NULL::TEXT[]; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("replication_set_name" text, "queue_of_base_table_relid" regclass, "if_id" oid, "if_name" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ SELECT unnest(coalesce(sub_replication_sets,'{NULL}')) AS replication_set_name , qt.queue_of_base_table_relid , n.if_id , n.if_name --source_time is now() for local tables (pglogical_node_if_id is null), and based on pglogical_ticker time otherwise , CASE WHEN qt.pglogical_node_if_id IS NULL THEN now() ELSE t.source_time END AS source_time FROM fact_loader.queue_tables qt LEFT JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.sub_origin_if LEFT JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id LEFT JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name;$$; ELSE RETURN QUERY SELECT NULL::TEXT AS replication_set_name , qt.queue_of_base_table_relid , NULL::OID AS if_id , NULL::NAME AS if_name --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt; END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.refresh_fact_table_dep_queue_table_deps() RETURNS VOID AS $BODY$ BEGIN /**** This function will be used to refresh the fact_table_dep_queue_table_deps table. The purpose of this table is to easily figure out queue data for fact tables that depend on other fact tables. This will be run with every call of load(). This may not be the most efficient method, but it is certainly reliable and fast. */ /**** Recursively find all fact table deps including nested ones (fact tables that depend on other fact tables) to build the fact_table_dep_queue_table_deps table. */ WITH RECURSIVE all_fact_table_deps AS ( SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ftc.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp USING (fact_table_id) INNER JOIN fact_loader.fact_tables ftc ON ftc.fact_table_id = ftd.child_id UNION ALL SELECT qtd.queue_table_dep_id , ftd.fact_table_dep_id , parent_id AS parent_fact_table_id , child_id AS fact_table_id , qtd.queue_table_id , qt.queue_table_relid , ftp.fact_table_relid AS parent_fact_table , ft.fact_table_relid AS child_fact_table , ftd.default_insert_merge_proid , ftd.default_update_merge_proid , ftd.default_delete_merge_proid FROM all_fact_table_deps qtd INNER JOIN fact_loader.queue_tables qt ON qtd.queue_table_id = qt.queue_table_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.parent_id = qtd.fact_table_id INNER JOIN fact_loader.fact_tables ftp ON ftp.fact_table_id = ftd.parent_id INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id ) /**** Remove fact_table_dep_queue_table_deps that no longer exist if applicable */ , removed AS ( DELETE FROM fact_loader.fact_table_dep_queue_table_deps ftdqc WHERE NOT EXISTS(SELECT 1 FROM all_fact_table_deps aftd WHERE aftd.fact_table_dep_id = ftdqc.fact_table_dep_id AND aftd.queue_table_dep_id = ftdqc.queue_table_dep_id) ) /**** Add any new keys or ignore if they already exist Add not exists because we think allowing all records to insert and conflict could be cause of serialization errors in repeatable read isolation. */ INSERT INTO fact_loader.fact_table_dep_queue_table_deps (fact_table_dep_id, queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_dep_id, queue_table_dep_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid FROM all_fact_table_deps new WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_dep_queue_table_deps existing WHERE existing.fact_table_dep_id = new.fact_table_dep_id AND existing.queue_table_dep_id = new.queue_table_dep_id) ON CONFLICT (fact_table_dep_id, queue_table_dep_id) DO NOTHING; END; $BODY$ LANGUAGE plpgsql; -- These fields now becomes based on batch, not based on queue_table_id_field DO $BODY$ DECLARE v_rec RECORD; v_sql TEXT; BEGIN FOR v_rec IN SELECT format($$ UPDATE fact_loader.%s SET last_cutoff_id = (SELECT fact_loader_batch_id FROM %s WHERE %s = %s) WHERE %s = %s; $$, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_deps' ELSE 'fact_table_dep_queue_table_deps' END, queue_table_relid::text, queue_table_id_field::text, last_cutoff_id::text, CASE WHEN fact_table_dep_id IS NULL THEN 'queue_table_dep_id' ELSE 'fact_table_dep_queue_table_dep_id' END, CASE WHEN fact_table_dep_id IS NULL THEN queue_table_dep_id ELSE fact_table_dep_queue_table_dep_id END ) AS sql FROM fact_loader.queue_deps_all WHERE last_cutoff_id IS NOT NULL LOOP v_sql = v_rec.sql; RAISE LOG 'Updating Extension pg_fact_loader Executed: %', v_sql; EXECUTE v_sql; END LOOP; END$BODY$; COMMENT ON TABLE fact_loader.debug_process_queue IS 'A mirror of process_queue for debugging only (unlogged) - only populated with log_min_duration set to DEBUG.'; COMMENT ON TABLE fact_loader.fact_table_dep_queue_table_deps IS $$Data in this table is by default auto-generated by refresh_fact_table_dep_queue_table_deps() only for queue-based fact tables that depend on other fact table changes. Each row represents a parent's queue_table_dep, updates of which will trickle down to this dependent fact table. Even though the default proids from fact_table_deps are used initially, they may not be appropriate as generalized across all of these queue_table_deps. The proids may need to be overridden for individual fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples of this. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_queue_table_dep_id IS 'Unique identifier'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.fact_table_dep_id IS 'fact_table_dep for this specific dependency.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.queue_table_dep_id IS 'Inherited queue_table_dep that this dependent fact table depends on.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_id IS $$This is unique and maintained separately from last_cutoff_id in queue_table_deps, as it refers to the last_cutoff_id for this dependent fact table. It is the last fact_loader_batch_id of the queue table that was processed for this queue table - dependent fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.last_cutoff_source_time IS $$This is unique and maintained separately from last_cutoff_source_time in queue_table_deps, as it refers to the last_cutoff_source_time for this dependent fact table. It is the source data change time of the last queue table record that was processed for this queue table - dependent fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. It will also never go past its parent(s) in time. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.insert_merge_proid IS $$Initially populated by default_insert_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on INSERT events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.update_merge_proid IS $$Initially populated by default_update_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on UPDATE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.delete_merge_proid IS $$Initially populated by default_delete_merge_proid from fact_table_deps, but can be overridden if a different proid is required. This is the function oid to execute on DELETE events *for this dependent fact table* - it accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - dependent fact table pair is configured in key_retrieval_sequences *for the parent(s)*. NULL to ignore insert events. See the regression suite in ./sql and ./expected for examples of this.$$; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_dep_queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_deps IS 'For queue-based fact tables that depend on other fact table changes ONLY. Add those dependencies here.'; COMMENT ON COLUMN fact_loader.fact_table_deps.fact_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.fact_table_deps.parent_id IS 'The parent fact_table_id that the child depends on.'; COMMENT ON COLUMN fact_loader.fact_table_deps.child_id IS 'The child fact_table_id that will run only after the parent is updated.'; COMMENT ON COLUMN fact_loader.fact_table_deps.default_insert_merge_proid IS $$Default function to use for insert events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_update_merge_proid IS $$Default function to use for update events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.default_delete_merge_proid IS $$Default function to use for delete events to update child tables. This may need to be modified for each individual inherited fact_table_dep_queue_table_deps if that generalization isn't possible. See the regression suite in ./sql and ./expected for examples.$$; COMMENT ON COLUMN fact_loader.fact_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.fact_table_refresh_logs IS 'Used to log both job run times and exceptions.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_refresh_log_id IS 'Unique identifier,'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.fact_table_id IS 'Fact table that created the log.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_attempted_at IS 'The time of the attempt (transaction begin time), which can be correlated to fact_table.last_refresh_attempted_at (see also unresolved_failures).'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.refresh_finished_at IS 'The transaction commit time of the attempt, which can be used with refresh_attempted_at to get actual run time.'; COMMENT ON COLUMN fact_loader.fact_table_refresh_logs.messages IS 'Only for failures - Error message content in JSON format - including message, message detail, context, and hint.'; COMMENT ON TABLE fact_loader.fact_tables IS 'Each fact table to be built via pg_fact_loader, which also drives the worker. These are also referred to as "jobs".'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_id IS 'Unique identifier for the fact table or job - also referred to as job_id'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_relid IS 'The oid of the fact table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.fact_tables.fact_table_agg_proid IS $$NOT REQUIRED. The aggregate function definition for the fact table. This can be used when passed to create_table_loader_function to auto-create a merge function. It can also be a reference for dq checks because it indicates what function returns the correct results for a fact table as it should appear now.$$; COMMENT ON COLUMN fact_loader.fact_tables.enabled IS 'Indicates whether or not the job is enabled. The worker will skip this table unless marked TRUE.'; COMMENT ON COLUMN fact_loader.fact_tables.priority IS 'Determines the order in which the job runs (in combination with other sorting factors)'; COMMENT ON COLUMN fact_loader.fact_tables.force_worker_priority IS 'If marked TRUE, this fact table will be prioritized in execution order above all other factors.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_source_cutoff IS 'The data cutoff time of the last refresh - only records older than this have been updated.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_attempted_at IS 'The last time the worker ran on this fact table. The oldest will be prioritized first, ahead of priority.'; COMMENT ON COLUMN fact_loader.fact_tables.last_refresh_succeeded IS 'Whether or not the last run of the job succeeded. NULL if it has never been run.'; COMMENT ON COLUMN fact_loader.fact_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.fact_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.fact_tables.use_daily_schedule IS 'If TRUE, this job is scheduled to run daily instead of using queue tables according to other daily column configuration. Also must be marked TRUE for dependent jobs.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_time IS 'The time of day *after which* to run the job (the system will attempt to run until midnight). If you have a chain of daily scheduled jobs, only the base job has time filled in.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_tz IS 'The timezone your time is in. This is critical to know when to allow a daily refresh from the standpoint of the business logic you require for a timezone-based date.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_proid IS $$The single function oid to execute at the scheduled time. No arguments supported. It is assumed to contain all the logic necessary to add any new daily entries, if applicable. See the unit tests in sql/16_1_2_features.sql for examples.$$; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_base_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. This is the fact_table_id of the FIRST job in a chain which is actually the only one with a scheduled_time.'; COMMENT ON COLUMN fact_loader.fact_tables.depends_on_parent_daily_job_id IS 'For jobs that depend on other daily scheduled jobs only. Immediate parent which must complete before this job will run.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_deps IS 'OPTIONAL for daily scheduled jobs. The only purpose of this column is to consider if we should wait to run a scheduled job because dependent tables are out of date. This is a regclass array of tables that this scheduled job depends on, which will only be considered if they are either listed in fact_loader.queue_tables or fact_loader.fact_tables. If the former, replication delay will be considered (if table is not local). If the latter, last_refresh_source_cutoff will be considered. Works in combination with daily_scheduled_dep_delay_tolerance which says how much time delay is tolerated. Job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON COLUMN fact_loader.fact_tables.daily_scheduled_dep_delay_tolerance IS 'OPTIONAL for daily scheduled jobs. Amount of time interval allowed that dependent tables can be out of date before running this job. For example, if 10 minutes, then if ANY of the dependent tables are more than 10 minutes out of date, this job will FAIL if the time delay constraint is not met for all tables - this is intended to be configured as a rare occurrence and thus we want to raise an alarm about it.'; COMMENT ON TABLE fact_loader.key_retrieval_sequences IS $$How to go from a change in the queue table itself to retrieve the key that needs to be updated in the fact table. That key specifically will be passed to the insert/update/delete merge_proids configured in queue_table_deps. When multiple joins are required to get there, you will have more than one key_retrieval_sequence for a single queue_table_dep. You can also optionally have a different key_retrieval_sequence if your insert/update/delete merge_proids don't all accept the exact same field as an arg. NOTE - The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.key_retrieval_sequence_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.queue_table_dep_id IS 'Which fact table - queue table record this is for (queue_table_deps)'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.filter_scope IS $$NULL or one of I, U, D. Optional and likely rare. By default, this key_retrieval_sequence will tell pg_fact_loader how to get the key for all events - insert, update, delete. But if your insert/update/delete merge_proids don't all accept the exact same field as an arg, you will have to tell it a different way to retrieve the different I, U, D events on separate rows. The regression suite has examples of this.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.level IS $$Default 1. When there are multiple joins required to retrieve a key, this indicates the order in which to perform the joins. It will start at level 1, then the return_columns_from_join field will be used to join to the join_to_relation - join_to_column for the level 2 record, and so on.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns IS $$What field to return from the base table (if this is level 1), or (if this level 2+) this should be the same as the return_columns_from_join from the previous level.$$; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.is_fact_key IS 'Only true if the base table itself contains the key. If return_columns contains the keys to pass into the functions without any additional join, TRUE. Otherwise, FALSE if you need to join to get more information.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_relation IS 'Join from the base table (or if this is level 2+, the join_to_relation from the previous level) to this table to get the key or to do yet a further join.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_to_column IS 'Join to this column of join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.return_columns_from_join IS 'Return these columns from join_to_relation.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.join_return_is_fact_key IS 'If return_columns_from_join are your fact keys, true. Otherwise false, and that means you need another level to get your key.'; COMMENT ON COLUMN fact_loader.key_retrieval_sequences.pass_queue_table_change_date_at_tz IS $$If this is set to a time zone, then the changed_at field will be cast to this time zone and then cast to a date, for the purpose of creating a date-range based fact table. For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date.$$; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; COMMENT ON VIEW fact_loader.queue_deps_all_with_retrieval IS 'The master view which builds on queue_deps_all to include key_retrieval_sequences. This is the main view used by sql_builder(int) to gather all queued changes.'; COMMENT ON TABLE fact_loader.queue_table_deps IS $$Ties together which fact tables depend on which queue tables, along with holding information on the last cutoff ids for each queue table. **NOTE** that anything that exists in queue_table_dep is assumed to be require its queue data not to be pruned even if the fact_tables job is disabled. That means that even if a job is disabled, you will not lose data, but you will also have your queue tables building up in size until you either enable (successfully) or drop the job. The regression suite in ./sql and ./expected has abundant examples of different configurations.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_dep_id IS 'Unique identifier.'; COMMENT ON COLUMN fact_loader.queue_table_deps.fact_table_id IS 'Fact table to tie together with a queue table it depends on.'; COMMENT ON COLUMN fact_loader.queue_table_deps.queue_table_id IS 'Queue table to tie together with a fact table that needs its changes.'; COMMENT ON COLUMN fact_loader.queue_table_deps.relevant_change_columns IS $$Optional. For UPDATE changes to data, you can specify to only consider changes to these columns as sufficient to update the fact table. If NULL, all columns will be considered as potentially changing the fact table data.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_id IS $$The last fact_loader_batch_id of the queue table that was processed for this queue table - fact table pair. After this job runs, records that have this id and lower are eligible to be pruned, assuming no other fact tables also depend on those same records. The next time the job runs, only records after this id are considered.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.last_cutoff_source_time IS $$The source data change time of the last queue table record that was processed for this queue table - fact table pair. This helps pg_fact_loader synchronize time across multiple queue tables and only pull changes that are early enough, and not purge records that are later than these cutoff times. THIS DOES NOT DETERMINE filter conditions for the starting point at which to pull new records as does last_cutoff_id - it is only used as an ending-point barrier. $$; COMMENT ON COLUMN fact_loader.queue_table_deps.insert_merge_proid IS $$Function oid to execute on insert events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore insert events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.update_merge_proid IS $$Function oid to execute on update events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore update events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.delete_merge_proid IS $$Function oid to execute on delete events - accepts a single value as its arg which is typically the key that has changed and needs to be updated. The way to retrieve this key for this queue table - fact table pair is configured in key_retrieval_sequences. NULL to ignore delete events.$$; COMMENT ON COLUMN fact_loader.queue_table_deps.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_table_deps.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON TABLE fact_loader.queue_tables IS 'Each queue table along with the base table to which it belongs.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_id IS 'Unique identifier for queue tables.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_relid IS 'The oid of the queue table itself regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.queue_of_base_table_relid IS 'The oid of the base table for which the queue table contains an audited log of changes. regclass type to accept only valid relations.'; COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$Optional - If NULL, we assume this is a local queue table and we need not synchronize time for potential replication delay. For use with tables that are replicated via pglogical. This is the pglogical.node_interface of the table. This also requires pglogical_ticker and is used to synchronize time and ensure we don't continue to move forward in time when replication is delayed for this queue table.$$; COMMENT ON COLUMN fact_loader.queue_tables.queue_table_tz IS $$**NOTE CAREFULLY** - If this is NULL, it assumes that changed_at in the queue tables is stored in TIMESTAMPTZ. If it IS set, it assumes you are telling it that changed_at is of TIMESTAMP data type which is stored in the provided time zone of queue_table_tz.$$; COMMENT ON COLUMN fact_loader.queue_tables.row_created_at IS 'Timestamp of when this row was first created.'; COMMENT ON COLUMN fact_loader.queue_tables.row_updated_at IS 'Timestamp of when this row was last updated (this is updated via trigger).'; COMMENT ON COLUMN fact_loader.queue_tables.purge IS 'Default is true because we prune queue tables as data is no longer needed. Can be set to false and no pruning will happen on this table.'; COMMENT ON VIEW fact_loader.unresolved_failures IS 'Will only show fact table and error messages for a job that just failed and has not been re-enabled since last failure. Useful for monitoring.'; /* pg_fact_loader--1.3--1.4.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP FUNCTION fact_loader.raw_queued_changes(int); ALTER TABLE fact_loader.debug_process_queue DROP CONSTRAINT debug_process_queue_pkey; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ %s < %s %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_ident(c.queue_table_timestamp), quote_literal(c.maximum_cutoff_time), changed_at_tz_correction) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; CREATE OR REPLACE FUNCTION fact_loader.raw_queued_changes(p_fact_table_id INT) RETURNS TABLE (fact_table_id INT, queue_table_dep_id INT, fact_table_dep_id INT, fact_table_dep_queue_table_dep_id INT, queue_table_id_field BIGINT, fact_loader_batch_id BIGINT, maximum_cutoff_time TIMESTAMPTZ, min_missed_id BIGINT, queue_table_id INT ) AS $BODY$ DECLARE v_raw_sql text; BEGIN SELECT raw_queued_changes_sql INTO v_raw_sql FROM fact_loader.sql_builder(p_fact_table_id); RETURN QUERY EXECUTE v_raw_sql; END; $BODY$ LANGUAGE plpgsql; COMMENT ON VIEW fact_loader.queue_deps_all IS 'A view which gathers all fact table data in order to process queued changes and update it, including nested dependencies.'; /* pg_fact_loader--1.4--1.5.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE OR REPLACE FUNCTION fact_loader.sql_builder(p_fact_table_id INT) RETURNS TABLE(raw_queued_changes_sql text, gathered_queued_changes_sql text, process_queue_sql text, metadata_update_sql text) AS $BODY$ /**** The recursive part of this CTE are only the sql_builder parts. In Postgres, if any of your CTEs are recursive, you only use the RECURSIVE keyword on the first of a set. The retrieval info may be the same for all 3 events (insert, update, delete), in which case filter_scope is null Otherwise, they must be specified separately. */ WITH RECURSIVE queue_deps_with_insert_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'I' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_update_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'U' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) , queue_deps_with_delete_retrieval AS ( SELECT * FROM fact_loader.queue_deps_all_with_retrieval WHERE (filter_scope = 'D' OR filter_scope IS NULL) AND fact_table_id = p_fact_table_id ) /**** Recursively build the SQL for any INSERT events found in the queues. The recursive part ONLY applies to cases where multiple joins have to be made to get at the source data, in which case there are multiple levels of key_retrieval_sequences for a given queue_table_dep_id. For an example of this, see the test cases involving the test.order_product_promos table. */ , insert_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_insert_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM insert_sql_builder r INNER JOIN queue_deps_with_insert_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , update_sql_builder AS ( SELECT queue_table_dep_id, level, CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', b.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(array[b.'||array_to_string(return_columns, ',b.')||'])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level)||'])::TEXT AS key' END ELSE '' END AS key_select_column, CASE WHEN is_fact_key THEN '' ELSE 'INNER JOIN '||join_to_relation::TEXT||' j'||level|| E'\n ON b.'||quote_ident(return_columns[1])||' = j'||level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, source_change_date_select FROM queue_deps_with_update_retrieval c WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.'||return_columns[1]||'::TEXT AS key' ELSE ', unnest(b.'||array_to_string(return_columns,',j'||r.level)||')::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||return_columns_from_join[1]||'::TEXT AS key' ELSE ', unnest(j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS key_select_column, key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE E'\nINNER JOIN '||join_to_relation::TEXT||' j'||c.level|| E'\n ON j'||r.level||'.'||quote_ident(return_columns[1])||' = j'||c.level||'.'||quote_ident(join_to_column) END AS key_retrieval_sql, r.source_change_date_select FROM update_sql_builder r INNER JOIN queue_deps_with_update_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , delete_sql_builder AS ( SELECT queue_table_dep_id, level, --For deletes, same pattern as key_select_column but instead, we may be selecting from the audit tables instead CASE WHEN is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', q.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns, ''', before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||level||'.'||array_to_string(return_columns_from_join, ',j'||level||'.')||'])::TEXT AS key' END ELSE '' END AS delete_key_select_column, CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, (CASE WHEN level = 1 THEN '(q'||'.before_change->>'||quote_literal(return_columns[1])||')::'||join_column_type ELSE 'j'||level||'.'||quote_ident(return_columns[1]) END), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||level ) END AS delete_key_retrieval_sql, source_change_date_select FROM queue_deps_with_delete_retrieval WHERE level = 1 AND fact_table_id = p_fact_table_id UNION ALL SELECT c.queue_table_dep_id, c.level, delete_key_select_column||CASE WHEN c.is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||r.level||'.before_change->>'||quote_literal(return_columns[1])||'::TEXT AS key' ELSE ', unnest(array[before_change->>'''||array_to_string(return_columns,',j'||r.level||'.before_change->>''')||'''])::TEXT AS key' END WHEN join_return_is_fact_key THEN CASE WHEN array_length(return_columns, 1) = 1 THEN ', j'||c.level||'.'||quote_ident(return_columns_from_join[1])||'::TEXT AS key' ELSE ', unnest(array[j'||c.level||'.'||array_to_string(return_columns_from_join,',j'||c.level)||')::TEXT AS key' END ELSE '' END AS delete_key_select_column, delete_key_retrieval_sql||CASE WHEN is_fact_key THEN '' ELSE format($$ --Join to either the base table, or the audit table, one of which --will be missing the key in a delete case INNER JOIN LATERAL ( SELECT %s FROM %s jb WHERE %s = %s UNION ALL SELECT %s FROM %s jq WHERE operation = 'D' AND %s = %s) %s ON TRUE $$, quote_ident(return_columns_from_join[1]), join_to_relation::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), 'jb.'||quote_ident(join_to_column), '(before_change->>'||quote_literal(return_columns_from_join[1])||')::'||return_columns_from_join_type, join_to_relation_queue::TEXT, 'j'||r.level||'.'||quote_ident(return_columns[1]), '(jq.before_change->>'||quote_literal(join_to_column)||')::'||join_column_type, /**** We use the higher level here just to be consistent with aliases from insert/update key retrieval */ 'j'||c.level ) END AS delete_key_retrieval_sql, r.source_change_date_select FROM delete_sql_builder r INNER JOIN queue_deps_with_delete_retrieval c USING (queue_table_dep_id) WHERE c.level = r.level + 1 ) , field_vars AS ( SELECT *, format($$ %s AS fact_table_id, %s AS queue_table_dep_id, %s::INT AS fact_table_dep_id, %s::INT AS fact_table_dep_queue_table_dep_id, %s AS queue_table_id_field, q.fact_loader_batch_id, %s::TIMESTAMPTZ AS maximum_cutoff_time, -- We must not ignore ids which are above maximum_cutoff_time -- but below the highest id which is below maximum_cutoff_time MIN(q.fact_loader_batch_id) FILTER ( WHERE %s %s > %s::TIMESTAMPTZ) OVER() AS min_missed_id $$, fact_table_id, queue_table_dep_id, (CASE WHEN fact_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_id::TEXT END), (CASE WHEN fact_table_dep_queue_table_dep_id IS NULL THEN 'NULL'::TEXT ELSE fact_table_dep_queue_table_dep_id::TEXT END), 'q.'||quote_ident(queue_table_id_field), quote_literal(maximum_cutoff_time), 'q.'||quote_ident(queue_table_timestamp), CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END, quote_literal(maximum_cutoff_time) ) AS inner_shared_select_columns, $$ fact_table_id, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, queue_table_id_field, fact_loader_batch_id, maximum_cutoff_time, min_missed_id $$ AS outer_shared_select_columns, CASE WHEN queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(queue_table_tz) END AS changed_at_tz_correction FROM fact_loader.queue_deps_all c WHERE c.fact_table_id = p_fact_table_id ) , non_recursive_sql AS ( SELECT /**** Separate select list for: - raw queue_ids from queue tables - gathered data from joining queue_ids to source tables to get actual keys to update in fact tables */ -- gathering all queue_ids from queue tables queue_table_dep_id, outer_shared_select_columns, format($$ %s, %s %s AS changed_at, %s AS queue_table_id $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, queue_table_id ) AS inner_metadata_select_columns, format($$ %s, queue_table_id $$, outer_shared_select_columns ) AS outer_metadata_select_columns, -- gathering actual keys to update in fact tables by joining from queue_ids to source tables format($$ %s, %s AS operation, %s %s AS changed_at, %s::REGPROC AS insert_merge_proid, %s::REGPROC AS update_merge_proid, %s::REGPROC AS delete_merge_proid $$, inner_shared_select_columns, 'q.'||quote_ident(queue_table_op), 'q.'||quote_ident(queue_table_timestamp), changed_at_tz_correction, CASE WHEN insert_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(insert_merge_proid) END, CASE WHEN update_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(update_merge_proid) END, CASE WHEN delete_merge_proid IS NULL THEN 'NULL' ELSE quote_literal(delete_merge_proid) END ) AS inner_data_select_columns, format($$ %s, operation, changed_at, insert_merge_proid, update_merge_proid, delete_merge_proid, key, source_change_date $$, outer_shared_select_columns ) AS outer_data_select_columns, -- This is simply the queue table aliased as q format('%s q', queue_table_relid::TEXT) AS queue_table_aliased, -- This is the SQL to join from the queue table to the base table format($$ INNER JOIN %s b ON q.%s::%s = b.%s $$, queue_of_base_table_relid::TEXT, quote_ident(queue_table_key), queue_of_base_table_key_type, quote_ident(queue_of_base_table_key)) AS base_join_sql, -- This is a WHERE statement to be added to ALL gathering of new queue_ids to process. -- There is a further filter based on the window min_missed_id after this subquery format($$ %s $$, CASE WHEN last_cutoff_id IS NOT NULL THEN 'q.fact_loader_batch_id > '||last_cutoff_id ELSE 'TRUE' END) AS inner_global_where_sql, format($$ -- changed_at is guaranteed now to be in timestamptz - any time zone casting is only in subquery changed_at < %s AND (min_missed_id IS NULL OR (fact_loader_batch_id < min_missed_id)) $$, quote_literal(c.maximum_cutoff_time) ) AS outer_global_where_sql, format($$ AND q.%s = 'I' $$, queue_table_op) AS where_for_insert_sql, format($$ AND (q.%s = 'U' AND %s) $$, queue_table_op, CASE WHEN relevant_change_columns IS NULL THEN 'TRUE' ELSE format($$q.%s ?| '{%s}'$$, queue_table_change, array_to_string(relevant_change_columns,',')) END) AS where_for_update_sql, format($$ AND q.%s = 'D' $$, queue_table_op) AS where_for_delete_sql FROM field_vars c ) , insert_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM insert_sql_builder ORDER BY queue_table_dep_id, level DESC ) , update_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM update_sql_builder ORDER BY queue_table_dep_id, level DESC ) , delete_sql_builder_final AS (SELECT DISTINCT ON (queue_table_dep_id) * FROM delete_sql_builder ORDER BY queue_table_dep_id, level DESC ) , all_queues_sql AS ( SELECT format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||isbf.key_select_column||isbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, isbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_insert_sql, nrs.outer_global_where_sql) AS queue_insert_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||usbf.key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased||nrs.base_join_sql, usbf.key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_update_sql, nrs.outer_global_where_sql) AS queue_update_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s %s WHERE %s ) sub WHERE %s $$, nrs.outer_data_select_columns, nrs.inner_data_select_columns||dsbf.delete_key_select_column||usbf.source_change_date_select, nrs.queue_table_aliased, dsbf.delete_key_retrieval_sql, nrs.inner_global_where_sql||nrs.where_for_delete_sql, nrs.outer_global_where_sql) AS queue_delete_sql, format($$ SELECT %s FROM ( SELECT %s FROM %s WHERE %s ) sub WHERE %s $$, nrs.outer_metadata_select_columns, nrs.inner_metadata_select_columns, nrs.queue_table_aliased, nrs.inner_global_where_sql, nrs.outer_global_where_sql) AS queue_ids_sql FROM non_recursive_sql nrs INNER JOIN insert_sql_builder_final isbf ON isbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN update_sql_builder_final usbf ON usbf.queue_table_dep_id = nrs.queue_table_dep_id INNER JOIN delete_sql_builder_final dsbf ON dsbf.queue_table_dep_id = nrs.queue_table_dep_id ) , final_queue_sql AS (SELECT string_agg( /**** This first UNION is to union together INSERT, UPDATE, and DELETE events for a single queue table */ format($$ %s UNION ALL %s UNION ALL %s $$, queue_insert_sql, queue_update_sql, queue_delete_sql) /**** This second UNION as the second arg of string_agg is the union together ALL queue tables for this fact table */ , E'\nUNION ALL\n') AS event_sql, string_agg(queue_ids_sql, E'\nUNION ALL\n') AS raw_queued_changes_sql_out FROM all_queues_sql) , final_outputs AS ( SELECT raw_queued_changes_sql_out, $$ WITH all_changes AS ( ($$||event_sql||$$) ORDER BY changed_at) , base_execution_groups AS (SELECT fact_table_id, queue_table_dep_id, queue_table_id_field, operation, changed_at, source_change_date, insert_merge_proid, update_merge_proid, delete_merge_proid, maximum_cutoff_time, key, CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END AS proid, RANK() OVER ( PARTITION BY CASE WHEN operation = 'I' THEN insert_merge_proid WHEN operation = 'U' THEN update_merge_proid WHEN operation = 'D' THEN delete_merge_proid END ) AS execution_group FROM all_changes WHERE key IS NOT NULL) SELECT fact_table_id, proid, key, source_change_date FROM base_execution_groups beg WHERE proid IS NOT NULL GROUP BY execution_group, fact_table_id, proid, key, source_change_date /**** This ordering is particularly important for date-range history tables where order of inserts is critical and usually expected to follow a pattern ***/ ORDER BY execution_group, MIN(changed_at), MIN(queue_table_id_field); $$ AS gathered_queued_changes_sql_out , $$ DROP TABLE IF EXISTS process_queue; CREATE TEMP TABLE process_queue (process_queue_id serial, fact_table_id int, proid regproc, key_value text, source_change_date date); INSERT INTO process_queue (fact_table_id, proid, key_value, source_change_date) $$ AS process_queue_snippet, $$ WITH all_ids AS ($$||raw_queued_changes_sql_out||$$) , new_metadata AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, queue_table_dep_id FROM all_ids --Exclude dependent fact tables from updates directly to queue_table_deps WHERE fact_table_dep_id IS NULL GROUP BY queue_table_dep_id, maximum_cutoff_time) /**** The dependent fact table uses the same queue_table_id_field as last_cutoff We are going to update fact_table_deps metadata instead of queue_table_deps ****/ , new_metadata_fact_dep AS (SELECT MAX(fact_loader_batch_id) AS last_cutoff_id, maximum_cutoff_time, fact_table_dep_queue_table_dep_id FROM all_ids --Include dependent fact tables only WHERE fact_table_dep_id IS NOT NULL GROUP BY fact_table_dep_queue_table_dep_id, maximum_cutoff_time) , update_key AS ( SELECT qdwr.queue_table_dep_id, --Cutoff the id to that newly found, otherwise default to last value COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, --This cutoff time must always be the same for all queue tables for given fact table. --Even if there are no new records, we move this forward to wherever the stream is at qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata mu ON mu.queue_table_dep_id = qdwr.queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Exclude dependent fact tables from updates directly to queue_table_deps AND qdwr.fact_table_dep_id IS NULL ) /**** This SQL also nearly matches that for the queue_table_deps but would be a little ugly to try to DRY up ****/ , update_key_fact_dep AS ( SELECT qdwr.fact_table_dep_queue_table_dep_id, qdwr.fact_table_id, COALESCE(mu.last_cutoff_id, qdwr.last_cutoff_id) AS last_cutoff_id, qdwr.maximum_cutoff_time AS last_cutoff_source_time FROM fact_loader.queue_deps_all qdwr LEFT JOIN new_metadata_fact_dep mu ON mu.fact_table_dep_queue_table_dep_id = qdwr.fact_table_dep_queue_table_dep_id WHERE qdwr.fact_table_id = $$||p_fact_table_id||$$ --Include dependent fact tables only AND qdwr.fact_table_dep_id IS NOT NULL ) , updated_queue_table_deps AS ( UPDATE fact_loader.queue_table_deps qtd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key uk WHERE qtd.queue_table_dep_id = uk.queue_table_dep_id RETURNING qtd.*) , updated_fact_table_deps AS ( UPDATE fact_loader.fact_table_dep_queue_table_deps ftd SET last_cutoff_id = uk.last_cutoff_id, last_cutoff_source_time = uk.last_cutoff_source_time FROM update_key_fact_dep uk WHERE ftd.fact_table_dep_queue_table_dep_id = uk.fact_table_dep_queue_table_dep_id RETURNING uk.*) UPDATE fact_loader.fact_tables ft SET last_refresh_source_cutoff = uqtd.last_cutoff_source_time, last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE FROM (SELECT fact_table_id, last_cutoff_source_time FROM updated_queue_table_deps --Must use UNION to get only distinct values UNION SELECT fact_table_id, last_cutoff_source_time FROM updated_fact_table_deps) uqtd WHERE uqtd.fact_table_id = ft.fact_table_id; $$ AS metadata_update_sql_out FROM final_queue_sql) SELECT raw_queued_changes_sql_out, gathered_queued_changes_sql_out , format($$ %s %s$$, process_queue_snippet, gathered_queued_changes_sql_out) AS process_queue_sql_out, metadata_update_sql_out FROM final_outputs; $BODY$ LANGUAGE SQL; /* pg_fact_loader--1.5--1.6.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; -- Must ensure we have the fully schema-qualified regprod before converting to text SET search_path TO ''; ALTER TABLE fact_loader.debug_process_queue ALTER COLUMN proid TYPE TEXT; ALTER TABLE fact_loader.debug_process_queue ADD CONSTRAINT check_proid CHECK (COALESCE(proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_delete_merge_proid CHECK (COALESCE(default_delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_insert_merge_proid CHECK (COALESCE(default_insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_update_merge_proid CHECK (COALESCE(default_update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN daily_scheduled_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_daily_scheduled_proid CHECK (COALESCE(daily_scheduled_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN fact_table_agg_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_fact_table_agg_proid CHECK (COALESCE(fact_table_agg_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); RESET search_path; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; /* pg_fact_loader--1.6--1.7.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; ALTER TABLE fact_loader.fact_tables ADD COLUMN pre_execute_hook_sql TEXT; CREATE OR REPLACE FUNCTION fact_loader.load(p_fact_table_id INT) RETURNS VOID AS $BODY$ DECLARE v_process_queue_sql text; v_execute_sql text; v_metadata_update_sql text; v_debug_rec record; v_debug_text text = ''; v_pre_execute_hook_sql text = ''; BEGIN /*** There are 3 basic steps to this load: 1. Gather all queue table changes and insert them into a consolidated process_queue 2. Update the metadata indicating the last records updated for both the queue tables and fact table */ /**** Get SQL to insert new data into the consolidated process_queue, and SQL to update metadata for last_cutoffs. */ SELECT process_queue_sql, metadata_update_sql INTO v_process_queue_sql, v_metadata_update_sql FROM fact_loader.sql_builder(p_fact_table_id); /**** Populate the consolidated queue This just creates a temp table with all changes to be processed */ RAISE DEBUG 'Populating Queue for fact_table_id %: %', p_fact_table_id, v_process_queue_sql; EXECUTE COALESCE(v_process_queue_sql, $$SELECT 'No queue data' AS result$$); /**** Pre-execute hook */ SELECT pre_execute_hook_sql INTO v_pre_execute_hook_sql FROM fact_loader.fact_tables WHERE fact_table_id = p_fact_table_id; EXECUTE COALESCE(v_pre_execute_hook_sql, $$SELECT 'No pre-execute hook.' AS result$$); /**** For DEBUG purposes only to view the actual process_queue. Requires setting log_min_messages to DEBUG. */ IF current_setting('log_min_messages') = 'debug3' THEN INSERT INTO fact_loader.debug_process_queue (process_queue_id, fact_table_id, proid, key_value, row_created_at, row_updated_at, source_change_date) -- the row timestamps are not populated, so we set them here SELECT process_queue_id, fact_table_id, proid, key_value, now(), now(), source_change_date FROM process_queue; END IF; /**** With data now in the process_queue, the execute_queue function builds the SQL to execute. Save this SQL in a variable and execute it. If there is no data to execute, this is a no-op select statement. */ SELECT sql INTO v_execute_sql FROM fact_loader.execute_queue(p_fact_table_id); RAISE DEBUG 'Executing Queue for fact_table_id %: %', p_fact_table_id, v_execute_sql; EXECUTE COALESCE(v_execute_sql, $$SELECT 'No queue data to execute' AS result$$); /**** With everything finished, we now update the metadata for the fact_table. Even if no data was processed, we will still move forward last_refresh_attempted_at. last_refresh_succeeded will be marked true always for now. It could in the future be used to indicate a failure in case of a caught error. */ RAISE DEBUG 'Updating metadata for fact_table_id %: %', p_fact_table_id, v_metadata_update_sql; EXECUTE COALESCE(v_metadata_update_sql, format( $$UPDATE fact_loader.fact_tables ft SET last_refresh_attempted_at = now(), last_refresh_succeeded = TRUE WHERE fact_table_id = %s; $$, p_fact_table_id)); END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.if_name AS provider_name, rt.replication_set_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; /* pg_fact_loader--1.7--2.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP FUNCTION fact_loader.queue_table_delay_info(); DROP FUNCTION fact_loader.logical_subscription(); CREATE TYPE fact_loader.driver AS ENUM ('pglogical', 'native'); /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.subscription() RETURNS TABLE (oid OID, subpublications text[], subconninfo text) AS $BODY$ BEGIN RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo FROM pg_subscription s; END; $BODY$ LANGUAGE plpgsql; /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.subscription_rel() RETURNS TABLE (srsubid OID, srrelid OID) AS $BODY$ BEGIN RETURN QUERY SELECT sr.srsubid, sr.srrelid FROM pg_subscription_rel sr; END; $BODY$ LANGUAGE plpgsql; /*** This function exists mostly to easily mock out for testing purposes. */ CREATE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (subid OID, subpublications text[], subconninfo text, dbname text, driver fact_loader.driver) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT sub_origin_if AS subid, sub_replication_sets AS subpublications, null::text AS subconninfo, null::text AS dbname, 'pglogical'::fact_loader.driver AS driver FROM pglogical.subscription UNION ALL SELECT oid, subpublications, subconninfo, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription(); $$; ELSE RETURN QUERY SELECT oid, subpublications, subconninfo, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription(); END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION fact_loader.queue_table_delay_info() RETURNS TABLE("publication_name" text, "queue_of_base_table_relid" regclass, "publisher" name, "source_time" timestamp with time zone) AS $BODY$ /*** This function exists to allow no necessary dependency to exist on pglogical_ticker. If the extension is used, it will return data from its native functions, if not, it will return a null data set matching the structure ***/ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical_ticker') THEN RETURN QUERY EXECUTE $$ -- pglogical SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , n.if_name AS publisher , t.source_time FROM fact_loader.queue_tables qt JOIN fact_loader.logical_subscription() s ON qt.pglogical_node_if_id = s.subid AND s.driver = 'pglogical' JOIN pglogical.node_interface n ON n.if_id = qt.pglogical_node_if_id JOIN pglogical_ticker.all_subscription_tickers() t ON t.provider_name = n.if_name UNION ALL -- native logical SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , t.db AS publisher , t.tick_time AS source_time FROM fact_loader.queue_tables qt JOIN fact_loader.subscription_rel() psr ON psr.srrelid = qt.queue_table_relid JOIN fact_loader.logical_subscription() s ON psr.srsubid = s.subid JOIN logical_ticker.tick t ON t.db = s.dbname UNION ALL -- local SELECT NULL::text AS publication_name , qt.queue_of_base_table_relid , NULL::name AS publisher , now() AS source_time FROM fact_loader.queue_tables qt WHERE qt.pglogical_node_if_id IS NULL AND NOT EXISTS ( SELECT 1 FROM fact_loader.subscription_rel() psr WHERE psr.srrelid = qt.queue_table_relid );$$; ELSE RETURN QUERY -- local SELECT NULL::TEXT AS publication_name , qt.queue_of_base_table_relid , NULL::NAME AS publisher --source_time is now() if queue tables are not pglogical-replicated, which is assumed if no ticker , now() AS source_time FROM fact_loader.queue_tables qt WHERE NOT EXISTS (SELECT 1 FROM fact_loader.subscription_rel() psr WHERE psr.srrelid = qt.queue_table_relid) UNION ALL -- native logical (WITH logical_subscription_with_db AS ( SELECT *, (regexp_matches(subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS db FROM fact_loader.logical_subscription() ) SELECT unnest(coalesce(subpublications,'{NULL}')) AS publication_name , qt.queue_of_base_table_relid , t.db AS publisher , t.tick_time AS source_time FROM fact_loader.queue_tables qt JOIN fact_loader.subscription_rel() psr ON psr.srrelid = qt.queue_table_relid JOIN logical_subscription_with_db s ON psr.srsubid = s.subid JOIN logical_ticker.tick t ON t.db = s.db); END IF; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.publisher AS provider_name, rt.publication_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/pg_fact_loader-sql-maker.sh000077500000000000000000000022651451107006500216310ustar00rootroot00000000000000#!/usr/bin/env bash set -eu last_version=1.7 new_version=2.0 last_version_file=pg_fact_loader--${last_version}.sql new_version_file=pg_fact_loader--${new_version}.sql update_file=pg_fact_loader--${last_version}--${new_version}.sql rm -f $update_file rm -f $new_version_file create_update_file_with_header() { cat << EOM > $update_file /* $update_file */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit EOM } add_sql_to_file() { sql=$1 file=$2 echo "$sql" >> $file } add_file() { s=$1 d=$2 (cat "${s}"; echo; echo) >> "$d" } create_update_file_with_header # Only copy diff and new files after last version, and add the update script add_file schema/2.0.sql $update_file add_file functions/subscription.sql $update_file add_file functions/subscription_rel.sql $update_file add_file functions/logical_subscription.sql $update_file add_file functions/queue_table_delay_info.sql $update_file add_file views/queue_deps_all.sql $update_file add_file views/queue_deps_all_with_retrieval.sql $update_file # make new version file cp $last_version_file $new_version_file cat $update_file >> $new_version_file pg_fact_loader-2.0.1/pg_fact_loader.control000066400000000000000000000002411451107006500207720ustar00rootroot00000000000000# pg_fact_loader extension comment = 'build fact tables with Postgres' module_pathname = '$libdir/pg_fact_loader' default_version = '2.0' schema = 'fact_loader' pg_fact_loader-2.0.1/schema/000077500000000000000000000000001451107006500157025ustar00rootroot00000000000000pg_fact_loader-2.0.1/schema/1.4.sql000066400000000000000000000001261451107006500167240ustar00rootroot00000000000000ALTER TABLE fact_loader.debug_process_queue DROP CONSTRAINT debug_process_queue_pkey; pg_fact_loader-2.0.1/schema/1.6.sql000066400000000000000000000061601451107006500167320ustar00rootroot00000000000000DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; -- Must ensure we have the fully schema-qualified regprod before converting to text SET search_path TO ''; ALTER TABLE fact_loader.debug_process_queue ALTER COLUMN proid TYPE TEXT; ALTER TABLE fact_loader.debug_process_queue ADD CONSTRAINT check_proid CHECK (COALESCE(proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_dep_queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_delete_merge_proid CHECK (COALESCE(default_delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_insert_merge_proid CHECK (COALESCE(default_insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_table_deps ALTER COLUMN default_update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.fact_table_deps ADD CONSTRAINT check_default_update_merge_proid CHECK (COALESCE(default_update_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN daily_scheduled_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_daily_scheduled_proid CHECK (COALESCE(daily_scheduled_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.fact_tables ALTER COLUMN fact_table_agg_proid TYPE TEXT; ALTER TABLE fact_loader.fact_tables ADD CONSTRAINT check_fact_table_agg_proid CHECK (COALESCE(fact_table_agg_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN delete_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_delete_merge_proid CHECK (COALESCE(delete_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN insert_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_insert_merge_proid CHECK (COALESCE(insert_merge_proid::REGPROC, 'boolin') IS NOT NULL); ALTER TABLE fact_loader.queue_table_deps ALTER COLUMN update_merge_proid TYPE TEXT; ALTER TABLE fact_loader.queue_table_deps ADD CONSTRAINT check_update_merge_proid CHECK (COALESCE(update_merge_proid::REGPROC, 'boolin') IS NOT NULL); RESET search_path; pg_fact_loader-2.0.1/schema/1.7.sql000066400000000000000000000003161451107006500167300ustar00rootroot00000000000000DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP VIEW fact_loader.prioritized_jobs; ALTER TABLE fact_loader.fact_tables ADD COLUMN pre_execute_hook_sql TEXT; pg_fact_loader-2.0.1/schema/2.0.sql000066400000000000000000000007011451107006500167200ustar00rootroot00000000000000DROP VIEW fact_loader.queue_deps_all_with_retrieval; DROP VIEW fact_loader.queue_deps_all; DROP FUNCTION fact_loader.safely_terminate_workers(); DROP FUNCTION fact_loader.launch_workers(int); DROP FUNCTION fact_loader.launch_worker(); DROP FUNCTION fact_loader._launch_worker(oid); DROP FUNCTION fact_loader.queue_table_delay_info(); DROP FUNCTION fact_loader.logical_subscription(); CREATE TYPE fact_loader.driver AS ENUM ('pglogical', 'native'); pg_fact_loader-2.0.1/schema/schema.sql000066400000000000000000000223711451107006500176700ustar00rootroot00000000000000/* pg_fact_loader--1.0.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION pg_fact_loader" to load this file. \quit CREATE FUNCTION fact_loader._launch_worker(oid) RETURNS pg_catalog.INT4 STRICT AS 'MODULE_PATHNAME', 'pg_fact_loader_worker' LANGUAGE C; CREATE FUNCTION fact_loader.launch_worker() RETURNS pg_catalog.INT4 STRICT AS 'SELECT fact_loader._launch_worker(oid) FROM pg_database WHERE datname = current_database();' LANGUAGE SQL; CREATE TABLE fact_loader.fact_tables ( fact_table_id SERIAL PRIMARY KEY, fact_table_relid REGCLASS NOT NULL, fact_table_agg_proid REGPROC NULL, --This may only be used to generate a merge function but is not used in automation enabled BOOLEAN NOT NULL DEFAULT FALSE, priority INT, attempt_number INT, retries_allowed INT DEFAULT 0, force_worker_priority BOOLEAN NOT NULL DEFAULT FALSE, last_refresh_source_cutoff TIMESTAMPTZ, last_refresh_attempted_at TIMESTAMPTZ, --TODO - answer if we want the worker to bail or record messages on ERROR (or both) last_refresh_succeeded BOOLEAN, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_tables UNIQUE (fact_table_relid) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_tables', ''); CREATE TABLE fact_loader.fact_table_deps ( fact_table_dep_id SERIAL PRIMARY KEY, parent_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), child_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), /***** In very many cases, you will use the same procs for insert, update, and delete even with multiple dependencies. This is why you must give defaults here which will be used to auto-populate fact_loader.fact_table_dep_queue_table_deps which can be overridden if necessary for each queue table. After you configure all of your fact tables and queue tables, run the function refresh_fact_table_dep_queue_table_deps manually to populate fact_table_dep_queue_table_deps, then make any changes as necessary. You can see an example of this in the test suite "seeds" file. You can also see an override example with order_emails_fact having a different proc for orders and reorders delete cases. */ default_insert_merge_proid REGPROC NOT NULL, default_update_merge_proid REGPROC NOT NULL, default_delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_fact_deps UNIQUE (parent_id, child_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.fact_table_deps', ''); CREATE TABLE fact_loader.queue_tables ( queue_table_id SERIAL PRIMARY KEY, queue_table_relid REGCLASS NOT NULL, queue_of_base_table_relid REGCLASS NOT NULL, /**** NOTE - the reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information. */ pglogical_node_if_id INT NOT NULL, --This is the timezone for the changed_at column - if null, we assume it is timestamptz (we could check that actually) queue_table_tz TEXT, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_table UNIQUE (queue_table_relid), CONSTRAINT unique_base_table UNIQUE (queue_of_base_table_relid) ); COMMENT ON COLUMN fact_loader.queue_tables.pglogical_node_if_id IS $$The reason for this config existing here is that we have no built-in way in pglogical to know which tables belong to which pglogical node. Therefore, we need to configure that. We hope that some time down the road, this will change, and we can derive this information.$$; SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_tables', ''); CREATE TABLE fact_loader.queue_table_deps ( queue_table_dep_id SERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), queue_table_id INT NOT NULL REFERENCES fact_loader.queue_tables (queue_table_id), relevant_change_columns NAME[], last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_queue_deps UNIQUE (fact_table_id, queue_table_id) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.queue_table_deps', ''); CREATE TABLE fact_loader.key_retrieval_sequences ( key_retrieval_sequence_id SERIAL PRIMARY KEY, queue_table_dep_id INT NOT NULL REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), /**** In almost all cases, we only need to write one way to retrieve keys. The only exception is, for example, when in a delete case, you need to pass a different field (customer_id instead of order_id) to the delete_merge_proid function. You then need a different key_retrieval_sequence to handle a different field name for this delete case. By default this is NULL, meaning there is no filter, meaning the sequence applies to all events I, U, D. Otherwise, you can add scopes in which case you must have one for each of 'I','U','D'. */ filter_scope CHAR(1) NULL, level INT NOT NULL, return_columns NAME[] NOT NULL, is_fact_key BOOLEAN NOT NULL, join_to_relation REGCLASS NULL, join_to_column NAME NULL, return_columns_from_join NAME[] NULL, join_return_is_fact_key BOOLEAN NULL, CONSTRAINT unique_retrievals UNIQUE (queue_table_dep_id, filter_scope, level), CONSTRAINT valid_scopes CHECK (filter_scope IN ('I','U','D')) ); SELECT pg_catalog.pg_extension_config_dump('fact_loader.key_retrieval_sequences', ''); CREATE TABLE fact_loader.fact_table_dep_queue_table_deps ( fact_table_dep_queue_table_dep_id SERIAL PRIMARY KEY, fact_table_dep_id INT REFERENCES fact_loader.fact_table_deps (fact_table_dep_id), queue_table_dep_id INT REFERENCES fact_loader.queue_table_deps (queue_table_dep_id), last_cutoff_id BIGINT, last_cutoff_source_time TIMESTAMPTZ, insert_merge_proid REGPROC NOT NULL, update_merge_proid REGPROC NOT NULL, delete_merge_proid REGPROC NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ, CONSTRAINT unique_cutoffs UNIQUE (fact_table_dep_id, queue_table_dep_id) ); CREATE OR REPLACE FUNCTION fact_loader.unique_scopes() RETURNS TRIGGER AS $BODY$ BEGIN IF (NEW.filter_scope IS NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NOT NULL )) OR (NEW.filter_scope IS NOT NULL AND EXISTS ( SELECT 1 FROM fact_loader.key_retrieval_sequences WHERE queue_table_dep_id <> NEW.queue_table_dep_id AND NEW.filter_scope IS NULL )) THEN RAISE EXCEPTION $$You must either use a NULL filter_scope to cover all 3 events I, U, D or you must specify all 3 events separately I, U, D (For queue_table_dep_id %). $$, NEW.queue_table_dep_id; END IF; RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER unique_scopes BEFORE INSERT OR UPDATE ON fact_loader.key_retrieval_sequences FOR EACH ROW EXECUTE PROCEDURE fact_loader.unique_scopes(); /*** This table is unlogged because it only has data mid-transaction and should always be empty */ CREATE UNLOGGED TABLE fact_loader.process_queue ( process_queue_id BIGSERIAL PRIMARY KEY, fact_table_id INT NOT NULL REFERENCES fact_loader.fact_tables (fact_table_id), proid REGPROC NOT NULL, key_value TEXT NOT NULL, row_created_at TIMESTAMPTZ DEFAULT NOW(), row_updated_at TIMESTAMPTZ ); CREATE OR REPLACE FUNCTION fact_loader.set_row_updated_at_to_now() RETURNS TRIGGER AS $BODY$ BEGIN NEW.row_updated_at = now(); RETURN NEW; END; $BODY$ LANGUAGE plpgsql; CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_tables FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.fact_table_dep_queue_table_deps FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); CREATE TRIGGER set_row_updated_at_to_now BEFORE INSERT OR UPDATE ON fact_loader.process_queue FOR EACH ROW WHEN (NEW.row_updated_at IS DISTINCT FROM now()) EXECUTE PROCEDURE fact_loader.set_row_updated_at_to_now(); --TODO -- CREATE TRIGGER verify_columns_are_really_columns pg_fact_loader-2.0.1/sql/000077500000000000000000000000001451107006500152415ustar00rootroot00000000000000pg_fact_loader-2.0.1/sql/01_create_ext.sql000066400000000000000000000003501451107006500204030ustar00rootroot00000000000000-- Allow running regression suite with upgrade paths \set v `echo ${FROMVERSION:-2.0}` SET client_min_messages TO warning; CREATE EXTENSION pglogical; CREATE EXTENSION pglogical_ticker; CREATE EXTENSION pg_fact_loader VERSION :'v'; pg_fact_loader-2.0.1/sql/02_schema.sql000066400000000000000000000310671451107006500175320ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; DROP SCHEMA IF EXISTS test, test_fact, audit, test_audit_raw CASCADE ; TRUNCATE fact_loader.fact_tables CASCADE; TRUNCATE fact_loader.queue_tables CASCADE; --We use no serial/identity types here purely to be able to have consistency across multiple re-testing CREATE SCHEMA test; CREATE TABLE test.customers (customer_id INT PRIMARY KEY, customer_number text, phone TEXT, age INT); CREATE TABLE test.orders (order_id INT PRIMARY KEY, customer_id INT REFERENCES test.customers (customer_id) ON DELETE CASCADE, order_date DATE, total NUMERIC(10,2), row_updated_at TIMESTAMPTZ); CREATE TABLE test.emails (email_id INT PRIMARY KEY, customer_id INT REFERENCES test.customers (customer_id) ON DELETE CASCADE, read BOOLEAN); CREATE TABLE test.promos (promo_id INT PRIMARY KEY, description TEXT); CREATE TABLE test.email_promos (email_promo_id INT PRIMARY KEY, email_id INT REFERENCES test.emails (email_id) ON DELETE CASCADE, promo_id INT REFERENCES test.promos (promo_id) ON DELETE CASCADE); CREATE TABLE test.products (product_id INT PRIMARY KEY, product_name NAME); CREATE TABLE test.order_products (order_product_id INT PRIMARY KEY, order_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE, product_id INT REFERENCES test.products (product_id) ON DELETE CASCADE); --This table will test having to do multiple joins from changes to a table - join to orders, join to customers, in order to update customers_fact CREATE TABLE test.order_product_promos (order_product_promo_id INT PRIMARY KEY, order_product_id INT NOT NULL REFERENCES test.order_products (order_product_id) ON DELETE CASCADE, promo_id INT NOT NULL REFERENCES test.promos (promo_id) ON DELETE CASCADE); --This table will test multiple columns referring to a key of a fact table (orders.order_id) CREATE TABLE test.reorders (reorder_id INT PRIMARY KEY, base_order_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE, reorder_from_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE, reorder_to_id INT REFERENCES test.orders (order_id) ON DELETE CASCADE); CREATE SCHEMA test_fact; CREATE TABLE test_fact.customers_fact (customer_id INT PRIMARY KEY, phone TEXT, age INT, last_order_id INT, order_product_count INT, order_product_promo_ids INT[], row_updated_at TIMESTAMPTZ); CREATE TABLE test_fact.orders_fact (order_id INT PRIMARY KEY, customer_id INT, order_date DATE, total NUMERIC(10,2), is_reorder BOOLEAN, row_updated_at TIMESTAMPTZ); --This is a silly dependent fact table definition, but will test correct updating of a fact table that depends on other fact tables CREATE TABLE test_fact.customersorders_fact (order_id INT PRIMARY KEY, customer_id INT, phone TEXT, age INT, max_order_date DATE, min_total NUMERIC(10,2), row_updated_at TIMESTAMPTZ); --This fact table def is an example of both a fact and base table dependency CREATE TABLE test_fact.order_emails_fact (order_id INT PRIMARY KEY, customer_id INT, order_date DATE, total NUMERIC(10,2), is_reorder BOOLEAN, num_emails INT, num_read INT, row_updated_at TIMESTAMPTZ); --This fact table tests nested fact table deps CREATE TABLE test_fact.customersorders_summary_fact (customer_id INT PRIMARY KEY, rows_in_customersorders_fact INT); --This fact table depends only on customers, which other fact tables depend on, and also emails, which the customers and test_fact.orders_fact do not depend on CREATE TABLE test_fact.emails_fact (email_id INT PRIMARY KEY, read BOOLEAN, promo_count INT); --This is to test range value tables CREATE TABLE test_fact.customer_order_history_fact (as_of_date daterange, customer_id INT, total_orders INT, last_order_date DATE, row_updated_at TIMESTAMPTZ, PRIMARY KEY (customer_id, as_of_date)); CREATE OR REPLACE FUNCTION test_fact.customers_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.customers_fact AS $BODY$ BEGIN RETURN QUERY SELECT customer_id, phone, age, os.last_order_id, ops.order_product_count::INT, oppi.order_product_promo_ids, now() AS row_updated_at FROM test.customers c LEFT JOIN LATERAL (SELECT MAX(order_id) AS last_order_id FROM test.orders o WHERE o.customer_id = c.customer_id) os ON TRUE LEFT JOIN LATERAL (SELECT COUNT(1) AS order_product_count FROM test.orders o INNER JOIN test.order_products op ON op.order_id = o.order_id WHERE o.customer_id = c.customer_id ) ops ON TRUE LEFT JOIN LATERAL (SELECT array_agg(opp.promo_id ORDER BY opp.promo_id) AS order_product_promo_ids FROM test.order_product_promos opp INNER JOIN test.order_products op ON opp.order_product_id = op.order_product_id INNER JOIN test.orders o ON op.order_id = o.order_id WHERE o.customer_id = c.customer_id) oppi ON TRUE WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customers_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customers_fact c WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.orders_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.orders_fact AS $BODY$ BEGIN RETURN QUERY SELECT order_id, customer_id, order_date, total, is_reorder, now() AS row_updated_at FROM test.orders o LEFT JOIN LATERAL (SELECT EXISTS (SELECT 1 FROM test.reorders ro WHERE ro.reorder_to_id = o.order_id) AS is_reorder) ros ON TRUE WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.orders_fact_delete(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.orders_fact c WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION test_fact.customersorders_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.customersorders_fact AS $BODY$ BEGIN RETURN QUERY SELECT order_id, customer_id, phone, age, MAX(order_date), MIN(total)::NUMERIC(10,2), now() AS row_updated_at FROM test_fact.customers_fact ff INNER JOIN test_fact.orders_fact bf USING (customer_id) WHERE ff.customer_id = p_customer_id GROUP BY order_id, customer_id, phone, age; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customersorders_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customersorders_fact c WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION test_fact.customersorders_summary_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.customersorders_summary_fact AS $BODY$ BEGIN RETURN QUERY SELECT customer_id, COUNT(1)::INT AS rows_in_customersorders_fact FROM test_fact.customersorders_fact WHERE customer_id = p_customer_id GROUP BY customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customersorders_summary_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customersorders_summary_fact c WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; /*** This fact table def is an example of both a fact and base table dependency */ CREATE OR REPLACE FUNCTION test_fact.order_emails_fact_aggregator(p_customer_id INT) RETURNS SETOF test_fact.order_emails_fact AS $BODY$ BEGIN RETURN QUERY SELECT order_id, customer_id, order_date, total, is_reorder, es.num_emails::INT, es.num_read::INT, now() AS row_updated_at FROM test_fact.orders_fact of LEFT JOIN LATERAL (SELECT COUNT(1) AS num_emails, SUM(CASE WHEN read THEN 1 ELSE 0 END) AS num_read FROM test.emails e WHERE e.customer_id = of.customer_id) es ON TRUE WHERE of.customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.order_emails_fact_delete(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.order_emails_fact c WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION test_fact.emails_fact_aggregator(p_email_id INT) RETURNS SETOF test_fact.emails_fact AS $BODY$ BEGIN RETURN QUERY SELECT email_id, read, promo_count::INT FROM test.emails e LEFT JOIN LATERAL (SELECT COUNT(1) AS promo_count FROM test.email_promos ep WHERE ep.email_id = e.email_id) eps ON TRUE WHERE email_id = p_email_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.emails_fact_delete(p_email_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.emails_fact c WHERE email_id = p_email_id; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customer_order_history_fact_merge(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN PERFORM test_fact.customer_order_history_fact_record_merge(o.*) FROM test.orders o WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; --TODO - this assumes inserts always have a greater or equal order_date - but is that just implementation? CREATE FUNCTION test_fact.customer_order_history_fact_record_merge(p_order test.orders) RETURNS VOID AS $BODY$ DECLARE v_add_to_total_orders integer = 1; BEGIN WITH ended_last_fact AS (UPDATE test_fact.customer_order_history_fact SET as_of_date = daterange(lower(as_of_date), p_order.order_date) , row_updated_at = p_order.row_updated_at WHERE customer_id = p_order.customer_id AND lower(as_of_date) <> p_order.order_date AND upper(as_of_date) = 'infinity' RETURNING *) INSERT INTO test_fact.customer_order_history_fact AS f (as_of_date, customer_id, total_orders, last_order_date, row_updated_at) SELECT daterange(p_order.order_date, 'infinity'), p_order.customer_id, COALESCE(ended_last_fact.total_orders, 0) + v_add_to_total_orders AS total_orders, p_order.order_date, now() FROM (SELECT p_order.customer_id) nes LEFT JOIN ended_last_fact ON nes.customer_id = ended_last_fact.customer_id ON CONFLICT (customer_id, as_of_date) DO UPDATE SET total_orders = f.total_orders + v_add_to_total_orders , last_order_date = p_order.order_date , row_updated_at = now(); END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customer_order_history_fact_update(p_order_id INT) RETURNS VOID AS $BODY$ DECLARE v_customer_id INT = (SELECT customer_id FROM test.orders WHERE order_id = p_order_id); BEGIN --For simplicities sake for this unusual event, just drop and rebuild history DELETE FROM test_fact.customer_order_history_fact cohf WHERE customer_id = v_customer_id; PERFORM test_fact.customer_order_history_fact_record_merge(o_ordered.*) FROM (SELECT * FROM test.orders WHERE customer_id = v_customer_id ORDER BY order_id) o_ordered; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customer_order_history_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN --For simplicities sake for this unusual event, just drop and rebuild history DELETE FROM test_fact.customer_order_history_fact cohf WHERE customer_id = p_customer_id; PERFORM test_fact.customer_order_history_fact_record_merge(o_ordered.*) FROM (SELECT * FROM test.orders WHERE customer_id = p_customer_id ORDER BY order_id) o_ordered; END; $BODY$ LANGUAGE plpgsql; SELECT fact_loader.create_table_loader_function((schemaname||'.'||relname||'_aggregator')::REGPROC,relid,'{row_updated_at}') FROM pg_stat_user_tables WHERE relname IN('customers_fact','orders_fact','customersorders_fact','emails_fact','order_emails_fact','customersorders_summary_fact') ORDER BY schemaname, relname; INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customers_fact'::REGCLASS, 'test_fact.customers_fact_aggregator'::REGPROC, 1); INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.orders_fact'::REGCLASS, 'test_fact.orders_fact_aggregator'::REGPROC, 2); --TODO feature INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customersorders_fact'::REGCLASS, 'test_fact.customersorders_fact_aggregator'::REGPROC, 3); INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.emails_fact' ::REGCLASS, 'test_fact.emails_fact_aggregator'::REGPROC, 4); --TODO feature INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.order_emails_fact' ::REGCLASS, 'test_fact.order_emails_fact_aggregator'::REGPROC, 5); --TODO feature INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customer_order_history_fact' ::REGCLASS, NULL, 6); --Nested fact table deps INSERT INTO fact_loader.fact_tables (fact_table_relid, fact_table_agg_proid, priority) VALUES ('test_fact.customersorders_summary_fact' ::REGCLASS, 'test_fact.customersorders_summary_fact_aggregator'::REGPROC, 7); pg_fact_loader-2.0.1/sql/03_audit.sql000066400000000000000000001002771451107006500174010ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; CREATE EXTENSION IF NOT EXISTS hstore; CREATE SCHEMA IF NOT EXISTS audit; CREATE OR REPLACE FUNCTION audit.no_dml_on_audit_table() RETURNS TRIGGER AS $$ BEGIN RAISE EXCEPTION 'No common-case updates/deletes/truncates allowed on audit table'; RETURN NULL; END; $$ LANGUAGE plpgsql; /*** TO BUILD THE REST OF THE AUDIT SQL: SELECT format('./audit.sh %s %s %s >> sql/03_audit.sql', schemaname, relname, pkey) AS script FROM pg_stat_user_tables st INNER JOIN LATERAL (SELECT a.attname AS pkey FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = st.relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aft ON TRUE WHERE st.schemaname = 'test' ORDER BY schemaname, relname; */ CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.customers_audit ( customers_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_customers"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.customers_audit_customers_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."customers_audit"("customers_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.customers FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_customers" ('customer_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.email_promos_audit ( email_promos_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_email_promos"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.email_promos_audit_email_promos_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."email_promos_audit"("email_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.email_promos FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_email_promos" ('email_promo_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.emails_audit ( emails_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_emails"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.emails_audit_emails_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."emails_audit"("emails_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.emails FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_emails" ('email_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.order_product_promos_audit ( order_product_promos_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_order_product_promos"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.order_product_promos_audit_order_product_promos_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."order_product_promos_audit"("order_product_promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.order_product_promos FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_order_product_promos" ('order_product_promo_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.order_products_audit ( order_products_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_order_products"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.order_products_audit_order_products_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."order_products_audit"("order_products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.order_products FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_order_products" ('order_product_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.orders_audit ( orders_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_orders"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.orders_audit_orders_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."orders_audit"("orders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.orders FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_orders" ('order_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.products_audit ( products_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_products"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.products_audit_products_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."products_audit"("products_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.products FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_products" ('product_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.promos_audit ( promos_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_promos"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.promos_audit_promos_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."promos_audit"("promos_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.promos FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_promos" ('promo_id'); CREATE SCHEMA IF NOT EXISTS test_audit_raw; CREATE TABLE test_audit_raw.reorders_audit ( reorders_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp without time zone NOT NULL, operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_audit_raw"."audit_test_reorders"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_audit_raw.reorders_audit_reorders_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_audit_raw"."reorders_audit"("reorders_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test.reorders FOR EACH ROW EXECUTE PROCEDURE "test_audit_raw"."audit_test_reorders" ('reorder_id'); pg_fact_loader-2.0.1/sql/04_seeds.sql000066400000000000000000000513141451107006500173740ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; --Test at least a column as timestamptz ALTER TABLE test_audit_raw.emails_audit ALTER COLUMN changed_at TYPE TIMESTAMPTZ; INSERT INTO test.customers (customer_id, customer_number, phone, age) SELECT generate_series, 'cust2018'||generate_series, '000123456'||RIGHT(generate_series::TEXT,1), 35 FROM generate_series(1,10); INSERT INTO test.products (product_id, product_name) VALUES (1,'Sour Candy'), (2,'Pool Table'), (3,'One Pocket Book'), (4,'Fast Turbo Car'), (5,'Black Cassock'), (6,'Pacifier'), (7,'Book Light'), (8,'A Dozen Roses'); INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (1, 1, '2018-04-10', 100.00), (2, 3, '2018-04-11', 200.00), (3, 5, '2018-04-12', 2000.00); INSERT INTO test.order_products (order_product_id, order_id, product_id) VALUES (1, 1, 1), (2, 1, 3), (3, 1, 5), (4, 2, 7), (5, 2, 8), (6, 3, 2); INSERT INTO test.promos (promo_id, description) VALUES (1, '50% off 9 foot pool table with real Italian slate'); INSERT INTO test.emails (email_id, customer_id, read) VALUES (1, 5, true); INSERT INTO test.email_promos (email_promo_id, email_id, promo_id) VALUES (1, 1, 1); INSERT INTO test.order_product_promos (order_product_promo_id, order_product_id, promo_id) VALUES (1, 6, 1); INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (4, 1, '2018-04-13', 100.00); INSERT INTO test.reorders (reorder_id, base_order_id, reorder_from_id, reorder_to_id) VALUES (1, 1, 1, 4); INSERT INTO fact_loader.queue_tables (queue_table_relid, queue_of_base_table_relid, pglogical_node_if_id, queue_table_tz) SELECT st.relid::REGCLASS, sts.relid::REGCLASS, 0, CASE WHEN st.relname = 'emails_audit' THEN NULL ELSE 'America/Chicago' END FROM (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) st INNER JOIN (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) sts ON sts.schemaname||'_audit_raw' = st.schemaname AND sts.relname||'_audit' = st.relname WHERE st.schemaname = 'test_audit_raw'; SELECT fact_loader.add_batch_id_fields(); /**** Configuration for customers_fact */ --Queue tables WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.customers_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.customers_fact_merge'::REGPROC AS update_merge_proid, CASE WHEN queue_of_base_table_relid = 'test.customers'::REGCLASS THEN 'test_fact.customers_fact_delete'::REGPROC ELSE 'test_fact.customers_fact_merge'::REGPROC END AS delete_merge_proid, CASE WHEN queue_of_base_table_relid = 'test.customers'::REGCLASS THEN '{phone, age}'::TEXT[] WHEN queue_of_base_table_relid = 'test.orders'::REGCLASS --This update may be implausible, but would affect the fact table THEN '{customer_id}'::TEXT[] --Let's just consider that any update to the other tables should cause concern and we want to be safe and refresh all ELSE NULL END AS relevant_change_columns FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test.customers'::REGCLASS, 'test.orders'::REGCLASS, 'test.order_products'::REGCLASS, 'test.order_product_promos'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, level, return_columns, is_fact_key, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key) SELECT queue_table_dep_id, 1, '{customer_id}'::name[], true, null, null, null::name[], null FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.customers'::REGCLASS, 'test.orders'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 1, '{order_id}', false, 'test.orders'::REGCLASS, 'order_id', '{customer_id}', true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.order_products'::REGCLASS) UNION ALL /**** These 2 are an example of a dependency requiring multiple joins to get the customer_id key needed to update the customers_fact table */ SELECT queue_table_dep_id, 1, '{order_product_id}', false, 'test.order_products'::REGCLASS, 'order_product_id', '{order_id}', false FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.order_product_promos'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 2, '{order_id}', false, 'test.orders'::REGCLASS, 'order_id', '{customer_id}', true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.order_product_promos'::REGCLASS); /**** Configuration for orders_fact */ --Queue tables INSERT INTO fact_loader.queue_table_deps ( fact_table_id, queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid ) SELECT fact_table_id ,(SELECT queue_table_id FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN ('test.orders'::REGCLASS)) , '{order_date, total}'::TEXT[] , 'test_fact.orders_fact_merge'::REGPROC AS insert_merge_proid , 'test_fact.orders_fact_merge'::REGPROC AS update_merge_proid , 'test_fact.orders_fact_delete'::REGPROC AS delete_merge_proid FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; INSERT INTO fact_loader.queue_table_deps ( fact_table_id, queue_table_id, relevant_change_columns, insert_merge_proid, update_merge_proid, delete_merge_proid ) SELECT fact_table_id ,(SELECT queue_table_id FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN ('test.reorders'::REGCLASS)) , NULL , 'test_fact.orders_fact_merge'::REGPROC AS insert_merge_proid , 'test_fact.orders_fact_merge'::REGPROC AS update_merge_proid , 'test_fact.orders_fact_merge'::REGPROC AS delete_merge_proid FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; --Key retrieval INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key) SELECT queue_table_dep_id, evts.evt, 1, evts.return_columns, true, null, null, null::name[], null::boolean FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) CROSS JOIN (VALUES ('I','{customer_id}'::name[]), ('U','{customer_id}'::name[]), ('D','{order_id}'::name[])) evts (evt, return_columns) WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.orders'::REGCLASS) UNION ALL SELECT queue_table_dep_id, NULL, 1, '{base_order_id,reorder_from_id,reorder_to_id}', false, 'test.orders', 'order_id', '{customer_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.reorders'::REGCLASS); /**** Configuration for customersorders_fact_aggregator */ --Only deps in fact_table_deps for this fact table because it depends on no queue tables directly --TODO - revisit and add delete functions as appropriate INSERT INTO fact_loader.fact_table_deps (parent_id, child_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid) VALUES ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS),(SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_fact'::REGCLASS),'test_fact.customersorders_fact_merge','test_fact.customersorders_fact_merge','test_fact.customersorders_fact_delete'), ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS),(SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_fact'::REGCLASS),'test_fact.customersorders_fact_merge','test_fact.customersorders_fact_merge','test_fact.customersorders_fact_delete'); /**** Configuration for order_emails_fact */ --Queue tables WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.order_emails_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.order_emails_fact_merge'::REGPROC AS update_merge_proid, 'test_fact.order_emails_fact_merge'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN('test.emails'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, level, return_columns, is_fact_key) SELECT queue_table_dep_id, 1, '{customer_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS AND queue_of_base_table_relid IN('test.emails'::REGCLASS); --Fact table deps INSERT INTO fact_loader.fact_table_deps (parent_id, child_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid) VALUES ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS),(SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS),'test_fact.order_emails_fact_merge','test_fact.order_emails_fact_merge','test_fact.order_emails_fact_delete'); /**** Configuration for emails_fact */ --Queue tables WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.emails_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.emails_fact_merge'::REGPROC AS update_merge_proid, CASE WHEN queue_of_base_table_relid = 'test.emails'::REGCLASS THEN 'test_fact.emails_fact_delete'::REGPROC ELSE 'test_fact.emails_fact_merge'::REGPROC END AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test.emails'::REGCLASS, 'test.email_promos'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.emails_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, level, return_columns, is_fact_key) SELECT queue_table_dep_id, 1, '{email_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.emails_fact'::REGCLASS AND queue_of_base_table_relid IN( 'test.emails'::REGCLASS, 'test.email_promos'::REGCLASS); /**** Configuration for customer_order_history_fact */ --Queue tables WITH qt AS ( SELECT *, 'test_fact.customer_order_history_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.customer_order_history_fact_update'::REGPROC AS update_merge_proid, 'test_fact.customer_order_history_fact_delete'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN ('test.orders'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, qt.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN qt WHERE fact_table_relid = 'test_fact.customer_order_history_fact'::REGCLASS; /**** For this fact table, we need a different key_retrieval for deletes, so we enter all 3 separately */ INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key) SELECT queue_table_dep_id, evts.evt, 1, '{order_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) CROSS JOIN (VALUES ('I'),('U')) evts (evt) WHERE fact_table_relid = 'test_fact.customer_order_history_fact'::REGCLASS AND queue_of_base_table_relid IN('test.orders'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 'D', 1, '{customer_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customer_order_history_fact'::REGCLASS AND queue_of_base_table_relid IN('test.orders'::REGCLASS); /**** Configuration for test_fact.customersorders_summary_fact */ INSERT INTO fact_loader.fact_table_deps (parent_id, child_id, default_insert_merge_proid, default_update_merge_proid, default_delete_merge_proid) VALUES ((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_fact' :: REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customersorders_summary_fact' :: REGCLASS), 'test_fact.customersorders_summary_fact_merge', 'test_fact.customersorders_summary_fact_merge', 'test_fact.customersorders_summary_fact_delete'); /**** Because we need to manually adjust the dependent fact table config for at least one table, we do this manually 1. Now that configs are all in place, run fact_loader.refresh_fact_table_dep_queue_cutoffs(); to build the deps table 2. Query based on fact_table_relid and queue_table_relid to find the correct fact_table_dep_queue_table_dep_id to update 3. Set this dep to have a different delete function for this queue table */ SELECT fact_loader.refresh_fact_table_dep_queue_table_deps(); WITH to_update AS ( SELECT ftdqtd.fact_table_dep_queue_table_dep_id , qtd.queue_table_id , qt.queue_table_relid , ft.fact_table_id FROM fact_loader.fact_table_deps ftd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqtd ON ftdqtd.fact_table_dep_id = ftd.fact_table_dep_id INNER JOIN fact_loader.queue_table_deps qtd ON qtd.queue_table_dep_id = ftdqtd.queue_table_dep_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE fact_table_relid = 'test_fact.order_emails_fact'::REGCLASS AND qt.queue_table_relid = 'test_audit_raw.reorders_audit'::REGCLASS ) UPDATE fact_loader.fact_table_dep_queue_table_deps SET delete_merge_proid = 'test_fact.order_emails_fact_merge' WHERE fact_table_dep_queue_table_dep_id = (SELECT fact_table_dep_queue_table_dep_id FROM to_update); /**** Both of these next 2 are the same situation because one depends on the other ****/ WITH to_update AS ( SELECT ftdqtd.fact_table_dep_queue_table_dep_id , qtd.queue_table_id , qt.queue_table_relid , ft.fact_table_id , ft.fact_table_relid FROM fact_loader.fact_table_deps ftd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqtd ON ftdqtd.fact_table_dep_id = ftd.fact_table_dep_id INNER JOIN fact_loader.queue_table_deps qtd ON qtd.queue_table_dep_id = ftdqtd.queue_table_dep_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE fact_table_relid = 'test_fact.customersorders_fact'::REGCLASS AND qt.queue_table_relid IN('test_audit_raw.reorders_audit'::REGCLASS,'test_audit_raw.order_product_promos_audit'::REGCLASS,'test_audit_raw.order_products_audit'::REGCLASS) ) UPDATE fact_loader.fact_table_dep_queue_table_deps SET delete_merge_proid = 'test_fact.customersorders_fact_merge' WHERE fact_table_dep_queue_table_dep_id IN (SELECT fact_table_dep_queue_table_dep_id FROM to_update); WITH to_update AS ( SELECT ftdqtd.fact_table_dep_queue_table_dep_id , qtd.queue_table_id , qt.queue_table_relid , ft.fact_table_id , ft.fact_table_relid FROM fact_loader.fact_table_deps ftd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = ftd.child_id INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqtd ON ftdqtd.fact_table_dep_id = ftd.fact_table_dep_id INNER JOIN fact_loader.queue_table_deps qtd ON qtd.queue_table_dep_id = ftdqtd.queue_table_dep_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE fact_table_relid = 'test_fact.customersorders_summary_fact'::REGCLASS AND qt.queue_table_relid IN('test_audit_raw.reorders_audit'::REGCLASS,'test_audit_raw.order_product_promos_audit'::REGCLASS,'test_audit_raw.order_products_audit'::REGCLASS) ) UPDATE fact_loader.fact_table_dep_queue_table_deps SET delete_merge_proid = 'test_fact.customersorders_summary_fact_merge' WHERE fact_table_dep_queue_table_dep_id IN (SELECT fact_table_dep_queue_table_dep_id FROM to_update); /**** DEMO SELECT * FROM fact_loader.fact_tables ORDER BY priority; SELECT * FROM fact_loader.queue_tables ORDER BY queue_table_relid::REGCLASS::TEXT; SELECT ft.fact_table_relid, qt.queue_table_relid, krs.* FROM fact_loader.key_retrieval_sequences krs INNER JOIN fact_loader.queue_table_deps qtd USING (queue_table_dep_id) INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) ORDER BY qtd.queue_table_dep_id, krs.filter_scope, krs.level; SELECT qtd.queue_table_dep_id, ft.fact_table_relid, qt.queue_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, qtd.insert_merge_proid, qtd.update_merge_proid, qtd.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) ORDER BY ft.fact_table_relid::TEXT, qt.queue_table_relid::TEXT; */ pg_fact_loader-2.0.1/sql/05_pgl_setup.sql000066400000000000000000000132371451107006500202760ustar00rootroot00000000000000\set d `echo ${TESTDRIVER:-pglogical}` \set x `echo ${TESTDROPEXT:-false}` SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; SELECT pglogical.create_node('test','host=localhost') INTO TEMP foonode; DROP TABLE foonode; WITH sets AS ( SELECT 'test'||generate_series AS set_name FROM generate_series(1,1) ) SELECT pglogical.create_replication_set (set_name:=s.set_name ,replicate_insert:=TRUE ,replicate_update:=TRUE ,replicate_delete:=TRUE ,replicate_truncate:=TRUE) AS result INTO TEMP repsets FROM sets s WHERE NOT EXISTS ( SELECT 1 FROM pglogical.replication_set WHERE set_name = s.set_name); DROP TABLE repsets; -- native equivalent CREATE PUBLICATION test1 WITH (publish = 'insert,update,delete'); SELECT pglogical_ticker.deploy_ticker_tables(); -- native equivalent CREATE SCHEMA IF NOT EXISTS logical_ticker; CREATE TABLE IF NOT EXISTS logical_ticker.tick ( db text DEFAULT current_database() NOT NULL PRIMARY KEY, tick_time TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, tier SMALLINT DEFAULT 1 NULL ); --As of pglogical_ticker 1.2, we don't tick tables not in replication uselessly, but this --would break our tests which did exactly that. So we can fix the test breakage by just adding these tables --to replication as they would be on an actual provider SELECT pglogical_ticker.add_ticker_tables_to_replication(); --The tests will manually run tick() before new data is needed -- native equivalent ALTER PUBLICATION test1 ADD TABLE logical_ticker.tick; CREATE TEMP TABLE vars AS SELECT :'d'::text as driver, :'x'::boolean as drop_ext; DO $$ DECLARE v_record RECORD; BEGIN IF (SELECT driver FROM vars) = 'native' THEN FOR v_record IN SELECT schemaname, tablename FROM pg_tables WHERE schemaname IN('test', 'test_audit_raw') LOOP EXECUTE format('ALTER PUBLICATION test1 ADD TABLE %s.%s', v_record.schemaname, v_record.tablename); END LOOP; CREATE OR REPLACE FUNCTION test.tick() RETURNS VOID AS $BODY$ BEGIN INSERT INTO logical_ticker.tick (tick_time) VALUES (now()) ON CONFLICT (db) DO UPDATE SET tick_time = now(); END;$BODY$ LANGUAGE plpgsql; CREATE TABLE public.mock_pg_subscription ( oid oid NOT NULL, subdbid oid NOT NULL, subname name NOT NULL, subowner oid NOT NULL, subenabled boolean NOT NULL, subconninfo text NOT NULL, subslotname name NOT NULL, subsynccommit text NOT NULL, subpublications text[] NOT NULL ); INSERT INTO mock_pg_subscription (oid, subdbid, subname, subowner, subenabled, subconninfo, subslotname, subsynccommit, subpublications) VALUES (10000, (SELECT oid FROM pg_database WHERE datname = current_database()), 'test', 16384, true, 'host=example.com dbname=contrib_regression', 'test', 'off', '{test1}'); CREATE OR REPLACE FUNCTION fact_loader.subscription() RETURNS TABLE (oid OID, subpublications text[], subconninfo text) AS $BODY$ BEGIN RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo FROM mock_pg_subscription s; END; $BODY$ LANGUAGE plpgsql; CREATE TABLE public.mock_pg_subscription_rel ( srsubid oid NOT NULL, srrelid oid NOT NULL, srsubstate "char" NOT NULL, srsublsn pg_lsn NOT NULL ); INSERT INTO mock_pg_subscription_rel (srsubid, srrelid, srsubstate, srsublsn) SELECT (SELECT oid FROM mock_pg_subscription LIMIT 1), c.oid, 'r', '0/0' FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname IN('test', 'test_audit_raw') AND c.relkind = 'r'; CREATE OR REPLACE FUNCTION fact_loader.subscription_rel() RETURNS TABLE (srsubid OID, srrelid OID) AS $BODY$ BEGIN RETURN QUERY SELECT sr.srsubid, sr.srrelid FROM mock_pg_subscription_rel sr; END; $BODY$ LANGUAGE plpgsql; IF (SELECT drop_ext FROM vars) THEN DROP EXTENSION pglogical CASCADE; END IF; ELSE UPDATE fact_loader.queue_tables SET pglogical_node_if_id = (SELECT if_id FROM pglogical.node_interface); CREATE OR REPLACE FUNCTION test.tick() RETURNS VOID AS $BODY$ BEGIN PERFORM pglogical_ticker.tick(); END;$BODY$ LANGUAGE plpgsql; END IF; END$$; /*** Mock this function so that we find results locally */ CREATE OR REPLACE FUNCTION pglogical_ticker.all_subscription_tickers() RETURNS TABLE (provider_name NAME, set_name NAME, source_time TIMESTAMPTZ) AS $BODY$ BEGIN RETURN QUERY SELECT t.provider_name, 'test1'::NAME AS set_name, t.source_time FROM pglogical_ticker.test1 t; END; $BODY$ LANGUAGE plpgsql; /*** Mock so we get what we want here also */ CREATE OR REPLACE FUNCTION fact_loader.logical_subscription() RETURNS TABLE (subid OID, subpublications text[], subconninfo text, dbname text, driver fact_loader.driver) AS $BODY$ BEGIN IF EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pglogical') THEN RETURN QUERY EXECUTE $$ SELECT if_id AS subid, '{test1}'::text[] as subpublications, null::text AS subconninfo, null::text AS dbname, 'pglogical'::fact_loader.driver AS driver FROM pglogical.node_interface UNION ALL SELECT s.oid, s.subpublications, s.subconninfo, (regexp_matches(s.subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription() s; $$; ELSE RETURN QUERY SELECT s.oid, s.subpublications, s.subconninfo, (regexp_matches(s.subconninfo, 'dbname=(.*?)(?=\s|$)'))[1] AS dbname, 'native'::fact_loader.driver AS driver FROM fact_loader.subscription() s; END IF; END; $BODY$ LANGUAGE plpgsql; pg_fact_loader-2.0.1/sql/06_basic_workers.sql000066400000000000000000000076451451107006500211400ustar00rootroot00000000000000SET client_min_messages TO warning; -- Client time zone should not change functionality of worker - use a different one here SET TIMEZONE TO 'UTC'; --Enable all except dep tables for now UPDATE fact_loader.fact_tables ft SET enabled = TRUE WHERE NOT EXISTS (SELECT 1 FROM fact_loader.fact_table_deps d WHERE d.child_id = ft.fact_table_id); --Move the mock replication stream forward to now SELECT test.tick(); SELECT fact_loader.worker(); SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; --test debugging feature on this table SET log_min_messages TO debug3; SELECT fact_loader.worker(); SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; RESET log_min_messages; DO $$ BEGIN IF NOT (SELECT COUNT(1) FROM fact_loader.debug_process_queue) = 3 THEN RAISE EXCEPTION '%', 'No worky'; END IF; END$$; SELECT fact_loader.worker(); SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; SELECT fact_loader.worker(); SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; SELECT fact_loader.worker(); SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; SELECT fact_loader.worker(); SELECT customer_id, as_of_date, total_orders, last_order_date FROM test_fact.customer_order_history_fact ORDER BY customer_id, as_of_date; -- Set time zone back to America/Chicago because the audit data is being logged at that time zone -- (another great reason NEVER to use timestamp, but functionality we need at any rate) SET TIMEZONE TO 'America/Chicago'; UPDATE test.customers SET age = 40 WHERE customer_id = 2; -- We need to make deletes handled with recursive joins as well first before testing this -- DELETE FROM test.customers WHERE customer_id = 3; /**** This should not update the fact table, because the replication stream is behind these last 2 updates */ SELECT fact_loader.worker(); SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE, -- Test 1.7 pre-hook feature pre_execute_hook_sql = 'CREATE TABLE cool_pre_execute_hook_sql (id int);' WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; SELECT test.tick(); SELECT fact_loader.worker(); SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; SELECT * FROM cool_pre_execute_hook_sql; UPDATE fact_loader.fact_tables SET pre_execute_hook_sql = NULL WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; --This would simulate an application's changes being out of order now UPDATE test.customers SET age = 41 WHERE customer_id = 2; SELECT test.tick(); SELECT fact_loader.worker(); --Pretend the transaction for this began before the update above - by lowering the actual audit_id and tx time UPDATE test.customers SET age = 42 WHERE customer_id = 2; UPDATE test_audit_raw.customers_audit SET customers_audit_id = customers_audit_id - 1000, changed_at = changed_at - interval '1 minute' WHERE customers_audit_id = (SELECT MAX(customers_audit_id) FROM test_audit_raw.customers_audit); --However, we assume fact_loader_batch_id is still in order because we have a single-threaded --predicatable order with pglogical or a local queue table fed by pg_fact_loader --This will be missed by version 1.2, but not 1.3 SELECT test.tick(); SELECT fact_loader.worker(); SELECT (age = 42) AS age_is_updated FROM test_fact.customers_fact WHERE customer_id = 2 ORDER BY customer_id; ALTER EXTENSION pg_fact_loader UPDATE; UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; pg_fact_loader-2.0.1/sql/07_launch_worker.sql000066400000000000000000000017621451107006500211410ustar00rootroot00000000000000-- NOTE: Original functionality of background worker has been removed. Retaining this test for consistency, -- replacing calls to launch the worker with instead direct calls to SELECT fact_loader.worker(); SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; --Make one change to guarantee we want to see a fact table change, ensure rep stream is up to date UPDATE test.customers SET phone = '0001234577' WHERE customer_id = 10; SELECT test.tick(); --Ensure this one table is prioritized UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; SELECT fact_loader.worker(); SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.customers_fact'::REGCLASS; pg_fact_loader-2.0.1/sql/08_fact_table_deps.sql000066400000000000000000000015301451107006500213670ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; --Enable the dep tables - which thereby should be placed first in line! UPDATE fact_loader.fact_tables ft SET enabled = TRUE WHERE EXISTS (SELECT 1 FROM fact_loader.fact_table_deps d WHERE d.child_id = ft.fact_table_id); SELECT fact_loader.worker(); SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; SELECT fact_loader.worker(); SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; /**** Nested fact table deps */ SELECT fact_loader.worker(); SELECT customer_id, rows_in_customersorders_fact FROM test_fact.customersorders_summary_fact ORDER BY customer_id; pg_fact_loader-2.0.1/sql/09_purge.sql000066400000000000000000000007021451107006500174130ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; SELECT COUNT(1) FROM test_audit_raw.customers_audit; --We call this explicitly, because the worker will take the default add_interval of 1 hour, thus --won't see any actual purging in the test suite. SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); SELECT COUNT(1) FROM test_audit_raw.customers_audit;pg_fact_loader-2.0.1/sql/10_delete.sql000066400000000000000000000040401451107006500175220ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; DELETE FROM test.customers WHERE customer_id = 3; SELECT pg_sleep(1); SELECT test.tick(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; SELECT customer_id, as_of_date, total_orders, last_order_date FROM test_fact.customer_order_history_fact ORDER BY customer_id, as_of_date; SELECT customer_id, rows_in_customersorders_fact FROM test_fact.customersorders_summary_fact ORDER BY customer_id; SELECT COUNT(1) FROM test_audit_raw.customers_audit; --We call this explicitly, because the worker will take the default add_interval of 1 hour, thus --won't see any actual purging in the test suite. SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); SELECT COUNT(1) FROM test_audit_raw.customers_audit; DELETE FROM test.reorders; SELECT pg_sleep(1); SELECT test.tick(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; pg_fact_loader-2.0.1/sql/11_more_data.sql000066400000000000000000000030231451107006500202140ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /*** Try this odd case to be sure we process all events in order correctly ***/ UPDATE test.orders SET total = 1000.00 WHERE order_id = 3; DELETE FROM test.orders WHERE order_id = 3; INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (3, 5, '2018-04-12', 2000.00); --Move the mock replication stream forward to now SELECT test.tick(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT fact_loader.worker(); SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact ORDER BY customer_id; SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; SELECT order_id, customer_id, phone, age, max_order_date, min_total FROM test_fact.customersorders_fact ORDER BY order_id; SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id; SELECT customer_id, as_of_date, total_orders, last_order_date FROM test_fact.customer_order_history_fact ORDER BY customer_id, as_of_date; SELECT customer_id, rows_in_customersorders_fact FROM test_fact.customersorders_summary_fact ORDER BY customer_id; pg_fact_loader-2.0.1/sql/12_no_proid.sql000066400000000000000000000050211451107006500200730ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /**** This makes no sense in reality for the fact table, but we are trying to simulate the potential issue */ WITH to_update AS (SELECT qtd.queue_table_dep_id FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE ft.fact_table_relid = 'test_fact.emails_fact'::REGCLASS AND qt.queue_table_relid = 'test_audit_raw.emails_audit'::REGCLASS) UPDATE fact_loader.queue_table_deps qtd SET insert_merge_proid = NULL FROM to_update tu WHERE tu.queue_table_dep_id = qtd.queue_table_dep_id; --We have configured for this NOT to show up as a change to the fact table INSERT INTO test.emails (email_id, customer_id, read) VALUES (2, 6, true), (3, 7, false); --The bug would have caused this to be missed UPDATE test.emails SET read = FALSE WHERE email_id = 1; --We have configured for this NOT to show up as a change to the fact table INSERT INTO test.emails (email_id, customer_id, read) VALUES (4, 8, true), (5, 9, false); SELECT test.tick(); SELECT fact_loader.worker() FROM generate_series(1,6); SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; SELECT test.tick(); SELECT fact_loader.worker() FROM generate_series(1,6); SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); SELECT COUNT(1) FROM test_audit_raw.emails_audit; /**** Now fix what we broke */ WITH to_update AS (SELECT qtd.queue_table_dep_id FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id WHERE ft.fact_table_relid = 'test_fact.emails_fact'::REGCLASS AND qt.queue_table_relid = 'test_audit_raw.emails_audit'::REGCLASS) UPDATE fact_loader.queue_table_deps qtd SET insert_merge_proid = 'test_fact.emails_fact_merge'::REGPROC FROM to_update tu WHERE tu.queue_table_dep_id = qtd.queue_table_dep_id; SELECT test_fact.emails_fact_merge(email_id) FROM test.emails; SELECT test_fact.order_emails_fact_merge(customer_id) FROM test.customers c WHERE EXISTS (SELECT 1 FROM test.emails e WHERE e.customer_id = c.customer_id); SELECT email_id, read, promo_count FROM test_fact.emails_fact ORDER BY email_id; SELECT order_id, customer_id, order_date, total, is_reorder, num_emails, num_read FROM test_fact.order_emails_fact ORDER BY order_id;pg_fact_loader-2.0.1/sql/13_cutoff_no_dep_on_filter.sql000066400000000000000000000013771451107006500231500ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /*** Based on our config, no actual changes will be processed based on these updates. But we still want the queue to be cleared. */ UPDATE test.customers SET customer_number = customer_number||'1'; SELECT COUNT(1) FROM test_audit_raw.customers_audit; SELECT test.tick(); SELECT fact_loader.worker() FROM generate_series(1,6); --Should now handle dep fact tables SELECT test.tick(); SELECT fact_loader.worker() FROM generate_series(1,6); SELECT test.tick(); SELECT fact_loader.worker() FROM generate_series(1,6); SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); SELECT COUNT(1) FROM test_audit_raw.customers_audit; pg_fact_loader-2.0.1/sql/14_null_key.sql000066400000000000000000000022321451107006500201070ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /*** Based on our config, this should not create an ERROR but should not do anything. */ INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES (5, NULL, '2018-04-10', 100.00); SELECT COUNT(1) FROM test_audit_raw.orders_audit; /**** We limit everything to this 1 table because the above grossly violates our schema and will create errors on other tables. We just want to verify that this actually runs without error when processed. */ UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS; SELECT test.tick(); SELECT fact_loader.worker() FROM generate_series(1,6); SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; SELECT fact_loader.purge_queues('0 seconds'::INTERVAL); SELECT COUNT(1) FROM test_audit_raw.orders_audit; TRUNCATE test_audit_raw.orders_audit; UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.orders_fact'::REGCLASS;pg_fact_loader-2.0.1/sql/15_source_change_date.sql000066400000000000000000000155551451107006500221040ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; /**** This example tests not only using the queue_table timestamp to build a date-based table, but doing that in a different time zone than we might expect, just to show the functionality. So we are going to show a history table of customers from the perspective of the UK */ CREATE TABLE test_fact.customers_history_uktime_fact (customer_id INT, as_of_date DATERANGE, customer_number text, phone TEXT, age INT, PRIMARY KEY (customer_id, as_of_date)); CREATE FUNCTION test_fact.customers_history_uktime_fact_merge(p_customer_id INT, p_as_of_date DATE) RETURNS VOID AS $BODY$ BEGIN WITH it_really_changed AS ( SELECT customer_id, daterange(p_as_of_date, 'infinity') AS as_of_date, customer_number, phone, age FROM test.customers WHERE customer_id = p_customer_id EXCEPT SELECT customer_id, as_of_date, customer_number, phone, age FROM test_fact.customers_history_uktime_fact WHERE customer_id = p_customer_id AND upper(as_of_date) = 'infinity' ) , ended_last_fact AS (UPDATE test_fact.customers_history_uktime_fact f SET as_of_date = daterange(lower(f.as_of_date), lower(irc.as_of_date)) FROM it_really_changed irc WHERE f.customer_id = irc.customer_id AND lower(f.as_of_date) <> lower(irc.as_of_date) AND upper(f.as_of_date) = 'infinity' RETURNING *) INSERT INTO test_fact.customers_history_uktime_fact AS f (customer_id, as_of_date, customer_number, phone, age) SELECT customer_id, as_of_date, customer_number, phone, age FROM it_really_changed nes ON CONFLICT (customer_id, as_of_date) DO UPDATE SET customer_number = EXCLUDED.customer_number , phone = EXCLUDED.phone , age = EXCLUDED.age; END; $BODY$ LANGUAGE plpgsql; CREATE FUNCTION test_fact.customers_history_uktime_fact_delete(p_customer_id INT) RETURNS VOID AS $BODY$ BEGIN DELETE FROM test_fact.customers_history_uktime_fact WHERE customer_id = p_customer_id; END; $BODY$ LANGUAGE plpgsql; INSERT INTO fact_loader.fact_tables (fact_table_relid, priority) VALUES ('test_fact.customers_history_uktime_fact'::REGCLASS, 8); WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.customers_history_uktime_fact_merge'::REGPROC AS insert_merge_proid, 'test_fact.customers_history_uktime_fact_merge'::REGPROC AS update_merge_proid, 'test_fact.customers_history_uktime_fact_delete'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test.customers'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key, join_to_relation, join_to_column, return_columns_from_join, join_return_is_fact_key, pass_queue_table_change_date_at_tz) SELECT queue_table_dep_id, evts.evt, 1, '{customer_id}'::name[], true, null, null, null::name[], null::boolean, --THIS is the key of which time zone the date is seen from 'Europe/London'::TEXT FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) CROSS JOIN (VALUES ('I'),('U')) evts (evt) WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS AND queue_of_base_table_relid IN('test.customers'::REGCLASS) UNION ALL SELECT queue_table_dep_id, 'D', 1, '{customer_id}'::name[], true, null, null, null::name[], null::boolean, null::TEXT FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS AND queue_of_base_table_relid IN('test.customers'::REGCLASS); SELECT test_fact.customers_history_uktime_fact_merge(customer_id, '2018-04-22'::DATE) FROM test.customers; UPDATE test.customers SET customer_number = customer_number||'a' WHERE customer_id BETWEEN 1 AND 5; UPDATE test.customers SET customer_number = customer_number||'b' WHERE customer_id BETWEEN 1 AND 5; UPDATE test.customers SET customer_number = customer_number||'c' WHERE customer_id BETWEEN 6 AND 10; UPDATE test.customers SET customer_number = customer_number||'d' WHERE customer_id BETWEEN 6 AND 10; UPDATE test.customers SET customer_number = customer_number||'e' WHERE customer_id BETWEEN 1 AND 5; /**** Now we have to mock that this actually happened on different days. */ UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-24'::DATE WHERE change ->> 'customer_number' ~ '1a$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-24'::DATE WHERE change ->> 'customer_number' ~ '1ab$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-25'::DATE WHERE change ->> 'customer_number' ~ '1c$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-26'::DATE WHERE change ->> 'customer_number' ~ '1cd$'; UPDATE test_audit_raw.customers_audit SET changed_at = '2018-04-27'::DATE WHERE change ->> 'customer_number' ~ '1abe$'; --Ensure this one table is prioritized UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE, enabled = TRUE WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS; SELECT test.tick(); DO $$ BEGIN IF NOT (SELECT COUNT(1) FROM fact_loader.gathered_queued_changes((SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS))) = 18 THEN RAISE EXCEPTION '%', 'No worky'; END IF; END$$; SELECT fact_loader.worker(); SELECT * FROM test_fact.customers_history_uktime_fact ORDER BY upper(as_of_date), customer_id; --Let's verify the current records are the same as the actual table SELECT customer_id, customer_number, phone, age FROM test.customers INTERSECT SELECT customer_id, customer_number, phone, age FROM test_fact.customers_history_uktime_fact WHERE upper(as_of_date) = 'infinity' ORDER BY customer_id; UPDATE fact_loader.fact_tables SET force_worker_priority = FALSE WHERE fact_table_relid = 'test_fact.customers_history_uktime_fact'::REGCLASS; pg_fact_loader-2.0.1/sql/16_1_2_features.sql000066400000000000000000000441421451107006500205540ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; CREATE TABLE test_fact.orders_fact_chain (LIKE test_fact.orders_fact); /**** First make a bad function def to test exception handling */ CREATE FUNCTION test_fact.orders_fact_chain_merge(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN INSERT INTO test_fact.orders_fact_chain SELECT * FROM test_fact.orders_fact WHERE p_order_id = NULL::JSONB; END; $BODY$ LANGUAGE plpgsql; INSERT INTO fact_loader.fact_tables (fact_table_relid, priority) VALUES ('test_fact.orders_fact_chain'::REGCLASS, 9); /**** This example will use a local fact table as a queue table */ CREATE SCHEMA IF NOT EXISTS test_fact_audit_raw; CREATE TABLE test_fact_audit_raw.orders_fact_audit ( orders_fact_audit_id BIGSERIAL PRIMARY KEY, changed_at timestamp with time zone NOT NULL, --NOTE THE TIMESTAMPTZ operation character varying(1) NOT NULL, row_before_change jsonb, change jsonb, primary_key text, before_change jsonb ); CREATE OR REPLACE FUNCTION "test_fact_audit_raw"."audit_test_fact_orders_fact"() RETURNS TRIGGER AS $$ DECLARE value_row HSTORE = hstore(NULL); new_row HSTORE = hstore(NULL); audit_id BIGINT; BEGIN SELECT nextval('test_fact_audit_raw.orders_fact_audit_orders_fact_audit_id_seq') INTO audit_id; IF (TG_OP = 'UPDATE') THEN new_row = hstore(NEW); SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h.h)).key AS key, substring((each(h.h)).value FROM 1 FOR 500) AS value FROM (SELECT hstore(OLD) - hstore(NEW) AS h) h) sq; IF new_row ? TG_ARGV[0] THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), new_row -> TG_ARGV[0]); ELSE INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), hstore_to_jsonb(hstore(NEW) - hstore(OLD)), NULL); END IF; ELSIF (TG_OP = 'INSERT') THEN value_row = hstore(NEW); IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); END IF; ELSIF (TG_OP = 'DELETE') THEN SELECT hstore(array_agg(sq.key), array_agg(sq.value)) INTO value_row FROM (SELECT (each(h)).key AS key, substring((each(h)).value FROM 1 FOR 500) AS value FROM hstore(OLD) h) sq; IF value_row ? TG_ARGV[0] THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, value_row -> TG_ARGV[0]); ELSE INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), hstore_to_jsonb(value_row), NULL, NULL); END IF; ELSIF (TG_OP = 'TRUNCATE') THEN INSERT INTO "test_fact_audit_raw"."orders_fact_audit"("orders_fact_audit_id", changed_at, operation, before_change, change, primary_key) VALUES(audit_id, now(), substring(TG_OP,1,1), NULL, NULL, NULL); ELSE RETURN NULL; END IF; RETURN NULL; END; $$ LANGUAGE plpgsql; CREATE TRIGGER row_audit_star AFTER INSERT OR DELETE OR UPDATE ON test_fact.orders_fact FOR EACH ROW EXECUTE PROCEDURE "test_fact_audit_raw"."audit_test_fact_orders_fact" ('order_id'); --Note that we DO NOT insert a pglogical_node_if_id - because this queue table is local INSERT INTO fact_loader.queue_tables (queue_table_relid, queue_of_base_table_relid, queue_table_tz) SELECT st.relid::REGCLASS, sts.relid::REGCLASS, NULL FROM (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) st INNER JOIN (SELECT c.oid AS relid, c.relname, n.nspname AS schemaname FROM pg_class c INNER JOIN pg_namespace n ON n.oid = c.relnamespace) sts ON sts.schemaname||'_audit_raw' = st.schemaname AND sts.relname||'_audit' = st.relname WHERE st.schemaname = 'test_fact_audit_raw'; SELECT fact_loader.add_batch_id_fields(); WITH queue_tables_with_proids AS ( SELECT *, 'test_fact.orders_fact_chain_merge'::REGPROC AS insert_merge_proid, 'test_fact.orders_fact_chain_merge'::REGPROC AS update_merge_proid, 'test_fact.orders_fact_chain_merge'::REGPROC AS delete_merge_proid FROM fact_loader.queue_tables WHERE queue_of_base_table_relid IN /*** These are the tables that are involved in test_fact.customers_fact_aggregator Find this out for each function in order to properly configure all possible changes that could affect the tables */ ('test_fact.orders_fact'::REGCLASS) ) INSERT INTO fact_loader.queue_table_deps (fact_table_id, queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid) SELECT fact_table_id, queue_tables_with_proids.queue_table_id, insert_merge_proid, update_merge_proid, delete_merge_proid FROM fact_loader.fact_tables CROSS JOIN queue_tables_with_proids WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS; --Key retrieval for updates INSERT INTO fact_loader.key_retrieval_sequences ( queue_table_dep_id, filter_scope, level, return_columns, is_fact_key) SELECT queue_table_dep_id, NULL, 1, '{order_id}'::name[], true FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.queue_tables qt USING (queue_table_id) INNER JOIN fact_loader.fact_tables ft USING (fact_table_id) WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS AND queue_of_base_table_relid IN('test_fact.orders_fact'::REGCLASS); --Force orders_fact update UPDATE test.orders SET total = 2010.00 WHERE order_id = 3; UPDATE fact_loader.fact_tables SET enabled = (fact_table_relid = 'test_fact.orders_fact'::REGCLASS); SELECT test.tick(); SELECT fact_loader.worker(); SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact ORDER BY order_id; --Don't tick - because this table is LOCAL and should update regardless of ticker. UPDATE fact_loader.fact_tables SET enabled = FALSE; UPDATE fact_loader.fact_tables SET force_worker_priority = TRUE, enabled = TRUE WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS; --This should also return false in case of error SELECT fact_loader.worker(); --We should see an error now SELECT fact_table_id, fact_table_relid, CASE WHEN current_setting('server_version_num')::INT >= 110000 THEN REPLACE(messages::TEXT, 'types', 'type(s)')::JSONB ELSE messages END FROM fact_loader.unresolved_failures; --No data SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact_chain ORDER BY order_id; --Let's fix the function def and re-run CREATE OR REPLACE FUNCTION test_fact.orders_fact_chain_merge(p_order_id INT) RETURNS VOID AS $BODY$ BEGIN INSERT INTO test_fact.orders_fact_chain SELECT * FROM test_fact.orders_fact WHERE order_id = p_order_id; END; $BODY$ LANGUAGE plpgsql; --Now re-enable and re-run UPDATE fact_loader.fact_tables SET enabled = TRUE WHERE fact_table_relid = 'test_fact.orders_fact_chain'::REGCLASS; SELECT fact_loader.worker(); --We should see nothing here now SELECT fact_table_id, fact_table_relid, messages FROM fact_loader.unresolved_failures; --1 row SELECT order_id, customer_id, order_date, total, is_reorder FROM test_fact.orders_fact_chain ORDER BY order_id; --This is NOT a new feature but a new test coverage - testing concurrency. \! psql contrib_regression -c 'BEGIN; SELECT fact_loader.worker() INTO try1; SELECT pg_sleep(2); COMMIT;' & SELECT pg_sleep(1); \! psql contrib_regression -c ' SELECT fact_loader.worker() INTO try2;' SELECT pg_sleep(4); SELECT * FROM try1; SELECT * FROM try2; --Daily schedule test - with range format suggestions included!!! --This kind of table should have a gist exclusion index for the daterange but we won't do it in the test CREATE TABLE test_fact.daily_customers_fact (LIKE test_fact.customers_fact); ALTER TABLE test_fact.daily_customers_fact ADD COLUMN as_of_date daterange; ALTER TABLE test_fact.daily_customers_fact ADD PRIMARY KEY (customer_id, as_of_date); CREATE FUNCTION test_fact.daily_customers_fact_merge() RETURNS VOID AS $BODY$ BEGIN DROP TABLE IF EXISTS changes; CREATE TEMP TABLE changes AS SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.customers_fact EXCEPT SELECT customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.daily_customers_fact WHERE upper(as_of_date) = 'infinity'; UPDATE test_fact.daily_customers_fact SET as_of_date = daterange(lower(as_of_date), current_date) WHERE customer_id IN (SELECT customer_id FROM changes) AND upper(as_of_date) = 'infinity'; INSERT INTO test_fact.daily_customers_fact (as_of_date, customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids) SELECT daterange(current_date,'infinity') AS as_of_date, customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM changes ON CONFLICT (customer_id, as_of_date) DO UPDATE SET phone = EXCLUDED.phone, age = EXCLUDED.age, last_order_id = EXCLUDED.last_order_id, order_product_count = EXCLUDED.order_product_count, order_product_promo_ids = EXCLUDED.order_product_promo_ids; END; $BODY$ LANGUAGE plpgsql; UPDATE fact_loader.fact_tables SET enabled = FALSE; BEGIN; --Keep the same transaction time to make these tests possible INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid) VALUES ('test_fact.daily_customers_fact', TRUE, 10, TRUE, now() + interval '1 second', 'America/Chicago', 'test_fact.daily_customers_fact_merge'::REGPROC); UPDATE fact_loader.fact_tables SET enabled = TRUE WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; /***** Dependent scheduled job - as of 1.3 ****/ CREATE TABLE silly (id int); CREATE FUNCTION itran() RETURNS VOID AS $BODY$ BEGIN INSERT INTO silly VALUES (1); END; $BODY$ LANGUAGE plpgsql; CREATE TABLE willy (id int); CREATE FUNCTION itrantoo() RETURNS VOID AS $BODY$ BEGIN INSERT INTO willy VALUES (1); END; $BODY$ LANGUAGE plpgsql; CREATE TABLE nilly (id int); CREATE FUNCTION itrantootoo() RETURNS VOID AS $BODY$ BEGIN INSERT INTO nilly VALUES (1); END; $BODY$ LANGUAGE plpgsql; INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid, depends_on_base_daily_job_id, depends_on_parent_daily_job_id) VALUES ('silly', TRUE, 11, TRUE, NULL, NULL, 'itran'::REGPROC, (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS)); INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid, depends_on_base_daily_job_id, depends_on_parent_daily_job_id) VALUES ('willy', TRUE, 12, TRUE, NULL, NULL, 'itrantoo'::REGPROC, (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'silly'::REGCLASS)); INSERT INTO fact_loader.fact_tables (fact_table_relid, enabled, priority, use_daily_schedule, daily_scheduled_time, daily_scheduled_tz, daily_scheduled_proid, depends_on_base_daily_job_id, depends_on_parent_daily_job_id) VALUES ('nilly', TRUE, 13, TRUE, NULL, NULL, 'itrantootoo'::REGPROC, (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS), (SELECT fact_table_id FROM fact_loader.fact_tables WHERE fact_table_relid = 'willy'::REGCLASS)); --BELOW we will try to run it only after our first one did successfully. --Should not show the daily job because we set the daily schedule ahead in time SELECT fact_table_id FROM fact_loader.prioritized_jobs; UPDATE fact_loader.fact_tables SET daily_scheduled_time = now() - interval '1 second' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Now it should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; SELECT fact_loader.worker(); --We have to mock out the date so it appears the same any day we run this test SELECT daterange('2018-04-15'::DATE + (lower(as_of_date) - current_date),upper(as_of_date)), customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.daily_customers_fact ORDER BY customer_id, as_of_date; --Pretend we ran this yesterday UPDATE test_fact.daily_customers_fact SET as_of_date = daterange(lower(as_of_date) - 1,'infinity'); --Job should not show because it just ran - but if it has dependent job it should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; --Pretend it ran yesterday UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = last_refresh_attempted_at - interval '1 day' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Job should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; --Change something silly UPDATE test_fact.customers_fact SET phone = NULL WHERE customer_id = 10; SELECT fact_loader.worker(); --Job should not show because it just ran - but if it has dependent job it should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; --This should run the dependent job SELECT fact_loader.worker(); TABLE silly; TABLE willy; --Now 2nd level dep should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; SELECT fact_loader.worker(); TABLE willy; TABLE nilly; --Now 3rd level dep should show --Just check if enabling regular jobs is ok UPDATE fact_loader.fact_tables SET enabled = true WHERE fact_table_id IN(1,2); SELECT fact_table_id FROM fact_loader.prioritized_jobs; SELECT fact_loader.worker(); TABLE nilly; UPDATE fact_loader.fact_tables SET enabled = false WHERE fact_table_id IN(1,2); -- Need to test the next day's run when last_refresh_attempted_at is not null UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = last_refresh_attempted_at - interval '1 day' WHERE use_daily_schedule; SELECT fact_loader.worker(); SELECT fact_table_id FROM fact_loader.prioritized_jobs; SELECT fact_loader.worker(); SELECT fact_table_id FROM fact_loader.prioritized_jobs; TABLE silly; SELECT fact_loader.worker(); SELECT fact_table_id FROM fact_loader.prioritized_jobs; TABLE willy; SELECT fact_loader.worker(); SELECT fact_table_id FROM fact_loader.prioritized_jobs; TABLE nilly; --We should see one changed range --We have to mock out the date so it appears the same any day we run this test SELECT daterange('2018-04-15'::DATE + (lower(as_of_date) - current_date), CASE WHEN upper(as_of_date) = 'infinity' THEN 'infinity' ELSE '2018-04-15'::DATE + (upper(as_of_date) - current_date) END), customer_id, phone, age, last_order_id, order_product_count, order_product_promo_ids FROM test_fact.daily_customers_fact ORDER BY customer_id, as_of_date; --Verify it still shows if we simulate a job failure UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = now(), last_refresh_succeeded = FALSE WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; SELECT fact_table_id FROM fact_loader.prioritized_jobs; --Here it should not show - if we mark that it did succeed UPDATE fact_loader.fact_tables SET last_refresh_succeeded = TRUE WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; SELECT fact_table_id FROM fact_loader.prioritized_jobs; /*** TEST ADDING DEPS TO SCHEDULED JOBS ***/ --AGAIN Pretend it ran yesterday UPDATE fact_loader.fact_tables SET last_refresh_attempted_at = last_refresh_attempted_at - interval '1 day' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Job should show SELECT fact_table_id FROM fact_loader.prioritized_jobs; --Change something silly UPDATE test_fact.customers_fact SET phone = NULL WHERE customer_id = 10; --Now add deps that are not met UPDATE fact_loader.fact_tables SET daily_scheduled_deps = ARRAY['test.customers'::REGCLASS,'test.orders'::REGCLASS, 'test_fact.customers_fact'::REGCLASS], daily_scheduled_dep_delay_tolerance = '1 millisecond' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Should fail because no way they have been updated 1 millisecond ago SELECT fact_loader.worker(); --We fail jobs that don't meet deps because as configured, it should be an exceptional occurrence and we want to raise an alarm. Should show an error message containing "Delayed" lingo SELECT fact_table_id FROM fact_loader.unresolved_failures WHERE messages ->> 'Message' LIKE '%Delayed%'; --Now make the tolerance such that we know the deps are met UPDATE fact_loader.fact_tables SET enabled = TRUE, daily_scheduled_deps = ARRAY['test.customers'::REGCLASS,'test.orders'::REGCLASS, 'test_fact.customers_fact'::REGCLASS], daily_scheduled_dep_delay_tolerance = '1 minute' WHERE fact_table_relid = 'test_fact.daily_customers_fact'::REGCLASS; --Shows up again SELECT fact_table_id FROM fact_loader.prioritized_jobs; --Succeeds SELECT fact_loader.worker(); --Does not show now SELECT fact_table_id FROM fact_loader.prioritized_jobs; ROLLBACK; pg_fact_loader-2.0.1/sql/17_1_3_features.sql000066400000000000000000000031421451107006500205510ustar00rootroot00000000000000SET client_min_messages TO warning; --This is for testing functionality of timezone-specific timestamps SET TIMEZONE TO 'America/Chicago'; -- These 2 calls replace legacy tests for background worker launches (functionality now removed) SELECT fact_loader.worker(); SELECT fact_loader.worker(); INSERT INTO test.orders (order_id, customer_id, order_date, total) VALUES ((SELECT MAX(order_id)+1 FROM test.orders) ,5, '2018-07-27', 2500.00); SELECT test.tick(); -- For some reason queue_table_id seems indeterminate so don't show it DO $$ BEGIN IF NOT (SELECT COUNT(1) FROM fact_loader.raw_queued_changes(1)) = 24 OR NOT (SELECT COUNT(1) FROM fact_loader.gathered_queued_changes(1)) = 1 THEN RAISE EXCEPTION '%', 'No worky'; END IF; END$$; --Count could be different if we are doing FROMVERSION=1.2 or lower but should be at least 50 (actually should be 66 for 1.2 and 76 for 1.3) SELECT COUNT(1) > 50 AS got_enough_logs FROM fact_loader.fact_table_refresh_logs; --Test the auto-pruning BEGIN; UPDATE fact_loader.fact_table_refresh_logs SET refresh_attempted_at = refresh_attempted_at - interval '1 year' WHERE messages IS NULL; INSERT INTO fact_loader.fact_table_refresh_logs (fact_table_refresh_log_id) VALUES (1000); SELECT COUNT(1) FROM fact_loader.fact_table_refresh_logs; ROLLBACK; --Test support for extension without deps (older tests for version 1.2 are removed as no longer relevant) BEGIN; DROP EXTENSION pg_fact_loader CASCADE; DROP EXTENSION IF EXISTS pglogical_ticker CASCADE; DROP EXTENSION IF EXISTS pglogical CASCADE; CREATE EXTENSION pg_fact_loader; DROP EXTENSION pg_fact_loader; ROLLBACK; pg_fact_loader-2.0.1/test_all_versions.sh000077500000000000000000000020031451107006500205330ustar00rootroot00000000000000#!/bin/bash set -eu orig_path=$PATH newest_version=2.0 unset PGSERVICE set_path() { version=$1 export PATH=/usr/lib/postgresql/$version/bin:$orig_path } get_port() { version=$1 pg_lsclusters | awk -v version=$version '$1 == version { print $3 }' } make_and_test() { version=$1 from_version=${2:-$newest_version} set_path $version make clean sudo "PATH=$PATH" make uninstall sudo "PATH=$PATH" make install port=$(get_port $version) PGPORT=$port psql contrib_regression -v "ON_ERROR_STOP=1" << 'EOM' DROP EXTENSION IF EXISTS pglogical CASCADE; SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = 'contrib_regression' AND pid <> pg_backend_pid(); EOM FROMVERSION=$from_version PGPORT=$port make installcheck } test_all_versions() { from_version="$1" cat << EOM *******************FROM VERSION $from_version****************** EOM make_and_test "10" make_and_test "11" make_and_test "12" make_and_test "13" #make_and_test "14" } test_all_versions "2.0" #test_all_versions "1.7" #test_all_versions "1.6" pg_fact_loader-2.0.1/types/000077500000000000000000000000001451107006500156065ustar00rootroot00000000000000pg_fact_loader-2.0.1/types/driver.sql000066400000000000000000000001001451107006500176110ustar00rootroot00000000000000CREATE TYPE fact_loader.driver AS ENUM ('pglogical', 'native'); pg_fact_loader-2.0.1/types/table_load_type.sql000066400000000000000000000001111451107006500214470ustar00rootroot00000000000000CREATE TYPE fact_loader.table_load_type AS ENUM('delta','full_refresh'); pg_fact_loader-2.0.1/views/000077500000000000000000000000001451107006500155775ustar00rootroot00000000000000pg_fact_loader-2.0.1/views/prioritized_jobs.sql000066400000000000000000000067111451107006500217060ustar00rootroot00000000000000CREATE OR REPLACE VIEW fact_loader.prioritized_jobs AS WITH jobs_with_daily_variables AS ( SELECT ft.*, /*** Keep all this logic of daily jobs as variables to ease visualization of logic in the next cte below!! */ (--If this is the first run of a scheduled job, it is eligible ft.last_refresh_attempted_at IS NULL OR ( --If it was last attempted successfully prior to this scheduled time only - meaning yesterday, it is eligible ( ft.last_refresh_succeeded AND ft.last_refresh_attempted_at::DATE < -- Timezone taken from daily_scheduled_tz if base job, otherwise look up the timezone of the base job if this is dependent (now() AT TIME ZONE COALESCE( ft.daily_scheduled_tz, base.daily_scheduled_tz ) )::DATE ) OR --If a job has failed and been re-enabled, it is eligible again even though it has been attempted at or after the scheduled time NOT ft.last_refresh_succeeded ) ) AS daily_not_attempted_today, (now() AT TIME ZONE ft.daily_scheduled_tz)::TIME BETWEEN daily_scheduled_time AND '23:59:59.999999'::TIME AS daily_scheduled_time_passed, base.use_daily_schedule AND base.last_refresh_succeeded AND base.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE base.daily_scheduled_tz) :: DATE AS daily_base_job_finished, ft.depends_on_base_daily_job_id = ft.depends_on_parent_daily_job_id AS daily_has_only_one_parent, -- This should only be used in combination with daily_has_only_one_parent parent.use_daily_schedule AND parent.last_refresh_succeeded AND parent.last_refresh_attempted_at :: DATE = (now() AT TIME ZONE COALESCE(parent.daily_scheduled_tz, base.daily_scheduled_tz)) :: DATE AS parent_job_finished FROM fact_loader.fact_tables ft LEFT JOIN LATERAL (SELECT ftb.use_daily_schedule, ftb.last_refresh_succeeded, ftb.last_refresh_attempted_at, ftb.daily_scheduled_tz FROM fact_loader.fact_tables ftb WHERE ftb.fact_table_id = ft.depends_on_base_daily_job_id) base ON TRUE LEFT JOIN LATERAL (SELECT ftp.use_daily_schedule, ftp.last_refresh_succeeded, ftp.last_refresh_attempted_at, ftp.daily_scheduled_tz FROM fact_loader.fact_tables ftp WHERE ftp.fact_table_id = ft.depends_on_parent_daily_job_id) parent ON TRUE WHERE enabled ) , jobs_with_daily_schedule_eligibility AS ( SELECT *, --Only run this job according to the same day of the daily_scheduled_time --according to configured timezone (use_daily_schedule AND daily_not_attempted_today AND ( daily_scheduled_time_passed OR (daily_base_job_finished AND (daily_has_only_one_parent OR parent_job_finished)) ) ) AS daily_schedule_eligible FROM jobs_with_daily_variables) SELECT * FROM jobs_with_daily_schedule_eligibility WHERE NOT use_daily_schedule OR daily_schedule_eligible ORDER BY CASE WHEN force_worker_priority THEN 0 ELSE 1 END, --If a job has a daily schedule, once the time has come for the next refresh, --prioritize it first CASE WHEN daily_schedule_eligible THEN (now() AT TIME ZONE daily_scheduled_tz)::TIME ELSE NULL END NULLS LAST, --This may be improved in the future but is a good start last_refresh_attempted_at NULLS FIRST, priority ; pg_fact_loader-2.0.1/views/queue_deps_all.sql000066400000000000000000000171771451107006500213240ustar00rootroot00000000000000CREATE OR REPLACE VIEW fact_loader.queue_deps_all AS WITH RECURSIVE fact_table_dep_cutoffs AS (SELECT 1 AS level , qtd.queue_table_dep_id , ftdqc.fact_table_dep_id , ftdqc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , qtd.last_cutoff_id AS dep_maximum_cutoff_id , qtd.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftd.child_id AS base_fact_table_id , queue_table_id , relevant_change_columns , ftdqc.last_cutoff_id , ftdqc.last_cutoff_source_time , ftdqc.insert_merge_proid , ftdqc.update_merge_proid , ftdqc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id UNION ALL /**** In this recursive part, we walk UP the chain to the base level in order to get the last_cutoff_id and last_cutoff_source_time of parent_ids because children must never surpass those. The ONLY difference between this recursive part and the non-recursive part are the dep_maximum_cutoffs. That means we can get our resultant data below by simply selecting distinct ON the right fields and order by dep_maximum_cutoffs to get the most conservative cutoff window, that is, the minimum cutoff amongst the queue tables and any PARENT fact table cutoffs. That means if, for example, - IF a queue table has been cutoff up until 11:00:00 - AND IF a level 1 fact table dependent on that queue table was last cutoff at 10:55:00 - THEN a level 2 fact table dependent on level 1 fact table must not go past 10:55:00 when it is processed. */ SELECT ftdc.level + 1 AS level , ftdc.queue_table_dep_id , ftdc.fact_table_dep_id , ftdc.fact_table_dep_queue_table_dep_id --This dep_maximum_cutoff_time is being taken from the queue_table_deps, because we cannot go past when the --fact table has been updated , ftdqc.last_cutoff_id AS dep_maximum_cutoff_id , ftdqc.last_cutoff_source_time AS dep_maximum_cutoff_time , ftd.parent_id AS parent_fact_table_id , ftd.child_id AS child_fact_table_id , ftdc.base_fact_table_id , ftdc.queue_table_id , ftdc.relevant_change_columns , ftdc.last_cutoff_id , ftdc.last_cutoff_source_time , ftdc.insert_merge_proid , ftdc.update_merge_proid , ftdc.delete_merge_proid FROM fact_loader.queue_table_deps qtd INNER JOIN fact_loader.fact_table_dep_queue_table_deps ftdqc ON ftdqc.queue_table_dep_id = qtd.queue_table_dep_id INNER JOIN fact_loader.fact_table_deps ftd ON ftd.fact_table_dep_id = ftdqc.fact_table_dep_id INNER JOIN fact_table_dep_cutoffs ftdc ON ftdc.parent_fact_table_id = ftd.child_id ) , adjusted_fact_table_deps AS ( /**** The reason we look at distinct queue_table_dep_id and not simply queue_table_id is because two parent fact tables could have differing logic for retrieving changes for the same base queue_tables. */ SELECT DISTINCT ON(base_fact_table_id, queue_table_dep_id) * FROM fact_table_dep_cutoffs ORDER BY base_fact_table_id, queue_table_dep_id, dep_maximum_cutoff_time ) , queue_table_info AS ( SELECT * FROM fact_loader.queue_table_delay_info() ) /**** For fact tables that depend on other fact tables, we join the child fact table to the queue_table_deps of the parent fact table, and just reuse this exactly, with these distinctions: - From the fact_table_dep table, we do use the proids, and the last_cutoff_id - We use the parent last_cutoff_source_time as the maximum_cutoff, because we can only update those records already updated on the parent - We pass the information of which table for which to update metadata in the end */ , queue_table_deps_with_nested AS ( /**** This part of the union is for the base level of queue_table_deps - for fact tables with no other dependent fact tables */ SELECT queue_table_dep_id , NULL :: INT AS fact_table_dep_id , NULL :: INT AS fact_table_dep_queue_table_dep_id , NULL :: BIGINT AS dep_maximum_cutoff_id , NULL :: TIMESTAMPTZ AS dep_maximum_cutoff_time , fact_table_id , queue_table_id , relevant_change_columns , last_cutoff_id , last_cutoff_source_time , insert_merge_proid , update_merge_proid , delete_merge_proid FROM fact_loader.queue_table_deps UNION ALL /**** This part of the union is for fact tables with other dependent fact tables */ SELECT queue_table_dep_id , fact_table_dep_id , fact_table_dep_queue_table_dep_id , aftd.dep_maximum_cutoff_id , aftd.dep_maximum_cutoff_time , base_fact_table_id AS fact_table_id , queue_table_id , relevant_change_columns , aftd.last_cutoff_id , aftd.last_cutoff_source_time , aftd.insert_merge_proid , aftd.update_merge_proid , aftd.delete_merge_proid FROM adjusted_fact_table_deps aftd ) SELECT ft.fact_table_id, ft.fact_table_relid, ft.fact_table_agg_proid, qt.queue_table_id, qt.queue_table_relid, qt.queue_of_base_table_relid, qtd.relevant_change_columns, qtd.last_cutoff_id, qtd.last_cutoff_source_time, rt.publisher AS provider_name, rt.publication_name, qtd.dep_maximum_cutoff_id, --Not used yet - TODO - think about if it needs to be used to filter as cutoff MAX in addition to the time filter LEAST( MIN(qtd.dep_maximum_cutoff_time) OVER ( PARTITION BY qtd.fact_table_id ), MIN(rt.source_time) OVER ( PARTITION BY qtd.fact_table_id ) ) AS maximum_cutoff_time, aqt.queue_table_id_field, 'primary_key'::name AS queue_table_key, 'operation'::name AS queue_table_op, 'change'::name AS queue_table_change, 'changed_at'::name AS queue_table_timestamp, qt.queue_table_tz, aqbt.queue_of_base_table_key, aqbt.queue_of_base_table_key_type, queue_table_dep_id, fact_table_dep_id, fact_table_dep_queue_table_dep_id, insert_merge_proid, update_merge_proid, delete_merge_proid, qt.purge FROM queue_table_deps_with_nested qtd INNER JOIN fact_loader.fact_tables ft ON ft.fact_table_id = qtd.fact_table_id INNER JOIN fact_loader.queue_tables qt ON qt.queue_table_id = qtd.queue_table_id INNER JOIN queue_table_info rt ON rt.queue_of_base_table_relid = qt.queue_of_base_table_relid INNER JOIN LATERAL (SELECT a.attname AS queue_of_base_table_key, format_type(atttypid, atttypmod) AS queue_of_base_table_key_type FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_of_base_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqbt ON TRUE INNER JOIN LATERAL (SELECT a.attname AS queue_table_id_field FROM (SELECT i.indrelid , unnest(indkey) AS ik , row_number() OVER () AS rn FROM pg_index i WHERE i.indrelid = qt.queue_table_relid AND i.indisprimary) pk INNER JOIN pg_attribute a ON a.attrelid = pk.indrelid AND a.attnum = pk.ik) aqt ON TRUE ORDER BY ft.fact_table_relid; pg_fact_loader-2.0.1/views/queue_deps_all_with_retrieval.sql000066400000000000000000000046631451107006500244300ustar00rootroot00000000000000CREATE OR REPLACE VIEW fact_loader.queue_deps_all_with_retrieval AS SELECT qtd.*, krs.filter_scope, krs.level, krs.return_columns, --we need not get the type separately. It must match queue_of_base_table_key_type krs.is_fact_key, krs.join_to_relation, qtk.queue_table_relid AS join_to_relation_queue, krs.join_to_column, ctypes.join_column_type, krs.return_columns_from_join, ctypes.return_columns_from_join_type, krs.join_return_is_fact_key, /*** We include this in this view def to be easily shared by all events (I, U, D) in sql_builder, as those may be different in terms of passing source_change_date. */ format(', %s::DATE AS source_change_date', CASE WHEN krs.pass_queue_table_change_date_at_tz IS NOT NULL /*** For casting queue_table_timestamp to a date, we first ensure we have it as timestamptz (objective UTC time). Then, we cast it to the timezone of interest on which the date should be based. For example, 02:00:00 UTC time on 2018-05-02 is actually 2018-05-01 in America/Chicago time. Thus, any date-based fact table must decide in what time zone to consider the date. */ THEN format('(%s %s AT TIME ZONE %s)', 'q.'||quote_ident(qtd.queue_table_timestamp), CASE WHEN qtd.queue_table_tz IS NULL THEN '' ELSE 'AT TIME ZONE '||quote_literal(qtd.queue_table_tz) END, quote_literal(krs.pass_queue_table_change_date_at_tz)) ELSE 'NULL' END) AS source_change_date_select FROM fact_loader.queue_deps_all qtd INNER JOIN fact_loader.key_retrieval_sequences krs ON qtd.queue_table_dep_id = krs.queue_table_dep_id LEFT JOIN fact_loader.queue_tables qtk ON qtk.queue_of_base_table_relid = krs.join_to_relation LEFT JOIN LATERAL (SELECT MAX(CASE WHEN attname = krs.join_to_column THEN format_type(atttypid, atttypmod) ELSE NULL END) AS join_column_type, MAX(CASE WHEN attname = krs.return_columns_from_join[1] THEN format_type(atttypid, atttypmod) ELSE NULL END) AS return_columns_from_join_type FROM pg_attribute a WHERE a.attrelid IN(krs.join_to_relation) /**** We stubbornly assume that if there are multiple columns in return_columns_from_join, they all have the same type. Undue complexity would ensue if we did away with that rule. */ AND a.attname IN(krs.join_to_column,krs.return_columns_from_join[1])) ctypes ON TRUE; pg_fact_loader-2.0.1/views/unresolved_failures.sql000066400000000000000000000005731451107006500224050ustar00rootroot00000000000000CREATE OR REPLACE VIEW fact_loader.unresolved_failures AS SELECT ft.fact_table_id, fact_table_relid, refresh_attempted_at, messages FROM fact_loader.fact_tables ft INNER JOIN fact_loader.fact_table_refresh_logs ftrl ON ft.fact_table_id = ftrl.fact_table_id AND ft.last_refresh_attempted_at = ftrl.refresh_attempted_at WHERE NOT enabled AND NOT last_refresh_succeeded;