pax_global_header00006660000000000000000000000064134213503770014517gustar00rootroot0000000000000052 comment=25c937879affdfffb37210b8dc125de86b1db7e2 pgloader-3.6.1/000077500000000000000000000000001342135037700133235ustar00rootroot00000000000000pgloader-3.6.1/.dockerignore000066400000000000000000000000551342135037700157770ustar00rootroot00000000000000.git .vagrant build Dockerfile Dockerfile.cclpgloader-3.6.1/.gitattributes000066400000000000000000000000401342135037700162100ustar00rootroot00000000000000test/**/*.sql linguist-vendored pgloader-3.6.1/.gitignore000066400000000000000000000005361342135037700153170ustar00rootroot00000000000000.vagrant local-data pgloader.html pgloader.pdf debian/pgloader.debhelper.log debian/pgloader.substvars debian/pgloader/ debian/files web/howto/csv.html web/howto/dBase.html web/howto/fixed.html web/howto/geolite.html web/howto/mysql.html web/howto/pgloader.1.html web/howto/quickstart.html web/howto/sqlite.html .DS_Store system-index.txt docs/_buildpgloader-3.6.1/.travis.sh000077500000000000000000000047521342135037700152600ustar00rootroot00000000000000#!/bin/bash set -eu lisp_install() { case "$LISP" in ccl) ccl_checksum='08e885e8c2bb6e4abd42b8e8e2b60f257c6929eb34b8ec87ca1ecf848fac6d70' ccl_version='1.11' remote_file "/tmp/ccl-${ccl_version}.tgz" "https://github.com/Clozure/ccl/releases/download/v${ccl_version}/ccl-${ccl_version}-linuxx86.tar.gz" "$ccl_checksum" tar --file "/tmp/ccl-${ccl_version}.tgz" --extract --exclude='.svn' --directory '/tmp' sudo mv --no-target-directory '/tmp/ccl' '/usr/local/src/ccl' sudo ln --no-dereference --force --symbolic "/usr/local/src/ccl/scripts/ccl64" '/usr/local/bin/ccl' ;; sbcl) sbcl_checksum='eb44d9efb4389f71c05af0327bab7cd18f8bb221fb13a6e458477a9194853958' sbcl_version='1.3.18' remote_file "/tmp/sbcl-${sbcl_version}.tgz" "http://prdownloads.sourceforge.net/sbcl/sbcl-${sbcl_version}-x86-64-linux-binary.tar.bz2" "$sbcl_checksum" tar --file "/tmp/sbcl-${sbcl_version}.tgz" --extract --directory '/tmp' ( cd "/tmp/sbcl-${sbcl_version}-x86-64-linux" && sudo ./install.sh ) ;; *) echo "Unrecognized Lisp: '$LISP'" exit 1 ;; esac } pgdg_repositories() { local sourcelist='sources.list.d/pgdg.list' sudo tee "/etc/apt/$sourcelist" <<-repositories deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg main deb http://apt.postgresql.org/pub/repos/apt/ $(lsb_release -cs)-pgdg-testing main 10 repositories sudo apt-key adv --keyserver 'hkp://ha.pool.sks-keyservers.net' --recv-keys 'ACCC4CF8' sudo apt-get -o Dir::Etc::sourcelist="$sourcelist" -o Dir::Etc::sourceparts='-' -o APT::Get::List-Cleanup='0' update } postgresql_install() { if [ -z "${PGVERSION:-}" ]; then PGVERSION="$( psql -d postgres -XAtc "select regexp_replace(current_setting('server_version'), '[.][0-9]+$', '')" )" else sudo service postgresql stop xargs sudo apt-get -y --purge remove <<-packages libpq-dev libpq5 postgresql postgresql-client-common postgresql-common packages sudo rm -rf /var/lib/postgresql fi xargs sudo apt-get -y install <<-packages postgresql-${PGVERSION} postgresql-${PGVERSION}-ip4r packages sudo tee /etc/postgresql/${PGVERSION}/main/pg_hba.conf > /dev/null <<-config local all all trust host all all 127.0.0.1/32 trust config sudo service postgresql restart } remote_file() { local target="$1" origin="$2" sum="$3" local check="shasum --algorithm $(( 4 * ${#sum} )) --check" local filesum="$sum $target" curl --location --output "$target" "$origin" && $check <<< "$filesum" } $1 pgloader-3.6.1/.travis.yml000066400000000000000000000014131342135037700154330ustar00rootroot00000000000000language: common-lisp sudo: required env: matrix: - LISP=ccl - LISP=ccl PGVERSION=9.6 - LISP=sbcl - LISP=sbcl PGVERSION=9.6 install: - ./.travis.sh lisp_install - ./.travis.sh pgdg_repositories - ./.travis.sh postgresql_install - sudo apt-get install -y unzip libsqlite3-dev gawk freetds-dev before_script: - PGUSER=postgres createuser -S -R -D -E -l pgloader - PGUSER=postgres createdb -E UTF8 -O pgloader pgloader - PGUSER=postgres psql -d pgloader -c "create extension ip4r;" - PGUSER=pgloader psql -d pgloader -c "create schema expected;" - PGUSER=pgloader psql -d pgloader -c "create schema err;" - make --version - make "CL=$LISP" script: - PGUSER=pgloader make "CL=$LISP" check notifications: email: - dim@tapoueh.org pgloader-3.6.1/Dockerfile000066400000000000000000000017641342135037700153250ustar00rootroot00000000000000FROM debian:stable-slim as builder RUN apt-get update \ && apt-get install -y --no-install-recommends \ bzip2 \ ca-certificates \ curl \ freetds-dev \ gawk \ git \ libsqlite3-dev \ libssl1.1 \ libzip-dev \ make \ openssl \ patch \ sbcl \ time \ unzip \ wget \ cl-ironclad \ cl-babel \ && rm -rf /var/lib/apt/lists/* COPY ./ /opt/src/pgloader RUN mkdir -p /opt/src/pgloader/build/bin \ && cd /opt/src/pgloader \ && make FROM debian:stable-slim RUN apt-get update \ && apt-get install -y --no-install-recommends \ curl \ freetds-dev \ gawk \ libsqlite3-dev \ libzip-dev \ make \ sbcl \ unzip \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /opt/src/pgloader/build/bin/pgloader /usr/local/bin LABEL maintainer="Dimitri Fontaine " pgloader-3.6.1/Dockerfile.ccl000066400000000000000000000023211342135037700160530ustar00rootroot00000000000000FROM debian:stable-slim as builder RUN apt-get update \ && apt-get install -y --no-install-recommends \ bzip2 \ ca-certificates \ curl \ freetds-dev \ gawk \ git \ libsqlite3-dev \ libssl1.1 \ libzip-dev \ make \ openssl \ patch \ sbcl \ time \ unzip \ wget \ cl-ironclad \ cl-babel \ && rm -rf /var/lib/apt/lists/* RUN curl -SL https://github.com/Clozure/ccl/releases/download/v1.11.5/ccl-1.11.5-linuxx86.tar.gz \ | tar xz -C /usr/local/src/ \ && mv /usr/local/src/ccl/scripts/ccl64 /usr/local/bin/ccl COPY ./ /opt/src/pgloader RUN mkdir -p /opt/src/pgloader/build/bin \ && cd /opt/src/pgloader \ && make CL=ccl DYNSIZE=256 FROM debian:stable-slim RUN apt-get update \ && apt-get install -y --no-install-recommends \ curl \ freetds-dev \ gawk \ libsqlite3-dev \ libzip-dev \ make \ sbcl \ unzip \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /opt/src/pgloader/build/bin/pgloader /usr/local/bin LABEL maintainer="Dimitri Fontaine " pgloader-3.6.1/INSTALL.md000066400000000000000000000072211342135037700147550ustar00rootroot00000000000000# Installing pgloader pgloader version 3.x is written in Common Lisp. ## Dependencies The steps depend on the OS you are currently using. ### debian If you're using debian, it's quite simple actually, see the file `bootstrap-debian.sh` within the main pgloader distribution to get yourself started. You will note in particular: sudo apt-get install -y sbcl \ git curl patch unzip \ devscripts pandoc \ libsqlite3-dev \ freetds-dev We need a recent enough [SBCL](http://sbcl.org/) version and that means backporting the one found in `sid` rather than using the very old one found in current *stable* debian release. See `bootstrap-debian.sh` for details about how to backport a recent enough SBCL here (1.2.5 or newer). ### Redhat / CentOS You will need to install the Steel Bank Common Lisp package (sbcl) from EPEL, as well as the freetds-devel package for some shared libraries. With RHEL/CentOS 6, if the packaged version isn't >=1.3.6, you'll need to build it from source. With v7, after installing freetds, you also need to create a softlink from the versioned shared library `libsybdb.so.5` to `libsybdb.so`. The above steps are prepared for you with `boostrap-centos.sh` and `bootstrap-centos7.sh` respectively. Please report to us if your standard RHEL/CentOS installation required additional steps. ### Mac OS X We suppose you already have `git` and `make` available, if that's not the case now is the time to install those tools. The SQLite lib that comes in MacOSX is fine, no need for extra software here. You will need to install either SBCL or CCL separately, and when using [brew](http://brew.sh/) it's as simple as: brew install sbcl brew install clozure-cl NOTE: Make sure you installed the universal binaries of Freetds, so that they can be loaded correctly. brew install freetds --universal --build-from-source ### Compiling SBCL by yourself If you ended up building SBCL yourself or you just want to do that, you can download the source from http://www.sbcl.org/ . You will need to build SBCL with the following command and options: sh make.sh --with-sb-core-compression --with-sb-thread NOTE: You could also remove the --compress-core option. ## Building pgloader Now that the dependences are installed, just type make. make If your `SBCL` supports core compression, the make process will use it to generate a smaller binary. To force disabling core compression, you may use: make COMPRESS_CORE=no Then you will have a new tool to play with: ./build/bin/pgloader --help This command should spit out the *usage* information on which parameters are accepted in the command line actually. ## Building pgloader with CCL It's possible to pick [ccl](http://ccl.clozure.com/) rather than SBCL when compiling pgloader: make CL=ccl ## Building pgloader for use in low RAM environments It's possible to tweak the size of RAM pgloader will use in its binary image, at compile time. This defaults to 4 GB. make DYNSIZE=1024 Now the `./build/bin/pgloader` that you get only uses 1GB. ## Building a docker image A `Dockerfile` is provided, to use it: docker build -t pgloader:debian . docker run --rm --name pgloader pgloader:debian bash -c "pgloader --version" The `build` step install build dependencies in a debian jessie container, then `git clone` and build `pgloader` in `/opt/src/pgloader` and finally copy the resulting binary image in `/usr/local/bin/pgloader` so that it's easily available. pgloader-3.6.1/ISSUE_TEMPLATE.md000066400000000000000000000053741342135037700160410ustar00rootroot00000000000000Thanks for contributing to [pgloader](https://pgloader.io) by reporting an issue! Reporting an issue is the only way we can solve problems, fix bugs, and improve both the software and its user experience in general. The best bug reports follow those 3 simple steps: 1. show what you did, 2. show the result you got, 3. explain how the result is not what you expected. In the case of pgloader, here's the information I will need to read in your bug report. Having all of this is a big help, and often means the bug you reported can be fixed very efficiently as soon as I get to it. Please provide the following information: - [ ] pgloader --version ``` ``` - [ ] did you test a fresh compile from the source tree? Compiling pgloader from sources is documented in the [README](https://github.com/dimitri/pgloader#build-from-sources), it's easy to do, and if patches are to be made to fix your bug, you're going to have to build from sources to get the fix anyway… - [ ] did you search for other similar issues? - [ ] how can I reproduce the bug? Incude a self-contained pgloader command file. If you're loading from a database, consider attaching a database dump to your issue. For MySQL, use `mysqldump`. For SQLite, just send over your source file, that's easy. Maybe be the one with your production data, of course, the one with just the sample of data that allows me to reproduce your bug. When using a proprietary database system as a source, consider creating a sample database on some Cloud service or somewhere you can then give me access to, and see my email address on my GitHub profile to send me the credentials. Still open a public issue for tracking and as documentation for other users. ``` -- -- EDIT THIS FILE TO MATCH YOUR BUG REPORT -- LOAD CSV FROM INLINE with encoding 'ascii' INTO postgresql:///pgloader TARGET TABLE jordane WITH truncate, fields terminated by '|', fields not enclosed, fields escaped by backslash-quote SET work_mem to '128MB', standard_conforming_strings to 'on' BEFORE LOAD DO $$ drop table if exists jordane; $$, $$ CREATE TABLE jordane ( "NOM" character(20), "PRENOM" character(20) ) $$; BORDET|Jordane BORDET|Audrey LASTNAME|"opening quote BONNIER|testprenombe~aucouptroplong JOURDAIN|héhé¶ ``` - [ ] pgloader output you obtain ``` PASTE HERE THE OUTPUT OF THE PGLOADER COMMAND ``` - [ ] data that is being loaded, if relevant ``` PASTE HERE THE DATA THAT HAS BEEN LOADED ``` - [ ] How the data is different from what you expected, if relevant pgloader-3.6.1/LICENSE000066400000000000000000000020111342135037700143220ustar00rootroot00000000000000pgloader Copyright (c) 2005-2017, The PostgreSQL Global Development Group Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.pgloader-3.6.1/Makefile000066400000000000000000000214231342135037700147650ustar00rootroot00000000000000# pgloader build tool APP_NAME = pgloader VERSION = 3.6.1 # use either sbcl or ccl CL = sbcl # default to 4096 MB of RAM size in the image DYNSIZE = 4096 LISP_SRC = $(wildcard src/*lisp) \ $(wildcard src/monkey/*lisp) \ $(wildcard src/utils/*lisp) \ $(wildcard src/load/*lisp) \ $(wildcard src/parsers/*lisp) \ $(wildcard src/pg-copy/*lisp) \ $(wildcard src/pgsql/*lisp) \ $(wildcard src/sources/*lisp) \ pgloader.asd BUILDDIR = build LIBS = $(BUILDDIR)/libs.stamp QLDIR = $(BUILDDIR)/quicklisp MANIFEST = $(BUILDDIR)/manifest.ql LATEST = $(BUILDDIR)/pgloader-latest.tgz BUNDLEDIST = 2019-01-07 BUNDLENAME = pgloader-bundle-$(VERSION) BUNDLEDIR = $(BUILDDIR)/bundle/$(BUNDLENAME) BUNDLE = $(BUILDDIR)/$(BUNDLENAME).tgz BUNDLETESTD= $(BUILDDIR)/bundle/test ifeq ($(OS),Windows_NT) EXE = .exe COMPRESS_CORE = no DYNSIZE = 1024 # support for windows 32 bits else EXE = endif PGLOADER = $(BUILDDIR)/bin/$(APP_NAME)$(EXE) BUILDAPP_CCL = $(BUILDDIR)/bin/buildapp.ccl$(EXE) BUILDAPP_SBCL = $(BUILDDIR)/bin/buildapp.sbcl$(EXE) ifeq ($(CL),sbcl) BUILDAPP = $(BUILDAPP_SBCL) BUILDAPP_OPTS = --require sb-posix \ --require sb-bsd-sockets \ --require sb-rotate-byte CL_OPTS = --noinform --no-sysinit --no-userinit else BUILDAPP = $(BUILDAPP_CCL) CL_OPTS = --no-init endif ifeq ($(CL),sbcl) COMPRESS_CORE ?= $(shell $(CL) --noinform \ --quit \ --eval '(when (member :sb-core-compression cl:*features*) (write-string "yes"))') endif # note: on Windows_NT, we never core-compress; see above. ifeq ($(COMPRESS_CORE),yes) COMPRESS_CORE_OPT = --compress-core endif DEBUILD_ROOT = /tmp/pgloader all: $(PGLOADER) clean: rm -rf $(LIBS) $(QLDIR) $(MANIFEST) $(BUILDAPP) $(PGLOADER) docs/_build $(QLDIR)/local-projects/qmynd: git clone --depth 1 https://github.com/qitab/qmynd.git $@ $(QLDIR)/local-projects/cl-ixf: git clone --depth 1 https://github.com/dimitri/cl-ixf.git $@ $(QLDIR)/local-projects/cl-db3: git clone --depth 1 https://github.com/dimitri/cl-db3.git $@ $(QLDIR)/local-projects/cl-csv: git clone --depth 1 https://github.com/AccelerationNet/cl-csv.git $@ $(QLDIR)/setup.lisp: mkdir -p $(BUILDDIR) curl -o $(BUILDDIR)/quicklisp.lisp http://beta.quicklisp.org/quicklisp.lisp $(CL) $(CL_OPTS) --load $(BUILDDIR)/quicklisp.lisp \ --load src/getenv.lisp \ --eval '(quicklisp-quickstart:install :path "$(BUILDDIR)/quicklisp" :proxy (getenv "http_proxy"))' \ --eval '(quit)' quicklisp: $(QLDIR)/setup.lisp ; clones: $(QLDIR)/local-projects/cl-ixf \ $(QLDIR)/local-projects/cl-db3 \ $(QLDIR)/local-projects/cl-csv \ $(QLDIR)/local-projects/qmynd ; $(LIBS): $(QLDIR)/setup.lisp $(CL) $(CL_OPTS) --load $(QLDIR)/setup.lisp \ --eval '(push :pgloader-image *features*)' \ --eval '(setf *print-circle* t *print-pretty* t)' \ --eval '(ql:quickload "pgloader")' \ --eval '(push "$(PWD)/" ql:*local-project-directories*)' \ --eval '(ql:quickload "pgloader")' \ --eval '(quit)' touch $@ libs: $(LIBS) ; $(MANIFEST): $(LIBS) $(CL) $(CL_OPTS) --load $(QLDIR)/setup.lisp \ --eval '(ql:write-asdf-manifest-file "$(MANIFEST)")' \ --eval '(quit)' manifest: $(MANIFEST) ; $(BUILDAPP_CCL): $(QLDIR)/setup.lisp mkdir -p $(BUILDDIR)/bin $(CL) $(CL_OPTS) --load $(QLDIR)/setup.lisp \ --eval '(ql:quickload "buildapp")' \ --eval '(buildapp:build-buildapp "$@")' \ --eval '(quit)' $(BUILDAPP_SBCL): $(QLDIR)/setup.lisp mkdir -p $(BUILDDIR)/bin $(CL) $(CL_OPTS) --load $(QLDIR)/setup.lisp \ --eval '(ql:quickload "buildapp")' \ --eval '(buildapp:build-buildapp "$@")' \ --eval '(quit)' buildapp: $(BUILDAPP) ; $(PGLOADER): $(MANIFEST) $(BUILDAPP) $(LISP_SRC) mkdir -p $(BUILDDIR)/bin $(BUILDAPP) --logfile /tmp/build.log \ $(BUILDAPP_OPTS) \ --sbcl $(CL) \ --asdf-path . \ --asdf-tree $(QLDIR)/local-projects \ --manifest-file $(MANIFEST) \ --asdf-tree $(QLDIR)/dists \ --asdf-path . \ --load-system cffi \ --load-system cl+ssl \ --load-system mssql \ --load src/hooks.lisp \ --load-system $(APP_NAME) \ --entry pgloader:main \ --dynamic-space-size $(DYNSIZE) \ $(COMPRESS_CORE_OPT) \ --output $@.tmp # that's ugly, but necessary when building on Windows :( mv $@.tmp $@ pgloader: $(PGLOADER) ; pgloader-standalone: $(BUILDAPP) $(BUILDAPP_OPTS) \ --sbcl $(CL) \ --load-system $(APP_NAME) \ --load src/hooks.lisp \ --entry pgloader:main \ --dynamic-space-size $(DYNSIZE) \ $(COMPRESS_CORE_OPT) \ --output $(PGLOADER) test: $(PGLOADER) $(MAKE) PGLOADER=$(realpath $(PGLOADER)) CL=$(CL) -C test regress save: ./src/save.lisp $(LISP_SRC) $(CL) $(CL_OPTS) --load ./src/save.lisp check-saved: save $(MAKE) PGLOADER=$(realpath $(PGLOADER)) CL=$(CL) -C test regress clean-bundle: rm -rf $(BUNDLEDIR) rm -rf $(BUNDLETESTD)/$(BUNDLENAME)/* $(BUNDLETESTD): mkdir -p $@ $(BUNDLEDIR): mkdir -p $@ $(CL) $(CL_OPTS) --load $(QLDIR)/setup.lisp \ --eval '(defvar *bundle-dir* "$@")' \ --eval '(defvar *pwd* "$(PWD)/")' \ --eval '(defvar *ql-dist* "$(BUNDLEDIST)")' \ --load bundle/ql.lisp $(BUNDLEDIR)/version.sexp: $(BUNDLEDIR) echo "\"$(VERSION)\"" > $@ $(BUNDLE): $(BUNDLEDIR) $(BUNDLEDIR)/version.sexp cp bundle/README.md $(BUNDLEDIR) cp bundle/save.lisp $(BUNDLEDIR) sed -e s/%VERSION%/$(VERSION)/ < bundle/Makefile > $(BUNDLEDIR)/Makefile git archive --format=tar --prefix=pgloader-$(VERSION)/ master \ | tar -C $(BUNDLEDIR)/local-projects/ -xf - make QLDIR=$(BUNDLEDIR) clones tar -C build/bundle \ --exclude bin \ --exclude test/sqlite \ -czf $@ $(BUNDLENAME) bundle: clean-bundle $(BUNDLE) $(BUNDLETESTD) tar -C $(BUNDLETESTD) -xf $(BUNDLE) make -C $(BUNDLETESTD)/$(BUNDLENAME) $(BUNDLETESTD)/$(BUNDLENAME)/bin/pgloader --version test-bundle: $(MAKE) -C $(BUNDLEDIR) test deb: # intended for use on a debian system mkdir -p $(DEBUILD_ROOT) && rm -rf $(DEBUILD_ROOT)/* rsync -Ca --exclude 'build' \ --exclude '.vagrant' \ ./ $(DEBUILD_ROOT)/ cd $(DEBUILD_ROOT) && make -f debian/rules orig cd $(DEBUILD_ROOT) && debuild -us -uc -sa cp -a /tmp/pgloader_* /tmp/cl-pgloader* build/ rpm: # intended for use on a CentOS or other RPM based system mkdir -p $(DEBUILD_ROOT) && rm -rf $(DEBUILD_ROOT) rsync -Ca --exclude=build/* ./ $(DEBUILD_ROOT)/ cd /tmp && tar czf $(HOME)/rpmbuild/SOURCES/pgloader-$(VERSION).tar.gz pgloader cd $(DEBUILD_ROOT) && rpmbuild -ba pgloader.spec cp -a $(HOME)/rpmbuild/SRPMS/*rpm build cp -a $(HOME)/rpmbuild/RPMS/x86_64/*rpm build pkg: # intended for use on a MacOSX system mkdir -p $(DEBUILD_ROOT) && rm -rf $(DEBUILD_ROOT)/* mkdir -p $(DEBUILD_ROOT)/usr/local/bin/ mkdir -p $(DEBUILD_ROOT)/usr/local/share/man/man1/ cp ./pgloader.1 $(DEBUILD_ROOT)/usr/local/share/man/man1/ cp ./build/bin/pgloader $(DEBUILD_ROOT)/usr/local/bin/ pkgbuild --identifier org.tapoueh.pgloader \ --root $(DEBUILD_ROOT) \ --version $(VERSION) \ ./build/pgloader-$(VERSION).pkg latest: git archive --format=tar --prefix=pgloader-$(VERSION)/ v$(VERSION) \ | gzip -9 > $(LATEST) check: test ; .PHONY: test pgloader-standalone docs bundle pgloader-3.6.1/README.md000066400000000000000000000151701342135037700146060ustar00rootroot00000000000000# PGLoader [![Build Status](https://travis-ci.org/dimitri/pgloader.svg?branch=master)](https://travis-ci.org/dimitri/pgloader) [![Join the chat at https://gitter.im/dimitri/pgloader](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dimitri/pgloader?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) pgloader is a data loading tool for PostgreSQL, using the `COPY` command. Its main advantage over just using `COPY` or `\copy`, and over using a *Foreign Data Wrapper*, is its transaction behaviour, where *pgloader* will keep a separate file of rejected data, but continue trying to `copy` good data in your database. The default PostgreSQL behaviour is transactional, which means that *any* erroneous line in the input data (file or remote database) will stop the entire bulk load for the table. pgloader also implements data reformatting, a typical example of that being the transformation of MySQL datestamps `0000-00-00` and `0000-00-00 00:00:00` to PostgreSQL `NULL` value (because our calendar never had a *year zero*). ## Versioning pgloader version 1.x is quite old and was developed in `TCL`. When faced with maintaining that code, the new emerging development team (hi!) picked `python` instead because that made sense at the time. So pgloader version 2.x was written in python. The current version of pgloader is the 3.x series, which is written in [Common Lisp](http://cliki.net/) for better development flexibility, runtime performance, and support of real threading. The versioning is now following the Emacs model, where any X.0 release number means you're using a development version (alpha, beta, or release candidate). The next stable versions are going to be `3.1` then `3.2` etc. When using a development snapshot rather than a released version the version number includes the git hash (in its abbreviated form): - `pgloader version "3.0.99"` Release candidate 9 for pgloader version 3.1, with a *git tag* named `v3.0.99` so that it's easy to checkout the same sources as the released code. - `pgloader version "3.0.fecae2c"` Development snapshot again *git hash* `fecae2c`. It's possible to have the same sources on another setup with using the git command `git checkout fecae2c`. - `pgloader version "3.1.0"` Stable release. ## LICENCE pgloader is available under [The PostgreSQL Licence](http://www.postgresql.org/about/licence/). ## INSTALL You can install pgloader directly from [apt.postgresql.org](https://wiki.postgresql.org/wiki/Apt) and from official debian repositories, see [packages.debian.org/pgloader](https://packages.debian.org/search?keywords=pgloader). $ apt-get install pgloader You can also use a **docker** image for pgloader at : $ docker pull dimitri/pgloader $ docker run --rm --name pgloader dimitri/pgloader:latest pgloader --version $ docker run --rm --name pgloader dimitri/pgloader:latest pgloader --help ## Build from sources pgloader is now a Common Lisp program, tested using the [SBCL](http://sbcl.org/) (>= 1.2.5) and [Clozure CL](http://ccl.clozure.com/) implementations with [Quicklisp](http://www.quicklisp.org/beta/). When building from sources, you should always build from the current git `HEAD` as it's basically the only source that is managed in a way to ensure it builds aginst current set of dependencies versions. ### Building from sources on debian $ apt-get install sbcl unzip libsqlite3-dev make curl gawk freetds-dev libzip-dev $ cd /path/to/pgloader $ make pgloader $ ./build/bin/pgloader --help ### Building from sources on macOS When using [brew](https://brew.sh), it should be a simple `brew install --HEAD pgloader`. When using [macports](https://www.macports.org), then we have a situation to deal with with shared objects pgloader depends on, as reported in issue #161 at : > I was able to get a clean build without having to disable compression after > symlinking /usr/local/lib to /opt/local/lib. Note that I did not have > anything installed to /usr/local/lib so I didn't lose anything here. ### Building from sources on Windows Building pgloader on Windows is supported, thanks to Common Lisp implementations being available on that platform, and to the Common Lisp Standard for making it easy to write actually portable code. It is recommended to have a look at the issues labelled with *Windows support* if you run into trouble when building pgloader: ### Building Docker image from sources You can build a Docker image from source using SBCL by default: $ docker build . Or Clozure CL (CCL): $ docker build -f Dockerfile.ccl . ## More options when building from source The `Makefile` target `pgloader` knows how to produce a Self Contained Binary file for pgloader, found at `./build/bin/pgloader`: $ make pgloader By default, the `Makefile` uses [SBCL](http://sbcl.org/) to compile your binary image, though it's possible to build using [CCL](http://ccl.clozure.com/). $ make CL=ccl pgloader If using `SBCL` and it supports core compression, the make process will use it to generate a smaller binary. To force disabling core compression, you may use: $ make COMPRESS_CORE=no pgloader The `--compress-core` is unique to SBCL, so not used when `CC` is different from the `sbcl` value. You can also tweak the default amount of memory that the `pgloader` image will allow itself using when running through your data (don't ask for more than your current RAM tho): $ make DYNSIZE=8192 pgloader The `make pgloader` command when successful outputs a `./build/bin/pgloader` file for you to use. ## Usage You can either give a command file to pgloader or run it all from the command line, see the [pgloader quick start](https://pgloader.readthedocs.io/en/latest/tutorial/tutorial.html#pgloader-quick-start) on for more details. $ ./build/bin/pgloader --help $ ./build/bin/pgloader For example, for a full migration from SQLite: $ createdb newdb $ pgloader ./test/sqlite/sqlite.db postgresql:///newdb Or for a full migration from MySQL, including schema definition (tables, indexes, foreign keys, comments) and parallel loading of the corrected data: $ createdb pagila $ pgloader mysql://user@localhost/sakila postgresql:///pagila See the documentation file `pgloader.1.md` for details. You can compile that file into a manual page or an HTML page thanks to the `ronn` application: $ apt-get install ruby-ronn $ make docs pgloader-3.6.1/TODO.md000066400000000000000000000041361342135037700144160ustar00rootroot00000000000000# TODO Some notes about what I intend to be working on next. You can sponsor any and all of those ideas if you actually need them today, and you can also sponsor new ideas not on the list yet. ## New Features ### Filtering Add commands to pick different target tables depending on the data found when reading from the source. ## Data Formats ### CSV - see about schema discovery (column names and types) ### JSON Propose to load JSON either in a "document" column, or to normalize it by applying some advanced filtering. Implement PostgreSQL JSON operators and functions in pgloader to help setup the normalisation steps: [PostgreSQL JSON Functions and Operators](http://www.postgresql.org/docs/9.3/interactive/functions-json.html). ### XML Add an XML reader to load XML documents into the database as a column value, and XSLT capabilities to normalize the XML contents into a proper relational model. ### Other databases Add support for full data and schema migrations for the following: - SQL Server - Sybase - Oracle ## User Interface, User Experience ### Improve parse error messages WIP, see https://github.com/nikodemus/esrap/issues/26 ### Graphical User Interface Most probably a web based tool, with guidance to setup the migration, maybe not even something very sophisticated, but making the simple cases way simpler. ## Database support ### MySQL Support - Convert SQL dialect for SQL views - Triggers and Stored Procedures ### SQLite support - implement CAST rules support ## Compat - add parsing for SQL*Loader file format ## Other ### error management - add input line number to log file? ### data output - PostgreSQL COPY Text format output for any supported input ### performances - some more parallelizing options - support for partitioning in pgloader itself ### UI - add a web controller with pretty monitoring - launch new jobs from the web controller ### crazy ideas - MySQL replication, reading from the binlog directly - plproxy (re-)sharding support - partitioning support - remote archiving support (with (delete returning *) insert into) pgloader-3.6.1/Vagrantfile000066400000000000000000000030141342135037700155060ustar00rootroot00000000000000# -*- mode: ruby -*- # vi: set ft=ruby : # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! VAGRANTFILE_API_VERSION = "2" Vagrant.configure("2") do |config| config.vm.define "wheezy" do |wheezy| wheezy.vm.box = "wheezy64" config.vm.provision :file do |file| file.source = 'conf/gpg-agent.conf' file.destination = '/home/vagrant/.gnupg/gpg-agent.conf' end config.vm.provision :file do |file| file.source = 'conf/gpg.conf' file.destination = '/home/vagrant/.gnupg/gpg.conf' end config.vm.provision :file do |file| file.source = 'conf/devscripts' file.destination = '/home/vagrant/.devscripts' end config.vm.provision "shell" do |s| s.path = "bootstrap-debian.sh" s.privileged = false end config.vm.network :forwarded_port, guest: 4505, host: 4505 end config.vm.define "centos" do |centos| centos.vm.box = "CentOS64" config.vm.provision "shell" do |s| s.path = "bootstrap-centos.sh" s.privileged = false end end config.vm.define "w7" do |centos| centos.vm.box = "w7" config.vm.communicator = "winrm" config.vm.network :forwarded_port, guest: 5985, host: 5985, id: "winrm", auto_correct: true config.vm.network :forwarded_port, guest: 3389, host: 3389, id: "rdp", auto_correct: true config.vm.network :forwarded_port, guest: 1433, host: 1433, id: "mssql", auto_correct: true end end pgloader-3.6.1/bootstrap-centos.sh000066400000000000000000000012411342135037700171630ustar00rootroot00000000000000#!/usr/bin/env bash sudo yum -y install yum-utils rpmdevtools @"Development Tools" \ sqlite-devel zlib-devel # SBCL 1.3, we'll overwrite the repo version of sbcl with a more recent one sudo yum -y install epel-release sudo yum install -y sbcl.x86_64 --enablerepo=epel wget http://downloads.sourceforge.net/project/sbcl/sbcl/1.3.6/sbcl-1.3.6-source.tar.bz2 tar xfj sbcl-1.3.6-source.tar.bz2 cd sbcl-1.3.6 ./make.sh --with-sb-thread --with-sb-core-compression --prefix=/usr > /dev/null 2>&1 sudo sh install.sh cd # Missing dependencies sudo yum -y install freetds-devel # prepare the rpmbuild setup rpmdev-setuptree # pgloader #make -C /vagrant rpm pgloader-3.6.1/bootstrap-centos7.sh000066400000000000000000000006031342135037700172530ustar00rootroot00000000000000#!/usr/bin/env bash sudo yum -y install yum-utils rpmdevtools @"Development Tools" \ sqlite-devel zlib-devel # Enable epel for sbcl sudo yum -y install epel-release sudo yum -y install sbcl # Missing dependency sudo yum install freetds freetds-devel -y sudo ln -s /usr/lib64/libsybdb.so.5 /usr/lib64/libsybdb.so # prepare the rpmbuild setup rpmdev-setuptree pgloader-3.6.1/bootstrap-debian.sh000066400000000000000000000035451342135037700171230ustar00rootroot00000000000000#!/usr/bin/env bash if [ ! -f /etc/apt/sources.list.old ] then sudo mv /etc/apt/sources.list /etc/apt/sources.list.old sudo cp /vagrant/conf/sources.list /etc/apt/sources.list fi sudo apt-get update sudo apt-get dist-upgrade -y cat /vagrant/conf/bashrc.sh >> ~/.bashrc # PostgreSQL sidsrc=/etc/apt/sources.list.d/sid-src.list echo "deb-src http://ftp.fr.debian.org/debian/ sid main" | sudo tee $sidsrc pgdg=/etc/apt/sources.list.d/pgdg.list pgdgkey=https://www.postgresql.org/media/keys/ACCC4CF8.asc echo "deb http://apt.postgresql.org/pub/repos/apt/ wheezy-pgdg main" | sudo tee $pgdg wget --quiet -O - ${pgdgkey} | sudo apt-key add - # MariaDB sudo apt-get install -y python-software-properties sudo apt-key adv --recv-keys --keyserver keyserver.ubuntu.com 0xcbcb082a1bb943db sudo add-apt-repository 'deb http://mirrors.linsrv.net/mariadb/repo/10.0/debian wheezy main' sudo apt-get update sudo apt-get install -y postgresql-9.3 postgresql-contrib-9.3 \ postgresql-9.3-ip4r \ sbcl \ git patch unzip \ devscripts pandoc \ freetds-dev libsqlite3-dev \ gnupg gnupg-agent sudo DEBIAN_FRONTEND=noninteractive \ apt-get install -y --allow-unauthenticated mariadb-server # SBCL # # we used to need to backport SBCL, it's only the case now in wheezy, all # the later distributions are uptodate enough for our needs here. sudo apt-get -y install sbcl HBA=/etc/postgresql/9.3/main/pg_hba.conf echo "local all all trust" | sudo tee $HBA echo "host all all 127.0.0.1/32 trust" | sudo tee -a $HBA sudo pg_ctlcluster 9.3 main reload createuser -U postgres -SdR `whoami` make -C /vagrant pgloader make -C /vagrant test pgloader-3.6.1/build/000077500000000000000000000000001342135037700144225ustar00rootroot00000000000000pgloader-3.6.1/build/.gitignore000066400000000000000000000001231342135037700164060ustar00rootroot00000000000000# Ignore everything in this directory * # Except this file !bin !bundle !.gitignorepgloader-3.6.1/build/bin/000077500000000000000000000000001342135037700151725ustar00rootroot00000000000000pgloader-3.6.1/build/bin/.gitignore000066400000000000000000000001061342135037700171570ustar00rootroot00000000000000# Ignore everything in this directory * # Except this file !.gitignorepgloader-3.6.1/build/bundle/000077500000000000000000000000001342135037700156735ustar00rootroot00000000000000pgloader-3.6.1/build/bundle/.gitignore000066400000000000000000000001061342135037700176600ustar00rootroot00000000000000# Ignore everything in this directory * # Except this file !.gitignorepgloader-3.6.1/bundle/000077500000000000000000000000001342135037700145745ustar00rootroot00000000000000pgloader-3.6.1/bundle/Makefile000066400000000000000000000046131342135037700162400ustar00rootroot00000000000000# pgloader build tool for bundle tarball # only supports SBCL CL = sbcl APP_NAME = pgloader VERSION = %VERSION% ifeq ($(OS),Windows_NT) EXE = .exe COMPRESS_CORE = no DYNSIZE = 1024 # support for windows 32 bits else DYNSIZE = 4096 EXE = endif BUILDDIR = bin BUILDAPP = $(BUILDDIR)/buildapp$(EXE) PGLOADER = ./bin/pgloader SRCDIR = local-projects/pgloader-$(VERSION) BUILDAPP_OPTS = --require sb-posix \ --require sb-bsd-sockets \ --require sb-rotate-byte CL_OPTS = --noinform --no-sysinit --no-userinit COMPRESS_CORE ?= $(shell $(CL) --noinform \ --quit \ --eval '(when (member :sb-core-compression cl:*features*) (write-string "yes"))') ifeq ($(COMPRESS_CORE),yes) COMPRESS_CORE_OPT = --compress-core endif pgloader: $(PGLOADER) ; buildapp: $(BUILDAPP) ; $(BUILDAPP): mkdir -p $(BUILDDIR) $(CL) $(CL_OPTS) --load bundle.lisp \ --eval '(asdf:load-system :buildapp)' \ --eval '(buildapp:build-buildapp "$@")' \ --eval '(quit)' $(PGLOADER): $(BUILDAPP) $(BUILDAPP) --logfile /tmp/pgloader-bundle-build.log \ $(BUILDAPP_OPTS) \ --sbcl $(CL) \ --asdf-tree . \ --load-system cffi \ --load-system cl+ssl \ --load-system mssql \ --load $(SRCDIR)/src/hooks.lisp \ --load-system $(APP_NAME) \ --eval '(setf pgloader.params::*version-string* "$(VERSION)")' \ --entry pgloader:main \ --dynamic-space-size $(DYNSIZE) \ $(COMPRESS_CORE_OPT) \ --output $@.tmp # that's ugly, but necessary when building on Windows :( mv $@.tmp $@ test: $(PGLOADER) $(MAKE) PGLOADER=$(realpath $(PGLOADER)) -C $(SRCDIR)/test regress save: sbcl --no-userinit --load ./save.lisp check: test ; pgloader-3.6.1/bundle/README.md000066400000000000000000000016251342135037700160570ustar00rootroot00000000000000# pgloader source bundle In order to ease building pgloader for non-lisp users, the *bundle* distribution is a tarball containing pgloader and its build dependencies. See the the following documentation for more details: The *bundle* comes with a specific `Makefile` so that building it is as simple as the following (which includes testing the resulting binary): make LANG=en_US.UTF-8 make test The compilation might takes a while, it's because SBCL is trying hard to generate run-time binary code that is fast and efficient. Yes you need to be in a unicide environment to run the test suite, so that it matches with the encoding of the test *.load files. You can then package or use the pgloader binary: ./bin/pgloader --version ./bin/pgloader --help Note that the SQLite test files are not included in the bundle, for weithing too much here. pgloader-3.6.1/bundle/ql.lisp000066400000000000000000000021271342135037700161030ustar00rootroot00000000000000;;; ;;; Script used to prepare a pgloader bundle ;;; ;; fetch a list of recent candidates with ;; (subseq (ql-dist:available-versions (ql-dist:dist "quicklisp")) 0 5) ;; ;; the 2017-06-30 QL release is broken, avoid it. ;; (defvar *ql-dist* :latest) (defvar *ql-dist-url-format* "http://beta.quicklisp.org/dist/quicklisp/~a/distinfo.txt") (let ((pkgs (append '("pgloader" "buildapp") (getf (read-from-string (uiop:read-file-string (uiop:merge-pathnames* "pgloader.asd" *pwd*))) :depends-on))) (dist (if (or (eq :latest *ql-dist*) (string= "latest" *ql-dist*)) (cdr ;; available-versions is an alist of (date . url), and the ;; first one is the most recent one (first (ql-dist:available-versions (ql-dist:dist "quicklisp")))) (format nil *ql-dist-url-format* *ql-dist*)))) (ql-dist:install-dist dist :prompt nil :replace t) (ql:bundle-systems pkgs :to *bundle-dir*)) (quit) pgloader-3.6.1/bundle/save.lisp000066400000000000000000000032271342135037700164270ustar00rootroot00000000000000;;; ;;; Create a build/bin/pgloader executable from the source code, using ;;; Quicklisp to load pgloader and its dependencies. ;;; (in-package #:cl-user) (require :asdf) ; should work in SBCL and CCL (let* ((cwd (uiop:getcwd)) (bundle.lisp (uiop:merge-pathnames* "bundle.lisp" cwd)) (version-file (uiop:merge-pathnames* "version.sexp" cwd)) (version-string (uiop:read-file-form version-file)) (asdf:*central-registry* (list cwd))) (format t "Loading bundle.lisp~%") (load bundle.lisp) (format t "Loading system pgloader ~a~%" version-string) (asdf:load-system :pgloader :verbose nil) (load (asdf:system-relative-pathname :pgloader "src/hooks.lisp")) (let* ((pgl (find-package "PGLOADER")) (version-symbol (find-symbol "*VERSION-STRING*" pgl))) (setf (symbol-value version-symbol) version-string))) (defun pgloader-image-main () (let ((argv #+sbcl sb-ext:*posix-argv* #+ccl ccl:*command-line-argument-list*)) (pgloader::main argv))) (let* ((cwd (uiop:getcwd)) (bin-dir (uiop:merge-pathnames* "bin/" cwd)) (bin-filename (uiop:merge-pathnames* "pgloader" bin-dir))) (ensure-directories-exist bin-dir) #+ccl (ccl:save-application bin-filename :toplevel-function #'cl-user::pgloader-image-main :prepend-kernel t) #+sbcl (sb-ext:save-lisp-and-die bin-filename :toplevel #'cl-user::pgloader-image-main :executable t :save-runtime-options t :compression t)) pgloader-3.6.1/conf/000077500000000000000000000000001342135037700142505ustar00rootroot00000000000000pgloader-3.6.1/conf/bashrc.sh000066400000000000000000000002631342135037700160470ustar00rootroot00000000000000 # GnuPG Agent export GPG_TTY=`tty` eval "$(gpg-agent --daemon)" # DEBIAN export DEBEMAIL="dim@tapoueh.org" export DEBFULLNAME="Dimitri Fontaine" export DEBSIGN_KEYID="60B1CB4E" pgloader-3.6.1/conf/devscripts000066400000000000000000000001231342135037700163550ustar00rootroot00000000000000DEBEMAIL="dim@tapoueh.org" DEBFULLNAME="Dimitri Fontaine" DEBSIGN_KEYID="60B1CB4E" pgloader-3.6.1/conf/gpg-agent.conf000066400000000000000000000001321342135037700167640ustar00rootroot00000000000000# Keyboard control no-grab # PIN entry program pinentry-program /usr/bin/pinentry-curses pgloader-3.6.1/conf/gpg.conf000066400000000000000000000003421342135037700156730ustar00rootroot00000000000000default-key 60B1CB4E keyserver hkp://keys.gnupg.net use-agent personal-digest-preferences SHA512 cert-digest-algo SHA512 default-preference-list SHA512 SHA384 SHA256 SHA224 AES256 AES192 AES CAST5 ZLIB BZIP2 ZIP Uncompressed pgloader-3.6.1/conf/sources.list000066400000000000000000000004231342135037700166270ustar00rootroot00000000000000deb http://ftp.fr.debian.org/debian sid main non-free contrib deb-src http://ftp.fr.debian.org/debian sid main non-free contrib # deb http://security.debian.org/ wheezy/updates main contrib non-free # deb-src http://security.debian.org/ wheezy/updates main contrib non-free pgloader-3.6.1/debian/000077500000000000000000000000001342135037700145455ustar00rootroot00000000000000pgloader-3.6.1/debian/changelog000066400000000000000000000262661342135037700164330ustar00rootroot00000000000000pgloader (3.5.2-3) UNRELEASED; urgency=medium * Make cl-pgloader test depend on ca-certificates so the snakeoil certificate is recognized as a valid CA. (Needs the /etc/ssl/certs/*.0 file.) -- Christoph Berg Thu, 05 Jul 2018 19:04:08 +0200 pgloader (3.5.2-2) unstable; urgency=medium * Install pgloader.asd into correct location. (Closes: #857226) * Test cl-pgloader through sbcl --eval. * Skip building and manpage generation in arch-indep builds. -- Christoph Berg Tue, 03 Jul 2018 22:51:48 +0200 pgloader (3.5.2-1) unstable; urgency=medium * New upstream version. * All included test data has been verified as free, stop building a +dfsg tarball. * debian/source/options: Ignore changes in src/params.lisp (release vs non-release). * Enable SSL in src/hooks.lisp. * Run wrap-and-sort -st. * Add new B-D cl-mustache, cl-yason, cl-zs3, sync Depends to cl-pgloader. * Depend on the libssl version cl-plus-ssl depends on. (Closes: #864309) * Build and install new sphinx docs instead. * Build manpage using help2man. * Priority: optional, move cl-pgloader to Section: lisp. * Update S-V. * Add watch file. -- Christoph Berg Tue, 03 Jul 2018 16:59:07 +0200 pgloader (3.4.1+dfsg-1) unstable; urgency=medium * New release, bugfixes and new features -- Dimitri Fontaine Thu, 06 Jul 2017 16:51:53 +0300 pgloader (3.3.2+dfsg-1) unstable; urgency=medium * Fixes github issue 453 (Closes: #843555) * Maintenance release. -- Dimitri Fontaine Sat, 03 Dec 2016 19:36:56 +0300 pgloader (3.3.1+dfsg-2) unstable; urgency=medium * Add tzdata to build-depends (Closes: #839468) -- Christoph Berg Thu, 03 Nov 2016 14:32:28 +0100 pgloader (3.3.1+dfsg-1) unstable; urgency=medium * New release, bugfixes and new features -- Dimitri Fontaine Sun, 28 Aug 2016 21:07:47 +0300 pgloader (3.2.2+dfsg-1) unstable; urgency=medium * New release, lots of bugfixes, some new features * New build dependencies -- Dimitri Fontaine Thu, 03 Sep 2015 19:17:12 +0300 pgloader (3.2.1~preview+dfsg-2) unstable; urgency=medium * Interim bugfix release -- Dimitri Fontaine Thu, 22 Jan 2015 04:06:51 +0400 pgloader (3.2.0+dfsg-1) unstable; urgency=medium * Implement COPY files support * Implement MS SQL source database support * Lots of bug fixes * Full command line operations support * Misc improvements, cleanup, refactoring -- Dimitri Fontaine Thu, 15 Jan 2015 19:51:02 +0300 pgloader (3.1.1+dfsg-1) unstable; urgency=medium * Fix --root-dir option when target directory doesn't exists (Closes:#767288) * New release, lots of bugfixes * Add support for CAST rules for SQLite * Code refactoring * New file based sources filters and options -- Dimitri Fontaine Mon, 03 Nov 2014 16:08:19 +0300 pgloader (3.1.0+dfsg-3) unstable; urgency=medium * Fix build dependencies problems, see Bug#765162 -- Dimitri Fontaine Tue, 28 Oct 2014 12:41:23 +0300 pgloader (3.1.0+dfsg-2) unstable; urgency=medium * Fix building on i386 machines, using 1GB of dynamic space there. -- Dimitri Fontaine Mon, 22 Sep 2014 23:04:18 +0400 pgloader (3.1.0+dfsg-1) unstable; urgency=medium * Stable release of the new pgloader version, 3.1 -- Dimitri Fontaine Wed, 10 Sep 2014 16:48:11 +0400 pgloader (3.0.99-1) unstable; urgency=medium * Release Candidate 9 * Can be built against either CCL or SBCL * Lots of bug fixes * Smarted batch memory usage -- Dimitri Fontaine Tue, 29 Apr 2014 13:50:26 +0400 pgloader (3.0.98-1) unstable; urgency=low * Release Candidate 8 * Fixes retry behavior * Assorted fixes -- Dimitri Fontaine Thu, 23 Jan 2014 03:19:07 +0400 pgloader (3.0.97-1) unstable; urgency=low * Release Candidate 7 * Fix log-filename location * Switch to the new cl-csv version * Add a documentation website with tutorials * Fix ASDF dependencies -- Dimitri Fontaine Thu, 15 Jan 2014 01:33:16 +0400 pgloader (3.0.96-1) unstable; urgency=low * Release Candidate 6 * Document then rework the batch retry behavior. * Various bug fixes, including CCL compatibility. -- Dimitri Fontaine Fri, 27 Dec 2013 15:11:02 +0400 pgloader (3.0.95-1) unstable; urgency=low * Release Candidate 5. * Minimize memory usage, bug fixes. -- Dimitri Fontaine Wed, 18 Dec 2013 23:32:33 +0100 pgloader (3.0.94-1) unstable; urgency=low * Release Candidate 4. -- Dimitri Fontaine Mon, 09 Dec 2013 14:58:51 +0400 pgloader (2.3.3~dev3-1.1) unstable; urgency=low * Non-maintainer upload. * Don't hardcode dependency on python-support, use ${python:Depends} (closes: #542052). Thanks to Faheem Mitha for the bug report. -- Jakub Wilk Fri, 06 Jan 2012 02:19:25 +0100 pgloader (2.3.3~dev3-1) unstable; urgency=low * Implement -f --field-sep to overwrite the default from command line * Add support for filename arguments, which use defaults * Implement --reject-log and --reject-data * Add support for --max-parallel-sections and --section-threads * Support setting any PG option (-o and config file) * Have --debug show a traceback * Fix a bug where pgloader would freeze on early error (no such file) * Implement an option to set csv field size limit * Implement --load-from-stdin * Implement --boundaries * use gettempdir() rather than hard-coded "/tmp" * Handle C-c the default system's way * Fix pgloader thread error management * Fix exit status to be non-zero in case of known errors -- Dimitri Fontaine Tue, 09 Nov 2010 16:10:01 +0100 pgloader (2.3.2-1) unstable; urgency=low * Implement --from support in all readers (Closes: #531034) * Use psycopg cursor.copy_expert() when available (> 2.0.6) * FIX fixedreader: it now know about -C * FIX Round Robin Reader with respect to offsets in readlines() * support python 2.3 if not using RRR (not importing collections.deque) * change logger initialisation to support python 2.3 * FIX bad usage of STDERR in the code * Implement skip_head_lines option in configuration (superseded by -F) * Do not sort() section list when it's been given on command line * Catch InterfaceError when trying to close connection -- Dimitri Fontaine Mon, 29 Jun 2009 09:39:33 +0200 pgloader (2.3.1-2) unstable; urgency=low * FIX Missing Build-Depends (Closes: #485067) -- Dimitri Fontaine Mon, 09 Jun 2008 15:23:56 +0200 pgloader (2.3.1-1) unstable; urgency=low * FIX: Add database opening into critical path (BoundedSemaphore) * FIX: close database connection as soon as possible * Add support for fixed format * Add support for escaped field_sep, such as \t -- Dimitri Fontaine Wed, 21 May 2008 12:19:42 +0200 pgloader (2.3.0-1) unstable; urgency=low * FIX the cluttered test case, see BUGS.txt * Better release it as stable now and fix bugs when found than wait for more testing of the ~dev release candidates. -- Dimitri Fontaine Mon, 10 Mar 2008 15:36:04 +0100 pgloader (2.3.0~dev3-1) experimental; urgency=low * Add options for forcing psycopg version to use (-1, -2, --psycopg-version) -- Dimitri Fontaine Wed, 27 Feb 2008 12:54:46 +0100 pgloader (2.3.0~dev2-1) experimental; urgency=low * columns = * is now supported -- Dimitri Fontaine Mon, 25 Feb 2008 14:58:46 +0100 pgloader (2.3.0~dev-1) unstable; urgency=low * Mutli-threaded pgloader (see options max_parallel_sections, section_threads and split_file_reading) * FIX for -C and -I options (replace sys.log with self.log), per user request -- Dimitri Fontaine Mon, 11 Feb 2008 15:04:40 +0100 pgloader (2.2.6-1) unstable; urgency=low * pgloader -V now VACUUM each table separately, no more vacuumdb issued * New option -D to DISABLE Triggers while loading (ENABLE them one done) -- Dimitri Fontaine Fri, 01 Feb 2008 11:01:34 +0100 pgloader (2.2.5-1) unstable; urgency=low * Now using proper python logging module * New client_min_messages, log_min_messages, log_file and lc_messages options * Better reporting of DatabaseError (pkey violation, e.g.) * Have logging of error data work again in all cases (DatabaseError) * Protect some more settings from begin overwritten when using templates -- Dimitri Fontaine Fri, 07 Dec 2007 23:24:58 +0100 pgloader (2.2.5~dev-1) unstable; urgency=low * Configuration now supports templates * Command line option for setting --reformat_path, -R * Upload to Debian (Closes: #453434) -- Dimitri Fontaine Mon, 26 Nov 2007 21:53:11 +0100 pgloader (2.2.4) unstable; urgency=low * Reformat modules to change input on-the-fly * mysql module with timestamp function provided * some fixes, all tests green again -- Dimitri Fontaine Tue, 20 Nov 2007 16:48:19 +0100 pgloader (2.2.3) unstable; urgency=low * User Defined Columns * Temporary files with copy data content now suffixed .pgloader * New option --version * Fix TextReader newline_escapes configuration option reading * Fix Reader reject initialisation * Skip database related settings when in DRY_RUN mode (-n) * List all command line options from man page synopsis -- Dimitri Fontaine Wed, 14 Nov 2007 21:57:39 +0100 pgloader (2.2.2) unstable; urgency=low * New command line options --quiet and --summary (-qs for short) -- Dimitri Fontaine Sat, 20 Oct 2007 16:20:18 +0200 pgloader (2.2.1) unstable; urgency=low * Support for datestyle setting * Support for omiting column numbering * Change documentation source format from SGML to asciidoc -- Dimitri Fontaine Thu, 23 Aug 2007 12:35:34 +0200 pgloader (2.2.0) unstable; urgency=low * Support for partial loading of data (subrange(s) of columns) * COPY table (col1, col2, ..., coln) systematically used * Support for CSV format (with quoting) -- Dimitri Fontaine Mon, 04 Jun 2007 11:13:21 +0200 pgloader (2.1.0) unstable; urgency=low * Added support for partial COPY table definition * Documentation and example update (see serial) -- Dimitri Fontaine Fri, 19 Jan 2007 12:25:39 +0100 pgloader (2.0.2) unstable; urgency=low * configurable null and empty_string representations * bugfix on newline_escapes behavior when all lines are not escaped * new global newline_escapes setting * uses by default psycopg2, failback to psycopg1 if not available * client_encoding can now be set on each table * documentation (manpage) update -- Dimitri Fontaine Wed, 15 Nov 2006 22:26:46 +0100 pgloader (2.0.1-2) unstable; urgency=low * package cleaning (lintian warnings and error) -- Dimitri Fontaine Tue, 14 Nov 2006 18:14:57 +0100 pgloader (2.0.1-1) unstable; urgency=low * Initial release -- Dimitri Fontaine Mon, 13 Nov 2006 22:56:15 +0100 pgloader-3.6.1/debian/cl-pgloader.dirs000066400000000000000000000001041342135037700176140ustar00rootroot00000000000000usr/share/common-lisp/source/pgloader usr/share/common-lisp/systems pgloader-3.6.1/debian/cl-pgloader.install000066400000000000000000000002211342135037700203210ustar00rootroot00000000000000pgloader.asd usr/share/common-lisp/source/pgloader pgloader.lisp usr/share/common-lisp/source/pgloader src usr/share/common-lisp/source/pgloader pgloader-3.6.1/debian/cl-pgloader.links000066400000000000000000000001361342135037700200000ustar00rootroot00000000000000usr/share/common-lisp/source/pgloader/pgloader.asd usr/share/common-lisp/systems/pgloader.asd pgloader-3.6.1/debian/compat000066400000000000000000000000021342135037700157430ustar00rootroot000000000000009 pgloader-3.6.1/debian/control000066400000000000000000000061501342135037700161520ustar00rootroot00000000000000Source: pgloader Section: database Priority: optional Maintainer: Dimitri Fontaine Uploaders: Christoph Berg Build-Depends: buildapp (>= 1.5), cl-abnf, cl-alexandria, cl-asdf (>= 3.0.3), cl-asdf-finalizers, cl-asdf-system-connections, cl-bordeaux-threads (>= 0.8.3), cl-cffi (>= 1:0.12.0), cl-command-line-arguments, cl-csv, cl-db3, cl-drakma, cl-esrap, cl-fad, cl-flexi-streams, cl-interpol, cl-ixf, cl-local-time, cl-log, cl-lparallel, cl-markdown, cl-md5, cl-metabang-bind, cl-mssql, cl-mustache, cl-postmodern, cl-ppcre, cl-py-configparser, cl-qmynd, cl-quri, cl-simple-date, cl-split-sequence, cl-sqlite, cl-trivial-backtrace, cl-trivial-utf-8, cl-unicode, cl-usocket, cl-utilities, cl-uuid, cl-yason, cl-zs3, debhelper (>= 9.0.0), gawk, help2man, python3-sphinx, sbcl (>= 1.1.13), tzdata, Standards-Version: 4.1.4 Homepage: https://github.com/dimitri/pgloader Vcs-Git: https://github.com/dimitri/pgloader.git Vcs-Browser: https://github.com/dimitri/pgloader Package: pgloader Architecture: any Depends: freetds-dev, ${misc:Depends}, ${shlibs:Depends}, ${ssl:Depends} Description: extract, transform and load data into PostgreSQL pgloader imports data from different kind of sources and COPY it into PostgreSQL. . The command language is described in the manual page and allows one to describe where to find the data source, its format, and to describe data processing and transformation. . Supported source formats include CSV, fixed width flat files, dBase3 files (DBF), and SQLite and MySQL databases. In most of those formats, pgloader is able to auto-discover the schema and create the tables and the indexes in PostgreSQL. In the MySQL case it's possible to edit CASTing rules from the pgloader command directly. Package: cl-pgloader Section: lisp Architecture: all Depends: cl-abnf, cl-alexandria, cl-asdf (>= 3.0.3), cl-asdf-finalizers, cl-asdf-system-connections, cl-bordeaux-threads (>= 0.8.3), cl-cffi (>= 1:0.12.0), cl-command-line-arguments, cl-csv, cl-db3, cl-drakma, cl-esrap, cl-fad, cl-flexi-streams, cl-interpol, cl-ixf, cl-local-time, cl-log, cl-lparallel, cl-markdown, cl-md5, cl-metabang-bind, cl-mssql, cl-mustache, cl-postmodern, cl-ppcre, cl-py-configparser, cl-qmynd, cl-quri, cl-simple-date, cl-split-sequence, cl-sqlite, cl-trivial-backtrace, cl-trivial-utf-8, cl-unicode, cl-usocket, cl-utilities, cl-uuid, cl-yason, cl-zs3, ${misc:Depends}, Description: extract, transform and load data into PostgreSQL pgloader imports data from different kind of sources and COPY it into PostgreSQL. . The command language is described in the manual page and allows one to describe where to find the data source, its format, and to describe data processing and transformation. . Supported source formats include CSV, fixed width flat files, dBase3 files (DBF), and SQLite and MySQL databases. In most of those formats, pgloader is able to auto-discover the schema and create the tables and the indexes in PostgreSQL. In the MySQL case it's possible to edit CASTing rules from the pgloader command directly. pgloader-3.6.1/debian/copyright000066400000000000000000000117741342135037700165120ustar00rootroot00000000000000Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: pgloader Source: https://github.com/dimitri/pgloader Files: * Copyright: 2013 Dimitri Fontaine License: PostgreSQL Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. . IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. . THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. Files: test/sqlite/Chinook* Copyright: Copyright (c) 2008-2017 Luis Rocha License: MIT Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: . The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Files: test/data/2013_Gaz_113CDs_national.txt Copyright: public domain License: us-public-domain All U.S. Census Bureau materials, regardless of the media, are entirely in the public domain. There are no user fees, site licenses, or any special agreements etc for the public or private use, and or reuse of any census title. As tax funded product, it's all in the public record. Files: test/data/reg2013.dbf Copyright: public comain License: fr-public-domain Les publications et données mises à disposition sur le présent site sont consultables et téléchargeables gratuitement. Sauf spécification contraire, elles peuvent être réutilisées, y compris à des fins commerciales, sans licence et sans versement de redevances autres que celles collectées par les sociétés de perception et de répartition des droits d'auteur régies par le titre II du livre III du code de la propriété intellectuelle. La réutilisation est toutefois subordonnée au respect de l'intégrité de l'information et des données et à la mention précise des sources. . https://www.insee.fr/fr/information/2008466 Files: test/data/sakila-db.zip Copyright: Copyright © 2007, 2018, Oracle and/or its affiliates. All rights reserved. License: new-bsd-license The contents of the sakila-schema.sql and sakila-data.sql files are licensed under the New BSD license. . Information on the New BSD license can be found at http://www.opensource.org/licenses/bsd-license.php and http://en.wikipedia.org/wiki/BSD_License. . Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: . 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. . 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. . THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pgloader-3.6.1/debian/patches/000077500000000000000000000000001342135037700161745ustar00rootroot00000000000000pgloader-3.6.1/debian/patches/enable-ssl000066400000000000000000000010321342135037700201400ustar00rootroot00000000000000In d4737a39ca8702e32ad0a47941f93fef2872966f SSL loading automatically was disabled for general use, but we want to have it in the Debian package. --- a/src/hooks.lisp +++ b/src/hooks.lisp @@ -30,10 +30,8 @@ ;; handles some context and things around loading with CFFI. (cl+ssl:reload))) -#| #+ccl (push #'open-foreign-libs *lisp-startup-functions*) #+sbcl (push #'open-foreign-libs sb-ext:*init-hooks*) -|# #+ccl (push #'close-foreign-libs *save-exit-functions*) #+sbcl (push #'close-foreign-libs sb-ext:*save-hooks*) pgloader-3.6.1/debian/patches/series000066400000000000000000000000131342135037700174030ustar00rootroot00000000000000enable-ssl pgloader-3.6.1/debian/pgloader.docs000066400000000000000000000000331342135037700172100ustar00rootroot00000000000000README.md docs/_build/html pgloader-3.6.1/debian/pgloader.install000066400000000000000000000000341342135037700177270ustar00rootroot00000000000000build/bin/pgloader /usr/bin pgloader-3.6.1/debian/rules000077500000000000000000000031411342135037700156240ustar00rootroot00000000000000#!/usr/bin/make -f include /usr/share/dpkg/pkg-info.mk # make pgloader depend on the libssl package cl-plus-ssl depends on LIBSSL := $(shell dpkg-query --showformat='$${Depends}' --show cl-plus-ssl | grep -o 'libssl[^ ]*') BITS = $(shell dpkg-architecture -qDEB_BUILD_ARCH_BITS) ifeq ($(BITS),32) SIZE=1024 else SIZE=4096 endif # buildd provides a build environment where $HOME is not writable, but the # CL compilers here will need to fill-in a per-user cache export HOME = $(CURDIR)/debian/home override_dh_auto_clean: dh_auto_clean rm -rf debian/home override_dh_auto_build-indep: # do nothing override_dh_auto_build-arch: mkdir -p build/bin mkdir -p $(HOME) buildapp --require sb-posix \ --require sb-bsd-sockets \ --load /usr/share/common-lisp/source/cl-asdf/build/asdf.lisp \ --asdf-path . \ --asdf-tree /usr/share/common-lisp/systems \ --load-system asdf-finalizers \ --load-system asdf-system-connections \ --load-system pgloader \ --load src/hooks.lisp \ --entry pgloader:main \ --dynamic-space-size $(SIZE) \ --compress-core \ --output build/bin/pgloader $(MAKE) -C docs html override_dh_auto_test: # do nothing override_dh_strip: # do nothing override_dh_installman-arch: mkdir -p debian/pgloader/usr/share/man/man1/ PATH=debian/pgloader/usr/bin:$(PATH) \ help2man --version-string $(DEB_VERSION_UPSTREAM) \ --no-info \ --name "extract, transform and load data into PostgreSQL" \ pgloader > \ debian/pgloader/usr/share/man/man1/pgloader.1 override_dh_gencontrol: dh_gencontrol -- -V"ssl:Depends=$(LIBSSL)" %: dh $@ pgloader-3.6.1/debian/source/000077500000000000000000000000001342135037700160455ustar00rootroot00000000000000pgloader-3.6.1/debian/source/format000066400000000000000000000000141342135037700172530ustar00rootroot000000000000003.0 (quilt) pgloader-3.6.1/debian/source/lintian-overrides000066400000000000000000000001751342135037700214310ustar00rootroot00000000000000# don't bug people uploading from @work source: changelog-should-mention-nmu source: source-nmu-has-incorrect-version-number pgloader-3.6.1/debian/source/options000066400000000000000000000001071342135037700174610ustar00rootroot00000000000000# ignore release/non-release status extend-diff-ignore=src/params.lisp pgloader-3.6.1/debian/tests/000077500000000000000000000000001342135037700157075ustar00rootroot00000000000000pgloader-3.6.1/debian/tests/cl-pgloader000077500000000000000000000006551342135037700200340ustar00rootroot00000000000000#!/bin/sh set -eux trap "rm -rf /tmp/pgloader debian/home" 0 2 3 15 pg_virtualenv <<-EOF set -eux HOME=$PWD/debian/home createdb pgloader sbcl --eval '(require :asdf)' \ --eval '(setf *compile-print* nil *compile-verbose* nil *load-verbose* nil *load-print* nil asdf:*asdf-verbose* nil)' \ --eval '(asdf:load-system "pgloader")' \ --eval '(pgloader::main SB-EXT:*POSIX-ARGV*)' \ test/csv.load 2> /dev/null EOF pgloader-3.6.1/debian/tests/control000066400000000000000000000002671342135037700173170ustar00rootroot00000000000000Depends: pgloader, ca-certificates, postgresql Tests: ssl Restrictions: allow-stderr, needs-root Depends: cl-pgloader, postgresql, sbcl Tests: cl-pgloader Restrictions: allow-stderr pgloader-3.6.1/debian/tests/ssl000077500000000000000000000011151342135037700164340ustar00rootroot00000000000000#!/bin/sh # test needs root so we have a SSL certificate set -eux trap "rm -rf /tmp/pgloader" 0 2 3 15 pg_virtualenv <<-'EOF' set -eux # force SSL connection HBA=$(psql -XAtc 'SHOW hba_file') sed -i -e 's/^host/hostssl/' $HBA psql -XAtc 'SELECT pg_reload_conf()' createdb pgloader export PGDATABASE=pgloader psql -XAtc 'create schema expected' # test UNIX socket rm -rf /tmp/pgloader PGHOST=/var/run/postgresql su -c 'pgloader --regress test/allcols.load' postgres # test SSL connection rm -rf /tmp/pgloader PGSSLMODE=require pgloader --regress test/allcols.load EOF pgloader-3.6.1/debian/watch000066400000000000000000000001071342135037700155740ustar00rootroot00000000000000version=4 https://github.com/dimitri/pgloader/releases .*/v(.*).tar.gz pgloader-3.6.1/docs/000077500000000000000000000000001342135037700142535ustar00rootroot00000000000000pgloader-3.6.1/docs/CNAME000066400000000000000000000000141342135037700150140ustar00rootroot00000000000000pgloader.orgpgloader-3.6.1/docs/Makefile000066400000000000000000000011351342135037700157130ustar00rootroot00000000000000# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = pgloader SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)pgloader-3.6.1/docs/bugreport.rst000066400000000000000000000024461342135037700170240ustar00rootroot00000000000000Reporting Bugs ============== pgloader is a software and as such contains bugs. Most bugs are easy to solve and taken care of in a short delay. For this to be possible though, bug reports need to follow those recommandations: - include pgloader version, - include problematic input and output, - include a description of the output you expected, - explain the difference between the ouput you have and the one you expected, - include a self-reproducing test-case Test Cases to Reproduce Bugs ---------------------------- Use the *inline* source type to help reproduce a bug, as in the pgloader tests:: LOAD CSV FROM INLINE INTO postgresql://dim@localhost/pgloader?public."HS" WITH truncate, fields terminated by '\t', fields not enclosed, fields escaped by backslash-quote, quote identifiers SET work_mem to '128MB', standard_conforming_strings to 'on', application_name to 'my app name' BEFORE LOAD DO $$ create extension if not exists hstore; $$, $$ drop table if exists "HS"; $$, $$ CREATE TABLE "HS" ( id serial primary key, kv hstore ) $$; 1 email=>foo@example.com,a=>b 2 test=>value 3 a=>b,c=>"quoted hstore value",d=>other 4 baddata pgloader-3.6.1/docs/conf.py000066400000000000000000000124011342135037700155500ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # pgloader documentation build configuration file, created by # sphinx-quickstart on Tue Dec 5 19:23:32 2017. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.githubpages'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = '.rst' source_suffix = ['.rst', '.md'] # The master toctree document. master_doc = 'index' # General information about the project. project = 'pgloader' copyright = '2017, Dimitri Fontaine' author = 'Dimitri Fontaine' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = '3.4' # The full version, including alpha/beta/rc tags. release = '3.4.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # #html_theme = 'alabaster' html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} html_theme_options = { 'github_user': 'dimitri', 'github_repo': 'pgloader', 'description': 'your migration companion', 'travis_button': True, 'show_related': True, #'sidebar_collapse': False, } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # This is required for the alabaster theme # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars html_sidebars = { '**': [ 'relations.html', # needs 'show_related': True theme option to display 'searchbox.html', ] } # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'pgloaderdoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'pgloader.tex', 'pgloader Documentation', 'Dimitri Fontaine', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'pgloader', 'pgloader Documentation', [author], 1) ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'pgloader', 'pgloader Documentation', author, 'pgloader', 'One line description of project.', 'Miscellaneous'), ] pgloader-3.6.1/docs/index.rst000066400000000000000000000265061342135037700161250ustar00rootroot00000000000000.. pgloader documentation master file, created by sphinx-quickstart on Tue Dec 5 19:23:32 2017. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to pgloader's documentation! ==================================== pgloader loads data from various sources into PostgreSQL. It can transform the data it reads on the fly and submit raw SQL before and after the loading. It uses the `COPY` PostgreSQL protocol to stream the data into the server, and manages errors by filling a pair of *reject.dat* and *reject.log* files. Thanks to being able to load data directly from a database source, pgloader also supports from migrations from other productions to PostgreSQL. In this mode of operations, pgloader handles both the schema and data parts of the migration, in a single unmanned command, allowing to implement **Continuous Migration**. Features Overview ================= pgloader has two modes of operation: loading from files, migrating databases. In both cases, pgloader uses the PostgreSQL COPY protocol which implements a **streaming** to send data in a very efficient way. Loading file content in PostgreSQL ---------------------------------- When loading from files, pgloader implements the following features: Many source formats supported Support for a wide variety of file based formats are included in pgloader: the CSV family, fixed columns formats, dBase files (``db3``), and IBM IXF files. The SQLite database engine is accounted for in the next section: pgloader considers SQLite as a database source and implements schema discovery from SQLite catalogs. On the fly data transformation Often enough the data as read from a CSV file (or another format) needs some tweaking and clean-up before being sent to PostgreSQL. For instance in the `geolite `_ example we can see that integer values are being rewritten as IP address ranges, allowing to target an ``ip4r`` column directly. Full Field projections pgloader supports loading data into less fields than found on file, or more, doing some computation on the data read before sending it to PostgreSQL. Reading files from an archive Archive formats *zip*, *tar*, and *gzip* are supported by pgloader: the archive is extracted in a temporary directly and expanded files are then loaded. HTTP(S) support pgloader knows how to download a source file or a source archive using HTTP directly. It might be better to use ``curl -O- http://... | pgloader`` and read the data from *standard input*, then allowing for streaming of the data from its source down to PostgreSQL. Target schema discovery When loading in an existing table, pgloader takes into account the existing columns and may automatically guess the CSV format for you. On error stop / On error resume next In some cases the source data is so damaged as to be impossible to migrate in full, and when loading from a file then the default for pgloader is to use ``on error resume next`` option, where the rows rejected by PostgreSQL are saved away and the migration continues with the other rows. In other cases loading only a part of the input data might not be a great idea, and in such cases it's possible to use the ``on error stop`` option. Pre/Post SQL commands This feature allows pgloader commands to include SQL commands to run before and after loading a file. It might be about creating a table first, then loading the data into it, and then doing more processing on-top of the data (implementing an *ELT* pipeline then), or creating specific indexes as soon as the data has been made ready. One-command migration to PostgreSQL ----------------------------------- When migrating a full database in a single command, pgloader implements the following features: One-command migration The whole migration is started with a single command line and then runs unattended. pgloader is meant to be integrated in a fully automated tooling that you can repeat as many times as needed. Schema discovery The source database is introspected using its SQL catalogs to get the list of tables, attributes (with data types, default values, not null constraints, etc), primary key constraints, foreign key constraints, indexes, comments, etc. This feeds an internal database catalog of all the objects to migrate from the source database to the target database. User defined casting rules Some source database have ideas about their data types that might not be compatible with PostgreSQL implementaion of equivalent data types. For instance, SQLite since version 3 has a `Dynamic Type System `_ which of course isn't compatible with the idea of a `Relation `_. Or MySQL accepts datetime for year zero, which doesn't exists in our calendar, and doesn't have a boolean data type. When migrating from another source database technology to PostgreSQL, data type casting choices must be made. pgloader implements solid defaults that you can rely upon, and a facility for **user defined data type casting rules** for specific cases. The idea is to allow users to specify the how the migration should be done, in order for it to be repeatable and included in a *Continuous Migration* process. On the fly data transformations The user defined casting rules come with on the fly rewrite of the data. For instance zero dates (it's not just the year, MySQL accepts ``0000-00-00`` as a valid datetime) are rewritten to NULL values by default. Partial Migrations It is possible to include only a partial list of the source database tables in the migration, or to exclude some of the tables on the source database. Schema only, Data only This is the **ORM compatibility** feature of pgloader, where it is possible to create the schema using your ORM and then have pgloader migrate the data targeting this already created schema. When doing this, it is possible for pgloader to *reindex* the target schema: before loading the data from the source database into PostgreSQL using COPY, pgloader DROPs the indexes and constraints, and reinstalls the exact same definitions of them once the data has been loaded. The reason for operating that way is of course data load performance. Repeatable (DROP+CREATE) By default, pgloader issues DROP statements in the target PostgreSQL database before issing any CREATE statement, so that you can repeat the migration as many times as necessary until migration specifications and rules are bug free. The schedule the data migration to run every night (or even more often!) for the whole duration of the code migration project. See the `Continuous Migration `_ methodology for more details about the approach. On error stop / On error resume next The default behavior of pgloader when migrating from a database is ``on error stop``. The idea is to let the user fix either the migration specifications or the source data, and run the process again, until it works. In some cases the source data is so damaged as to be impossible to migrate in full, and it might be necessary to then resort to the ``on error resume next`` option, where the rows rejected by PostgreSQL are saved away and the migration continues with the other rows. Pre/Post SQL commands, Post-Schema SQL commands While pgloader takes care of rewriting the schema to PostgreSQL expectations, and even provides *user-defined data type casting rules* support to that end, sometimes it is necessary to add some specific SQL commands around the migration. It's of course supported right from pgloader itself, without having to script around it. Online ALTER schema At times migrating to PostgreSQL is also a good opportunity to review and fix bad decisions that were made in the past, or simply that are not relevant to PostgreSQL. The pgloader command syntax allows to ALTER pgloader's internal representation of the target catalogs so that the target schema can be created a little different from the source one. Changes supported include target a different *schema* or *table* name. Materialized Views, or schema rewrite on-the-fly In some cases the schema rewriting goes deeper than just renaming the SQL objects to being a full normalization exercise. Because PostgreSQL is great at running a normalized schema in production under most workloads. pgloader implements full flexibility in on-the-fly schema rewriting, by making it possible to migrate from a view definition. The view attribute list becomes a table definition in PostgreSQL, and the data is fetched by querying the view on the source system. A SQL view allows to implement both content filtering at the column level using the SELECT projection clause, and at the row level using the WHERE restriction clause. And backfilling from reference tables thanks to JOINs. Distribute to Citus When migrating from PostgreSQL to Citus, a important part of the process consists of adjusting the schema to the distribution key. Read `Preparing Tables and Ingesting Data `_ in the Citus documentation for a complete example showing how to do that. When using pgloader it's possible to specify the distribution keys and reference tables and let pgloader take care of adjusting the table, indexes, primary keys and foreign key definitions all by itself. Encoding Overrides MySQL doesn't actually enforce the encoding of the data in the database to match the encoding known in the metadata, defined at the database, table, or attribute level. Sometimes, it's necessary to override the metadata in order to make sense of the text, and pgloader makes it easy to do so. Continuous Migration -------------------- pgloader is meant to migrate a whole database in a single command line and without any manual intervention. The goal is to be able to setup a *Continuous Integration* environment as described in the `Project Methodology `_ document of the `MySQL to PostgreSQL `_ webpage. 1. Setup your target PostgreSQL Architecture 2. Fork a Continuous Integration environment that uses PostgreSQL 3. Migrate the data over and over again every night, from production 4. As soon as the CI is all green using PostgreSQL, schedule the D-Day 5. Migrate without suprise and enjoy! In order to be able to follow this great methodology, you need tooling to implement the third step in a fully automated way. That's pgloader. .. toctree:: :maxdepth: 2 :caption: Table Of Contents: intro quickstart tutorial/tutorial pgloader ref/csv ref/fixed ref/copy ref/dbf ref/ixf ref/archive ref/mysql ref/sqlite ref/mssql ref/pgsql ref/pgsql-citus-target ref/pgsql-redshift ref/transforms bugreport Indices and tables ================== * :ref:`genindex` * :ref:`modindex` * :ref:`search` pgloader-3.6.1/docs/intro.rst000066400000000000000000000101371342135037700161420ustar00rootroot00000000000000Introduction ============ pgloader loads data from various sources into PostgreSQL. It can transform the data it reads on the fly and submit raw SQL before and after the loading. It uses the `COPY` PostgreSQL protocol to stream the data into the server, and manages errors by filling a pair of *reject.dat* and *reject.log* files. pgloader knows how to read data from different kind of sources: * Files * CSV * Fixed Format * DBF * Databases * SQLite * MySQL * MS SQL Server * PostgreSQL * Redshift pgloader knows how to target different products using the PostgresQL Protocol: * PostgreSQL * `Citus `_ * Redshift The level of automation provided by pgloader depends on the data source type. In the case of CSV and Fixed Format files, a full description of the expected input properties must be given to pgloader. In the case of a database, pgloader connects to the live service and knows how to fetch the metadata it needs directly from it. Features Matrix --------------- Here's a comparison of the features supported depending on the source database engine. Some features that are not supported can be added to pgloader, it's just that nobody had the need to do so yet. Those features are marked with ✗. Empty cells are used when the feature doesn't make sense for the selected source database. ========================== ======= ====== ====== =========== ========= Feature SQLite MySQL MS SQL PostgreSQL Redshift ========================== ======= ====== ====== =========== ========= One-command migration ✓ ✓ ✓ ✓ ✓ Continuous Migration ✓ ✓ ✓ ✓ ✓ Schema discovery ✓ ✓ ✓ ✓ ✓ Partial Migrations ✓ ✓ ✓ ✓ ✓ Schema only ✓ ✓ ✓ ✓ ✓ Data only ✓ ✓ ✓ ✓ ✓ Repeatable (DROP+CREATE) ✓ ✓ ✓ ✓ ✓ User defined casting rules ✓ ✓ ✓ ✓ ✓ Encoding Overrides ✓ On error stop ✓ ✓ ✓ ✓ ✓ On error resume next ✓ ✓ ✓ ✓ ✓ Pre/Post SQL commands ✓ ✓ ✓ ✓ ✓ Post-Schema SQL commands ✗ ✓ ✓ ✓ ✓ Primary key support ✓ ✓ ✓ ✓ ✓ Foreign key support ✓ ✓ ✓ ✓ Online ALTER schema ✓ ✓ ✓ ✓ ✓ Materialized views ✗ ✓ ✓ ✓ ✓ Distribute to Citus ✗ ✓ ✓ ✓ ✓ ========================== ======= ====== ====== =========== ========= For more details about what the features are about, see the specific reference pages for your database source. For some of the features, missing support only means that the feature is not needed for the other sources, such as the capability to override MySQL encoding metadata about a table or a column. Only MySQL in this list is left completely unable to guarantee text encoding. Or Redshift not having foreign keys. Commands -------- pgloader implements its own *Command Language*, a DSL that allows to specify every aspect of the data load and migration to implement. Some of the features provided in the language are only available for a specific source type. Command Line ------------ The pgloader command line accepts those two variants:: pgloader [] []... pgloader [] SOURCE TARGET Either you have a *command-file* containing migration specifications in the pgloader *Command Language*, or you can give a *Source* for the data and a PostgreSQL database connection *Target* where to load the data into. pgloader-3.6.1/docs/pgloader-usage-examples.rst000066400000000000000000000150601342135037700215220ustar00rootroot00000000000000Pgloader Usage Examples ======================= Currently not included, because redundant with the tutorial. Usage Examples -------------- Review the command line options and pgloader's version:: pgloader --help pgloader --version Loading from a complex command ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Use the command file as the pgloader command argument, pgloader will parse that file and execute the commands found in it:: pgloader --verbose ./test/csv-districts.load CSV ^^^ Load data from a CSV file into a pre-existing table in your database, having pgloader guess the CSV properties (separator, quote and escape character):: pgloader ./test/data/matching-1.csv pgsql:///pgloader?tablename=matching Load data from a CSV file into a pre-existing table in your database, with expanded options:: pgloader --type csv \ --field id --field field \ --with truncate \ --with "fields terminated by ','" \ ./test/data/matching-1.csv \ postgres:///pgloader?tablename=matching In that example the whole loading is driven from the command line, bypassing the need for writing a command in the pgloader command syntax entirely. As there's no command though, the extra inforamtion needed must be provided on the command line using the `--type` and `--field` and `--with` switches. For documentation about the available syntaxes for the `--field` and `--with` switches, please refer to the CSV section later in the man page. Note also that the PostgreSQL URI includes the target *tablename*. Reading from STDIN ^^^^^^^^^^^^^^^^^^ File based pgloader sources can be loaded from the standard input, as in the following example:: pgloader --type csv \ --field "usps,geoid,aland,awater,aland_sqmi,awater_sqmi,intptlat,intptlong" \ --with "skip header = 1" \ --with "fields terminated by '\t'" \ - \ postgresql:///pgloader?districts_longlat \ < test/data/2013_Gaz_113CDs_national.txt The dash (`-`) character as a source is used to mean *standard input*, as usual in Unix command lines. It's possible to stream compressed content to pgloader with this technique, using the Unix pipe: gunzip -c source.gz | pgloader --type csv ... - pgsql:///target?foo Loading from CSV available through HTTP ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The same command as just above can also be run if the CSV file happens to be found on a remote HTTP location:: pgloader --type csv \ --field "usps,geoid,aland,awater,aland_sqmi,awater_sqmi,intptlat,intptlong" \ --with "skip header = 1" \ --with "fields terminated by '\t'" \ http://pgsql.tapoueh.org/temp/2013_Gaz_113CDs_national.txt \ postgresql:///pgloader?districts_longlat Some more options have to be used in that case, as the file contains a one-line header (most commonly that's column names, could be a copyright notice). Also, in that case, we specify all the fields right into a single `--field` option argument. Again, the PostgreSQL target connection string must contain the *tablename* option and you have to ensure that the target table exists and may fit the data. Here's the SQL command used in that example in case you want to try it yourself:: create table districts_longlat ( usps text, geoid text, aland bigint, awater bigint, aland_sqmi double precision, awater_sqmi double precision, intptlat double precision, intptlong double precision ); Also notice that the same command will work against an archived version of the same data, e.g. http://pgsql.tapoueh.org/temp/2013_Gaz_113CDs_national.txt.gz. Finally, it's important to note that pgloader first fetches the content from the HTTP URL it to a local file, then expand the archive when it's recognized to be one, and only then processes the locally expanded file. In some cases, either because pgloader has no direct support for your archive format or maybe because expanding the archive is not feasible in your environment, you might want to *stream* the content straight from its remote location into PostgreSQL. Here's how to do that, using the old battle tested Unix Pipes trick:: curl http://pgsql.tapoueh.org/temp/2013_Gaz_113CDs_national.txt.gz \ | gunzip -c \ | pgloader --type csv \ --field "usps,geoid,aland,awater,aland_sqmi,awater_sqmi,intptlat,intptlong" --with "skip header = 1" \ --with "fields terminated by '\t'" \ - \ postgresql:///pgloader?districts_longlat Now the OS will take care of the streaming and buffering between the network and the commands and pgloader will take care of streaming the data down to PostgreSQL. Migrating from SQLite ^^^^^^^^^^^^^^^^^^^^^ The following command will open the SQLite database, discover its tables definitions including indexes and foreign keys, migrate those definitions while *casting* the data type specifications to their PostgreSQL equivalent and then migrate the data over:: createdb newdb pgloader ./test/sqlite/sqlite.db postgresql:///newdb Migrating from MySQL ^^^^^^^^^^^^^^^^^^^^ Just create a database where to host the MySQL data and definitions and have pgloader do the migration for you in a single command line:: createdb pagila pgloader mysql://user@localhost/sakila postgresql:///pagila Fetching an archived DBF file from a HTTP remote location ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It's possible for pgloader to download a file from HTTP, unarchive it, and only then open it to discover the schema then load the data:: createdb foo pgloader --type dbf http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2013/dbf/historiq2013.zip postgresql:///foo Here it's not possible for pgloader to guess the kind of data source it's being given, so it's necessary to use the `--type` command line switch. pgloader-3.6.1/docs/pgloader.rst000066400000000000000000000660731342135037700166160ustar00rootroot00000000000000PgLoader Reference Manual ========================= pgloader loads data from various sources into PostgreSQL. It can transform the data it reads on the fly and submit raw SQL before and after the loading. It uses the `COPY` PostgreSQL protocol to stream the data into the server, and manages errors by filling a pair of *reject.dat* and *reject.log* files. pgloader operates either using commands which are read from files:: pgloader commands.load or by using arguments and options all provided on the command line:: pgloader SOURCE TARGET Arguments --------- The pgloader arguments can be as many load files as needed, or a couple of connection strings to a specific input file. Source Connection String ^^^^^^^^^^^^^^^^^^^^^^^^ The source connection string format is as follows:: format:///absolute/path/to/file.ext format://./relative/path/to/file.ext Where format might be one of `csv`, `fixed`, `copy`, `dbf`, `db3` or `ixf`.:: db://user:pass@host:port/dbname Where db might be of `sqlite`, `mysql` or `mssql`. When using a file based source format, pgloader also support natively fetching the file from an http location and decompressing an archive if needed. In that case it's necessary to use the `--type` option to specify the expected format of the file. See the examples below. Also note that some file formats require describing some implementation details such as columns to be read and delimiters and quoting when loading from csv. For more complex loading scenarios, you will need to write a full fledge load command in the syntax described later in this document. Target Connection String ^^^^^^^^^^^^^^^^^^^^^^^^ The target connection string format is described in details later in this document, see Section Connection String. Options ------- Inquiry Options ^^^^^^^^^^^^^^^ Use these options when you want to know more about how to use `pgloader`, as those options will cause `pgloader` not to load any data. * `-h`, `--help` Show command usage summary and exit. * `-V`, `--version` Show pgloader version string and exit. * `-E`, `--list-encodings` List known encodings in this version of pgloader. * `-U`, `--upgrade-config` Parse given files in the command line as `pgloader.conf` files with the `INI` syntax that was in use in pgloader versions 2.x, and output the new command syntax for pgloader on standard output. General Options ^^^^^^^^^^^^^^^ Those options are meant to tweak `pgloader` behavior when loading data. * `-v`, `--verbose` Be verbose. * `-q`, `--quiet` Be quiet. * `-d`, `--debug` Show debug level information messages. * `-D`, `--root-dir` Set the root working directory (default to "/tmp/pgloader"). * `-L`, `--logfile` Set the pgloader log file (default to "/tmp/pgloader/pgloader.log"). * `--log-min-messages` Minimum level of verbosity needed for log message to make it to the logfile. One of critical, log, error, warning, notice, info or debug. * `--client-min-messages` Minimum level of verbosity needed for log message to make it to the console. One of critical, log, error, warning, notice, info or debug. * `-S`, `--summary` A filename where to copy the summary output. When relative, the filename is expanded into `*root-dir*`. The format of the filename defaults to being *human readable*. It is possible to have the output in machine friendly formats such as *CSV*, *COPY* (PostgreSQL's own COPY format) or *JSON* by specifying a filename with the extension resp. `.csv`, `.copy` or `.json`. * `-l `, `--load-lisp-file ` Specify a lisp to compile and load into the pgloader image before reading the commands, allowing to define extra transformation function. Those functions should be defined in the `pgloader.transforms` package. This option can appear more than once in the command line. * `--dry-run` Allow testing a `.load` file without actually trying to load any data. It's useful to debug it until it's ok, in particular to fix connection strings. * `--on-error-stop` Alter pgloader behavior: rather than trying to be smart about error handling and continue loading good data, separating away the bad one, just stop as soon as PostgreSQL refuses anything sent to it. Useful to debug data processing, transformation function and specific type casting. * `--self-upgrade ` Specify a where to find pgloader sources so that one of the very first things it does is dynamically loading-in (and compiling to machine code) another version of itself, usually a newer one like a very recent git checkout. * `--no-ssl-cert-verification` Uses the OpenSSL option to accept a locally issued server-side certificate, avoiding the following error message:: SSL verify error: 20 X509_V_ERR_UNABLE_TO_GET_ISSUER_CERT_LOCALLY The right way to fix the SSL issue is to use a trusted certificate, of course. Sometimes though it's useful to make progress with the pgloader setup while the certificate chain of trust is being fixed, maybe by another team. That's when this option is useful. Command Line Only Operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Those options are meant to be used when using `pgloader` from the command line only, rather than using a command file and the rich command clauses and parser. In simple cases, it can be much easier to use the *SOURCE* and *TARGET* directly on the command line, then tweak the loading with those options: * `--with "option"` Allows setting options from the command line. You can use that option as many times as you want. The option arguments must follow the *WITH* clause for the source type of the `SOURCE` specification, as described later in this document. * `--set "guc_name='value'"` Allows setting PostgreSQL configuration from the command line. Note that the option parsing is the same as when used from the *SET* command clause, in particular you must enclose the guc value with single-quotes. * `--field "..."` Allows setting a source field definition. Fields are accumulated in the order given on the command line. It's possible to either use a `--field` option per field in the source file, or to separate field definitions by a comma, as you would do in the *HAVING FIELDS* clause. * `--cast "..."` Allows setting a specific casting rule for loading the data. * `--type csv|fixed|db3|ixf|sqlite|mysql|mssql` Allows forcing the source type, in case when the *SOURCE* parsing isn't satisfying. * `--encoding ` Set the encoding of the source file to load data from. * `--before ` Parse given filename for SQL queries and run them against the target database before loading the data from the source. The queries are parsed by pgloader itself: they need to be terminated by a semi-colon (;) and the file may include `\i` or `\ir` commands to *include* another file. * `--after ` Parse given filename for SQL queries and run them against the target database after having loaded the data from the source. The queries are parsed in the same way as with the `--before` option, see above. More Debug Information ^^^^^^^^^^^^^^^^^^^^^^ To get the maximum amount of debug information, you can use both the `--verbose` and the `--debug` switches at the same time, which is equivalent to saying `--client-min-messages data`. Then the log messages will show the data being processed, in the cases where the code has explicit support for it. Batches And Retry Behaviour --------------------------- To load data to PostgreSQL, pgloader uses the `COPY` streaming protocol. While this is the faster way to load data, `COPY` has an important drawback: as soon as PostgreSQL emits an error with any bit of data sent to it, whatever the problem is, the whole data set is rejected by PostgreSQL. To work around that, pgloader cuts the data into *batches* of 25000 rows each, so that when a problem occurs it's only impacting that many rows of data. Each batch is kept in memory while the `COPY` streaming happens, in order to be able to handle errors should some happen. When PostgreSQL rejects the whole batch, pgloader logs the error message then isolates the bad row(s) from the accepted ones by retrying the batched rows in smaller batches. To do that, pgloader parses the *CONTEXT* error message from the failed COPY, as the message contains the line number where the error was found in the batch, as in the following example:: CONTEXT: COPY errors, line 3, column b: "2006-13-11" Using that information, pgloader will reload all rows in the batch before the erroneous one, log the erroneous one as rejected, then try loading the remaining of the batch in a single attempt, which may or may not contain other erroneous data. At the end of a load containing rejected rows, you will find two files in the *root-dir* location, under a directory named the same as the target database of your setup. The filenames are the target table, and their extensions are `.dat` for the rejected data and `.log` for the file containing the full PostgreSQL client side logs about the rejected data. The `.dat` file is formatted in PostgreSQL the text COPY format as documented in `http://www.postgresql.org/docs/9.2/static/sql-copy.html#AEN66609`. It is possible to use the following WITH options to control pgloader batch behavior: - *on error stop*, *on error resume next* This option controls if pgloader is using building batches of data at all. The batch implementation allows pgloader to recover errors by sending the data that PostgreSQL accepts again, and by keeping away the data that PostgreSQL rejects. To enable retrying the data and loading the good parts, use the option *on error resume next*, which is the default to file based data loads (such as CSV, IXF or DBF). When migrating from another RDMBS technology, it's best to have a reproducible loading process. In that case it's possible to use *on error stop* and fix either the casting rules, the data transformation functions or in cases the input data until your migration runs through completion. That's why *on error resume next* is the default for SQLite, MySQL and MS SQL source kinds. A Note About Performance ------------------------ pgloader has been developed with performance in mind, to be able to cope with ever growing needs in loading large amounts of data into PostgreSQL. The basic architecture it uses is the old Unix pipe model, where a thread is responsible for loading the data (reading a CSV file, querying MySQL, etc) and fills pre-processed data into a queue. Another threads feeds from the queue, apply some more *transformations* to the input data and stream the end result to PostgreSQL using the COPY protocol. When given a file that the PostgreSQL `COPY` command knows how to parse, and if the file contains no erroneous data, then pgloader will never be as fast as just using the PostgreSQL `COPY` command. Note that while the `COPY` command is restricted to read either from its standard input or from a local file on the server's file system, the command line tool `psql` implements a `\copy` command that knows how to stream a file local to the client over the network and into the PostgreSQL server, using the same protocol as pgloader uses. A Note About Parallelism ------------------------ pgloader uses several concurrent tasks to process the data being loaded: - a reader task reads the data in and pushes it to a queue, - at last one write task feeds from the queue and formats the raw into the PostgreSQL COPY format in batches (so that it's possible to then retry a failed batch without reading the data from source again), and then sends the data to PostgreSQL using the COPY protocol. The parameter *workers* allows to control how many worker threads are allowed to be active at any time (that's the parallelism level); and the parameter *concurrency* allows to control how many tasks are started to handle the data (they may not all run at the same time, depending on the *workers* setting). We allow *workers* simultaneous workers to be active at the same time in the context of a single table. A single unit of work consist of several kinds of workers: - a reader getting raw data from the source, - N writers preparing and sending the data down to PostgreSQL. The N here is setup to the *concurrency* parameter: with a *CONCURRENCY* of 2, we start (+ 1 2) = 3 concurrent tasks, with a *concurrency* of 4 we start (+ 1 4) = 5 concurrent tasks, of which only *workers* may be active simultaneously. The defaults are `workers = 4, concurrency = 1` when loading from a database source, and `workers = 8, concurrency = 2` when loading from something else (currently, a file). Those defaults are arbitrary and waiting for feedback from users, so please consider providing feedback if you play with the settings. As the `CREATE INDEX` threads started by pgloader are only waiting until PostgreSQL is done with the real work, those threads are *NOT* counted into the concurrency levels as detailed here. By default, as many `CREATE INDEX` threads as the maximum number of indexes per table are found in your source schema. It is possible to set the `max parallel create index` *WITH* option to another number in case there's just too many of them to create. Source Formats -------------- pgloader supports the following input formats: - csv, which includes also tsv and other common variants where you can change the *separator* and the *quoting* rules and how to *escape* the *quotes* themselves; - fixed columns file, where pgloader is flexible enough to accomodate with source files missing columns (*ragged fixed length column files* do exist); - PostgreSLQ COPY formatted files, following the COPY TEXT documentation of PostgreSQL, such as the reject files prepared by pgloader; - dbase files known as db3 or dbf file; - ixf formated files, ixf being a binary storage format from IBM; - sqlite databases with fully automated discovery of the schema and advanced cast rules; - mysql databases with fully automated discovery of the schema and advanced cast rules; - MS SQL databases with fully automated discovery of the schema and advanced cast rules. Pgloader Commands Syntax ------------------------ pgloader implements a Domain Specific Language allowing to setup complex data loading scripts handling computed columns and on-the-fly sanitization of the input data. For more complex data loading scenarios, you will be required to learn that DSL's syntax. It's meant to look familiar to DBA by being inspired by SQL where it makes sense, which is not that much after all. The pgloader commands follow the same global grammar rules. Each of them might support only a subset of the general options and provide specific options. :: LOAD FROM [ HAVING FIELDS ] INTO [ TARGET TABLE [ "" ]."" ] [ TARGET COLUMNS ] [ WITH ] [ SET ] [ BEFORE LOAD [ DO | EXECUTE ] ... ] [ AFTER LOAD [ DO | EXECUTE ] ... ] ; The main clauses are the `LOAD`, `FROM`, `INTO` and `WITH` clauses that each command implements. Some command then implement the `SET` command, or some specific clauses such as the `CAST` clause. Templating with Mustache ------------------------ pgloader implements the https://mustache.github.io/ templating system so that you may have dynamic parts of your commands. See the documentation for this template system online. A specific feature of pgloader is the ability to fetch a variable from the OS environment of the pgloader process, making it possible to run pgloader as in the following example:: $ DBPATH=sqlite/sqlite.db pgloader ./test/sqlite-env.load or in several steps:: $ export DBPATH=sqlite/sqlite.db $ pgloader ./test/sqlite-env.load The variable can then be used in a typical mustache fashion:: load database from '{{DBPATH}}' into postgresql:///pgloader; It's also possible to prepare a INI file such as the following:: [pgloader] DBPATH = sqlite/sqlite.db And run the following command, feeding the INI values as a *context* for pgloader templating system:: $ pgloader --context ./test/sqlite.ini ./test/sqlite-ini.load The mustache templates implementation with OS environment support replaces former `GETENV` implementation, which didn't work anyway. Common Clauses -------------- Some clauses are common to all commands: FROM ^^^^ The *FROM* clause specifies where to read the data from, and each command introduces its own variant of sources. For instance, the *CSV* source supports `inline`, `stdin`, a filename, a quoted filename, and a *FILENAME MATCHING* clause (see above); whereas the *MySQL* source only supports a MySQL database URI specification. INTO ^^^^ The PostgreSQL connection URI must contains the name of the target table where to load the data into. That table must have already been created in PostgreSQL, and the name might be schema qualified. Then *INTO* option also supports an optional comma separated list of target columns, which are either the name of an input *field* or the white space separated list of the target column name, its PostgreSQL data type and a *USING* expression. The *USING* expression can be any valid Common Lisp form and will be read with the current package set to `pgloader.transforms`, so that you can use functions defined in that package, such as functions loaded dynamically with the `--load` command line parameter. Each *USING* expression is compiled at runtime to native code. This feature allows pgloader to load any number of fields in a CSV file into a possibly different number of columns in the database, using custom code for that projection. WITH ^^^^ Set of options to apply to the command, using a global syntax of either: - *key = value* - *use option* - *do not use option* See each specific command for details. All data sources specific commands support the following options: - *on error stop*, *on error resume next* - *batch rows = R* - *batch size = ... MB* - *prefetch rows = ...* See the section BATCH BEHAVIOUR OPTIONS for more details. In addition, the following settings are available: - *workers = W* - *concurrency = C* - *max parallel create index = I* See section A NOTE ABOUT PARALLELISM for more details. SET ^^^ This clause allows to specify session parameters to be set for all the sessions opened by pgloader. It expects a list of parameter name, the equal sign, then the single-quoted value as a comma separated list. The names and values of the parameters are not validated by pgloader, they are given as-is to PostgreSQL. BEFORE LOAD DO ^^^^^^^^^^^^^^ You can run SQL queries against the database before loading the data from the `CSV` file. Most common SQL queries are `CREATE TABLE IF NOT EXISTS` so that the data can be loaded. Each command must be *dollar-quoted*: it must begin and end with a double dollar sign, `$$`. Dollar-quoted queries are then comma separated. No extra punctuation is expected after the last SQL query. BEFORE LOAD EXECUTE ^^^^^^^^^^^^^^^^^^^ Same behaviour as in the *BEFORE LOAD DO* clause. Allows you to read the SQL queries from a SQL file. Implements support for PostgreSQL dollar-quoting and the `\i` and `\ir` include facilities as in `psql` batch mode (where they are the same thing). AFTER LOAD DO ^^^^^^^^^^^^^ Same format as *BEFORE LOAD DO*, the dollar-quoted queries found in that section are executed once the load is done. That's the right time to create indexes and constraints, or re-enable triggers. AFTER LOAD EXECUTE ^^^^^^^^^^^^^^^^^^ Same behaviour as in the *AFTER LOAD DO* clause. Allows you to read the SQL queries from a SQL file. Implements support for PostgreSQL dollar-quoting and the `\i` and `\ir` include facilities as in `psql` batch mode (where they are the same thing). AFTER CREATE SCHEMA DO ^^^^^^^^^^^^^^^^^^^^^^ Same format as *BEFORE LOAD DO*, the dollar-quoted queries found in that section are executed once the schema has been craeted by pgloader, and before the data is loaded. It's the right time to ALTER TABLE or do some custom implementation on-top of what pgloader does, like maybe partitioning. AFTER CREATE SCHEMA EXECUTE ^^^^^^^^^^^^^^^^^^^^^^^^^^^ Same behaviour as in the *AFTER CREATE SCHEMA DO* clause. Allows you to read the SQL queries from a SQL file. Implements support for PostgreSQL dollar-quoting and the `\i` and `\ir` include facilities as in `psql` batch mode (where they are the same thing). Connection String ^^^^^^^^^^^^^^^^^ The `` parameter is expected to be given as a *Connection URI* as documented in the PostgreSQL documentation at http://www.postgresql.org/docs/9.3/static/libpq-connect.html#LIBPQ-CONNSTRING. :: postgresql://[user[:password]@][netloc][:port][/dbname][?option=value&...] Where: - *user* Can contain any character, including colon (`:`) which must then be doubled (`::`) and at-sign (`@`) which must then be doubled (`@@`). When omitted, the *user* name defaults to the value of the `PGUSER` environment variable, and if it is unset, the value of the `USER` environment variable. - *password* Can contain any character, including the at sign (`@`) which must then be doubled (`@@`). To leave the password empty, when the *user* name ends with at at sign, you then have to use the syntax user:@. When omitted, the *password* defaults to the value of the `PGPASSWORD` environment variable if it is set, otherwise the password is left unset. When no *password* is found either in the connection URI nor in the environment, then pgloader looks for a `.pgpass` file as documented at https://www.postgresql.org/docs/current/static/libpq-pgpass.html. The implementation is not that of `libpq` though. As with `libpq` you can set the environment variable `PGPASSFILE` to point to a `.pgpass` file, and pgloader defaults to `~/.pgpass` on unix like systems and `%APPDATA%\postgresql\pgpass.conf` on windows. Matching rules and syntax are the same as with `libpq`, refer to its documentation. - *netloc* Can be either a hostname in dotted notation, or an ipv4, or an Unix domain socket path. Empty is the default network location, under a system providing *unix domain socket* that method is preferred, otherwise the *netloc* default to `localhost`. It's possible to force the *unix domain socket* path by using the syntax `unix:/path/to/where/the/socket/file/is`, so to force a non default socket path and a non default port, you would have: postgresql://unix:/tmp:54321/dbname The *netloc* defaults to the value of the `PGHOST` environment variable, and if it is unset, to either the default `unix` socket path when running on a Unix system, and `localhost` otherwise. Socket path containing colons are supported by doubling the colons within the path, as in the following example: postgresql://unix:/tmp/project::region::instance:5432/dbname - *dbname* Should be a proper identifier (letter followed by a mix of letters, digits and the punctuation signs comma (`,`), dash (`-`) and underscore (`_`). When omitted, the *dbname* defaults to the value of the environment variable `PGDATABASE`, and if that is unset, to the *user* value as determined above. - *options* The optional parameters must be supplied with the form `name=value`, and you may use several parameters by separating them away using an ampersand (`&`) character. Only some options are supported here, *tablename* (which might be qualified with a schema name) *sslmode*, *host*, *port*, *dbname*, *user* and *password*. The *sslmode* parameter values can be one of `disable`, `allow`, `prefer` or `require`. For backward compatibility reasons, it's possible to specify the *tablename* option directly, without spelling out the `tablename=` parts. The options override the main URI components when both are given, and using the percent-encoded option parameters allow using passwords starting with a colon and bypassing other URI components parsing limitations. Regular Expressions ^^^^^^^^^^^^^^^^^^^ Several clauses listed in the following accept *regular expressions* with the following input rules: - A regular expression begins with a tilde sign (`~`), - is then followed with an opening sign, - then any character is allowed and considered part of the regular expression, except for the closing sign, - then a closing sign is expected. The opening and closing sign are allowed by pair, here's the complete list of allowed delimiters:: ~// ~[] ~{} ~() ~<> ~"" ~'' ~|| ~## Pick the set of delimiters that don't collide with the *regular expression* you're trying to input. If your expression is such that none of the solutions allow you to enter it, the places where such expressions are allowed should allow for a list of expressions. Comments ^^^^^^^^ Any command may contain comments, following those input rules: - the `--` delimiter begins a comment that ends with the end of the current line, - the delimiters `/*` and `*/` respectively start and end a comment, which can be found in the middle of a command or span several lines. Any place where you could enter a *whitespace* will accept a comment too. Batch behaviour options ^^^^^^^^^^^^^^^^^^^^^^^ All pgloader commands have support for a *WITH* clause that allows for specifying options. Some options are generic and accepted by all commands, such as the *batch behaviour options*, and some options are specific to a data source kind, such as the CSV *skip header* option. The global batch behaviour options are: - *batch rows* Takes a numeric value as argument, used as the maximum number of rows allowed in a batch. The default is `25 000` and can be changed to try having better performance characteristics or to control pgloader memory usage; - *batch size* Takes a memory unit as argument, such as *20 MB*, its default value. Accepted multipliers are *kB*, *MB*, *GB*, *TB* and *PB*. The case is important so as not to be confused about bits versus bytes, we're only talking bytes here. - *prefetch rows* Takes a numeric value as argument, defaults to `100000`. That's the number of rows that pgloader is allowed to read in memory in each reader thread. See the *workers* setting for how many reader threads are allowed to run at the same time. Other options are specific to each input source, please refer to specific parts of the documentation for their listing and covering. A batch is then closed as soon as either the *batch rows* or the *batch size* threshold is crossed, whichever comes first. In cases when a batch has to be closed because of the *batch size* setting, a *debug* level log message is printed with how many rows did fit in the *oversized* batch. pgloader-3.6.1/docs/quickstart.rst000066400000000000000000000137071342135037700172070ustar00rootroot00000000000000Pgloader Quick Start ==================== In simple cases, pgloader is very easy to use. CSV --- Load data from a CSV file into a pre-existing table in your database:: pgloader --type csv \ --field id --field field \ --with truncate \ --with "fields terminated by ','" \ ./test/data/matching-1.csv \ postgres:///pgloader?tablename=matching In that example the whole loading is driven from the command line, bypassing the need for writing a command in the pgloader command syntax entirely. As there's no command though, the extra information needed must be provided on the command line using the `--type` and `--field` and `--with` switches. For documentation about the available syntaxes for the `--field` and `--with` switches, please refer to the CSV section later in the man page. Note also that the PostgreSQL URI includes the target *tablename*. Reading from STDIN ------------------ File based pgloader sources can be loaded from the standard input, as in the following example:: pgloader --type csv \ --field "usps,geoid,aland,awater,aland_sqmi,awater_sqmi,intptlat,intptlong" \ --with "skip header = 1" \ --with "fields terminated by '\t'" \ - \ postgresql:///pgloader?districts_longlat \ < test/data/2013_Gaz_113CDs_national.txt The dash (`-`) character as a source is used to mean *standard input*, as usual in Unix command lines. It's possible to stream compressed content to pgloader with this technique, using the Unix pipe:: gunzip -c source.gz | pgloader --type csv ... - pgsql:///target?foo Loading from CSV available through HTTP --------------------------------------- The same command as just above can also be run if the CSV file happens to be found on a remote HTTP location:: pgloader --type csv \ --field "usps,geoid,aland,awater,aland_sqmi,awater_sqmi,intptlat,intptlong" \ --with "skip header = 1" \ --with "fields terminated by '\t'" \ http://pgsql.tapoueh.org/temp/2013_Gaz_113CDs_national.txt \ postgresql:///pgloader?districts_longlat Some more options have to be used in that case, as the file contains a one-line header (most commonly that's column names, could be a copyright notice). Also, in that case, we specify all the fields right into a single `--field` option argument. Again, the PostgreSQL target connection string must contain the *tablename* option and you have to ensure that the target table exists and may fit the data. Here's the SQL command used in that example in case you want to try it yourself:: create table districts_longlat ( usps text, geoid text, aland bigint, awater bigint, aland_sqmi double precision, awater_sqmi double precision, intptlat double precision, intptlong double precision ); Also notice that the same command will work against an archived version of the same data. Streaming CSV data from an HTTP compressed file ----------------------------------------------- Finally, it's important to note that pgloader first fetches the content from the HTTP URL it to a local file, then expand the archive when it's recognized to be one, and only then processes the locally expanded file. In some cases, either because pgloader has no direct support for your archive format or maybe because expanding the archive is not feasible in your environment, you might want to *stream* the content straight from its remote location into PostgreSQL. Here's how to do that, using the old battle tested Unix Pipes trick:: curl http://pgsql.tapoueh.org/temp/2013_Gaz_113CDs_national.txt.gz \ | gunzip -c \ | pgloader --type csv \ --field "usps,geoid,aland,awater,aland_sqmi,awater_sqmi,intptlat,intptlong" --with "skip header = 1" \ --with "fields terminated by '\t'" \ - \ postgresql:///pgloader?districts_longlat Now the OS will take care of the streaming and buffering between the network and the commands and pgloader will take care of streaming the data down to PostgreSQL. Migrating from SQLite --------------------- The following command will open the SQLite database, discover its tables definitions including indexes and foreign keys, migrate those definitions while *casting* the data type specifications to their PostgreSQL equivalent and then migrate the data over:: createdb newdb pgloader ./test/sqlite/sqlite.db postgresql:///newdb Migrating from MySQL -------------------- Just create a database where to host the MySQL data and definitions and have pgloader do the migration for you in a single command line:: createdb pagila pgloader mysql://user@localhost/sakila postgresql:///pagila Fetching an archived DBF file from a HTTP remote location --------------------------------------------------------- It's possible for pgloader to download a file from HTTP, unarchive it, and only then open it to discover the schema then load the data:: createdb foo pgloader --type dbf http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2013/dbf/historiq2013.zip postgresql:///foo Here it's not possible for pgloader to guess the kind of data source it's being given, so it's necessary to use the `--type` command line switch. pgloader-3.6.1/docs/ref/000077500000000000000000000000001342135037700150275ustar00rootroot00000000000000pgloader-3.6.1/docs/ref/archive.rst000066400000000000000000000067411342135037700172120ustar00rootroot00000000000000Loading From an Archive ======================= This command instructs pgloader to load data from one or more files contained in an archive. Currently the only supported archive format is *ZIP*, and the archive might be downloaded from an *HTTP* URL. Here's an example:: LOAD ARCHIVE FROM /Users/dim/Downloads/GeoLiteCity-latest.zip INTO postgresql:///ip4r BEFORE LOAD DO $$ create extension if not exists ip4r; $$, $$ create schema if not exists geolite; $$, EXECUTE 'geolite.sql' LOAD CSV FROM FILENAME MATCHING ~/GeoLiteCity-Location.csv/ WITH ENCODING iso-8859-1 ( locId, country, region null if blanks, city null if blanks, postalCode null if blanks, latitude, longitude, metroCode null if blanks, areaCode null if blanks ) INTO postgresql:///ip4r?geolite.location ( locid,country,region,city,postalCode, location point using (format nil "(~a,~a)" longitude latitude), metroCode,areaCode ) WITH skip header = 2, fields optionally enclosed by '"', fields escaped by double-quote, fields terminated by ',' AND LOAD CSV FROM FILENAME MATCHING ~/GeoLiteCity-Blocks.csv/ WITH ENCODING iso-8859-1 ( startIpNum, endIpNum, locId ) INTO postgresql:///ip4r?geolite.blocks ( iprange ip4r using (ip-range startIpNum endIpNum), locId ) WITH skip header = 2, fields optionally enclosed by '"', fields escaped by double-quote, fields terminated by ',' FINALLY DO $$ create index blocks_ip4r_idx on geolite.blocks using gist(iprange); $$; The `archive` command accepts the following clauses and options. Archive Source Specification: FROM ---------------------------------- Filename or HTTP URI where to load the data from. When given an HTTP URL the linked file will get downloaded locally before processing. If the file is a `zip` file, the command line utility `unzip` is used to expand the archive into files in `$TMPDIR`, or `/tmp` if `$TMPDIR` is unset or set to a non-existing directory. Then the following commands are used from the top level directory where the archive has been expanded. Archive Sub Commands -------------------- - command [ *AND* command ... ] A series of commands against the contents of the archive, at the moment only `CSV`,`'FIXED` and `DBF` commands are supported. Note that commands are supporting the clause *FROM FILENAME MATCHING* which allows the pgloader command not to depend on the exact names of the archive directories. The same clause can also be applied to several files with using the spelling *FROM ALL FILENAMES MATCHING* and a regular expression. The whole *matching* clause must follow the following rule:: FROM [ ALL FILENAMES | [ FIRST ] FILENAME ] MATCHING Archive Final SQL Commands -------------------------- - *FINALLY DO* SQL Queries to run once the data is loaded, such as `CREATE INDEX`. pgloader-3.6.1/docs/ref/copy.rst000066400000000000000000000074621342135037700165440ustar00rootroot00000000000000Loading COPY Formatted Files ============================ This commands instructs pgloader to load from a file containing COPY TEXT data as described in the PostgreSQL documentation. Here's an example:: LOAD COPY FROM copy://./data/track.copy ( trackid, track, album, media, genre, composer, milliseconds, bytes, unitprice ) INTO postgresql:///pgloader TARGET TABLE track_full WITH truncate SET work_mem to '14MB', standard_conforming_strings to 'on' BEFORE LOAD DO $$ drop table if exists track_full; $$, $$ create table track_full ( trackid bigserial, track text, album text, media text, genre text, composer text, milliseconds bigint, bytes bigint, unitprice numeric ); $$; The `COPY` format command accepts the following clauses and options. COPY Formatted Files Source Specification: FROM ----------------------------------------------- Filename where to load the data from. This support local files, HTTP URLs and zip files containing a single dbf file of the same name. Fetch such a zip file from an HTTP address is of course supported. - *inline* The data is found after the end of the parsed commands. Any number of empty lines between the end of the commands and the beginning of the data is accepted. - *stdin* Reads the data from the standard input stream. - *FILENAMES MATCHING* The whole *matching* clause must follow the following rule:: [ ALL FILENAMES | [ FIRST ] FILENAME ] MATCHING regexp [ IN DIRECTORY '...' ] The *matching* clause applies given *regular expression* (see above for exact syntax, several options can be used here) to filenames. It's then possible to load data from only the first match of all of them. The optional *IN DIRECTORY* clause allows specifying which directory to walk for finding the data files, and can be either relative to where the command file is read from, or absolute. The given directory must exists. COPY Formatted File Options: WITH --------------------------------- When loading from a `COPY` file, the following options are supported: - *delimiter* Takes a single character as argument, which must be found inside single quotes, and might be given as the printable character itself, the special value \t to denote a tabulation character, or `0x` then an hexadecimal value read as the ASCII code for the character. This character is used as the *delimiter* when reading the data, in a similar way to the PostgreSQL `COPY` option. - *null* Takes a quoted string as an argument (quotes can be either double quotes or single quotes) and uses that string as the `NULL` representation in the data. This is similar to the *null* `COPY` option in PostgreSQL. - *truncate* When this option is listed, pgloader issues a `TRUNCATE` command against the PostgreSQL target table before reading the data file. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *skip header* Takes a numeric value as argument. Instruct pgloader to skip that many lines at the beginning of the input file. pgloader-3.6.1/docs/ref/csv.rst000066400000000000000000000207231342135037700163600ustar00rootroot00000000000000Loading CSV data ================ This command instructs pgloader to load data from a `CSV` file. Here's an example:: LOAD CSV FROM 'GeoLiteCity-Blocks.csv' WITH ENCODING iso-646-us HAVING FIELDS ( startIpNum, endIpNum, locId ) INTO postgresql://user@localhost:54393/dbname TARGET TABLE geolite.blocks TARGET COLUMNS ( iprange ip4r using (ip-range startIpNum endIpNum), locId ) WITH truncate, skip header = 2, fields optionally enclosed by '"', fields escaped by backslash-quote, fields terminated by '\t' SET work_mem to '32 MB', maintenance_work_mem to '64 MB'; The `csv` format command accepts the following clauses and options. CSV Source Specification: FROM ------------------------------ Filename where to load the data from. Accepts an *ENCODING* option. Use the `--list-encodings` option to know which encoding names are supported. The filename may be enclosed by single quotes, and could be one of the following special values: - *inline* The data is found after the end of the parsed commands. Any number of empty lines between the end of the commands and the beginning of the data is accepted. - *stdin* Reads the data from the standard input stream. - *FILENAMES MATCHING* The whole *matching* clause must follow the following rule:: [ ALL FILENAMES | [ FIRST ] FILENAME ] MATCHING regexp [ IN DIRECTORY '...' ] The *matching* clause applies given *regular expression* (see above for exact syntax, several options can be used here) to filenames. It's then possible to load data from only the first match of all of them. The optional *IN DIRECTORY* clause allows specifying which directory to walk for finding the data files, and can be either relative to where the command file is read from, or absolute. The given directory must exists. Fields Specifications --------------------- The *FROM* option also supports an optional comma separated list of *field* names describing what is expected in the `CSV` data file, optionally introduced by the clause `HAVING FIELDS`. Each field name can be either only one name or a name following with specific reader options for that field, enclosed in square brackets and comma-separated. Supported per-field reader options are: - *terminated by* See the description of *field terminated by* below. The processing of this option is not currently implemented. - *date format* When the field is expected of the date type, then this option allows to specify the date format used in the file. Date format string are template strings modeled against the PostgreSQL `to_char` template strings support, limited to the following patterns: - YYYY, YYY, YY for the year part - MM for the numeric month part - DD for the numeric day part - HH, HH12, HH24 for the hour part - am, AM, a.m., A.M. - pm, PM, p.m., P.M. - MI for the minutes part - SS for the seconds part - MS for the milliseconds part (4 digits) - US for the microseconds part (6 digits) - unparsed punctuation signs: - . * # @ T / \ and space Here's an example of a *date format* specification:: column-name [date format 'YYYY-MM-DD HH24-MI-SS.US'] - *null if* This option takes an argument which is either the keyword *blanks* or a double-quoted string. When *blanks* is used and the field value that is read contains only space characters, then it's automatically converted to an SQL `NULL` value. When a double-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL `NULL` value. - *trim both whitespace*, *trim left whitespace*, *trim right whitespace* This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string. CSV Loading Options: WITH ------------------------- When loading from a `CSV` file, the following options are supported: - *truncate* When this option is listed, pgloader issues a `TRUNCATE` command against the PostgreSQL target table before reading the data file. - *drop indexes* When this option is listed, pgloader issues `DROP INDEX` commands against all the indexes defined on the target table before copying the data, then `CREATE INDEX` commands once the `COPY` is done. In order to get the best performance possible, all the indexes are created in parallel and when done the primary keys are built again from the unique indexes just created. This two step process allows creating the primary key index in parallel with the other indexes, as only the `ALTER TABLE` command needs an *access exclusive lock* on the target table. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *skip header* Takes a numeric value as argument. Instruct pgloader to skip that many lines at the beginning of the input file. - *csv header* Use the first line read after *skip header* as the list of csv field names to be found in the CSV file, using the same CSV parameters as for the CSV data. - *trim unquoted blanks* When reading unquoted values in the `CSV` file, remove the blanks found in between the separator and the value. That behaviour is the default. - *keep unquoted blanks* When reading unquoted values in the `CSV` file, keep blanks found in between the separator and the value. - *fields optionally enclosed by* Takes a single character as argument, which must be found inside single quotes, and might be given as the printable character itself, the special value \t to denote a tabulation character, the special value \' to denote a single-quote, or `0x` then an hexadecimal value read as the ASCII code for the character. The following options specify the same enclosing character, a single quote:: fields optionally enclosed by '\'' fields optionally enclosed by '0x27' This character is used as the quoting character in the `CSV` file, and defaults to double-quote. - *fields not enclosed* By default, pgloader will use the double-quote character as the enclosing character. If you have a CSV file where fields are not enclosed and are using double-quote as an expected ordinary character, then use the option *fields not enclosed* for the CSV parser to accept those values. - *fields escaped by* Takes either the special value *backslash-quote* or *double-quote*, or any value supported by the *fields terminated by* option (see below). This value is used to recognize escaped field separators when they are to be found within the data fields themselves. Defaults to *double-quote*. - *csv escape mode* Takes either the special value *quote* (the default) or *following* and allows the CSV parser to parse either only escaped field separator or any character (including CSV data) when using the *following* value. - *fields terminated by* Takes a single character as argument, which must be found inside single quotes, and might be given as the printable character itself, the special value \t to denote a tabulation character, or `0x` then an hexadecimal value read as the ASCII code for the character. This character is used as the *field separator* when reading the `CSV` data. - *lines terminated by* Takes a single character as argument, which must be found inside single quotes, and might be given as the printable character itself, the special value \t to denote a tabulation character, or `0x` then an hexadecimal value read as the ASCII code for the character. This character is used to recognize *end-of-line* condition when reading the `CSV` data. pgloader-3.6.1/docs/ref/dbf.rst000066400000000000000000000034171342135037700163210ustar00rootroot00000000000000Loading DBF data ================= This command instructs pgloader to load data from a `DBF` file. Here's an example:: LOAD DBF FROM http://www.insee.fr/fr/methodes/nomenclatures/cog/telechargement/2013/dbf/reg2013.dbf INTO postgresql://user@localhost/dbname WITH truncate, create table; The `dbf` format command accepts the following clauses and options. DBF Source Specification: FROM ------------------------------ Filename where to load the data from. This support local files, HTTP URLs and zip files containing a single dbf file of the same name. Fetch such a zip file from an HTTP address is of course supported. DBF Loading Options: WITH ------------------------- When loading from a `DBF` file, the following options are supported: - *truncate* When this option is listed, pgloader issues a `TRUNCATE` command against the PostgreSQL target table before reading the data file. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *create table* When this option is listed, pgloader creates the table using the meta data found in the `DBF` file, which must contain a list of fields with their data type. A standard data type conversion from DBF to PostgreSQL is done. - *table name* This options expects as its value the possibly qualified name of the table to create. pgloader-3.6.1/docs/ref/fixed.rst000066400000000000000000000134361342135037700166670ustar00rootroot00000000000000Loading Fixed Cols File Formats =============================== This command instructs pgloader to load data from a text file containing columns arranged in a *fixed size* manner. Here's an example:: LOAD FIXED FROM inline ( a from 0 for 10, b from 10 for 8, c from 18 for 8, d from 26 for 17 [null if blanks, trim right whitespace] ) INTO postgresql:///pgloader TARGET TABLE fixed ( a, b, c time using (time-with-no-separator c), d ) WITH truncate SET work_mem to '14MB', standard_conforming_strings to 'on' BEFORE LOAD DO $$ drop table if exists fixed; $$, $$ create table fixed ( a integer, b date, c time, d text ); $$; 01234567892008052011431250firstline 01234562008052115182300left blank-padded 12345678902008052208231560another line 2345609872014092914371500 2345678902014092914371520 The `fixed` format command accepts the following clauses and options. Fixed File Format Source Specification: FROM -------------------------------------------- Filename where to load the data from. Accepts an *ENCODING* option. Use the `--list-encodings` option to know which encoding names are supported. The filename may be enclosed by single quotes, and could be one of the following special values: - *inline* The data is found after the end of the parsed commands. Any number of empty lines between the end of the commands and the beginning of the data is accepted. - *stdin* Reads the data from the standard input stream. - *FILENAMES MATCHING* The whole *matching* clause must follow the following rule:: [ ALL FILENAMES | [ FIRST ] FILENAME ] MATCHING regexp [ IN DIRECTORY '...' ] The *matching* clause applies given *regular expression* (see above for exact syntax, several options can be used here) to filenames. It's then possible to load data from only the first match of all of them. The optional *IN DIRECTORY* clause allows specifying which directory to walk for finding the data files, and can be either relative to where the command file is read from, or absolute. The given directory must exists. Fields Specifications --------------------- The *FROM* option also supports an optional comma separated list of *field* names describing what is expected in the `FIXED` data file. Each field name is composed of the field name followed with specific reader options for that field. Supported per-field reader options are the following, where only *start* and *length* are required. - *start* Position in the line where to start reading that field's value. Can be entered with decimal digits or `0x` then hexadecimal digits. - *length* How many bytes to read from the *start* position to read that field's value. Same format as *start*. Those optional parameters must be enclosed in square brackets and comma-separated: - *terminated by* See the description of *field terminated by* below. The processing of this option is not currently implemented. - *date format* When the field is expected of the date type, then this option allows to specify the date format used in the file. Date format string are template strings modeled against the PostgreSQL `to_char` template strings support, limited to the following patterns: - YYYY, YYY, YY for the year part - MM for the numeric month part - DD for the numeric day part - HH, HH12, HH24 for the hour part - am, AM, a.m., A.M. - pm, PM, p.m., P.M. - MI for the minutes part - SS for the seconds part - MS for the milliseconds part (4 digits) - US for the microseconds part (6 digits) - unparsed punctuation signs: - . * # @ T / \ and space Here's an example of a *date format* specification:: column-name [date format 'YYYY-MM-DD HH24-MI-SS.US'] - *null if* This option takes an argument which is either the keyword *blanks* or a double-quoted string. When *blanks* is used and the field value that is read contains only space characters, then it's automatically converted to an SQL `NULL` value. When a double-quoted string is used and that string is read as the field value, then the field value is automatically converted to an SQL `NULL` value. - *trim both whitespace*, *trim left whitespace*, *trim right whitespace* This option allows to trim whitespaces in the read data, either from both sides of the data, or only the whitespace characters found on the left of the streaing, or only those on the right of the string. Fixed File Format Loading Options: WITH --------------------------------------- When loading from a `FIXED` file, the following options are supported: - *truncate* When this option is listed, pgloader issues a `TRUNCATE` command against the PostgreSQL target table before reading the data file. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *skip header* Takes a numeric value as argument. Instruct pgloader to skip that many lines at the beginning of the input file. pgloader-3.6.1/docs/ref/ixf.rst000066400000000000000000000041751342135037700163560ustar00rootroot00000000000000Loading IXF Data ================ This command instructs pgloader to load data from an IBM `IXF` file. Here's an example:: LOAD IXF FROM data/nsitra.test1.ixf INTO postgresql:///pgloader TARGET TABLE nsitra.test1 WITH truncate, create table, timezone UTC BEFORE LOAD DO $$ create schema if not exists nsitra; $$, $$ drop table if exists nsitra.test1; $$; The `ixf` format command accepts the following clauses and options. IXF Source Specification: FROM ------------------------------ Filename where to load the data from. This support local files, HTTP URLs and zip files containing a single ixf file of the same name. Fetch such a zip file from an HTTP address is of course supported. IXF Loading Options: WITH ------------------------- When loading from a `IXF` file, the following options are supported: - *truncate* When this option is listed, pgloader issues a `TRUNCATE` command against the PostgreSQL target table before reading the data file. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *create table* When this option is listed, pgloader creates the table using the meta data found in the `DBF` file, which must contain a list of fields with their data type. A standard data type conversion from DBF to PostgreSQL is done. - *table name* This options expects as its value the possibly qualified name of the table to create. - *timezone* This options allows to specify which timezone is used when parsing timestamps from an IXF file, and defaults to *UTC*. Expected values are either `UTC`, `GMT` or a single quoted location name such as `'Universal'` or `'Europe/Paris'`. pgloader-3.6.1/docs/ref/mssql.rst000066400000000000000000000157301342135037700167260ustar00rootroot00000000000000Migrating a MS SQL Database to PostgreSQL ========================================= This command instructs pgloader to load data from a MS SQL database. Automatic discovery of the schema is supported, including build of the indexes, primary and foreign keys constraints. Here's an example:: load database from mssql://user@host/dbname into postgresql:///dbname including only table names like 'GlobalAccount' in schema 'dbo' set work_mem to '16MB', maintenance_work_mem to '512 MB' before load do $$ drop schema if exists dbo cascade; $$; The `mssql` command accepts the following clauses and options. MS SQL Database Source Specification: FROM ------------------------------------------ Connection string to an existing MS SQL database server that listens and welcome external TCP/IP connection. As pgloader currently piggybacks on the FreeTDS driver, to change the port of the server please export the `TDSPORT` environment variable. MS SQL Database Migration Options: WITH --------------------------------------- When loading from a `MS SQL` database, the same options as when loading a `MS SQL` database are supported. Please refer to the MS SQL section. The following options are added: - *create schemas* When this option is listed, pgloader creates the same schemas as found on the MS SQL instance. This is the default. - *create no schemas* When this option is listed, pgloader refrains from creating any schemas at all, you must then ensure that the target schema do exist. MS SQL Database Casting Rules ----------------------------- CAST ^^^^ The cast clause allows to specify custom casting rules, either to overload the default casting rules or to amend them with special cases. Please refer to the MS SQL CAST clause for details. MS SQL Views Support -------------------- MS SQL views support allows pgloader to migrate view as if they were base tables. This feature then allows for on-the-fly transformation from MS SQL to PostgreSQL, as the view definition is used rather than the base data. MATERIALIZE VIEWS ^^^^^^^^^^^^^^^^^ This clause allows you to implement custom data processing at the data source by providing a *view definition* against which pgloader will query the data. It's not possible to just allow for plain `SQL` because we want to know a lot about the exact data types of each column involved in the query output. This clause expect a comma separated list of view definitions, each one being either the name of an existing view in your database or the following expression:: *name* `AS` `$$` *sql query* `$$` The *name* and the *sql query* will be used in a `CREATE VIEW` statement at the beginning of the data loading, and the resulting view will then be dropped at the end of the data loading. MATERIALIZE ALL VIEWS ^^^^^^^^^^^^^^^^^^^^^ Same behaviour as *MATERIALIZE VIEWS* using the dynamic list of views as returned by MS SQL rather than asking the user to specify the list. MS SQL Partial Migration ------------------------ INCLUDING ONLY TABLE NAMES LIKE ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Introduce a comma separated list of table name patterns used to limit the tables to migrate to a sublist. More than one such clause may be used, they will be accumulated together. Example:: including only table names like 'GlobalAccount' in schema 'dbo' EXCLUDING TABLE NAMES LIKE ^^^^^^^^^^^^^^^^^^^^^^^^^^ Introduce a comma separated list of table name patterns used to exclude table names from the migration. This filter only applies to the result of the *INCLUDING* filter. :: excluding table names matching 'LocalAccount' in schema 'dbo' MS SQL Schema Transformations ----------------------------- ALTER SCHEMA '...' RENAME TO '...' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Allows to rename a schema on the flight, so that for instance the tables found in the schema 'dbo' in your source database will get migrated into the schema 'public' in the target database with this command:: alter schema 'dbo' rename to 'public' ALTER TABLE NAMES MATCHING ... IN SCHEMA '...' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Introduce a comma separated list of table names or *regular expressions* that you want to target in the pgloader *ALTER TABLE* command. Available actions are *SET SCHEMA*, *RENAME TO*, and *SET*:: ALTER TABLE NAMES MATCHING ~/_list$/, 'sales_by_store', ~/sales_by/ IN SCHEMA 'dbo' SET SCHEMA 'mv' ALTER TABLE NAMES MATCHING 'film' IN SCHEMA 'dbo' RENAME TO 'films' ALTER TABLE NAMES MATCHING ~/./ IN SCHEMA 'dbo' SET (fillfactor='40') ALTER TABLE NAMES MATCHING ~/./ IN SCHEMA 'dbo' SET TABLESPACE 'tlbspc' You can use as many such rules as you need. The list of tables to be migrated is searched in pgloader memory against the *ALTER TABLE* matching rules, and for each command pgloader stops at the first matching criteria (regexp or string). No *ALTER TABLE* command is sent to PostgreSQL, the modification happens at the level of the pgloader in-memory representation of your source database schema. In case of a name change, the mapping is kept and reused in the *foreign key* and *index* support. The *SET ()* action takes effect as a *WITH* clause for the `CREATE TABLE` command that pgloader will run when it has to create a table. The *SET TABLESPACE* action takes effect as a *TABLESPACE* clause for the `CREATE TABLE` command that pgloader will run when it has to create a table. The matching is done in pgloader itself, with a Common Lisp regular expression lib, so doesn't depend on the *LIKE* implementation of MS SQL, nor on the lack of support for regular expressions in the engine. MS SQL Driver setup and encoding -------------------------------- pgloader is using the `FreeTDS` driver, and internally expects the data to be sent in utf-8. To achieve that, you can configure the FreeTDS driver with those defaults, in the file `~/.freetds.conf`:: [global] tds version = 7.4 client charset = UTF-8 Default MS SQL Casting Rules ---------------------------- When migrating from MS SQL the following Casting Rules are provided: Numbers:: type tinyint to smallint type float to float using float-to-string type real to real using float-to-string type double to double precision using float-to-string type numeric to numeric using float-to-string type decimal to numeric using float-to-string type money to numeric using float-to-string type smallmoney to numeric using float-to-string Texts:: type char to text drop typemod type nchat to text drop typemod type varchar to text drop typemod type nvarchar to text drop typemod type xml to text drop typemod Binary:: type binary to bytea using byte-vector-to-bytea type varbinary to bytea using byte-vector-to-bytea Date:: type datetime to timestamptz type datetime2 to timestamptz Others:: type bit to boolean type hierarchyid to bytea type geography to bytea type uniqueidentifier to uuid using sql-server-uniqueidentifier-to-uuid pgloader-3.6.1/docs/ref/mysql.rst000066400000000000000000000566401342135037700167410ustar00rootroot00000000000000Migrating a MySQL Database to PostgreSQL ======================================== This command instructs pgloader to load data from a database connection. pgloader supports dynamically converting the schema of the source database and the indexes building. A default set of casting rules are provided and might be overloaded and appended to by the command. Here's an example using as many options as possible, some of them even being defaults. Chances are you don't need that complex a setup, don't copy and paste it, use it only as a reference! :: LOAD DATABASE FROM mysql://root@localhost/sakila INTO postgresql://localhost:54393/sakila WITH include drop, create tables, create indexes, reset sequences, workers = 8, concurrency = 1, multiple readers per thread, rows per range = 50000 SET PostgreSQL PARAMETERS maintenance_work_mem to '128MB', work_mem to '12MB', search_path to 'sakila, public, "$user"' SET MySQL PARAMETERS net_read_timeout = '120', net_write_timeout = '120' CAST type bigint when (= precision 20) to bigserial drop typemod, type date drop not null drop default using zero-dates-to-null, -- type tinyint to boolean using tinyint-to-boolean, type year to integer MATERIALIZE VIEWS film_list, staff_list -- INCLUDING ONLY TABLE NAMES MATCHING ~/film/, 'actor' -- EXCLUDING TABLE NAMES MATCHING ~ -- DECODING TABLE NAMES MATCHING ~/messed/, ~/encoding/ AS utf8 -- ALTER TABLE NAMES MATCHING 'film' RENAME TO 'films' -- ALTER TABLE NAMES MATCHING ~/_list$/ SET SCHEMA 'mv' ALTER TABLE NAMES MATCHING ~/_list$/, 'sales_by_store', ~/sales_by/ SET SCHEMA 'mv' ALTER TABLE NAMES MATCHING 'film' RENAME TO 'films' ALTER TABLE NAMES MATCHING ~/./ SET (fillfactor='40') ALTER SCHEMA 'sakila' RENAME TO 'pagila' BEFORE LOAD DO $$ create schema if not exists pagila; $$, $$ create schema if not exists mv; $$, $$ alter database sakila set search_path to pagila, mv, public; $$; The `database` command accepts the following clauses and options. MySQL Database Source Specification: FROM ----------------------------------------- Must be a connection URL pointing to a MySQL database. If the connection URI contains a table name, then only this table is migrated from MySQL to PostgreSQL. See the `SOURCE CONNECTION STRING` section above for details on how to write the connection string. The MySQL connection string accepts the same parameter *sslmode* as the PostgreSQL connection string, but the *verify* mode is not implemented (yet). :: mysql://[user[:password]@][netloc][:port][/dbname][?option=value&...] - *options* The same notation rules as found in the *Connection String* parts of the documentation apply, and we have a specific MySQL option: `useSSL`. The value for `useSSL` can be either `false` or `true`. If both `sslmode` and `useSSL` are used in the same connection string, pgloader behavior is undefined. The MySQL connection string also accepts the *useSSL* parameter with values being either *false* or *true*. Environment variables described in can be used as default values too. If the user is not provided, then it defaults to `USER` environment variable value. The password can be provided with the environment variable `MYSQL_PWD`. The host can be provided with the environment variable `MYSQL_HOST` and otherwise defaults to `localhost`. The port can be provided with the environment variable `MYSQL_TCP_PORT` and otherwise defaults to `3306`. MySQL Database Migration Options: WITH -------------------------------------- When loading from a `MySQL` database, the following options are supported, and the default *WITH* clause is: *no truncate*, *create schema*, *create tables*, *include drop*, *create indexes*, *reset sequences*, *foreign keys*, *downcase identifiers*, *uniquify index names*. - *include drop* When this option is listed, pgloader drops all the tables in the target PostgreSQL database whose names appear in the MySQL database. This option allows for using the same command several times in a row until you figure out all the options, starting automatically from a clean environment. Please note that `CASCADE` is used to ensure that tables are dropped even if there are foreign keys pointing to them. This is precisely what `include drop` is intended to do: drop all target tables and recreate them. Great care needs to be taken when using `include drop`, as it will cascade to *all* objects referencing the target tables, possibly including other tables that are not being loaded from the source DB. - *include no drop* When this option is listed, pgloader will not include any `DROP` statement when loading the data. - *truncate* When this option is listed, pgloader issue the `TRUNCATE` command against each PostgreSQL table just before loading data into it. - *no truncate* When this option is listed, pgloader issues no `TRUNCATE` command. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *create tables* When this option is listed, pgloader creates the table using the meta data found in the `MySQL` file, which must contain a list of fields with their data type. A standard data type conversion from DBF to PostgreSQL is done. - *create no tables* When this option is listed, pgloader skips the creation of table before loading data, target tables must then already exist. Also, when using *create no tables* pgloader fetches the metadata from the current target database and checks type casting, then will remove constraints and indexes prior to loading the data and install them back again once the loading is done. - *create indexes* When this option is listed, pgloader gets the definitions of all the indexes found in the MySQL database and create the same set of index definitions against the PostgreSQL database. - *create no indexes* When this option is listed, pgloader skips the creating indexes. - *drop indexes* When this option is listed, pgloader drops the indexes in the target database before loading the data, and creates them again at the end of the data copy. - *uniquify index names*, *preserve index names* MySQL index names are unique per-table whereas in PostgreSQL index names have to be unique per-schema. The default for pgloader is to change the index name by prefixing it with `idx_OID` where `OID` is the internal numeric identifier of the table the index is built against. In somes cases like when the DDL are entirely left to a framework it might be sensible for pgloader to refrain from handling index unique names, that is achieved by using the *preserve index names* option. The default is to *uniquify index names*. Even when using the option *preserve index names*, MySQL primary key indexes named "PRIMARY" will get their names uniquified. Failing to do so would prevent the primary keys to be created again in PostgreSQL where the index names must be unique per schema. - *drop schema* When this option is listed, pgloader drops the target schema in the target PostgreSQL database before creating it again and all the objects it contains. The default behavior doesn't drop the target schemas. - *foreign keys* When this option is listed, pgloader gets the definitions of all the foreign keys found in the MySQL database and create the same set of foreign key definitions against the PostgreSQL database. - *no foreign keys* When this option is listed, pgloader skips creating foreign keys. - *reset sequences* When this option is listed, at the end of the data loading and after the indexes have all been created, pgloader resets all the PostgreSQL sequences created to the current maximum value of the column they are attached to. The options *schema only* and *data only* have no effects on this option. - *reset no sequences* When this option is listed, pgloader skips resetting sequences after the load. The options *schema only* and *data only* have no effects on this option. - *downcase identifiers* When this option is listed, pgloader converts all MySQL identifiers (table names, index names, column names) to *downcase*, except for PostgreSQL *reserved* keywords. The PostgreSQL *reserved* keywords are determined dynamically by using the system function `pg_get_keywords()`. - *quote identifiers* When this option is listed, pgloader quotes all MySQL identifiers so that their case is respected. Note that you will then have to do the same thing in your application code queries. - *schema only* When this option is listed pgloader refrains from migrating the data over. Note that the schema in this context includes the indexes when the option *create indexes* has been listed. - *data only* When this option is listed pgloader only issues the `COPY` statements, without doing any other processing. - *single reader per thread*, *multiple readers per thread* The default is *single reader per thread* and it means that each MySQL table is read by a single thread as a whole, with a single `SELECT` statement using no `WHERE` clause. When using *multiple readers per thread* pgloader may be able to divide the reading work into several threads, as many as the *concurrency* setting, which needs to be greater than 1 for this option to kick be activated. For each source table, pgloader searches for a primary key over a single numeric column, or a multiple-column primary key index for which the first column is of a numeric data type (one of `integer` or `bigint`). When such an index exists, pgloader runs a query to find the *min* and *max* values on this column, and then split that range into many ranges containing a maximum of *rows per range*. When the range list we then obtain contains at least as many ranges than our concurrency setting, then we distribute those ranges to each reader thread. So when all the conditions are met, pgloader then starts as many reader thread as the *concurrency* setting, and each reader thread issues several queries with a `WHERE id >= x AND id < y`, where `y - x = rows per range` or less (for the last range, depending on the max value just obtained. - *rows per range* How many rows are fetched per `SELECT` query when using *multiple readers per thread*, see above for details. - *SET MySQL PARAMETERS* The *SET MySQL PARAMETERS* allows setting MySQL parameters using the MySQL `SET` command each time pgloader connects to it. MySQL Database Casting Rules ---------------------------- The command *CAST* introduces user-defined casting rules. The cast clause allows to specify custom casting rules, either to overload the default casting rules or to amend them with special cases. A casting rule is expected to follow one of the forms:: type [ ... ] to [
using distribute
using from
[,
, ...] distribute
as reference table When using the distribute command, the following steps are added to pgloader operations when migrating the schema: - if the distribution column does not exist in the table, it is added as the first column of the table - if the distribution column does not exists in the primary key of the table, it is added as the first column of the primary of the table - all the foreign keys that point to the table are added the distribution key automatically too, including the source tables of the foreign key constraints - once the schema has been created on the target database, pgloader then issues Citus specific command `create_reference_table() `_ and `create_distributed_table() `_ to make the tables distributed Those operations are done in the schema section of pgloader, before the data is loaded. When the data is loaded, the newly added columns need to be backfilled from referenced data. pgloader knows how to do that by generating a query like the following and importing the result set of such a query rather than the raw data from the source table. Citus Migration Example ^^^^^^^^^^^^^^^^^^^^^^^ With the migration command as above, pgloader adds the column ``company_id`` to the tables that have a direct or indirect foreign key reference to the ``companies`` table. We run pgloader using the following command, where the file `./test/citus/company.load `_ contains the pgloader command as shown above. :: $ pgloader --client-min-messages sql ./test/citus/company.load The following SQL statements are all extracted from the log messages that the pgloader command outputs. We are going to have a look at the `impressions` table. It gets created with a new column `company_id` in the first position, as follows: :: CREATE TABLE "public"."impressions" ( company_id bigint, "id" bigserial, "ad_id" bigint default NULL, "seen_at" timestamp with time zone default NULL, "site_url" text default NULL, "cost_per_impression_usd" numeric(20,10) default NULL, "user_ip" inet default NULL, "user_data" jsonb default NULL ); The original schema for this table does not have the `company_id` column, which means pgloader now needs to change the primary key definition, the foreign keys constraints definitions from and to this table, and also to *backfill* the `company_id` data to this table when doing the COPY phase of the migration. Then once the tables have been created, pgloader executes the following SQL statements:: SELECT create_distributed_table('"public"."companies"', 'id'); SELECT create_distributed_table('"public"."campaigns"', 'company_id'); SELECT create_distributed_table('"public"."ads"', 'company_id'); SELECT create_distributed_table('"public"."clicks"', 'company_id'); SELECT create_distributed_table('"public"."impressions"', 'company_id'); Then when copying the data from the source PostgreSQL database to the new Citus tables, the new column (here ``company_id``) needs to be backfilled from the source tables. Here's the SQL query that pgloader uses as a data source for the ``ads`` table in our example: :: SELECT "campaigns".company_id::text, "ads".id::text, "ads".campaign_id::text, "ads".name::text, "ads".image_url::text, "ads".target_url::text, "ads".impressions_count::text, "ads".clicks_count::text, "ads".created_at::text, "ads".updated_at::text FROM "public"."ads" JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id The ``impressions`` table has an indirect foreign key reference to the ``company`` table, which is the table where the distribution key is specified. pgloader will discover that itself from walking the PostgreSQL catalogs, and you may also use the following specification in the pgloader command to explicitely add the indirect dependency: :: distribute impressions using company_id from ads, campaigns Given this schema, the SQL query used by pgloader to fetch the data for the `impressions` table is the following, implementing online backfilling of the data: :: SELECT "campaigns".company_id::text, "impressions".id::text, "impressions".ad_id::text, "impressions".seen_at::text, "impressions".site_url::text, "impressions".cost_per_impression_usd::text, "impressions".user_ip::text, "impressions".user_data::text FROM "public"."impressions" JOIN "public"."ads" ON impressions.ad_id = ads.id JOIN "public"."campaigns" ON ads.campaign_id = campaigns.id When the data copying is done, then pgloader also has to install the indexes supporting the primary keys, and add the foreign key definitions to the schema. Those definitions are not the same as in the source schema, because of the adding of the distribution column to the table: we need to also add the column to the primary key and the foreign key constraints. Here's the commands issued by pgloader for the ``impressions`` table: :: CREATE UNIQUE INDEX "impressions_pkey" ON "public"."impressions" (company_id, id); ALTER TABLE "public"."impressions" ADD CONSTRAINT "impressions_ad_id_fkey" FOREIGN KEY(company_id,ad_id) REFERENCES "public"."ads"(company_id,id) Given a single line of specification ``distribute companies using id`` then pgloader implements all the necessary schema changes on the fly when migrating to Citus, and also dynamically backfills the data. Citus Migration: Limitations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The way pgloader implements *reset sequence* does not work with Citus at this point, so sequences need to be taken care of separately at this point. pgloader-3.6.1/docs/ref/pgsql-redshift.rst000066400000000000000000000055201342135037700205170ustar00rootroot00000000000000Support for Redshift in pgloader ================================ The command and behavior are the same as when migration from a PostgreSQL database source. pgloader automatically discovers that it's talking to a Redshift database by parsing the output of the `SELECT version()` SQL query. Redhift as a data source ^^^^^^^^^^^^^^^^^^^^^^^^ Redshit is a variant of PostgreSQL version 8.0.2, which allows pgloader to work with only a very small amount of adaptation in the catalog queries used. In other words, migrating from Redshift to PostgreSQL works just the same as when migrating from a PostgreSQL data source, including the connection string specification. Redshift as a data destination ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The Redshift variant of PostgreSQL 8.0.2 does not have support for the ``COPY FROM STDIN`` feature that pgloader normally relies upon. To use COPY with Redshift, the data must first be made available in an S3 bucket. First, pgloader must authenticate to Amazon S3. pgloader uses the following setup for that: - ``~/.aws/config`` This INI formatted file contains sections with your default region and other global values relevant to using the S3 API. pgloader parses it to get the region when it's setup in the ``default`` INI section. The environment variable ``AWS_DEFAULT_REGION`` can be used to override the configuration file value. - ``~/.aws/credentials`` The INI formatted file contains your authentication setup to Amazon, with the properties ``aws_access_key_id`` and ``aws_secret_access_key`` in the section ``default``. pgloader parses this file for those keys, and uses their values when communicating with Amazon S3. The environment variables ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY`` can be used to override the configuration file - ``AWS_S3_BUCKET_NAME`` Finally, the value of the environment variable ``AWS_S3_BUCKET_NAME`` is used by pgloader as the name of the S3 bucket where to upload the files to COPY to the Redshift database. The bucket name defaults to ``pgloader``. Then pgloader works as usual, see the other sections of the documentation for the details, depending on the data source (files, other databases, etc). When preparing the data for PostgreSQL, pgloader now uploads each batch into a single CSV file, and then issue such as the following, for each batch: :: COPY FROM 's3:///' FORMAT CSV TIMEFORMAT 'auto' REGION '' ACCESS_KEY_ID '' SECRET_ACCESS_KEY '; This is the only difference with a PostgreSQL core version, where pgloader can rely on the classic ``COPY FROM STDIN`` command, which allows to send data through the already established connection to PostgreSQL. pgloader-3.6.1/docs/ref/pgsql.rst000066400000000000000000000345601342135037700167170ustar00rootroot00000000000000Migrating a PostgreSQL Database to PostgreSQL ============================================= This command instructs pgloader to load data from a database connection. Automatic discovery of the schema is supported, including build of the indexes, primary and foreign keys constraints. A default set of casting rules are provided and might be overloaded and appended to by the command. Here's a short example of migrating a database from a PostgreSQL server to another: :: load database from pgsql://localhost/pgloader into pgsql://localhost/copy including only table names matching 'bits', ~/utilisateur/ in schema 'mysql' including only table names matching ~/geolocations/ in schema 'public' ; PostgreSQL Database Source Specification: FROM ---------------------------------------------- Must be a connection URL pointing to a PostgreSQL database. See the `SOURCE CONNECTION STRING` section above for details on how to write the connection string. :: pgsql://[user[:password]@][netloc][:port][/dbname][?option=value&...] PostgreSQL Database Migration Options: WITH ------------------------------------------- When loading from a `PostgreSQL` database, the following options are supported, and the default *WITH* clause is: *no truncate*, *create schema*, *create tables*, *include drop*, *create indexes*, *reset sequences*, *foreign keys*, *downcase identifiers*, *uniquify index names*, *reindex*. - *include drop* When this option is listed, pgloader drops all the tables in the target PostgreSQL database whose names appear in the MySQL database. This option allows for using the same command several times in a row until you figure out all the options, starting automatically from a clean environment. Please note that `CASCADE` is used to ensure that tables are dropped even if there are foreign keys pointing to them. This is precisely what `include drop` is intended to do: drop all target tables and recreate them. Great care needs to be taken when using `include drop`, as it will cascade to *all* objects referencing the target tables, possibly including other tables that are not being loaded from the source DB. - *include no drop* When this option is listed, pgloader will not include any `DROP` statement when loading the data. - *truncate* When this option is listed, pgloader issue the `TRUNCATE` command against each PostgreSQL table just before loading data into it. - *no truncate* When this option is listed, pgloader issues no `TRUNCATE` command. - *disable triggers* When this option is listed, pgloader issues an `ALTER TABLE ... DISABLE TRIGGER ALL` command against the PostgreSQL target table before copying the data, then the command `ALTER TABLE ... ENABLE TRIGGER ALL` once the `COPY` is done. This option allows loading data into a pre-existing table ignoring the *foreign key constraints* and user defined triggers and may result in invalid *foreign key constraints* once the data is loaded. Use with care. - *create tables* When this option is listed, pgloader creates the table using the meta data found in the `MySQL` file, which must contain a list of fields with their data type. A standard data type conversion from DBF to PostgreSQL is done. - *create no tables* When this option is listed, pgloader skips the creation of table before loading data, target tables must then already exist. Also, when using *create no tables* pgloader fetches the metadata from the current target database and checks type casting, then will remove constraints and indexes prior to loading the data and install them back again once the loading is done. - *create indexes* When this option is listed, pgloader gets the definitions of all the indexes found in the MySQL database and create the same set of index definitions against the PostgreSQL database. - *create no indexes* When this option is listed, pgloader skips the creating indexes. - *drop indexes* When this option is listed, pgloader drops the indexes in the target database before loading the data, and creates them again at the end of the data copy. - *reindex* When this option is used, pgloader does both *drop indexes* before loading the data and *create indexes* once data is loaded. - *drop schema* When this option is listed, pgloader drops the target schema in the target PostgreSQL database before creating it again and all the objects it contains. The default behavior doesn't drop the target schemas. - *foreign keys* When this option is listed, pgloader gets the definitions of all the foreign keys found in the MySQL database and create the same set of foreign key definitions against the PostgreSQL database. - *no foreign keys* When this option is listed, pgloader skips creating foreign keys. - *reset sequences* When this option is listed, at the end of the data loading and after the indexes have all been created, pgloader resets all the PostgreSQL sequences created to the current maximum value of the column they are attached to. The options *schema only* and *data only* have no effects on this option. - *reset no sequences* When this option is listed, pgloader skips resetting sequences after the load. The options *schema only* and *data only* have no effects on this option. - *downcase identifiers* When this option is listed, pgloader converts all MySQL identifiers (table names, index names, column names) to *downcase*, except for PostgreSQL *reserved* keywords. The PostgreSQL *reserved* keywords are determined dynamically by using the system function `pg_get_keywords()`. - *quote identifiers* When this option is listed, pgloader quotes all MySQL identifiers so that their case is respected. Note that you will then have to do the same thing in your application code queries. - *schema only* When this option is listed pgloader refrains from migrating the data over. Note that the schema in this context includes the indexes when the option *create indexes* has been listed. - *data only* When this option is listed pgloader only issues the `COPY` statements, without doing any other processing. - *rows per range* How many rows are fetched per `SELECT` query when using *multiple readers per thread*, see above for details. PostgreSQL Database Casting Rules --------------------------------- The command *CAST* introduces user-defined casting rules. The cast clause allows to specify custom casting rules, either to overload the default casting rules or to amend them with special cases. A casting rule is expected to follow one of the forms:: type [ ... ] to [