pax_global_header00006660000000000000000000000064145046132470014520gustar00rootroot0000000000000052 comment=0c6358dc3559474d9baaa6c525a7ee47effe595c icu_ext-1.8.0/000077500000000000000000000000001450461324700131665ustar00rootroot00000000000000icu_ext-1.8.0/.gitignore000066400000000000000000000006561450461324700151650ustar00rootroot00000000000000# Prerequisites *.d # Object files *.o *.ko *.obj *.elf # Linker output *.ilk *.map *.exp # Precompiled Headers *.gch *.pch # Libraries *.lib *.a *.la *.lo # Shared objects (inc. Windows DLLs) *.dll *.so *.so.* *.dylib # Executables *.exe a.out *.app *.i*86 *.x86_64 *.hex # Debug files *.dSYM/ *.su *.idb *.pdb # Kernel Module Compile Results *.mod* *.cmd .tmp_versions/ modules.order Module.symvers Mkfile.old dkms.conf icu_ext-1.8.0/LICENSE.md000066400000000000000000000017151450461324700145760ustar00rootroot00000000000000# Copyright and License Copyright (c) 2018-2023, Daniel Vérité Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. In no event shall Daniel Vérité be liable to any party for direct, indirect, special, incidental, or consequential damages, including lost profits, arising out of the use of this software and its documentation, even if Daniel Vérité has been advised of the possibility of such damage. Daniel Vérité specifically disclaims any warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The software provided hereunder is on an "AS IS" basis, and Daniel Vérité has no obligations to provide maintenance, support, updates, enhancements, or modifications. icu_ext-1.8.0/META.json000066400000000000000000000020671450461324700146140ustar00rootroot00000000000000{ "name": "icu_ext", "abstract": "Extension to expose functionality from the ICU (Unicode) library", "version": "1.8.0", "release_status": "stable", "maintainer": "Daniel Vérité ", "license": "postgresql", "prereqs": { "runtime": { "requires": { "PostgreSQL": "10.0.0" } } }, "provides": { "icu_ext": { "file": "sql/icu_ext--1.3.sql", "version": "1.8.0", "abstract": "Extension to expose functionality from the ICU (Unicode) library" } }, "resources": { "bugtracker": { "web": "https://github.com/dverite/icu_ext/issues" }, "repository": { "url": "git://github.com/dverite/icu_ext.git" , "web": "https://github.com/dverite/icu_ext", "type": "git" } }, "meta-spec": { "version": "1.0.0", "url": "http://pgxn.org/meta/spec.txt" }, "tags": [ "icu", "unicode", "collation" ] } icu_ext-1.8.0/Makefile000066400000000000000000000012021450461324700146210ustar00rootroot00000000000000EXTENSION = icu_ext EXTVERSION = 1.8 PG_CONFIG = pg_config DATA = $(wildcard sql/icu_*.sql) MODULE_big = icu_ext OBJS = icu_ext.o icu_break.o icu_num.o icu_spoof.o icu_transform.o \ icu_search.o icu_normalize.o icu_date.o icu_timestamptz.o icu_interval.o SHLIB_LINK = $(ICU_LIBS) REGRESS = tests-01 tests-datetime EXTRA_CLEAN = expected/tests.out all: PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) override CFLAGS += -g # added with PG16 built with meson. Not sure it should be kept. dist: tar cjf $(EXTENSION)-$(EXTVERSION).tar.bz2 Makefile META.json icu_ext.control *.md *.c *.h sql/ expected/ META.json .PHONY: dist icu_ext-1.8.0/README-datetime.md000066400000000000000000000235161450461324700162460ustar00rootroot00000000000000# Date and time functionalities in icu_ext Postgres core provides a comprehensive set of types and functions that work with the widely used gregorian calendar, but does not support the [traditional calendars](https://en.wikipedia.org/wiki/List_of_calendars) used in some parts of the world. These calendars differ mostly by when they start, how many months there are in years and how they're named, and how many days there are in months. Since the ICU library can handle many of these traditional calendars, `icu_ext` exposes them in Postgres through an alternate set of SQL functions, types and operators. ## Locale settings The calendar and the language used for date and time are defined through a locale string: `language[_country][@calendar=caltype]`. `language` and `country` are the usual short codes, as in `en_US` or `fr_CA` (see the output of `icu_locales_list()` for a full list). The choice of language selects the associated translations, and along with the country it influences how dates are displayed when using the basic formats with respect to cultural conventions (see the formatting options below). Default values will be guessed from the environment when the language or calendar are not specified. The accepted values for `caltype` are, as of ICU 70: * buddhist * chinese * coptic * dangi * ethiopic * ethiopic-amete-alem * gregorian * hebrew * indian * islamic * islamic-civil * islamic-rgsa * islamic-tbla * islamic-umalqura * iso8601 * japanese * persian * roc The locale can be passed to the `icu_parse_date()` and `icu_format_date()` functions, or assigned to the `icu_ext.locale` configuration setting to affect the behavior of the `icu_date` and `icu_timestamptz` types implemented by the extension. ## Format strings for dates and timestamp The fields available in the text representation of date and timestamps are described in [Formatting Dates and Times](https://unicode-org.github.io/icu/userguide/format_parse/datetime/) (ICU documentation). The format strings composed of these fields are passed to `icu_format_date`, `icu_parse_date`, and used in the configuration settings `icu_ext.timestamptz_format` and `icu_ext.date_format` described below. As an alternative to specifying individuals fields and separators, the format string can consist of a reference to a basic format, as described in the [CLDR](https://cldr.unicode.org/translation/date-time/date-time-patterns) : - `{short}` - `{medium}` - `{long}` - `{full}` The format code must be enclosed by curly brackets as shown in the list, with nothing else in the format string. When using these forms, which fields are displayed and in what order is determined by the language and country of the ICU locale. These values match the ICU enum [UDateFormatStyle](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/udat_8h.html#adb4c5a95efb888d04d38db7b3efff0c5) Dates can also be expressed relatively to the current day with the `relative` keyword added. The formats can be expressed as: - `{short relative}` - `{medium relative}` - `{long relative}` - `{full relative}` ## Functions taking core types ### icu_format_date (`input` date, `format` text [,`locale` text]) Return the string representing the input date with the given `format` and `locale` as described above. If `locale` is not specified, the current ICU locale is used. Example: ```sql => select icu_format_date('2020-12-31'::date, '{medium}', 'en@calendar=ethiopic'); icu_format_date ---------------------- Tahsas 22, 2013 ERA1 ``` ### icu_format_datetime (`input` timestamptz, `format` text [,`locale` text]) Return the string representing the time stamp wih time zone `ts`with the given `format` and `locale` as described above. If `locale` is not specified, the current ICU locale is used. Example: => SELECT icu_format_datetime( now(), 'GGGG dd/MMMM/yyyy HH:mm:ss.SSS z', 'fr@calendar=buddhist' ); icu_format_datetime ------------------------------------------------ ère bouddhique 22/septembre/2566 14:55:48.133 UTC+2 ### icu_parse_date (`input` text, `format` text [,`locale` text]) Return a `date` resulting from parsing the input string according to `format` (see "format strings" above). The function will error out if the input string interpreted with the given `format` and `locale` does not strictly match the format or cannot be converted into a date. When `locale` is not specified, the current ICU locale is used. Example: => SET icu_ext.locale TO '@calendar=buddhist'; => SELECT icu_parse_date('25/09/2566', 'dd/MM/yyyy'); icu_parse_date ---------------- 2023-09-25 ### icu_parse_datetime (`input` text, `format` text [,`locale` text]) Return a `timestamp with time zone` resulting from parsing the input string according to `format`. This is similar to `icu_parse_date()` except that it parses a full timestamp instead of a date. Example: => SELECT icu_parse_datetime( '11/Meskerem/2016 14:57:17', 'dd/MMMM/yyyy HH:mm:ss', 'en@calendar=ethiopic' ); icu_parse_datetime ------------------------ 2023-09-22 14:57:17+02 ## Custom types ### icu_date It differs from the core built-in type `date` in the input and output formats that are accepted. `icu_date` text representation works with respect to `icu_ext.date_format` if set, and otherwise with the default format of the current ICU locale. To express non-finite dates, use `'infinity'::date::icu_date`. Internally, the representation is the same as the `date` type, and `icu_date` can be cast implicitly to and from `date`. Example: ```sql CREATE TABLE events(ev_name text, ev_date icu_date); INSERT INTO events VALUES('birthday', '2023-07-31'::date); SET icu_ext.locale TO 'orm@calendar=ethiopic'; SELECT * FROM events; +----------+--------------------+ | ev_name | ev_date | +----------+--------------------+ | birthday | 24-Hamle-2015 ERA1 | +----------+--------------------+ ``` ### icu_timestamptz It differs from the core built-in type `timestamp with time zone` (or `timestamptz` in short) in the input and output formats that are accepted. The text representation for `icu_timestamptz` works with respect to `icu_ext.timestamp_format` if set, and otherwise with the default format of the current ICU locale. To express non-finite timestamps, use `'infinity'::timestamptz::icu_timestamptz`. Internally, the representation is the same as the `timestamptz` type, and `icu_timestamptz` can be cast directly to and from `timestamptz`. ### icu_interval Like the `interval` built-in data type, it represents spans of time with years, months, days and microseconds components that are meant to process calendar-aware calculations. It differs from `interval` in not assuming that one year always equals 12 months. For instance, in the ethiopic calendar, there are 13 months in a year. How spans of time are added to dates and timestamps depend on the current calendar. `icu_interval` accepts the same textual inputs as the `interval` data type. It also shares pretty much the same output except for not converting months to years. `icu_interval` can be cast from `interval`. Example: ```sql select '25 months'::interval, '25 months'::icu_interval; +---------------+--------------+ | interval | icu_interval | +---------------+--------------+ | 2 years 1 mon | 25 mons | +---------------+--------------+ ``` ## Operators ### icu_interval * int Multiply each component of the interval (years, months...) by the integer number. This operator is commutative. ### icu_date + icu_interval Add the years, months, days and time from the interval to the date, with respect to the rules of the calendar of the current locale (`icu_ext.locale`). ### icu_date - icu_interval Substract the years, months, days and time from the interval to the date, with respect to the rules of the calendar of the current locale (`icu_ext.locale`). ### icu_timestamptz + icu_interval Add the years, months, days and time from the interval to the timestamp, with respect to the rules of the calendar of the current locale (`icu_ext.locale`). This operator is commutative. ### icu_timestamptz - icu_interval Subtract the years, months, days and time from the interval to the timestamp, with respect to the rules of the calendar of the current locale (`icu_ext.locale`). ### icu_interval + icu_interval Add the intervals. The result does not depend on the current calendar. ### icu_interval - icu_interval Subtract the intervals. The result does not depend on the current calendar. ## Configurable settings There are three configuration settings that work together to control input and output of the `icu_date` and `icu_timestamptz` types. ### icu_ext.locale Locale to use for input/output and calendar-dependent calculations, as described in "Locale format and settings" above. ``` -- vietnamese language, buddhist calendar SET icu_ext.locale TO 'vi@calendar=buddhist'; SET icu_ext.timestamptz_format TO '{long}'; SELECT now()::icu_timestamptz; now ------------------------------------------------ Ngày 22 tháng 9 năm 2566 BE lúc 15:57:13 GMT+2 ``` ### icu_ext.date_format Format string used for the text representation of the `icu_date` datatype, both for input and output. The format is described in [Formatting Dates and Times](https://unicode-org.github.io/icu/userguide/format_parse/datetime/) (ICU documentation). The default value for this setting is `{medium}`. ### icu_ext.timestamptz_format Format string used for the text representation of the `icu_timestamptz` datatype, both for input and output. The format is described in [Formatting Dates and Times](https://unicode-org.github.io/icu/userguide/format_parse/datetime/) (ICU documentation). This setting also accepts the same references to basic formats (short, medium, ...) as `icu_ext.date_format`, and its default value is `{medium}`. icu_ext-1.8.0/README.md000066400000000000000000000666711450461324700144650ustar00rootroot00000000000000# icu_ext An extension to expose functionality from [ICU](https://icu.unicode.org/) to PostgreSQL applications. It requires PostgreSQL version 11 or newer, configured with ICU (--with-icu). Note: this text is in GitHub Flavored Markdown format. Please see the version [on github](https://github.com/dverite/icu_ext/blob/master/README.md) if it's rendered weirdly elsewhere. ## Installation The Makefile uses the [PGXS infrastructure](https://www.postgresql.org/docs/current/static/extend-pgxs.html) to find include and library files and determine the install location. Build and install with: $ make $ (sudo) make install ## Types See [README-datetime.md](README-datetime.md) for the date and time data types and functionalities. ## Functions ### Quick links (in alphabetical order) [icu_char_name](#icu_char_name) [icu_character_boundaries](#icu_character_boundaries) [icu_collation_attributes](#icu_collation_attributes) [icu_compare](#icu_compare) [icu_confusable_strings_check](#icu_confusable_strings_check) [icu_confusable_string_skeleton](#icu_confusable_string_skeleton) [icu_default_locale](#icu_default_locale) [icu_format_date](README-datetime.md#icu_format_date) [icu_format_datetime](README-datetime.md#icu_format_datetime) [icu_is_normalized](#icu_is_normalized) [icu_line_boundaries](#icu_line_boundaries) [icu_locales_list](#icu_locales_list) [icu_normalize](#icu_normalize) [icu_number_spellout](#icu_number_spellout) [icu_parse_date](README-datetime.md#icu_parse_date) [icu_parse_datetime](README-datetime.md#icu_parse_datetime) [icu_replace](#icu_replace) [icu_sentence_boundaries](#icu_sentence_boundaries) [icu_set_default_locale](#icu_set_default_locale) [icu_sort_key](#icu_sort_key) [icu_spoof_check](#icu_spoof_check) [icu_strpos](#icu_strpos) [icu_transform](#icu_transform) [icu_transforms_list](#icu_transforms_list) [icu_unicode_version](#icu_unicode_version) [icu_version](#icu_version) [icu_word_boundaries](#icu_word_boundaries) These functions work in both Unicode and non-Unicode databases. ### icu_version() Returns the version of the ICU library linked with the server. ### icu_unicode_version() Returns the version of the Unicode standard used by the ICU library linked with the server. ### icu_locales_list() Returns a table-type list of available ICU locales with their main properties (country code and name, language code and name, script, direction). When translations are available, the country and language names are localized with the default ICU locale, configurable with `icu_set_default_locale()`. Set it to `en` to force english names. Examples: =# SELECT * FROM icu_locales_list() where name like 'es%' limit 5; name | country | country_code | language | language_code | script | direction --------+---------------+--------------+----------+---------------+--------+----------- es | | | Spanish | spa | | LTR es_419 | Latin America | | Spanish | spa | | LTR es_AR | Argentina | ARG | Spanish | spa | | LTR es_BO | Bolivia | BOL | Spanish | spa | | LTR es_CL | Chile | CHL | Spanish | spa | | LTR =# SELECT name,country FROM icu_locales_list() where script='Simplified Han'; name | country ------------+--------------------- zh_Hans | zh_Hans_CN | China zh_Hans_HK | Hong Kong SAR China zh_Hans_MO | Macau SAR China zh_Hans_SG | Singapore This list is obtained independently from the collations declared to PostgreSQL (found in `pg_collation`). ### icu_collation_attributes(`collator` text [, `exclude_defaults` bool]) Lists the attributes, version and display name of an ICU collation, returned as a set of `(attribute,value)` tuples. The `collator` argument must designate an [ICU collator](https://unicode-org.github.io/icu/userguide/collation/api) and accepts several different syntaxes. In particular, a [locale ID](https://unicode-org.github.io/icu/userguide/locale) or (if ICU>=54) [language tags](https://unicode.org/reports/tr35/tr35-collation.html#Collation_Settings) may be used. Note that this argument is **not** a reference to a PostgreSQL collation, and that this function does not depend on whether a corresponding collation has been instantiated in the database with [`CREATE COLLATION`](https://www.postgresql.org/docs/current/static/sql-createcollation.html). To query the properties of an already created PostgreSQL ICU collation, refer to `pg_collation.collcollate` (which corresponds to the `lc_collate` argument of CREATE COLLATION). =# SELECT a.attribute,a.value FROM pg_collation JOIN LATERAL icu_collation_attributes(collcollate) a ON (collname='fr-CA-x-icu'); attribute | value -------------+------------------- displayname | français (Canada) kn | false kb | true kk | false ka | noignore ks | level3 kf | false kc | false kv | punct version | 153.80.33 `icu_collation_attributes()` is useful to check that the settings embedded into a collation name activate the intended options, because ICU parses them in a way that non-conformant parts tend to be silently ignored, and because the interpretation somewhat depends on the ICU version (in particular, pre-54 versions do not support options expressed as BCP-47 tags). It may be also useful to search existing collations by their properties. When `exclude_defaults` is set to `true`, attributes that have their default value are filtered out, to put in evidence the specifics of collations. For instance, to find the only collations that use `shifted` for the `Alternate` attribute: =# SELECT collname,collcollate,a.attribute,a.value FROM pg_collation JOIN LATERAL icu_collation_attributes(collcollate,true) a ON (attribute='ka') ; collname | collcollate | attribute | value -------------+-------------+-----------+--------- th-x-icu | th | ka | shifted th-TH-x-icu | th-TH | ka | shifted (2 rows) By default there is no filtering (`exclude_defaults` = false) so that all attributes known by the function as well as the collation version number are reported. Example of checking a collation without any reference to `pg_collation`: =# SELECT * FROM icu_collation_attributes('fr-u-ks-level2-kn'); attribute | value -----------+---------- kn | true kb | false kk | false ka | noignore ks | level2 kf | false kc | false version | 153.64 `icu_collation_attributes()` will error out if ICU is unable to open a collator with the given argument. ### icu_sort_key(`string` text [, `collator` text]) Returns the binary sort key (type: `bytea`) corresponding to the string with the given collation. See https://unicode-org.github.io/icu/userguide/collation/architecture#sort-keys When a `collator` argument is passed, it is interpreted as an ICU locale independently of the persistent collations instantiated in the database. When there is no `collator` argument, the collation associated to `string` gets used to generate the sort key. It must be an ICU collation or the function will error out. This form with a single argument is faster due to Postgres keeping its collations "open" (in the sense of `ucol_open()/ucol_close()`) for the duration of the session, whereas the other form with the explicit `collator` argument does open and close the ICU collation for each call. Binary sort keys may be useful to circumvent a core PostgreSQL limitation that two strings that differ in their byte representation are never considered equal by deterministic collations (see for instance [this thread](https://www.postgresql.org/message-id/7f0120e8945c4befac964777d31912d7%40exmbdft5.ad.twosigma.com) in the pgsql-bugs mailing-list for a discussion of this problem in relation with the ICU integration). With PostgreSQL 12 or newer versions, the "deterministic" property can be set to `false` by [`CREATE COLLATION`](https://www.postgresql.org/docs/current/sql-createcollation.html) to request that string comparisons with these collations skip the tie-breaker. With older versions, "deterministic" is always `true`. You may order or rank by binary sort keys, or materialize them in a unique index to achieve at the SQL level what cannot be done internally by persistent collations, either because PostgreSQL is not recent enough or because you don't want or lack the permission to instantiate nondeterministic collations. The function is declared IMMUTABLE to be usable in indexes, but please be aware that it's only true as far as the "version" of the collation doesn't change. (Typically it changes with every version of Unicode). In short, consider rebuilding the affected indexes on ICU upgrades. To simply compare pairs of strings, consider `icu_compare()` instead. Example demonstrating a case-sensitive, accent-sensitive unique index: =# CREATE TABLE uniq(name text); =# CREATE UNIQUE INDEX idx ON uniq((icu_sort_key(name, 'fr-u-ks-level1'))); =# INSERT INTO uniq values('été'); INSERT 0 1 =# INSERT INTO uniq values('Ête'); ERROR: duplicate key value violates unique constraint "idx" DETAIL: Key (icu_sort_key(name, 'fr-u-ks-level1'::text))=(\x314f31) already exists. =# insert into uniq values('Êtes'); INSERT 0 1 ### icu_compare(`string1` text, `string2` text [, `collator` text]) Compare two strings with the given collation. Return the result as a signed integer, similarly to strcoll(), that is, the result is negative if string1 < string2, zero if string = string2, and positive if string1 > string2. When a `collator` argument is passed, it is taken as the ICU locale (independently of the collations instantiated in the database) to use to collate the strings. When there is no `collator` argument, the collation associated to `string1` and `string2` gets used for the comparison. It must be an ICU collation and it must be the same for the two arguments or the function will error out. With PostgreSQL 12 or newer, it can be nondeterministic, but whether it is nondeterministic or deterministic will not make any difference in the result of `icu_compare`, contrary to comparisons done by PostgreSQL core with the equality operator. The two-argument form is significantly faster due to Postgres keeping its collations "open" (in the sense of `ucol_open()/ucol_close()`) for the duration of the session, whereas the other form with the explicit `collator` argument does open and close the ICU collation for each call. Example: case-sensitive, accent-insensitive comparison: =# SELECT icu_compare('abcé', 'abce', 'en-u-ks-level1-kc-true'); icu_compare ------------- 0 =# SELECT icu_compare('Abcé', 'abce', 'en-u-ks-level1-kc-true'); icu_compare ------------- 1 With two arguments and a collation determined by the COLLATE clause: =# SELECT icu_compare('Abcé', 'abce' COLLATE "fr-x-icu"); icu_compare ------------- 1 With an implicit Postgres collation: =# CREATE COLLATION mycoll (locale='fr-u-ks-level1', provider='icu'); CREATE COLLATION =# CREATE TABLE books (id int, title text COLLATE "mycoll"); CREATE TABLE =# insert into books values(1, $$C'est l'été$$); INSERT 0 1 =# select id,title from books where icu_compare (title, $$c'est l'ete$$) = 0; id | title ----+------------- 1 | C'est l'été ### icu_set_default_locale(`locale` text) Sets the default ICU locale for the session, and returns a canonicalized version of the locale name. The POSIX syntax (`lang[_country[@attr]]`) is accepted. Call this function to change the output language of `icu_locales_list()`. This setting should not have any effect on PostgreSQL core functions, at least as of PG version 10. Warning: passing bogus contents to this function may freeze the backend with older versions of ICU (seen with 52.1). ### icu_default_locale() Returns the name of the default ICU locale as a text. The initial value is automatically set by ICU from the environment. For date and time localization, use the `icu_ext.locale` instead (see [README-datetime.md](README-datetime.md)). ### icu_character_boundaries(`string` text, `locale` text) Break down the string into its characters and return them as a set of text. This is comparable to calling `regexp_split_to_table` with an empty regexp, with some differences, for instance: - CRLF sequences do not get split into two characters. - Sequences with a base and a combining character are kept together. - Legacy and extended grapheme clusters are extracted as one result per grapheme. Example (the "e" followed by the combining acute accent U+0301 may be rendered as an accented e or differently depending on your browser): =# SELECT * FROM icu_character_boundaries('Ete'||E'\u0301', 'fr') as chars; chars ------- E t é See [Boundary Analysis](https://unicode-org.github.io/icu/userguide/boundaryanalysis/) in the ICU User Guide and [UAX #29 (Unicode Text Segmentation)](https://unicode.org/reports/tr29/) for more information. ### icu_word_boundaries (`string` text, `locale` text) Break down the string into words and non-words constituents, and return them in a set of (tag, contents) tuples. `tag` has values from the [UWordBreak enum][ubrk_source] defined in ubrk.h indicating the nature of the piece of contents. The current values are: UBRK_WORD_NONE = 0, UBRK_WORD_NUMBER = 100, UBRK_WORD_LETTER = 200, UBRK_WORD_KANA = 300, UBRK_WORD_IDEO = 400, /* up to 500 */ (strictly speaking, any number between the lower and the upper bounds may be counted, as these numbers are meant to be intervals inside which new subdivisions may be added in future versions of ICU). Example: =# SELECT * FROM icu_word_boundaries($$I like O'Reilly books, like the japanese 初めてのPerl 第7版.$$ , 'en'); tag | contents -----+---------- 200 | I 0 | 200 | like 0 | 200 | O'Reilly 0 | 200 | books 0 | , 0 | 200 | like 0 | 200 | the 0 | 200 | japanese 0 | 400 | 初めて 400 | の 200 | Perl 0 | 400 | 第 100 | 7 400 | 版 0 | . or to count words in english: =# SELECT count(*) FROM icu_words_boundaries($$piece of text$$, 'en_US') WHERE tag=200; ### icu_line_boundaries (`string` text, `locale` text) Split the string into pieces where a line break may occur, according to the Unicode line breaking algorithm defined in [UAX #14](https://unicode.org/reports/tr14/), and return them in a set of (tag, contents) tuples. `tag` has values from the [ULineBreakTag enum][ubrk_source] defined in ubrk.h indicating the nature of the break. The current values are: UBRK_LINE_SOFT = 0, UBRK_LINE_HARD = 100, /* up to 200 */ (strictly speaking, any number between the lower and the upper bounds may be counted, as these numbers are meant to be intervals inside which new subdivisions may be added in future versions of ICU). Example: =# SELECT *,convert_to( contents, 'utf-8') from icu_line_boundaries( $$Thus much let me avow--You are not wrong, who deem That my days have been a dream; Yet if hope has flown away In a night, or in a day,$$ , 'en'); tag | contents | convert_to -----+----------+------------------ 100 | +| \x0a | | 0 | Thus | \x5468757320 0 | much | \x6d75636820 0 | let | \x6c657420 0 | me | \x6d6520 100 | avow-- +| \x61766f772d2d0a | | 0 | You | \x596f7520 0 | are | \x61726520 0 | not | \x6e6f7420 0 | wrong, | \x77726f6e672c20 0 | who | \x77686f20 100 | deem +| \x6465656d0a | | 0 | That | \x5468617420 0 | my | \x6d7920 0 | days | \x6461797320 0 | have | \x6861766520 0 | been | \x6265656e20 0 | a | \x6120 100 | dream; +| \x647265616d3b0a | | 0 | Yet | \x59657420 0 | if | \x696620 0 | hope | \x686f706520 0 | has | \x68617320 0 | flown | \x666c6f776e20 100 | away +| \x617761790a | | 0 | In | \x496e20 0 | a | \x6120 0 | night, | \x6e696768742c20 0 | or | \x6f7220 0 | in | \x696e20 0 | a | \x6120 0 | day, | \x6461792c ### icu_sentence_boundaries (`string` text, `locale` text) Split the string into sentences, according the Unicode text segmentation rules defined in [UAX #29](https://unicode.org/reports/tr29/), and return them in a set of (tag, contents) tuples. `tag` has values from the [USentenceBreakTag enum][ubrk_source] defined in ubrk.h indicating the nature of the break. The current values are: UBRK_SENTENCE_TERM = 0, UBRK_SENTENCE_SEP = 100, /* up to 200 */ (strictly speaking, any number between the lower and the upper bounds may be counted, as these numbers are meant to be intervals inside which new subdivisions may be added in future versions of ICU). Example: =# SELECT * FROM icu_sentence_boundaries('Mr. Barry Sheene was born in 1950. He was a motorcycle racer.', 'en-u-ss-standard'); tag | contents -----+------------------------------------- 0 | Mr. Barry Sheene was born in 1950. 0 | He was a motorcycle racer. Note: "Mr." followed by a space is recognized by virtue of the locale as an abbreviation of the english "Mister", rather than the end of a sentence. ### icu_number_spellout (`number` double precision, `locale` text) Return the spelled out text corresponding to the number expressed in the given locale. Example: =# SELECT loc, icu_number_spellout(1234, loc) FROM (values ('en'),('fr'),('de'),('ru'),('ja')) AS s(loc); loc | icu_number_spellout -----+------------------------------------------- en | one thousand two hundred thirty-four fr | mille deux cent trente-quatre de | ein­tausend­zwei­hundert­vier­und­dreißig ru | одна тысяча двести тридцать четыре ja | 千二百三十四 (Note: the german output uses U+00AD (SOFT HYPHEN) to separate words. Github's markdown to HTML conversion seems to remove them, so in the above text the spellout might appear like a single long word.) ### icu_char_name(`c` character) Return the Unicode character name corresponding to the first codepoint of the input. Example: =# SELECT c, to_hex(ascii(c)), icu_char_name(c) FROM regexp_split_to_table('El Niño', '') as c; c | to_hex | icu_char_name ---+--------+--------------------------------- E | 45 | LATIN CAPITAL LETTER E l | 6c | LATIN SMALL LETTER L | 20 | SPACE N | 4e | LATIN CAPITAL LETTER N i | 69 | LATIN SMALL LETTER I ñ | f1 | LATIN SMALL LETTER N WITH TILDE o | 6f | LATIN SMALL LETTER O ### icu_spoof_check (`string` text) Return a boolean indicating whether the argument is likely to be an attempt at confusing a reader. The implementation is based on Unicode Technical Reports [#36](https://unicode.org/reports/tr36) and [#39](https://unicode.org/reports/tr39) and uses the ICU default settings for spoof checks. Example: =# SELECT txt, icu_spoof_check(txt) FROM (VALUES ('paypal'), (E'p\u0430ypal')) AS s(txt); txt | icu_spoof_check --------+----------------- paypal | f pаypal | t (Note: The second character in the second row is U+0430 (CYRILLIC SMALL LETTER A) instead of the genuine ASCII U+0061 (LATIN SMALL LETTER A)) ### icu_confusable_strings_check(`string1` text, `string2` text) Return a boolean indicating whether the string arguments are visually confusable with each other, according to data described in [Unicode Technical Report #39](https://unicode.org/reports/tr39/#Confusable_Detection). The settings and comparison levels are ICU defaults. For strictly identical strings, it returns true. Example: =# SELECT txt, icu_confusable_strings_check('phil', txt) AS confusable FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l')) AS s(txt); txt | confusable ------+------------ phiL | f phiI | t phi1 | t phıl | t ### icu_confusable_string_skeleton(`string` text) Return the skeleton transformation of the input string as specified in the [Unicode Technical Report #39](https://unicode.org/reports/tr39/#def-skeleton). Two strings are visually confusable if they produce the same skeleton. Example: =# SELECT txt, icu_confusable_string_skeleton(txt) AS skeleton FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l'), (E'\u2026\u2026')) AS s(txt); txt | skeleton ------+---------- phiL | phiL phiI | phil phi1 | phil phıl | phil …… | ...... ### icu_transform (`string` text, `transformations` text) Return a string with some transformations applied. This function essentially calls ICU's [utrans_transUChars()](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/utrans_8h.html#ad71dddd14877497f386b727d152ee89a). The first argument is the string to transform, and the second is the transformation to apply, expressed as a sequence of transforms and filters (see the [ICU user guide on transforms](https://unicode-org.github.io/icu/userguide/transforms/general/) and the output of `icu_transforms_list()` mentioned below). Examples: Transliterate: =# select icu_transform('Владимир Путин', 'Cyrl-Latn'); -- just 'Latin' would work here too icu_transform ---------------- Vladimir Putin Transform Unicode names into the corresponding characters: =# select icu_transform('10\N{SUPERSCRIPT MINUS}\N{SUPERSCRIPT FOUR}' '\N{MICRO SIGN}m = 1 \N{ANGSTROM SIGN}', 'Name-Any'); icu_transform --------------- 10⁻⁴µm = 1 Å Remove diacritics (generalized "unaccent") through Unicode decomposition. =# select icu_transform('1 Å', 'any-NFD; [:nonspacing mark:] any-remove; any-NFC'); icu_transform --------------- 1 A Generate hexadecimal codepoints for non-ASCII characters: =# select icu_transform('Ich muß essen.', '[:^ascii:]; Hex'); icu_transform --------------------- Ich mu\u00DF essen. ### icu_transforms_list () Return the list of built-in transliterations or transforms, as a set of text, corresponding to "Basic IDs" in [ICU documentation](https://unicode-org.github.io/icu/userguide/transforms/general). The initial set of transforms are transliterations between scripts (like `Katakana-Latin` or `Latin-Cyrillic`), but they're supplemented with functionalities related to accents, casing, Unicode composition and decomposition with combining characters and other conversions. Values from this list are meant to be used individually as the 2nd argument of `icu_transform()`, or assembled with semi-colon separators to form compound transforms, possibly with filters added to limit the set of characters to transform. ### icu_strpos(`string` text, `substring` text [, `collator` text]) Like `strpos(text,text)` in Postgres core, except that it uses the linguistic rules of `collator` to search `substring` in `string`, and that it supports nondeterministic collations seamlessly. When the substring is not found, it returns 0. Otherwise, It returns the 1-based position of the first match of `substring` inside `string`, or 1 if `substring` is empty. When `collator` is not passed, the collation of the arguments is used. As with the other functions in this extension, the two-argument form is faster since it can keep the ICU collation open across function calls. Example: -- Search in names independently of punctuation, case and accents =# select name from addresses where icu_strpos(name, 'jeanrene', 'fr-u-ks-level1-ka-shifted') > 0 name ------------------ jean-rené dupont Jean-René Dupont jeanrenédupont ### icu_replace(`string` text, `from` text, `to` text [, `collator` text]) Like `replace(string text, from text, to text)` in Postgres core, except it uses the linguistic rules of `collator` to search `substring` in `string` instead of a byte-wise comparison. It also supports nondeterministic collations to search `from` as a substring. It returns `strings` with all substrings that match `from` replaced by `to`. When `collator` is not passed, the collation of the arguments is used, which is faster because the ICU collation can be kept open across function calls. Example: -- Collation comparing independently of punctuation, case and accents =# CREATE COLLATION ciaipi (provider = icu, locale = 'und-u-ks-level1-ka-shifted'); -- Replace names matching 'jeanrene' by a placeholder =# select s.n, icu_replace(n, 'jeanrene', '{firstname}' collate "ciaipi") from (values('jeanrenédupont'),('Jean-René Dupont')) as s(n) ; n | icu_replace -------------------+--------------------- jeanrenédupont | {firstname}dupont Jean-René Dupont | {firstname} Dupont ### icu_normalize(`string` text, `form` text) Return `string` transformed into the Unicode normalized `form`, which must be `nfc`, `nfkc`, `nfd`, or `nfkd` (upper case or mixed case variants are accepted). Returns NULL if any input argument is NULL. The database must use an Unicode encoding, which means UTF-8 in practice. See the Unicode Annex [UAX #15](https://unicode.org/reports/tr15/#Introduction) for an introduction on Unicode normal forms. Example: =# select icu_normalize('éte'||E'\u0301', 'nfc') = E'ét\u00E9'; ?column? ---------- t ### icu_is_normalized(`string` text, `form` text) Return true if `string` is in the Unicode normalized `form`, which must be `nfc`, `nfkc`, `nfd`, or `nfkd` (upper case or mixed case variants are accepted). Returns false otherwise, or NULL if any input argument is NULL. The database must use an Unicode encoding, which means UTF-8 in practice. Example: =# SELECT icu_is_normalized('ét'||E'\u0301', 'nfc'); icu_is_normalized ------------------- f =# SELECT icu_is_normalized('ét'||E'\u0301', 'nfd'); icu_is_normalized ------------------- t ## License This project is licensed under the PostgreSQL License -- see [LICENSE.md](LICENSE.md). [ubrk_source]: https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/ubrk_8h_source.html icu_ext-1.8.0/expected/000077500000000000000000000000001450461324700147675ustar00rootroot00000000000000icu_ext-1.8.0/expected/tests-01.out000066400000000000000000000130241450461324700171000ustar00rootroot00000000000000-- regression tests for icu_ext CREATE EXTENSION icu_ext; -- Check that the database has the built-in ICU collations -- required by the tests SELECT collname FROM pg_collation WHERE collname IN ('und-x-icu', 'en-x-icu') ORDER BY collname; collname ----------- en-x-icu und-x-icu (2 rows) -- icu_char_name SELECT c, to_hex(ascii(c)), icu_char_name(c) FROM regexp_split_to_table('El Niño', '') as c; c | to_hex | icu_char_name ---+--------+--------------------------------- E | 45 | LATIN CAPITAL LETTER E l | 6c | LATIN SMALL LETTER L | 20 | SPACE N | 4e | LATIN CAPITAL LETTER N i | 69 | LATIN SMALL LETTER I ñ | f1 | LATIN SMALL LETTER N WITH TILDE o | 6f | LATIN SMALL LETTER O (7 rows) -- icu_character_boundaries SELECT * FROM icu_character_boundaries('Ete'||E'\u0301', 'fr') as chars; chars ------- E t é (3 rows) -- icu_collation_attributes SELECT * FROM icu_collation_attributes('en') WHERE attribute <> 'version'; attribute | value -------------+---------- displayname | English kn | false kb | false kk | false ka | noignore ks | level3 kf | false kc | false kv | punct (9 rows) -- icu_compare SELECT icu_compare('abcé', 'abce', 'en@colStrength=primary;colCaseLevel=yes'); icu_compare ------------- 0 (1 row) SELECT icu_compare('Abcé', 'abce' COLLATE "en-x-icu"); icu_compare ------------- 1 (1 row) -- icu_confusable_strings_check SELECT txt, icu_confusable_strings_check('phil', txt) AS confusable FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l')) AS s(txt); txt | confusable ------+------------ phiL | f phiI | t phi1 | t phıl | t (4 rows) -- icu_confusable_string_skeleton SELECT txt, icu_confusable_string_skeleton(txt) AS skeleton FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l'), (E'\u2026\u2026')) AS s(txt); txt | skeleton ------+---------- phiL | phiL phiI | phil phi1 | phil phıl | phil …… | ...... (5 rows) -- icu_line_boundaries SELECT *,convert_to( contents, 'utf-8') FROM icu_line_boundaries( $$Thus much let me avow You are not wrong, who deem That my days have been a dream; Yet if hope has flown away In a night, or in a day,$$ , 'en'); tag | contents | convert_to -----+----------+------------------ 0 | Thus | \x5468757320 0 | much | \x6d75636820 0 | let | \x6c657420 0 | me | \x6d6520 100 | avow +| \x61766f770a | | 0 | You | \x596f7520 0 | are | \x61726520 0 | not | \x6e6f7420 0 | wrong, | \x77726f6e672c20 0 | who | \x77686f20 100 | deem +| \x6465656d0a | | 0 | That | \x5468617420 0 | my | \x6d7920 0 | days | \x6461797320 0 | have | \x6861766520 0 | been | \x6265656e20 0 | a | \x6120 100 | dream; +| \x647265616d3b0a | | 0 | Yet | \x59657420 0 | if | \x696620 0 | hope | \x686f706520 0 | has | \x68617320 0 | flown | \x666c6f776e20 100 | away +| \x617761790a | | 0 | In | \x496e20 0 | a | \x6120 0 | night, | \x6e696768742c20 0 | or | \x6f7220 0 | in | \x696e20 0 | a | \x6120 0 | day, | \x6461792c (31 rows) -- icu_number_spellout /* use the unaligned format for this test. With the aligned format, there are environment-related differences in how psql computes the width of strings containing U+00AD (soft hyphen) */ \pset format unaligned SELECT loc, icu_number_spellout(1234, loc) FROM (values ('en'),('fr'),('de'),('ru'),('ja')) AS s(loc); loc|icu_number_spellout en|one thousand two hundred thirty-four fr|mille deux cent trente-quatre de|ein­tausend­zwei­hundert­vier­und­dreißig ru|одна тысяча двести тридцать четыре ja|千二百三十四 (5 rows) \pset format aligned -- icu_replace SELECT n, icu_replace( n, 'jeanrene', '{firstname}', 'und@colStrength=primary;colAlternate=shifted') FROM (values('jeanrenédupont'),('Jean-René Dupont')) as s(n) ORDER BY n COLLATE "C"; n | icu_replace -------------------+--------------------- Jean-René Dupont | {firstname} Dupont jeanrenédupont | {firstname}dupont (2 rows) -- icu_sentence_boundaries SELECT * FROM icu_sentence_boundaries('Call me Mr. Brown. It''s a movie.', 'en@ss=standard'); tag | contents -----+--------------------- 0 | Call me Mr. Brown. 0 | It's a movie. (2 rows) -- icu_strpos SELECT v,icu_strpos('hey rene', v, 'und@colStrength=primary;colAlternate=shifted') FROM (VALUES ('René'), ('rené'), ('Rene'), ('n'), ('në'), ('no'), (''), (null)) AS s(v) ORDER BY v COLLATE "C"; v | icu_strpos ------+------------ | 1 Rene | 5 René | 5 n | 7 no | 0 në | 7 rené | 5 | (8 rows) -- icu_transform SELECT icu_transform('10\N{SUPERSCRIPT MINUS}\N{SUPERSCRIPT FOUR}' '\N{MICRO SIGN}m = 1 \N{ANGSTROM SIGN}', 'Name-Any'); icu_transform --------------- 10⁻⁴µm = 1 Å (1 row) SELECT icu_transform('Ich muß essen.', '[:^ascii:]; Hex'); icu_transform --------------------- Ich mu\u00DF essen. (1 row) -- icu_word_boundaries SELECT * FROM icu_word_boundaries($$Do you like O'Reilly books?$$, 'en'); tag | contents -----+---------- 200 | Do 0 | 200 | you 0 | 200 | like 0 | 200 | O'Reilly 0 | 200 | books 0 | ? (10 rows) icu_ext-1.8.0/expected/tests-datetime.out000066400000000000000000000037031450461324700204570ustar00rootroot00000000000000/* test date and time support */ \set format unaligned set icu_ext.locale to 'en@calendar=gregorian'; set icu_ext.timestamptz_format to 'YYYY-MM-dd HH:mm:ss'; set timezone to 'Europe/Paris'; -- DST transition to summer time select '2023-03-25 00:00:00'::timestamptz + '26.5 hours'::interval AS "core", '2023-03-25 00:00:00'::icu_timestamptz + '26.5 hours'::icu_interval AS "ext"; core | ext -------------------------------+--------------------- Sun Mar 26 03:30:00 2023 CEST | 2023-03-26 03:30:00 (1 row) set icu_ext.locale to 'en@calendar=ethiopic'; set icu_ext.date_format to '{short}'; set icu_ext.timestamptz_format to '{short}'; -- 13-month year with 5 days in the last month select '1/13/2016 ERA1'::icu_date + icu_interval '12 months' as d1, '1/13/2016 ERA1'::icu_date + icu_interval '13 months' as d2, '1/13/2016 ERA1'::icu_date + icu_interval '1 year' as d3; d1 | d2 | d3 --------------------------+--------------------------+-------------------------- 13/5/2016 ERA1, 12:00 AM | 1/13/2017 ERA1, 12:00 AM | 1/13/2017 ERA1, 12:00 AM (1 row) select '13/5/2016 ERA1'::icu_date + 1; ?column? --------------- 1/1/2017 ERA1 (1 row) set icu_ext.locale to 'en@calendar=gregorian'; select icu_parse_date('17/10/2023', 'dd/MM/yyyy'); icu_parse_date ---------------- 10-17-2023 (1 row) select icu_parse_datetime('17/10/2023', 'dd/MM/yyyy'); icu_parse_datetime ------------------------------- Tue Oct 17 00:00:00 2023 CEST (1 row) select icu_parse_datetime('17/10/2023 12:02:40.653', 'dd/MM/yyyy HH:mm:ss.S'); icu_parse_datetime ----------------------------------- Tue Oct 17 12:02:40.653 2023 CEST (1 row) set timezone to 'GMT'; select icu_parse_datetime('17/10/2023 12:02:40.653', 'dd/MM/yyyy HH:mm:ss.S'); icu_parse_datetime ---------------------------------- Tue Oct 17 12:02:40.653 2023 GMT (1 row) icu_ext-1.8.0/icu_break.c000066400000000000000000000143501450461324700152610ustar00rootroot00000000000000/* * icu_break.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ #include "postgres.h" #include "access/htup_details.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "mb/pg_wchar.h" #include "unicode/ubrk.h" #include "unicode/ucnv.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "unicode/ustring.h" #include "unicode/utext.h" /* * PG set-returning functions exposing ICU's BreakIterator API for * characters, words, line-wrapping, sentences */ PG_FUNCTION_INFO_V1(icu_character_boundaries); PG_FUNCTION_INFO_V1(icu_word_boundaries); PG_FUNCTION_INFO_V1(icu_sentence_boundaries); PG_FUNCTION_INFO_V1(icu_line_boundaries); struct ubreak_ctxt { UBreakIterator *iter; UText* ut; char* source_text; UChar* cnv_text; /* unused and NULL if the database encoding is UTF-8 */ int32_t len; TupleDesc tupdesc; }; /* * Initialize the context to iterate on the input. * arg1=input string, arg2=locale * The main difference between break iterators is: * - UBRK_CHARACTER: return SETOF text * - others: return SETOF (int,text) */ static void init_srf_first_call(UBreakIteratorType break_type, PG_FUNCTION_ARGS) { MemoryContext oldcontext; const char *brk_locale; UErrorCode status = U_ZERO_ERROR; FuncCallContext *funcctx; struct ubreak_ctxt *ctxt; funcctx = SRF_FIRSTCALL_INIT(); /* * Switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); ctxt = palloc(sizeof(struct ubreak_ctxt)); if (break_type != UBRK_CHARACTER) { TupleDesc tupdesc; /* Construct tuple descriptor */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("function returning record called in context that cannot accept type record"))); ctxt->tupdesc = BlessTupleDesc(tupdesc); } else ctxt->tupdesc = NULL; /* Use the UTF-8 ICU functions if our string is in UTF-8 */ if (GetDatabaseEncoding() == PG_UTF8) { text *txt = PG_GETARG_TEXT_PP(0); ctxt->len = VARSIZE_ANY_EXHDR(txt); ctxt->source_text = (char*)palloc(ctxt->len); ctxt->cnv_text = NULL; memcpy(ctxt->source_text, VARDATA_ANY(txt), ctxt->len); ctxt->ut = utext_openUTF8(NULL, ctxt->source_text, ctxt->len, &status); if (U_FAILURE(status)) elog(ERROR, "utext_openUTF8() failed: %s", u_errorName(status)); } else { text *input = PG_GETARG_TEXT_PP(0); /* database encoding to UChar buffer */ ctxt->len = icu_to_uchar(&ctxt->cnv_text, text_to_cstring(input), VARSIZE_ANY_EXHDR(input)); ctxt->ut = utext_openUChars(NULL, ctxt->cnv_text, ctxt->len, &status); if (U_FAILURE(status)) elog(ERROR, "utext_openUChars() failed: %s", u_errorName(status)); } funcctx->user_fctx = (void *) ctxt; brk_locale = text_to_cstring(PG_GETARG_TEXT_PP(1)); MemoryContextSwitchTo(oldcontext); ctxt->iter = ubrk_open(break_type, brk_locale, NULL, 0, &status); if (U_FAILURE(status)) { utext_close(ctxt->ut); elog(ERROR, "ubrk_open failed: %s", u_errorName(status)); } ubrk_setUText(ctxt->iter, ctxt->ut, &status); if (U_FAILURE(status)) { ubrk_close(ctxt->iter); utext_close(ctxt->ut); elog(ERROR, "ubrk_setText() failed: %s", u_errorName(status)); } } /* * Return substrings (SETOF text). In general, they're are * one-character only but CRLF are returned in one piece, * and combining+base characters are also pieced together. * In this respect it differs from regexp_split_to_table(text, '') */ Datum icu_character_boundaries(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; int32_t pos0, pos; struct ubreak_ctxt *ctxt; if (SRF_IS_FIRSTCALL()) { init_srf_first_call(UBRK_CHARACTER, fcinfo); } funcctx = SRF_PERCALL_SETUP(); ctxt = (struct ubreak_ctxt*) funcctx->user_fctx; if (ctxt->len == 0) SRF_RETURN_DONE(funcctx); /* no result */ pos0 = ubrk_current(ctxt->iter); pos = ubrk_next(ctxt->iter); if (pos != UBRK_DONE) { text *item; if (ctxt->source_text != NULL) item = cstring_to_text_with_len(ctxt->source_text+pos0, pos-pos0); else { char *buf; /* convert UChar to a buffer in the database encoding */ int32_t len = icu_from_uchar(&buf, ctxt->cnv_text+pos0, pos-pos0); item = cstring_to_text_with_len(buf, len); } SRF_RETURN_NEXT(funcctx, PointerGetDatum(item)); } else /* end of SRF iteration */ { ubrk_close(ctxt->iter); utext_close(ctxt->ut); SRF_RETURN_DONE(funcctx); } } /* * Return (tag,content) tuples */ static Datum icu_boundaries_internal(UBreakIteratorType break_type, PG_FUNCTION_ARGS) { FuncCallContext *funcctx; int32_t pos0, pos1; struct ubreak_ctxt *ctxt; if (SRF_IS_FIRSTCALL()) { init_srf_first_call(break_type, fcinfo); } funcctx = SRF_PERCALL_SETUP(); ctxt = (struct ubreak_ctxt*) funcctx->user_fctx; if (ctxt->len == 0) SRF_RETURN_DONE(funcctx); /* no result */ pos0 = ubrk_current(ctxt->iter); do { pos1 = ubrk_next(ctxt->iter); if (pos1 != UBRK_DONE) { Datum values[2]; bool nulls[2]; HeapTuple tuple; text *item; if (ctxt->source_text != NULL) { item = cstring_to_text_with_len(ctxt->source_text + pos0, pos1-pos0); } else { char *buf; /* convert back UChar to a buffer in the database encoding */ int32_t len = icu_from_uchar(&buf, ctxt->cnv_text + pos0, pos1-pos0); item = cstring_to_text_with_len(buf, len); } values[0] = Int32GetDatum(ubrk_getRuleStatus(ctxt->iter)); nulls[0] = false; values[1] = PointerGetDatum(item); nulls[1] = false; tuple = heap_form_tuple(ctxt->tupdesc, values, nulls); SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); } } while (pos1 != UBRK_DONE); /* end of SRF iteration */ ubrk_close(ctxt->iter); utext_close(ctxt->ut); SRF_RETURN_DONE(funcctx); } Datum icu_word_boundaries(PG_FUNCTION_ARGS) { return icu_boundaries_internal(UBRK_WORD, fcinfo); } Datum icu_line_boundaries(PG_FUNCTION_ARGS) { return icu_boundaries_internal(UBRK_LINE, fcinfo); } Datum icu_sentence_boundaries(PG_FUNCTION_ARGS) { return icu_boundaries_internal(UBRK_SENTENCE, fcinfo); } icu_ext-1.8.0/icu_date.c000066400000000000000000000365341450461324700151220ustar00rootroot00000000000000/* * icu_calendar.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "pgtime.h" #include "utils/builtins.h" #include "utils/timestamp.h" #include "utils/pg_locale.h" #include "utils/date.h" #include "utils/datetime.h" /* ICU includes */ #include "unicode/ucal.h" #include "unicode/udat.h" #include "unicode/ustring.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_format_date_locale); PG_FUNCTION_INFO_V1(icu_format_date_default_locale); PG_FUNCTION_INFO_V1(icu_format_datetime_locale); PG_FUNCTION_INFO_V1(icu_format_datetime_default_locale); PG_FUNCTION_INFO_V1(icu_parse_date_locale); PG_FUNCTION_INFO_V1(icu_parse_date_default_locale); PG_FUNCTION_INFO_V1(icu_parse_datetime_locale); PG_FUNCTION_INFO_V1(icu_parse_datetime_default_locale); PG_FUNCTION_INFO_V1(icu_date_in); PG_FUNCTION_INFO_V1(icu_date_out); PG_FUNCTION_INFO_V1(icu_date_add_days); PG_FUNCTION_INFO_V1(icu_date_days_add); PG_FUNCTION_INFO_V1(icu_date_plus_interval); PG_FUNCTION_INFO_V1(icu_date_minus_interval); /* Convert a postgres date (number of days since 1/1/2000) to a UDate */ static UDate dateadt_to_udate(DateADT pg_date) { /* simple version */ return (UDate)( (double)(pg_date+(POSTGRES_EPOCH_JDATE-UNIX_EPOCH_JDATE)) /* days since Unix epoch */ *86400.0*1000 /* multiplied by the number of milliseconds in a day */ ); } /* * Return a text representation of a PG timestamp given the locale and ICU format. * locale==NULL means the default locale. */ static Datum format_timestamp(TimestampTz pg_tstz, text *date_fmt, const char *locale) { const char* icu_date_format = text_to_cstring(date_fmt); UErrorCode status = U_ZERO_ERROR; char *result; int32_t result_len; int32_t pattern_length; UChar* pattern_buf; UDateFormat* df = NULL; UDate dat; UChar* tzid; int32_t tzid_length; const char *pg_tz_name = pg_get_timezone_name(session_timezone); UDateFormatStyle style; if (TIMESTAMP_NOT_FINITE(pg_tstz)) { char buf[MAXDATELEN + 1]; EncodeSpecialTimestamp(pg_tstz, buf); /* produces [-]infinity */ result = pstrdup(buf); PG_RETURN_TEXT_P(cstring_to_text(result)); } dat = TS_TO_UDATE(pg_tstz); style = date_format_style(icu_date_format); if (style == UDAT_NONE) { pattern_length = icu_to_uchar(&pattern_buf, icu_date_format, strlen(icu_date_format)); style = UDAT_PATTERN; } else { pattern_length = -1; pattern_buf = NULL; } tzid_length = icu_to_uchar(&tzid, pg_tz_name, /* or UCAL_UNKNOWN_ZONE_ID, like GMT */ strlen(pg_tz_name)); if (!locale) locale = icu_ext_default_locale; /* if UDAT_PATTERN is passed, it must for both timeStyle and dateStyle */ df = udat_open(style, /* timeStyle */ style, /* dateStyle */ locale, /* NULL for the default locale */ tzid, /* tzID (NULL=default). */ tzid_length, /* tzIDLength */ pattern_buf, pattern_length, &status); if (U_FAILURE(status)) elog(ERROR, "udat_open failed with code %d\n", status); { /* Try first to convert into a buffer on the stack, and palloc() it only if udat_format says it's too small */ UChar local_buf[MAXDATELEN]; int32_t u_buffer_size = udat_format(df, dat, local_buf, sizeof(local_buf)/sizeof(UChar), NULL, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { UChar* u_buffer; status = U_ZERO_ERROR; u_buffer = (UChar*) palloc(u_buffer_size*sizeof(UChar)); udat_format(df, dat, u_buffer, u_buffer_size, NULL, &status); result_len = icu_from_uchar(&result, u_buffer, u_buffer_size); } else { result_len = icu_from_uchar(&result, local_buf, u_buffer_size); } } if (df) udat_close(df); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } /* * Return a text representation of a PG timestamp given the locale and ICU format. * locale==NULL means the default locale. */ static Datum format_date(DateADT pg_date, text *date_fmt, const char *locale) { const char* date_format = text_to_cstring(date_fmt); UErrorCode status = U_ZERO_ERROR; char *result; int32_t result_len; int32_t pattern_length; UChar* pattern_buf; UDateFormat* df = NULL; UDate dat; UChar* tzid; int32_t tzid_length; UDateFormatStyle style; if (DATE_NOT_FINITE(pg_date)) { char buf[MAXDATELEN + 1]; EncodeSpecialDate(pg_date, buf); /* produces [-]infinity */ result = pstrdup(buf); PG_RETURN_TEXT_P(cstring_to_text(result)); } dat = dateadt_to_udate(pg_date); style = date_format_style(date_format); if (style == UDAT_NONE) { pattern_length = icu_to_uchar(&pattern_buf, date_format, strlen(date_format)); style = UDAT_PATTERN; } else { pattern_length = -1; pattern_buf = NULL; } tzid_length = icu_to_uchar(&tzid, "GMT", 3); if (!locale) locale = icu_ext_default_locale; /* if UDAT_PATTERN is passed, it must for both timeStyle and dateStyle */ df = udat_open(style==UDAT_PATTERN ? style : UDAT_NONE, /* timeStyle */ style, /* dateStyle */ locale, /* NULL for the default locale */ tzid, /* tzID (NULL=default). */ tzid_length, /* tzIDLength */ pattern_buf, pattern_length, &status); if (U_FAILURE(status)) elog(ERROR, "udat_open failed with code %d\n", status); { /* Try first to convert into a buffer on the stack, and palloc() it only if udat_format says it's too small */ UChar local_buf[MAXDATELEN]; int32_t u_buffer_size = udat_format(df, dat, local_buf, sizeof(local_buf)/sizeof(UChar), NULL, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { UChar* u_buffer; status = U_ZERO_ERROR; u_buffer = (UChar*) palloc(u_buffer_size*sizeof(UChar)); udat_format(df, dat, u_buffer, u_buffer_size, NULL, &status); result_len = icu_from_uchar(&result, u_buffer, u_buffer_size); } else { result_len = icu_from_uchar(&result, local_buf, u_buffer_size); } } if (df) udat_close(df); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } Datum icu_format_date_locale(PG_FUNCTION_ARGS) { return format_date(PG_GETARG_DATEADT(0), PG_GETARG_TEXT_PP(1), text_to_cstring(PG_GETARG_TEXT_PP(2))); } Datum icu_format_date_default_locale(PG_FUNCTION_ARGS) { return format_date(PG_GETARG_DATEADT(0), PG_GETARG_TEXT_PP(1), NULL); } Datum icu_format_datetime_locale(PG_FUNCTION_ARGS) { return format_timestamp(PG_GETARG_TIMESTAMPTZ(0), PG_GETARG_TEXT_PP(1), text_to_cstring(PG_GETARG_TEXT_PP(2))); } Datum icu_format_datetime_default_locale(PG_FUNCTION_ARGS) { return format_timestamp(PG_GETARG_TIMESTAMPTZ(0), PG_GETARG_TEXT_PP(1), NULL); } /* * Parse a user-supplied ICU-formatted string into a Postgres * timestamptz (if include_time is true) or date (include_time is * false). * if locale=NULL the default locale is used. */ static Datum parse_timestamp(const text *input_date, const text *input_format, const char *locale, bool include_time) { const char* date_string = text_to_cstring(input_date); const char* date_format = text_to_cstring(input_format); int32_t pattern_length; UChar* pattern_buf; UChar* u_date_string; int32_t u_date_length; UDateFormat* df = NULL; UDate udat; UErrorCode status = U_ZERO_ERROR; UChar* tzid; int32_t tzid_length; UDateFormatStyle style; style = date_format_style(date_format); if (style == UDAT_NONE) { pattern_length = icu_to_uchar(&pattern_buf, date_format, strlen(date_format)); style = UDAT_PATTERN; } else { pattern_length = -1; pattern_buf = NULL; } u_date_length = icu_to_uchar(&u_date_string, date_string, strlen(date_string)); if (!include_time) { tzid_length = icu_to_uchar(&tzid, "GMT", /* for dates, we ignore timezones */ 3); } else { const char *pg_tz_name = pg_get_timezone_name(session_timezone); /* use PG current timezone, hopefully compatible with ICU */ tzid_length = icu_to_uchar(&tzid, pg_tz_name, strlen(pg_tz_name)); } if (!locale) locale = icu_ext_default_locale; /* if UDAT_PATTERN is used, we must pass it for both timeStyle and dateStyle */ df = udat_open(include_time ? style : (style==UDAT_PATTERN?style:UDAT_NONE), style, locale, tzid, tzid_length, pattern_buf, pattern_length, &status); if (U_FAILURE(status)) { udat_close(df); elog(ERROR, "udat_open failed: %s\n", u_errorName(status)); } udat_setLenient(df, false); /* strict parsing */ udat = udat_parse(df, u_date_string, u_date_length, NULL, &status); udat_close(df); if (U_FAILURE(status)) elog(ERROR, "udat_parse failed: %s\n", u_errorName(status)); if (!include_time) { DateADT d = (udat/(86400*1000)) - (POSTGRES_EPOCH_JDATE-UNIX_EPOCH_JDATE); PG_RETURN_DATEADT(d); } else PG_RETURN_TIMESTAMPTZ(UDATE_TO_TS(udat)); } Datum icu_parse_date_locale(PG_FUNCTION_ARGS) { return parse_timestamp(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), text_to_cstring(PG_GETARG_TEXT_PP(2)), false); } Datum icu_parse_date_default_locale(PG_FUNCTION_ARGS) { /* Let the default ICU locale for now. Probably use a GUC later */ return parse_timestamp(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), NULL, false); } Datum icu_parse_datetime_locale(PG_FUNCTION_ARGS) { return parse_timestamp(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), text_to_cstring(PG_GETARG_TEXT_PP(2)), true); } Datum icu_parse_datetime_default_locale(PG_FUNCTION_ARGS) { /* Let the default ICU locale for now. Probably use a GUC later */ return parse_timestamp(PG_GETARG_TEXT_PP(0), PG_GETARG_TEXT_PP(1), NULL, true); } /* * Input function for text representation of icu_date. */ Datum icu_date_in(PG_FUNCTION_ARGS) { char *date_string = PG_GETARG_CSTRING(0); int32_t pattern_length = -1; UChar *u_date_string; int32_t u_date_length; UDateFormat* df = NULL; UDate udat; UDateFormatStyle style = icu_ext_date_style; UErrorCode status = U_ZERO_ERROR; UChar *input_pattern = NULL; Timestamp pg_ts; const char *locale = NULL; DateADT result; struct pg_tm tm; fsec_t fsec; int32_t parse_pos = 0; UChar* tzid; int32_t tzid_length; if (icu_ext_date_format != NULL) { if (icu_ext_date_format[0] != '\0' && icu_ext_date_style == UDAT_NONE) { pattern_length = icu_to_uchar(&input_pattern, icu_ext_date_format, strlen(icu_ext_date_format)); } } u_date_length = icu_to_uchar(&u_date_string, date_string, strlen(date_string)); if (icu_ext_default_locale != NULL && icu_ext_default_locale[0] != '\0') { locale = icu_ext_default_locale; } tzid_length = icu_to_uchar(&tzid, "GMT", /* for dates, we ignore timezones */ 3); /* if UDAT_PATTERN is used, we must pass it for both timeStyle and dateStyle */ df = udat_open(input_pattern ? UDAT_PATTERN : UDAT_NONE, /* timeStyle */ input_pattern ? UDAT_PATTERN : style, /* dateStyle */ locale, tzid, /* tzID */ tzid_length, /* tzIDLength */ input_pattern, pattern_length, &status); if (U_FAILURE(status)) { udat_close(df); elog(ERROR, "udat_open failed: %s\n", u_errorName(status)); } udat_setLenient(df, false); /* strict parsing */ udat = udat_parse(df, u_date_string, u_date_length, &parse_pos, &status); udat_close(df); if (U_FAILURE(status)) elog(ERROR, "udat_parse failed: %s\n", u_errorName(status)); /* convert UDate to julian days, with an intermediate Timestamp to use date2j */ pg_ts = UDATE_TO_TS(udat); if (timestamp2tm(pg_ts, NULL, &tm, &fsec, NULL, NULL) != 0) ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("date out of range: \"%s\"", date_string))); result = date2j(tm.tm_year, tm.tm_mon, tm.tm_mday) - POSTGRES_EPOCH_JDATE; PG_RETURN_DATEADT(result); } Datum icu_date_out(PG_FUNCTION_ARGS) { DateADT date = PG_GETARG_DATEADT(0); char buf[MAXDATELEN + 1]; UErrorCode status = U_ZERO_ERROR; UDateFormat* df = NULL; UDate udate; const char *locale = NULL; char *result; UChar* tzid; int32_t tzid_length; if (DATE_NOT_FINITE(date)) { EncodeSpecialDate(date, buf); result = pstrdup(buf); } else { UChar *output_pattern = NULL; int32_t pattern_length = -1; UDateFormatStyle style = icu_ext_date_style; udate = dateadt_to_udate(date); if (icu_ext_date_format != NULL) { if (icu_ext_date_format[0] != '\0' && icu_ext_date_style == UDAT_NONE) { pattern_length = icu_to_uchar(&output_pattern, icu_ext_date_format, strlen(icu_ext_date_format)); } } if (icu_ext_default_locale != NULL && icu_ext_default_locale[0] != '\0') { locale = icu_ext_default_locale; } /* dates are not time-zone shifted when output */ tzid_length = icu_to_uchar(&tzid, UCAL_UNKNOWN_ZONE_ID, /*like GMT */ strlen(UCAL_UNKNOWN_ZONE_ID)); /* if UDAT_PATTERN is passed, it must for both timeStyle and dateStyle */ df = udat_open(output_pattern ? UDAT_PATTERN : UDAT_NONE, /* timeStyle */ output_pattern ? UDAT_PATTERN : style, /* dateStyle */ locale, /* NULL for the default locale */ tzid, /* tzID (NULL=default). */ tzid_length, /* tzIDLength */ output_pattern, /* pattern */ pattern_length, /* patternLength */ &status); if (U_FAILURE(status)) elog(ERROR, "udat_open failed with code %d\n", status); { /* Try first to convert into a buffer on the stack, and palloc() it only if udat_format says it's too small */ UChar local_buf[MAXDATELEN]; int32_t u_buffer_size = udat_format(df, udate, local_buf, sizeof(local_buf)/sizeof(UChar), NULL, &status); if(status == U_BUFFER_OVERFLOW_ERROR) { UChar* u_buffer; status = U_ZERO_ERROR; u_buffer = (UChar*) palloc(u_buffer_size*sizeof(UChar)); udat_format(df, udate, u_buffer, u_buffer_size, NULL, &status); icu_from_uchar(&result, u_buffer, u_buffer_size); } else { icu_from_uchar(&result, local_buf, u_buffer_size); } } if (df) udat_close(df); } PG_RETURN_CSTRING(result); } Datum icu_date_add_days(PG_FUNCTION_ARGS) { DateADT date = PG_GETARG_DATEADT(0); int32 days = PG_GETARG_INT32(1); /* same operation as the built-in date type */ return DirectFunctionCall2(date_pli, date, days); } Datum icu_date_days_add(PG_FUNCTION_ARGS) { int32 days = PG_GETARG_INT32(0); DateADT date = PG_GETARG_DATEADT(1); /* same operation as the built-in date type */ return DirectFunctionCall2(date_pli, date, days); } /* icu_date + icu_interval => icu_timestamptz */ Datum icu_date_plus_interval(PG_FUNCTION_ARGS) { Datum ts_datum; /* convert the date to a timestamptz */ ts_datum = DirectFunctionCall1(date_timestamptz, PG_GETARG_DATUM(0)); /* branch to icu_timestampz + icu_interval */ return DirectFunctionCall2(icu_timestamptz_add_interval, ts_datum, PG_GETARG_DATUM(1)); } /* icu_date - icu-interval => icu_timestamptz */ Datum icu_date_minus_interval(PG_FUNCTION_ARGS) { Datum ts_datum; /* convert the date to a timestamptz */ ts_datum = DirectFunctionCall1(date_timestamptz, PG_GETARG_DATUM(0)); /* branch to icu_timestampz + icu_interval */ return DirectFunctionCall2(icu_timestamptz_sub_interval, ts_datum, PG_GETARG_DATUM(1)); } icu_ext-1.8.0/icu_ext.c000066400000000000000000000621131450461324700147750ustar00rootroot00000000000000/* * icu_ext.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ #include "postgres.h" #include "catalog/pg_collation.h" #include "fmgr.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "miscadmin.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/pg_locale.h" #include "utils/tuplestore.h" #include "unicode/ucnv.h" #include "unicode/ucol.h" #include "unicode/udat.h" #include "unicode/uloc.h" #include "unicode/umachine.h" #include "unicode/uscript.h" #include "unicode/ustring.h" #include "unicode/utext.h" #include "unicode/uvernum.h" #include "icu_ext.h" PG_MODULE_MAGIC; PG_FUNCTION_INFO_V1(icu_version); PG_FUNCTION_INFO_V1(icu_unicode_version); PG_FUNCTION_INFO_V1(icu_collation_attributes); PG_FUNCTION_INFO_V1(icu_locales_list); PG_FUNCTION_INFO_V1(icu_default_locale); PG_FUNCTION_INFO_V1(icu_set_default_locale); PG_FUNCTION_INFO_V1(icu_compare); PG_FUNCTION_INFO_V1(icu_compare_coll); PG_FUNCTION_INFO_V1(icu_case_compare); PG_FUNCTION_INFO_V1(icu_sort_key); PG_FUNCTION_INFO_V1(icu_sort_key_coll); PG_FUNCTION_INFO_V1(icu_char_name); /* * GUC parameters */ char *icu_ext_default_locale; char *icu_ext_date_format; char *icu_ext_timestamptz_format; /* Built-in ICU styles that are #define'd. See date_format_style() */ UDateFormatStyle icu_ext_date_style = UDAT_DEFAULT; UDateFormatStyle icu_ext_timestamptz_style = UDAT_DEFAULT; void _PG_init(void); Datum icu_version(PG_FUNCTION_ARGS) { UVersionInfo version; char buf[U_MAX_VERSION_STRING_LENGTH+1]; u_getVersion(version); u_versionToString(version, buf); PG_RETURN_TEXT_P(cstring_to_text(buf)); } Datum icu_unicode_version(PG_FUNCTION_ARGS) { UVersionInfo version; char buf[U_MAX_VERSION_STRING_LENGTH+1]; u_getUnicodeVersion(version); u_versionToString(version, buf); PG_RETURN_TEXT_P(cstring_to_text(buf)); } /* Get the value of a collation attribute, aborting on error. */ static UColAttributeValue get_attribute(const UCollator *coll, UColAttribute attr) { UColAttributeValue val; UErrorCode status = U_ZERO_ERROR; val = ucol_getAttribute(coll, attr, &status); if (status != U_ZERO_ERROR) { elog(ERROR, "ucol_getAttribute failed"); } return val; } /* * Return (attribute,value) tuples for all attributes of a collation, * with keys and values matching options defined at * http://unicode.org/reports/tr35/tr35-collation.html#Setting_Options * Optionally, the attributes kept at their default values are not * included in the results. */ Datum icu_collation_attributes(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; MemoryContext per_query_ctx; MemoryContext oldcontext; TupleDesc tupdesc; Tuplestorestate *tupstore; Datum values[2]; bool nulls[2]; char *txt; const char *locale; bool include_defaults = !(PG_GETARG_BOOL(1)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; UColAttributeValue u_attr_val; if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); /* Switch into long-lived context to construct returned data structures */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* Open ICU collation */ locale = text_to_cstring(PG_GETARG_TEXT_P(0)); collator = ucol_open(locale, &status); if (!collator) { elog(ERROR, "failed to open collation"); } tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); memset(nulls, 0, sizeof(nulls)); /* name (not a real attribute, added for convenience) */ if (include_defaults) { /* Use a large initial buffer to avoid bug ICU-21157 */ UChar dname_local[500]; UChar *dname = dname_local; char *buf; int32_t ulen; ulen = uloc_getDisplayName(locale, NULL, dname, sizeof(dname_local)/sizeof(UChar), &status); if (status == U_BUFFER_OVERFLOW_ERROR) { dname = palloc((ulen+1)*sizeof(UChar)); status = U_ZERO_ERROR; ulen = uloc_getDisplayName(locale, NULL, dname, ulen, &status); } if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayName failed: %s", u_errorName(status)); icu_from_uchar(&buf, dname, ulen); values[0] = CStringGetTextDatum("displayname"); values[1] = CStringGetTextDatum(buf); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_NUMERIC_COLLATION (key:kn) */ u_attr_val = get_attribute(collator, UCOL_NUMERIC_COLLATION); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kn"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_FRENCH_COLLATION (key:kb, rule:[backwards 2]) */ u_attr_val = get_attribute(collator, UCOL_FRENCH_COLLATION); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kb"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_NORMALIZATION_MODE (key:kk)*/ u_attr_val = get_attribute(collator, UCOL_NORMALIZATION_MODE); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kk"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_ALTERNATE_HANDLING (key:ka) */ u_attr_val = get_attribute(collator, UCOL_ALTERNATE_HANDLING); if (include_defaults || u_attr_val != UCOL_NON_IGNORABLE) { switch (u_attr_val) { case UCOL_NON_IGNORABLE: txt = "noignore"; break; case UCOL_SHIFTED: txt = "shifted"; break; default: txt = ""; break; } values[0] = CStringGetTextDatum("ka"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_STRENGTH (key:ks) */ u_attr_val = get_attribute(collator, UCOL_STRENGTH); if (include_defaults || u_attr_val != UCOL_TERTIARY) { switch(u_attr_val) { case UCOL_PRIMARY: txt = "level1"; break; case UCOL_SECONDARY: txt = "level2"; break; case UCOL_TERTIARY: txt = "level3"; break; case UCOL_QUATERNARY: txt = "level4"; break; case UCOL_IDENTICAL: txt = "identic"; break; default: txt = ""; break; } values[0] = CStringGetTextDatum("ks"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_CASE_FIRST (key:kf) */ u_attr_val = get_attribute(collator, UCOL_CASE_FIRST); if (include_defaults || u_attr_val != UCOL_OFF) { switch(u_attr_val) { case UCOL_OFF: txt = "false"; break; case UCOL_LOWER_FIRST: txt = "lower"; break; case UCOL_UPPER_FIRST: txt = "upper"; break; default: txt = ""; break; } values[0] = CStringGetTextDatum("kf"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_CASE_LEVEL (key:kc) */ u_attr_val = get_attribute(collator, UCOL_CASE_LEVEL); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kc"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* Max variable (key:kv) */ { UColReorderCode reorder_code = ucol_getMaxVariable(collator); const char *code_name = NULL; switch(reorder_code) { case UCOL_REORDER_CODE_SPACE: code_name = "space"; break; case UCOL_REORDER_CODE_PUNCTUATION: code_name = "punct"; break; case UCOL_REORDER_CODE_SYMBOL: code_name = "symbol"; break; case UCOL_REORDER_CODE_CURRENCY: code_name = "currency"; break; case UCOL_REORDER_CODE_DIGIT: code_name = "digit"; break; default: break; } /* "punct" is the default. Omit it unless include_defaults is set */ if (code_name != NULL && (include_defaults || reorder_code != UCOL_REORDER_CODE_PUNCTUATION)) { values[0] = CStringGetTextDatum("kv"); values[1] = CStringGetTextDatum(code_name); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } } /* Reorder codes (key:kr) */ { StringInfoData aggr_values; /* 4-letter codes separated by hyphens */ int32_t *reorder_codes = NULL; int32_t nb_reorderings = ucol_getReorderCodes(collator, NULL, 0, &status); if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) elog(ERROR, "uloc_getReorderCodes failed: %s", u_errorName(status)); initStringInfo(&aggr_values); if (nb_reorderings > 0) { reorder_codes = palloc(nb_reorderings*sizeof(int32_t)); status = U_ZERO_ERROR; nb_reorderings = ucol_getReorderCodes(collator, reorder_codes, nb_reorderings, &status); if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) elog(ERROR, "uloc_getReorderCodes failed: %s", u_errorName(status)); } for (uint32_t idx=0; idx < nb_reorderings; idx++) { const char *value = NULL; if (reorder_codes[idx] >= UCOL_REORDER_CODE_FIRST) { switch(reorder_codes[idx]) { case UCOL_REORDER_CODE_SPACE: value = "space"; break; case UCOL_REORDER_CODE_PUNCTUATION: value = "punct"; break; case UCOL_REORDER_CODE_SYMBOL: value = "symbol"; break; case UCOL_REORDER_CODE_CURRENCY: value = "currency"; break; case UCOL_REORDER_CODE_DIGIT: value = "digit"; break; } } else { value = uscript_getShortName((UScriptCode)reorder_codes[idx]); } if (value != NULL) { if (idx >= 1) appendStringInfoChar(&aggr_values, '-'); appendStringInfoString(&aggr_values, value); } } if (aggr_values.len > 0) { values[0] = CStringGetTextDatum("kr"); values[1] = CStringGetTextDatum(aggr_values.data); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } } /* version (not a real attribute, added for convenience) */ if (include_defaults) { UVersionInfo version; char buf[U_MAX_VERSION_STRING_LENGTH+1]; ucol_getVersion(collator, version); u_versionToString(version, buf); values[0] = CStringGetTextDatum("version"); values[1] = CStringGetTextDatum(buf); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } tuplestore_donestoring(tupstore); ucol_close(collator); return (Datum) 0; } /* * Add a piece of text as a new Datum value, setting it to NULL * if it's empty. */ static int add_string(const char* value, int column, Datum *values, bool *nulls) { if (*value) values[column] = CStringGetTextDatum(value); else values[column] = (Datum)0; nulls[column] = (*value == '\0'); return column+1; } /* * Interface to uloc_getAvailable() for all locales. * Return a table of available locales with their main properties. */ Datum icu_locales_list(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; MemoryContext per_query_ctx; MemoryContext oldcontext; TupleDesc tupdesc; Tuplestorestate *tupstore; int32_t loc_count = uloc_countAvailable(); int32_t i; Datum values[7]; bool nulls[7]; if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); /* Switch into long-lived context to construct returned data structures */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); for (i=0; i < loc_count; i++) { UErrorCode status = U_ZERO_ERROR; int col_num = 0; const char *p = uloc_getAvailable(i); /* Name */ col_num = add_string(p, col_num, values, nulls); /* Country */ { UChar country_buf[200]; char* country; /* with the database encoding */ uloc_getDisplayCountry(p, NULL /*ULOC_ENGLISH*/, country_buf, sizeof(country_buf)/sizeof(UChar), &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayCountry() failed on locale '%s': %s", p, u_errorName(status)); icu_from_uchar(&country, country_buf, u_strlen(country_buf)); col_num = add_string(country, col_num, values, nulls); } /* Country code */ col_num = add_string(uloc_getISO3Country(p), col_num, values, nulls); /* Language */ { UChar lang_buf[200]; char* language; uloc_getDisplayLanguage(p, NULL, lang_buf, sizeof(lang_buf)/sizeof(UChar), &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayLanguage() failed on locale '%s': %s", p, u_errorName(status)); icu_from_uchar(&language, lang_buf, u_strlen(lang_buf)); col_num = add_string(language, col_num, values, nulls); } /* Language code */ col_num = add_string(uloc_getISO3Language(p), col_num, values, nulls); /* Script */ { UChar script_buf[100]; char* script; uloc_getDisplayScript(p, NULL, script_buf, sizeof(script_buf)/sizeof(UChar), &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayScript() failed on locale '%s': %s", p, u_errorName(status)); icu_from_uchar(&script, script_buf, u_strlen(script_buf)); col_num = add_string(script, col_num, values, nulls); } /* Character orientation */ { const char* layout; ULayoutType t = uloc_getCharacterOrientation(p, &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getCharacterOrientation() failed on locale '%s': %s", p, u_errorName(status)); switch (t) { case ULOC_LAYOUT_LTR: layout = "LTR"; break; case ULOC_LAYOUT_RTL: layout = "RTL"; break; case ULOC_LAYOUT_TTB: layout = "TTB"; break; case ULOC_LAYOUT_BTT: layout = "BTT"; break; default: layout = ""; break; } col_num = add_string(layout, col_num, values, nulls); } tuplestore_putvalues(tupstore, tupdesc, values, nulls); } tuplestore_donestoring(tupstore); return (Datum) 0; } /* * Return the default locale. */ Datum icu_default_locale(PG_FUNCTION_ARGS) { PG_RETURN_TEXT_P(cstring_to_text(uloc_getDefault())); } /* * Set the default locale to some name and return its canonicalized * name. * Warning: seen with ICU-52, passing a locale name with BCP-47 * extensions makes ICU never return from uloc_setDefault() (it seems * to wait for some internal lock). * Note that ICU documentation says about uloc_setDefault(): * "Do not use unless you know what you are doing." * This is useful in icu_ext to get translated versions of country * and language names from icu_locales_list(). */ Datum icu_set_default_locale(PG_FUNCTION_ARGS) { UErrorCode status = U_ZERO_ERROR; const char *locname = text_to_cstring(PG_GETARG_TEXT_P(0)); char buf[1024]; uloc_setDefault(locname, &status); if (U_FAILURE(status)) elog(ERROR, "failed to set ICU locale: %s", u_errorName(status)); uloc_canonicalize(locname, buf, sizeof(buf), &status); if (U_FAILURE(status)) PG_RETURN_NULL(); else PG_RETURN_TEXT_P(cstring_to_text(buf)); } /* * Get the UCollator object corresponding to the collation in input. * This UCollator is kept open by the backend and pointed to by the * cached pg_locale_t object. */ UCollator* ucollator_from_coll_id(Oid collid) { pg_locale_t pg_locale; if (collid == DEFAULT_COLLATION_OID || !OidIsValid(collid)) { /* * This will need to be changed when a db will be able to have * an ICU collation by default (not possible as of PG11). */ ereport(ERROR, (errcode(ERRCODE_INDETERMINATE_COLLATION), errmsg("could not determine which ICU collation to use"), errhint("Use the COLLATE clause to set the collation explicitly."))); } pg_locale = pg_newlocale_from_collation(collid); if (!pg_locale || pg_locale->provider != 'i') { ereport(ERROR, (errcode(ERRCODE_COLLATION_MISMATCH), errmsg("the collation provider of the input string must be ICU"))); } return pg_locale->info.icu.ucol; } /* * The actual collation-aware comparison happens here. * the UCollator comes either from a cached pg_locale_t * or is just opened and closed immediately by icu_ext. */ static UCollationResult our_strcoll(text *txt1, text *txt2, UCollator *collator ) { UCollationResult result; int32_t len1 = VARSIZE_ANY_EXHDR(txt1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); if (GetDatabaseEncoding() == PG_UTF8) { /* use the UTF-8 representation directly if possible */ UErrorCode status = U_ZERO_ERROR; result = ucol_strcollUTF8(collator, text_to_cstring(txt1), len1, text_to_cstring(txt2), len2, &status); if (U_FAILURE(status)) elog(ERROR, "ICU strcoll failed: %s", u_errorName(status)); } else { int32_t ulen1, ulen2; UChar *uchar1, *uchar2; ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); ulen2 = icu_to_uchar(&uchar2, text_to_cstring(txt2), len2); result = ucol_strcoll(collator, uchar1, ulen1, uchar2, ulen2); pfree(uchar1); pfree(uchar2); } return result; } /* * Compare two strings with the given collation. * Return the result as a signed integer, similarly to strcoll(). */ Datum icu_compare_coll(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); text *txt2 = PG_GETARG_TEXT_PP(1); const char *collname = text_to_cstring(PG_GETARG_TEXT_P(2)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; UCollationResult result; collator = ucol_open(collname, &status); if (!collator || U_FAILURE(status)) { elog(ERROR, "failed to open collation: %s", u_errorName(status)); } result = our_strcoll(txt1, txt2, collator); ucol_close(collator); PG_RETURN_INT32(result == UCOL_EQUAL ? 0 : (result == UCOL_GREATER ? 1 : -1)); } /* * Compare two strings with the collation of the function, * which must be an ICU collation. * Return the result as a signed integer, similarly to strcoll(). */ Datum icu_compare(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); text *txt2 = PG_GETARG_TEXT_PP(1); UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); UCollationResult result; result = our_strcoll(txt1, txt2, collator); PG_RETURN_INT32(result == UCOL_EQUAL ? 0 : (result == UCOL_GREATER ? 1 : -1)); } /* * Compare two strings with full case folding. */ Datum icu_case_compare(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); text *txt2 = PG_GETARG_TEXT_PP(1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); int32_t result; UChar *uchar1, *uchar2; (void)icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); (void)icu_to_uchar(&uchar2, text_to_cstring(txt2), len2); result = u_strcasecmp(uchar1, uchar2, 0); pfree(uchar1); pfree(uchar2); PG_RETURN_INT32(result); } /* * Return a binary sort key corresponding to the string and * its collation (through a COLLATE clause). */ Datum icu_sort_key(PG_FUNCTION_ARGS) { text *txt = PG_GETARG_TEXT_PP(0); UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); int32_t o_len = 1024; /* first attempt */ int32_t ulen; UChar *ustring; bytea *output; ulen = icu_to_uchar(&ustring, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); do { int32_t effective_len; output = (bytea*) palloc(o_len + VARHDRSZ); effective_len = ucol_getSortKey(collator, ustring, ulen, (uint8_t*)VARDATA(output), o_len); if (effective_len == 0) { elog(ERROR, "ucol_getSortKey() failed: internal error"); } if (effective_len > o_len) { pfree(output); output = NULL; } o_len = effective_len; } while (output == NULL); /* should loop at most once, if buffer too small */ SET_VARSIZE(output, o_len + VARHDRSZ - 1); /* -1 excludes the ending NUL byte */ PG_RETURN_BYTEA_P(output); } /* * Return a binary sort key corresponding to the string and * the given collation. */ Datum icu_sort_key_coll(PG_FUNCTION_ARGS) { text *txt = PG_GETARG_TEXT_PP(0); const char *locname = text_to_cstring(PG_GETARG_TEXT_P(1)); UCollator *collator; UErrorCode status = U_ZERO_ERROR; int32_t o_len = 1024; /* first attempt */ int32_t ulen; UChar *ustring; bytea *output; ulen = icu_to_uchar(&ustring, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); collator = ucol_open(locname, &status); if (!collator) elog(ERROR, "failed to open collation"); do { int32_t effective_len; output = (bytea*) palloc(o_len + VARHDRSZ); effective_len = ucol_getSortKey(collator, ustring, ulen, (uint8_t*)VARDATA(output), o_len); if (effective_len == 0) { ucol_close(collator); elog(ERROR, "ucol_getSortKey() failed: internal error"); } if (effective_len > o_len) { pfree(output); output = NULL; } o_len = effective_len; } while (output == NULL); /* should loop at most once, if buffer too small */ ucol_close(collator); SET_VARSIZE(output, o_len + VARHDRSZ - 1); /* -1 excludes the ending NUL byte */ PG_RETURN_BYTEA_P(output); } /* Return the first UChar32 of the char(1) string */ static UChar32 first_char32(BpChar* source) { UChar32 c; UText *ut; int32_t ulen; UChar *ustring; UErrorCode status = U_ZERO_ERROR; ulen = icu_to_uchar(&ustring, VARDATA_ANY(source), VARSIZE_ANY_EXHDR(source)); ut = utext_openUChars(NULL, ustring, ulen, &status); if (U_FAILURE(status)) elog(ERROR, "utext_openUChars() failed: %s", u_errorName(status)); c = utext_current32(ut); utext_close(ut); return c; } /* * Return the Unicode name corresponding to the the input character. */ Datum icu_char_name(PG_FUNCTION_ARGS) { BpChar *source = PG_GETARG_BPCHAR_PP(0); char local_buf[80]; char *buffer; int32_t buflen = sizeof(local_buf); UChar32 first_char; int32_t ulen; UErrorCode status = U_ZERO_ERROR; first_char = first_char32(source); ulen = u_charName(first_char, U_EXTENDED_CHAR_NAME, local_buf, buflen, &status); if (status == U_BUFFER_OVERFLOW_ERROR) /* buffer too small */ { buffer = palloc((ulen+1)*sizeof(char)); status = U_ZERO_ERROR; ulen = u_charName(first_char, U_EXTENDED_CHAR_NAME, buffer, ulen+1, &status); } else buffer = local_buf; if (U_FAILURE(status)) elog(ERROR, "u_charName failed: %s", u_errorName(status)); else PG_RETURN_TEXT_P(cstring_to_text(buffer)); } /* * Convert {full|medium|...} into an UDateFormatStyle value, or UDAT_NONE * if not recognized */ UDateFormatStyle date_format_style(const char *fmt) { UDateFormatStyle style = UDAT_NONE; if (fmt[0] == '{') { if (!strcmp(fmt+1, "short}")) style = UDAT_SHORT; else if (!strcmp(fmt+1, "medium}")) style = UDAT_MEDIUM; else if (!strcmp(fmt+1, "long}")) style = UDAT_LONG; else if (!strcmp(fmt+1, "full}")) style = UDAT_FULL; if (!strcmp(fmt+1, "short relative}")) style = UDAT_SHORT_RELATIVE; else if (!strcmp(fmt+1, "medium relative}")) style = UDAT_MEDIUM_RELATIVE; else if (!strcmp(fmt+1, "long relative}")) style = UDAT_LONG_RELATIVE; else if (!strcmp(fmt+1, "full relative}")) style = UDAT_FULL_RELATIVE; } return style; } static void assign_guc_date_format(const char *newval, void *extra) { if (*newval == '{') icu_ext_date_style = date_format_style(newval); else icu_ext_date_style = UDAT_NONE; } static void assign_guc_timestamptz_format(const char *newval, void *extra) { if (*newval == '{') icu_ext_timestamptz_style = date_format_style(newval); else icu_ext_timestamptz_style = UDAT_NONE; } static bool check_guc_date_format(char **newval, void **extra, GucSource source) { UDateFormatStyle style = UDAT_NONE; if (**newval == '{') { style = date_format_style(*newval); if (style == UDAT_NONE) return false; } return true; } /* * Module load callback */ void _PG_init(void) { DefineCustomStringVariable("icu_ext.locale", "Sets the default locale to use by ICU functions.", NULL, &icu_ext_default_locale, NULL, PGC_USERSET, 0, NULL, NULL, NULL); DefineCustomStringVariable("icu_ext.date_format", "Sets the default input/output format for dates.", NULL, &icu_ext_date_format, "{medium}", PGC_USERSET, 0, check_guc_date_format, assign_guc_date_format, NULL); DefineCustomStringVariable("icu_ext.timestamptz_format", "Sets the default input/output format for timestamptz values.", NULL, &icu_ext_timestamptz_format, "{medium}", PGC_USERSET, 0, check_guc_date_format, assign_guc_timestamptz_format, NULL); EmitWarningsOnPlaceholders("icu_ext"); } icu_ext-1.8.0/icu_ext.control000066400000000000000000000002041450461324700162240ustar00rootroot00000000000000# icu_ext extension comment = 'Access ICU functions' default_version = '1.8' module_pathname = '$libdir/icu_ext' relocatable = true icu_ext-1.8.0/icu_ext.h000066400000000000000000000034301450461324700147770ustar00rootroot00000000000000/* * icu_ext.h * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ #include "postgres.h" #include "datatype/timestamp.h" #include "unicode/ucol.h" #include "unicode/udat.h" /* * icu_interval_t is like Interval except for the additional year * field. Interval considers that 1 year = 12 months, whereas * icu_interval_t does not. */ typedef struct { TimeOffset time; /* all time units other than days, months and * years */ int32 day; /* days, after time for alignment */ int32 month; /* months, after time for alignment */ int32 year; /* years */ } icu_interval_t; UCollator* ucollator_from_coll_id(Oid collid); extern char *icu_ext_default_locale; extern char *icu_ext_date_format; extern char *icu_ext_timestamptz_format; extern UDateFormatStyle icu_ext_date_style; extern UDateFormatStyle icu_ext_timestamptz_style; extern UDateFormatStyle date_format_style(const char *fmt); extern Datum icu_timestamptz_add_interval(PG_FUNCTION_ARGS); extern Datum icu_timestamptz_sub_interval(PG_FUNCTION_ARGS); /* * Convert a Postgres timestamp into an ICU timestamp * ICU's UDate is a number of milliseconds since the Unix Epoch, * (1970-01-01, 00:00 UTC), stored as a double. * Postgres' TimestampTz is a number of microseconds since 2000-01-01 00:00 UTC, * stored as an int64. * The code below translates directly between the epochs */ #define TS_TO_UDATE(pg_tstz) \ (UDate)(10957.0*86400*1000 + (pg_tstz)/1000) /* * Convert an ICU timestamp into a Postgres timestamp * Input: number of milliseconds since 1970-01-01 UTC * Output: number of microseconds since 2000-01-01 UTC */ #define UDATE_TO_TS(ud) \ (TimestampTz)((ud)*1000 - 10957LL*86400*1000*1000) icu_ext-1.8.0/icu_interval.c000066400000000000000000000236161450461324700160260ustar00rootroot00000000000000/* * icu_interval.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "common/int.h" #include "utils/builtins.h" #include "utils/timestamp.h" #include "utils/pg_locale.h" #include "utils/date.h" #include "utils/datetime.h" /* ICU includes */ #include "unicode/ucal.h" #include "unicode/ucnv.h" /* needed? */ #include "unicode/udat.h" #include "unicode/ustring.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_interval_in); PG_FUNCTION_INFO_V1(icu_interval_out); PG_FUNCTION_INFO_V1(icu_from_interval); PG_FUNCTION_INFO_V1(icu_timestamptz_add_interval); PG_FUNCTION_INFO_V1(icu_interval_add_timestamptz); PG_FUNCTION_INFO_V1(icu_timestamptz_sub_interval); PG_FUNCTION_INFO_V1(icu_interval_mul); PG_FUNCTION_INFO_V1(icu_mul_i_interval); PG_FUNCTION_INFO_V1(icu_interv_plus_interv); PG_FUNCTION_INFO_V1(icu_interv_minus_interv); /* * Add an interval to a timestamp with timezone, given a localized calendar. * if locale==NULL, use the current ICU locale. */ static Datum add_interval(TimestampTz ts, const icu_interval_t *ival, const char *locale) { UErrorCode status = U_ZERO_ERROR; UDate date_time = TS_TO_UDATE(ts); UCalendar *ucal; UChar* tzid; int32_t tzid_length; const char *pg_tz_name = pg_get_timezone_name(session_timezone); tzid_length = icu_to_uchar(&tzid, pg_tz_name, /* or UCAL_UNKNOWN_ZONE_ID, like GMT */ strlen(pg_tz_name)); ucal = ucal_open(tzid, tzid_length, locale, UCAL_DEFAULT, &status); if (U_FAILURE(status)) { elog(ERROR, "ucal_open failed: %s\n", u_errorName(status)); } ucal_setMillis(ucal, date_time, &status); /* Add years, months, days, with the rules of the given calendar */ if (ival->year != 0) ucal_add(ucal, UCAL_YEAR, ival->year, &status); if (ival->month != 0) ucal_add(ucal, UCAL_MONTH, ival->month, &status); if (ival->day != 0) ucal_add(ucal, UCAL_DAY_OF_MONTH, ival->day, &status); if (ival->time != 0) ucal_add(ucal, UCAL_MILLISECOND, ival->time/1000, &status); /* Translate back to a UDate, and then to a postgres timestamptz */ date_time = ucal_getMillis(ucal, &status); ucal_close(ucal); if (U_FAILURE(status)) { elog(ERROR, "calendar translation failed: %s\n", u_errorName(status)); } PG_RETURN_TIMESTAMPTZ(UDATE_TO_TS(date_time)); } /* * This is similar to core's interval_in() except * there's no typmod support and no adjustment. */ Datum icu_interval_in(PG_FUNCTION_ARGS) { icu_interval_t *result; char *str = PG_GETARG_CSTRING(0); /* int32 typmod = PG_GETARG_INT32(2); */ int dtype; int nf; int dterr; char *field[MAXDATEFIELDS]; int ftype[MAXDATEFIELDS]; char workbuf[256]; #if PG_VERSION_NUM >= 160000 Node *escontext = fcinfo->context; DateTimeErrorExtra extra; #endif #if PG_VERSION_NUM >= 150000 struct pg_itm_in tt, *itm_in = &tt; #else fsec_t fsec; struct pg_tm tt, *tm = &tt; #endif #if PG_VERSION_NUM >= 150000 itm_in->tm_year = 0; itm_in->tm_mon = 0; itm_in->tm_mday = 0; itm_in->tm_usec = 0; #else tm->tm_year = 0; tm->tm_mon = 0; tm->tm_mday = 0; tm->tm_hour = 0; tm->tm_min = 0; tm->tm_sec = 0; fsec = 0; #endif dterr = ParseDateTime(str, workbuf, sizeof(workbuf), field, ftype, MAXDATEFIELDS, &nf); if (dterr == 0) { #if PG_VERSION_NUM >= 150000 dterr = DecodeInterval(field, ftype, nf, INTERVAL_FULL_RANGE, &dtype, itm_in); /* if those functions think it's a bad format, try ISO8601 style */ if (dterr == DTERR_BAD_FORMAT) dterr = DecodeISO8601Interval(str, &dtype, itm_in); #else dterr = DecodeInterval(field, ftype, nf, INTERVAL_FULL_RANGE, &dtype, tm, &fsec); if (dterr == DTERR_BAD_FORMAT) dterr = DecodeISO8601Interval(str, &dtype, tm, &fsec); #endif } if (dterr != 0) { if (dterr == DTERR_FIELD_OVERFLOW) dterr = DTERR_INTERVAL_OVERFLOW; #if PG_VERSION_NUM >= 160000 DateTimeParseError(dterr, &extra, str, "interval", escontext); #else DateTimeParseError(dterr, str, "interval"); #endif PG_RETURN_NULL(); } result = (icu_interval_t*) palloc(sizeof(icu_interval_t)); switch (dtype) { case DTK_DELTA: /* do not call itm2interval() to not merge years into months */ #if PG_VERSION_NUM >= 150000 result->month = itm_in->tm_mon; result->day = itm_in->tm_mday; result->year = itm_in->tm_year; result->time = itm_in->tm_usec; #else result->month = tm->tm_mon; result->day = tm->tm_mday; result->year = tm->tm_year; result->time = (((((tm->tm_hour * INT64CONST(60)) + tm->tm_min) * INT64CONST(60)) + tm->tm_sec) * USECS_PER_SEC) + fsec; #endif break; default: elog(ERROR, "unexpected dtype %d while parsing interval \"%s\"", dtype, str); } return PointerGetDatum(result); } /* * Text representation for icu_interval. * It is essentially identical to "interval" except that * the year field is not months%12 */ Datum icu_interval_out(PG_FUNCTION_ARGS) { icu_interval_t *itv = (icu_interval_t*)PG_GETARG_DATUM(0); char buf[MAXDATELEN + 1]; TimeOffset time, tfrac; #if PG_VERSION_NUM >= 150000 struct pg_itm itm; itm.tm_year = itv->year; itm.tm_mon = itv->month; itm.tm_mday = itv->day; /* The following code is copied from interval2itm() in backend/utils/adt/timestamp.c */ time = itv->time; tfrac = time / USECS_PER_HOUR; time -= tfrac * USECS_PER_HOUR; itm.tm_hour = tfrac; tfrac = time / USECS_PER_MINUTE; time -= tfrac * USECS_PER_MINUTE; itm.tm_min = (int) tfrac; tfrac = time / USECS_PER_SEC; time -= tfrac * USECS_PER_SEC; itm.tm_sec = (int) tfrac; itm.tm_usec = (int) time; EncodeInterval(&itm, IntervalStyle, buf); #else /* see interval2tm() in backend/utils/adt/timestamp.c */ fsec_t fsec; struct pg_tm ptm, *tm = &ptm; tm->tm_year = itv->month / MONTHS_PER_YEAR; tm->tm_mon = itv->month % MONTHS_PER_YEAR; tm->tm_mday = itv->day; time = itv->time; tfrac = time / USECS_PER_HOUR; time -= tfrac * USECS_PER_HOUR; tm->tm_hour = tfrac; if ((tm->tm_hour < 0) != (tfrac < 0)) /* !SAMESIGN */ ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("interval out of range"))); tfrac = time / USECS_PER_MINUTE; time -= tfrac * USECS_PER_MINUTE; tm->tm_min = tfrac; tfrac = time / USECS_PER_SEC; fsec = time - (tfrac * USECS_PER_SEC); tm->tm_sec = tfrac; EncodeInterval(tm, fsec, IntervalStyle, buf); #endif PG_RETURN_CSTRING(pstrdup(buf)); } Datum icu_from_interval(PG_FUNCTION_ARGS) { Interval *pg_interval = PG_GETARG_INTERVAL_P(0); icu_interval_t *interval = (icu_interval_t*) palloc(sizeof(icu_interval_t)); interval->time = pg_interval->time; interval->day = pg_interval->day; interval->month = pg_interval->month; interval->year = 0; return PointerGetDatum(interval); } /* * icu_timestamptz + icu_interval */ Datum icu_timestamptz_add_interval(PG_FUNCTION_ARGS) { TimestampTz pg_ts = PG_GETARG_TIMESTAMPTZ(0); icu_interval_t *itv = (icu_interval_t*) PG_GETARG_DATUM(1); return add_interval(pg_ts, itv, icu_ext_default_locale); } /* * icu_interval + icu_timestamptz */ Datum icu_interval_add_timestamptz(PG_FUNCTION_ARGS) { icu_interval_t *itv = (icu_interval_t*) PG_GETARG_DATUM(0); TimestampTz pg_ts = PG_GETARG_TIMESTAMPTZ(1); return add_interval(pg_ts, itv, icu_ext_default_locale); } Datum icu_timestamptz_sub_interval(PG_FUNCTION_ARGS) { TimestampTz pg_ts = PG_GETARG_TIMESTAMPTZ(0); icu_interval_t *itv = (icu_interval_t*) PG_GETARG_DATUM(1); itv->year = -itv->year; itv->month = -itv->month; itv->day = -itv->day; itv->time = -itv->time; return add_interval(pg_ts, itv, icu_ext_default_locale); } Datum icu_interval_mul(PG_FUNCTION_ARGS) { icu_interval_t *itv = (icu_interval_t*) PG_GETARG_DATUM(0); int32 factor = PG_GETARG_INT32(1); icu_interval_t *result; result = (icu_interval_t *) palloc(sizeof(icu_interval_t)); if (pg_mul_s32_overflow(itv->day, factor, &result->day) || pg_mul_s32_overflow(itv->month, factor, &result->month) || pg_mul_s32_overflow(itv->year, factor, &result->year) || pg_mul_s64_overflow(itv->time, factor, &result->time)) { ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("interval out of range"))); } return PointerGetDatum(result); } /* integer multiplied by icu_interval */ Datum icu_mul_i_interval(PG_FUNCTION_ARGS) { Datum factor = PG_GETARG_DATUM(0); Datum itv = PG_GETARG_DATUM(1); return DirectFunctionCall2(icu_interval_mul, itv, factor); } /* icu_interval + icu_interval */ Datum icu_interv_plus_interv(PG_FUNCTION_ARGS) { icu_interval_t *i1 = (icu_interval_t*) PG_GETARG_DATUM(0); icu_interval_t *i2 = (icu_interval_t*) PG_GETARG_DATUM(1); icu_interval_t *result; result = (icu_interval_t *) palloc(sizeof(icu_interval_t)); if (pg_add_s32_overflow(i1->day, i2->day, &result->day) || pg_add_s32_overflow(i1->month, i2->month, &result->month) || pg_add_s32_overflow(i1->year, i2->year, &result->year) || pg_add_s64_overflow(i1->time, i2->time, &result->time)) { ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("interval out of range"))); } return PointerGetDatum(result); } /* icu_interval - icu_interval */ Datum icu_interv_minus_interv(PG_FUNCTION_ARGS) { icu_interval_t *i1 = (icu_interval_t*) PG_GETARG_DATUM(0); icu_interval_t *i2 = (icu_interval_t*) PG_GETARG_DATUM(1); icu_interval_t *result; result = (icu_interval_t *) palloc(sizeof(icu_interval_t)); if (pg_add_s32_overflow(i1->day, -i2->day, &result->day) || pg_add_s32_overflow(i1->month, -i2->month, &result->month) || pg_add_s32_overflow(i1->year, -i2->year, &result->year) || pg_add_s64_overflow(i1->time, -i2->time, &result->time)) { ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("interval out of range"))); } return PointerGetDatum(result); } /* TODO: - binary - cast from icu_interval to interval? - justify_interval? */ icu_ext-1.8.0/icu_normalize.c000066400000000000000000000074741450461324700162060ustar00rootroot00000000000000/* * icu_normalize.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #if PG_VERSION_NUM >= 160000 #include "varatt.h" #endif /* ICU includes */ #include "unicode/unorm.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_is_normalized); PG_FUNCTION_INFO_V1(icu_normalize); typedef enum { UNICODE_NFC, UNICODE_NFD, UNICODE_NFKC, UNICODE_NFKD } norm_form_t; static norm_form_t name_to_norm(const char *formstr) { if (pg_strcasecmp(formstr, "NFC") == 0) return UNICODE_NFC; else if (pg_strcasecmp(formstr, "NFD") == 0) return UNICODE_NFD; else if (pg_strcasecmp(formstr, "NFKC") == 0) return UNICODE_NFKC; else if (pg_strcasecmp(formstr, "NFKD") == 0) return UNICODE_NFKD; else elog(ERROR, "invalid normalization form: %s", formstr); } static const UNormalizer2* norm_instance(norm_form_t form) { UErrorCode status = U_ZERO_ERROR; const UNormalizer2 *instance = NULL; switch (form) { case UNICODE_NFC: instance = unorm2_getNFCInstance(&status); break; case UNICODE_NFD: instance = unorm2_getNFDInstance(&status); break; case UNICODE_NFKC: instance = unorm2_getNFKCInstance(&status); break; case UNICODE_NFKD: instance = unorm2_getNFKDInstance(&status); break; } if (U_FAILURE(status)) elog(ERROR, "norm_instance failure: %s", u_errorName(status)); return instance; } /* * Return the string (1st arg) with the given Unicode normalization * (2nd arg). */ Datum icu_normalize(PG_FUNCTION_ARGS) { text *src_text = PG_GETARG_TEXT_PP(0); const char* arg_form = text_to_cstring(PG_GETARG_TEXT_P(1)); norm_form_t form = name_to_norm(arg_form); const UNormalizer2 *instance = norm_instance(form); int32_t u_src_length, u_dest_length, effective_length, result_len; char *result; UChar *u_src, *u_dest; UErrorCode status = U_ZERO_ERROR; if (GetDatabaseEncoding() != PG_UTF8) elog(ERROR, "non-Unicode database encoding"); u_src_length = icu_to_uchar(&u_src, VARDATA_ANY(src_text), VARSIZE_ANY_EXHDR(src_text)); /* * The result may be expanded by the maximum factor given at: * https://unicode.org/faq/normalization.html#12 * (given that the UChar buffer is in UTF-16) */ switch(form) { case UNICODE_NFC: u_dest_length = u_src_length * 3; break; case UNICODE_NFD: u_dest_length = u_src_length * 4; break; case UNICODE_NFKC: case UNICODE_NFKD: default: u_dest_length = u_src_length * 18; break; } u_dest = (UChar*) palloc(u_dest_length*sizeof(UChar)); effective_length = unorm2_normalize(instance, u_src, u_src_length, u_dest, u_dest_length, &status); if (U_FAILURE(status)) elog(ERROR, "unorm2_normalize failure: %s", u_errorName(status)); result_len = icu_from_uchar(&result, u_dest, effective_length); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } /* * Check if a string (1st arg) is in the given Unicode normal form * (2nd arg). */ Datum icu_is_normalized(PG_FUNCTION_ARGS) { text *src_text = PG_GETARG_TEXT_PP(0); const char* arg_form = text_to_cstring(PG_GETARG_TEXT_PP(1)); norm_form_t form = name_to_norm(arg_form); UErrorCode status = U_ZERO_ERROR; UChar *u_src; int32_t u_src_length; UBool is_norm; const UNormalizer2 *instance = norm_instance(form); if (GetDatabaseEncoding() != PG_UTF8) elog(ERROR, "non-Unicode database encoding"); u_src_length = icu_to_uchar(&u_src, VARDATA_ANY(src_text), VARSIZE_ANY_EXHDR(src_text)); is_norm = unorm2_isNormalized(instance, u_src, u_src_length, &status); if (U_FAILURE(status)) elog(ERROR, "unorm2_isNormalized failure: %s", u_errorName(status)); PG_RETURN_BOOL(is_norm == 1); } icu_ext-1.8.0/icu_num.c000066400000000000000000000031321450461324700147700ustar00rootroot00000000000000/* * icu_num.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ #include "postgres.h" #include "access/htup_details.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "mb/pg_wchar.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "unicode/unum.h" #include "unicode/ustring.h" #include "unicode/utext.h" PG_FUNCTION_INFO_V1(icu_number_spellout); Datum icu_number_spellout(PG_FUNCTION_ARGS) { float8 number = PG_GETARG_FLOAT8(0); const char *locale = text_to_cstring(PG_GETARG_TEXT_PP(1)); UErrorCode status = U_ZERO_ERROR; UChar local_ubuf[256]; UChar *ubuf = local_ubuf; int32_t buf_len = sizeof(local_ubuf)/sizeof(UChar); UNumberFormat* nf; int32_t real_len; char *output; nf = unum_open(UNUM_SPELLOUT, NULL, /* pattern */ -1, /* pattern length */ locale, NULL, /* parseErr */ &status); if (U_FAILURE(status)) elog(ERROR, "unum_open failed: %s", u_errorName(status)); real_len = unum_formatDouble(nf, number, ubuf, buf_len, NULL, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { /* buffer too small */ ubuf = palloc((real_len+1)*sizeof(UChar)); status = U_ZERO_ERROR; real_len = unum_formatDouble(nf, number, ubuf, real_len+1, NULL, &status); } if (U_FAILURE(status)) { unum_close(nf); elog(ERROR, "unum_formatDouble failed: %s", u_errorName(status)); } icu_from_uchar(&output, ubuf, real_len); unum_close(nf); PG_RETURN_TEXT_P(cstring_to_text(output)); } icu_ext-1.8.0/icu_search.c000066400000000000000000000226021450461324700154410ustar00rootroot00000000000000/* * icu_search.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" /* ICU includes */ #include "unicode/ucol.h" #include "unicode/usearch.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_strpos); PG_FUNCTION_INFO_V1(icu_strpos_coll); PG_FUNCTION_INFO_V1(icu_replace); PG_FUNCTION_INFO_V1(icu_replace_coll); /* * Given @str in the database encoding and @str_utf16 its UTF-16 * representation, translate the character position @u16_pos (expressed in * UTF-16 code units and 0-based) to a character position in @str. * It differs from @u16_pos if @str_utf16 contains surrogate pairs. * * if @p_str null, make it point to the first byte * corresponding to @pos in @str */ static int32_t translate_char_pos(const char* str, int32_t str_len, const UChar* str_utf16, int32_t u16_len, /* in 16-bit code units */ int32_t u16_pos, const char **p_str) { UChar32 c; int32_t u16_idx = 0; int32_t out_pos = 0; if (GetDatabaseEncoding() == PG_UTF8) { int32_t u8_offset = 0; /* for UTF-8, use ICU macros instead of calling pg_mblen() */ while (u16_idx < u16_pos) { U16_NEXT(str_utf16, u16_idx, u16_len, c); U8_NEXT(str, u8_offset, str_len, c); out_pos++; } if (p_str != NULL) *p_str = str + u8_offset; } else if (pg_encoding_max_length(GetDatabaseEncoding()) == 1) { /* * for mono-byte encodings, assume a 1:1 mapping with UTF-16 * code units, since they should not contain characters * outside of the BMP. */ out_pos = u16_pos; if (p_str != NULL) *p_str = str + out_pos; } else { /* for non-UTF-8 multi-byte encodings, use pg_mblen() */ while (u16_idx < u16_pos) { U16_NEXT(str_utf16, u16_idx, u16_len, c); str += pg_mblen(str); out_pos++; } if (p_str != NULL) *p_str = str; } return out_pos; } /* * Do the bulk of the work for icu_strpos and icu_strpos_coll. * Return values: * 0: not found * >0: the 1-based position of txt2 into txt1 */ static int32_t internal_strpos(text *txt1, text *txt2, UCollator *collator) { int32_t len1 = VARSIZE_ANY_EXHDR(txt1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); UErrorCode status = U_ZERO_ERROR; UStringSearch *usearch; UChar *uchar1, *uchar2; int32_t ulen1, ulen2; int32_t pos; /* * A non-empty substring is never contained by an empty string. */ if (len1 == 0 && len2 != 0) return 0; /* * An empty substring is always found at the first character (even * inside an empty string), to be consistent with strpos() in * core. */ if (len2 == 0) return 1; ulen1 = icu_to_uchar(&uchar1, VARDATA_ANY(txt1), len1); ulen2 = icu_to_uchar(&uchar2, VARDATA_ANY(txt2), len2); usearch = usearch_openFromCollator(uchar2, /* needle */ ulen2, uchar1, /* haystack */ ulen1, collator, NULL, &status); if (U_FAILURE(status)) elog(ERROR, "failed to start search: %s", u_errorName(status)); else { pos = usearch_first(usearch, &status); if (!U_FAILURE(status) && pos != USEARCH_DONE) { /* * pos is in UTF-16 code units, with surrogate pairs counting * as two, so we need a non-trivial translation to the corresponding * position in the original string. */ pos = translate_char_pos(VARDATA_ANY(txt1), len1, uchar1, ulen1, pos, NULL); } else pos = -1; } pfree(uchar1); pfree(uchar2); usearch_close(usearch); if (U_FAILURE(status)) elog(ERROR, "failed to perform ICU search: %s", u_errorName(status)); /* return 0 if not found or the 1-based position of txt2 inside txt1 */ return pos + 1; } /* * Equivalent of strpos(haystack, needle) using ICU search */ Datum icu_strpos(PG_FUNCTION_ARGS) { UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); PG_RETURN_INT32(internal_strpos(PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ collator)); } /* * Equivalent of strpos(haystack, needle) using ICU search */ Datum icu_strpos_coll(PG_FUNCTION_ARGS) { const char *collname = text_to_cstring(PG_GETARG_TEXT_PP(2)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; int32_t pos; collator = ucol_open(collname, &status); if (!collator || U_FAILURE(status)) { elog(ERROR, "failed to open collation: %s", u_errorName(status)); } pos = internal_strpos(PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ collator); ucol_close(collator); PG_RETURN_INT32(pos); } /* * Search for @txt2 in @txt1 with the ICU @collator and replace the * matched substrings with @txt3. * * The replacement text is always txt3, but the replaced text may not * be exactly txt2, and its length in bytes may differ too, depending on * the collation rules. For example in utf-8 with an accent-insensitive * collation, {LATIN SMALL LETTER E WITH ACUTE} (2 bytes) will match * {LATIN SMALL LETTER E} (1 byte). */ static text * internal_str_replace(text *txt1, /* not const because it may be returned */ const text *txt2, /* search for txt2 with collator */ const text *txt3, /* replace the matched substrings by txt3 */ UCollator *collator) { int32_t len1 = VARSIZE_ANY_EXHDR(txt1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); int32_t len3 = VARSIZE_ANY_EXHDR(txt3); UErrorCode status = U_ZERO_ERROR; UStringSearch *usearch; UChar *uchar1, *uchar2; int32_t ulen1, ulen2; /* in utf-16 units */ text *result; int32_t pos; StringInfoData resbuf; if (len1 == 0 || len2 == 0) return txt1; ulen1 = icu_to_uchar(&uchar1, VARDATA_ANY(txt1), len1); ulen2 = icu_to_uchar(&uchar2, VARDATA_ANY(txt2), len2); usearch = usearch_openFromCollator(uchar2, /* needle */ ulen2, uchar1, /* haystack */ ulen1, collator, NULL, &status); /* "nana" in "nananana" must be found 2 times, not 3 times. */ usearch_setAttribute(usearch, USEARCH_OVERLAP, USEARCH_OFF, &status); pos = usearch_first(usearch, &status); if (U_FAILURE(status)) elog(ERROR, "failed to perform ICU search: %s", u_errorName(status)); if (pos != USEARCH_DONE) { const char *txt1_currptr; const char* txt1_startptr = VARDATA_ANY(txt1); initStringInfo(&resbuf); /* initialize the output string with the segment before the first match */ translate_char_pos(txt1_startptr, len1, uchar1, ulen1, pos, &txt1_currptr); appendBinaryStringInfo(&resbuf, txt1_startptr, txt1_currptr - txt1_startptr); /* append the replacement text */ appendBinaryStringInfo(&resbuf, VARDATA_ANY(txt3), len3); /* skip the replaced text in txt1 */ translate_char_pos( txt1_currptr, len1 - (txt1_currptr - txt1_startptr), uchar1 + pos, usearch_getMatchedLength(usearch), usearch_getMatchedLength(usearch), &txt1_currptr); do { int32_t previous_pos = pos + usearch_getMatchedLength(usearch); CHECK_FOR_INTERRUPTS(); pos = usearch_next(usearch, &status); if (U_FAILURE(status)) break; if (pos != USEARCH_DONE) { const char *txt1_nextptr; /* copy the segment before the match */ translate_char_pos( txt1_currptr, len1 - (txt1_currptr - txt1_startptr), uchar1 + previous_pos, len1 - previous_pos, pos - previous_pos, &txt1_nextptr); appendBinaryStringInfo(&resbuf, txt1_currptr, txt1_nextptr - txt1_currptr); /* compute the length of the replaced text in txt1 */ translate_char_pos( txt1_nextptr, len1 - (txt1_nextptr - txt1_startptr), uchar1 + pos, usearch_getMatchedLength(usearch), usearch_getMatchedLength(usearch), &txt1_currptr); /* append the replacement text */ appendBinaryStringInfo(&resbuf, VARDATA_ANY(txt3), len3); } } while (pos != USEARCH_DONE); /* copy the segment after the last match */ if (len1 - (txt1_currptr - txt1_startptr) > 0) { appendBinaryStringInfo(&resbuf, txt1_currptr, len1 - (txt1_currptr - txt1_startptr)); } result = cstring_to_text_with_len(resbuf.data, resbuf.len); pfree(resbuf.data); } else { /* * The substring is not found: return the original string */ result = txt1; } pfree(uchar1); pfree(uchar2); if (usearch != NULL) usearch_close(usearch); if (U_FAILURE(status)) elog(ERROR, "failed to perform ICU search: %s", u_errorName(status)); return result; } Datum icu_replace(PG_FUNCTION_ARGS) { UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); text *string; string = internal_str_replace( PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ PG_GETARG_TEXT_PP(2), /* replacement */ collator); PG_RETURN_TEXT_P(string); } Datum icu_replace_coll(PG_FUNCTION_ARGS) { const char *collname = text_to_cstring(PG_GETARG_TEXT_PP(3)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; collator = ucol_open(collname, &status); if (!collator || U_FAILURE(status)) { elog(ERROR, "failed to open collation: %s", u_errorName(status)); } PG_RETURN_TEXT_P( internal_str_replace( PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ PG_GETARG_TEXT_PP(2), /* replacement */ collator) ); } icu_ext-1.8.0/icu_spoof.c000066400000000000000000000063661450461324700153330ustar00rootroot00000000000000/* * icu_spoof.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "unicode/uspoof.h" PG_FUNCTION_INFO_V1(icu_confusable_string_skeleton); PG_FUNCTION_INFO_V1(icu_spoof_check); PG_FUNCTION_INFO_V1(icu_confusable_strings_check); /* * Get the "skeleton" for an input string. * Two strings are confusable if their skeletons are identical. */ Datum icu_confusable_string_skeleton(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); UErrorCode status = U_ZERO_ERROR; USpoofChecker *sc; int32_t ulen1, ulen_skel, result_len; UChar *uchar1, *uchar_skel; char *result; sc = uspoof_open(&status); if (!sc) elog(ERROR, "ICU uspoof_open failed"); ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); // maximum of equal length sounds like a sane guess for the first try ulen_skel = ulen1; uchar_skel = (UChar*) palloc((ulen_skel)*sizeof(UChar)); ulen_skel = uspoof_getSkeleton(sc, 0, uchar1, ulen1, uchar_skel, ulen_skel, &status); if (U_FAILURE(status) && status == U_BUFFER_OVERFLOW_ERROR) { // try again with a properly sized buffer status = U_ZERO_ERROR; pfree(uchar_skel); uchar_skel = (UChar*) palloc((ulen_skel)*sizeof(UChar)); ulen_skel = uspoof_getSkeleton(sc, 0, uchar1, ulen1, uchar_skel, ulen_skel, &status); } uspoof_close(sc); if (U_FAILURE(status)) elog(ERROR, "ICU uspoof_getSkeleton failed: %s", u_errorName(status)); result_len = icu_from_uchar(&result, uchar_skel, ulen_skel); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } /* * Check whether the input string is likely to be an attempt at * confusing a reader. */ Datum icu_spoof_check(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); UErrorCode status = U_ZERO_ERROR; USpoofChecker *sc; int32_t bitmask; int32_t ulen1; UChar *uchar1; sc = uspoof_open(&status); if (!sc) elog(ERROR, "ICU uspoof_open failed"); ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); bitmask = uspoof_check(sc, uchar1, ulen1, NULL, &status); uspoof_close(sc); if (U_FAILURE(status)) elog(ERROR, "ICU uspoof_areConfusable failed: %s", u_errorName(status)); PG_RETURN_BOOL(bitmask != 0); } /* * Check whether the two input strings are visually confusable with * each other. */ Datum icu_confusable_strings_check(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); text *txt2 = PG_GETARG_TEXT_PP(1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); int32_t ulen1, ulen2; UChar *uchar1, *uchar2; USpoofChecker *sc; UErrorCode status = U_ZERO_ERROR; int32_t bitmask; sc = uspoof_open(&status); if (!sc) elog(ERROR, "ICU uspoof_open failed"); ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); ulen2 = icu_to_uchar(&uchar2, text_to_cstring(txt2), len2); bitmask = uspoof_areConfusable(sc, uchar1, ulen1, uchar2, ulen2, &status); uspoof_close(sc); if (U_FAILURE(status)) elog(ERROR, "ICU uspoof_areConfusable failed: %s", u_errorName(status)); PG_RETURN_BOOL(bitmask != 0); } icu_ext-1.8.0/icu_timestamptz.c000066400000000000000000000133541450461324700165610ustar00rootroot00000000000000/* * icu_timestamptz.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "pgtime.h" #include "utils/builtins.h" #include "utils/timestamp.h" #include "utils/pg_locale.h" #include "utils/date.h" #include "utils/datetime.h" /* ICU includes */ #include "unicode/ucal.h" #include "unicode/udat.h" #include "unicode/ustring.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_timestamptz_in); PG_FUNCTION_INFO_V1(icu_timestamptz_out); PG_FUNCTION_INFO_V1(icu_date_to_ts); PG_FUNCTION_INFO_V1(icu_ts_to_date); /* * icu_timestamptz_out() * Convert a timestamp to external form. */ Datum icu_timestamptz_out(PG_FUNCTION_ARGS) { TimestampTz dt = PG_GETARG_TIMESTAMPTZ(0); char *result; int tz; struct pg_tm tt, *tm = &tt; fsec_t fsec; const char *tzn; char buf[MAXDATELEN + 1]; if (TIMESTAMP_NOT_FINITE(dt)) { EncodeSpecialTimestamp(dt, buf); result = pstrdup(buf); PG_RETURN_CSTRING(result); } else if (timestamp2tm(dt, &tz, tm, &fsec, &tzn, NULL) == 0) { UErrorCode status = U_ZERO_ERROR; UDateFormat* df = NULL; UDate udate = TS_TO_UDATE(dt); const char *locale = NULL; UChar *output_pattern = NULL; int32_t pattern_length = -1; UChar* tzid; int32_t tzid_length; UDateFormatStyle style = icu_ext_timestamptz_style; const char *pg_tz_name = pg_get_timezone_name(session_timezone); if (icu_ext_timestamptz_format != NULL) { if (icu_ext_timestamptz_format[0] != '\0' && icu_ext_timestamptz_style == UDAT_NONE) { pattern_length = icu_to_uchar(&output_pattern, icu_ext_timestamptz_format, strlen(icu_ext_timestamptz_format)); } } if (icu_ext_default_locale != NULL && icu_ext_default_locale[0] != '\0') { locale = icu_ext_default_locale; } /* use PG current timezone, hopefully compatible with ICU */ tzid_length = icu_to_uchar(&tzid, pg_tz_name, strlen(pg_tz_name)); /* if UDAT_PATTERN is passed, it must for both timeStyle and dateStyle */ df = udat_open(output_pattern ? UDAT_PATTERN : style, /* timeStyle */ output_pattern ? UDAT_PATTERN : style, /* dateStyle */ locale, /* NULL for the default locale */ tzid, /* tzID (NULL=default). */ tzid_length, /* tzIDLength */ output_pattern, /* pattern */ pattern_length, /* patternLength */ &status); if (U_FAILURE(status)) elog(ERROR, "udat_open failed with code %d\n", status); { /* Try first to convert into a buffer on the stack, and palloc() it only if udat_format says it's too small */ UChar local_buf[MAXDATELEN]; int32_t u_buffer_size = udat_format(df, udate, local_buf, sizeof(local_buf)/sizeof(UChar), NULL, &status); if(status == U_BUFFER_OVERFLOW_ERROR) { UChar* u_buffer; status = U_ZERO_ERROR; u_buffer = (UChar*) palloc(u_buffer_size*sizeof(UChar)); udat_format(df, udate, u_buffer, u_buffer_size, NULL, &status); icu_from_uchar(&result, u_buffer, u_buffer_size); } else { icu_from_uchar(&result, local_buf, u_buffer_size); } } if (df) udat_close(df); PG_RETURN_CSTRING(result); } else ereport(ERROR, (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), errmsg("timestamp out of range"))); } /* * icu_timestamptz_in() * Convert a string to internal form. */ Datum icu_timestamptz_in(PG_FUNCTION_ARGS) { char *input_string = PG_GETARG_CSTRING(0); int32_t pattern_length = -1; UChar *u_ts_string; int32_t u_ts_length; UDateFormat* df = NULL; UDate udat; UDateFormatStyle style = icu_ext_timestamptz_style; UErrorCode status = U_ZERO_ERROR; UChar *input_pattern = NULL; const char *locale = NULL; int32_t parse_pos = 0; UChar* tzid; int32_t tzid_length; const char *pg_tz_name = pg_get_timezone_name(session_timezone); if (icu_ext_timestamptz_format != NULL) { if (icu_ext_timestamptz_format[0] != '\0' && style == UDAT_NONE) { pattern_length = icu_to_uchar(&input_pattern, icu_ext_timestamptz_format, strlen(icu_ext_timestamptz_format)); } } u_ts_length = icu_to_uchar(&u_ts_string, input_string, strlen(input_string)); if (icu_ext_default_locale != NULL && icu_ext_default_locale[0] != '\0') { locale = icu_ext_default_locale; } /* use PG current timezone, hopefully compatible with ICU */ tzid_length = icu_to_uchar(&tzid, pg_tz_name, strlen(pg_tz_name)); /* if UDAT_PATTERN is used, we must pass it for both timeStyle and dateStyle */ df = udat_open(input_pattern ? UDAT_PATTERN : style, /* timeStyle */ input_pattern ? UDAT_PATTERN : style, /* dateStyle */ locale, tzid, /* tzID */ tzid_length, /* tzIDLength */ input_pattern, pattern_length, &status); if (U_FAILURE(status)) { udat_close(df); elog(ERROR, "udat_open failed: %s\n", u_errorName(status)); } udat_setLenient(df, false); /* strict parsing */ udat = udat_parse(df, u_ts_string, u_ts_length, &parse_pos, &status); udat_close(df); if (U_FAILURE(status)) elog(ERROR, "udat_parse failed: %s\n", u_errorName(status)); PG_RETURN_TIMESTAMPTZ(UDATE_TO_TS(udat)); } /* * Conversions between icu_timestamptz and icu_date are exactly the * same as with the PG types timestamptz/date, since they share the * same internal representation. */ Datum icu_date_to_ts(PG_FUNCTION_ARGS) { return DirectFunctionCall2(date_timestamptz, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1)); } Datum icu_ts_to_date(PG_FUNCTION_ARGS) { return DirectFunctionCall2(timestamptz_date, PG_GETARG_DATUM(0), PG_GETARG_DATUM(1)); } icu_ext-1.8.0/icu_transform.c000066400000000000000000000103771450461324700162150ustar00rootroot00000000000000/* * icu_transform.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2023. See LICENSE.md */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "utils/memutils.h" #include "unicode/uenum.h" #include "unicode/utrans.h" PG_FUNCTION_INFO_V1(icu_transforms_list); PG_FUNCTION_INFO_V1(icu_transform); /* * List the available pre-defined transforms/transliterations. */ Datum icu_transforms_list(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; UErrorCode status = U_ZERO_ERROR; UEnumeration *en; const char *elt; if (SRF_IS_FIRSTCALL()) { funcctx = SRF_FIRSTCALL_INIT(); en = utrans_openIDs(&status); if (U_FAILURE(status)) elog(ERROR, "utrans_openIDs failed: %s", u_errorName(status)); funcctx->user_fctx = (void *)en; } funcctx = SRF_PERCALL_SETUP(); en = (UEnumeration*) funcctx->user_fctx; elt = uenum_next(en, NULL, &status); if (U_FAILURE(status)) elog(ERROR, "uenum_next failed: %s", u_errorName(status)); if (elt) { text* item = cstring_to_text(elt); SRF_RETURN_NEXT(funcctx, PointerGetDatum(item)); } else { uenum_close(en); SRF_RETURN_DONE(funcctx); } } /* * Cache for the last transformation used. * This may come in handy in applications that use several times the same transformation */ static UTransliterator *utrans = NULL; static char *cached_utrans_id = NULL; /* * Main function to apply a transformation based on UTransliterator. * Input: * 1st arg: string to transform * 2nd arg: name (system identifier) of the transliterator */ Datum icu_transform(PG_FUNCTION_ARGS) { text *arg1 = PG_GETARG_TEXT_PP(0); text *arg2 = PG_GETARG_TEXT_PP(1); int32_t len1 = VARSIZE_ANY_EXHDR(arg1); const char *input_id = text_to_cstring(arg2); UErrorCode status = U_ZERO_ERROR; int32_t ulen, limit, capacity, start, original_ulen; int32_t result_len, in_ulen; UChar* utext; UChar* trans_id; char* result; UChar* original; bool done = false; if (cached_utrans_id != NULL) { if (strcmp(cached_utrans_id, input_id) != 0) { pfree(cached_utrans_id); cached_utrans_id = NULL; utrans_close(utrans); utrans = NULL; } } if (utrans == NULL) { in_ulen = icu_to_uchar(&trans_id, input_id, strlen(input_id)); utrans = utrans_openU(trans_id, in_ulen, UTRANS_FORWARD, NULL, /* rules. NULL for system transliterators */ -1, NULL, /* pointer to parseError. Not used */ &status); if (U_FAILURE(status) || !utrans) { elog(ERROR, "utrans_open failed: %s", u_errorName(status)); } else { cached_utrans_id = MemoryContextStrdup(TopMemoryContext, input_id); } } ulen = icu_to_uchar(&utext, text_to_cstring(arg1), len1); /* utext is terminated by a zero UChar that we include in the copy. */ original = (UChar*) palloc((ulen+1)*sizeof(UChar)); original_ulen = ulen; memcpy(original, utext, (ulen+1)*sizeof(UChar)); limit = ulen; capacity = ulen + 1; start = 0; /* * utrans_transUChars() updates the string in-place, stopping if * it would go over `capacity`. * The following loop doubles the capacity and restarts from * scratch with a clean copy of the source if the buffer was * too small. * Although it looks like we could use `start` and `limit` * to reallocate and make the transliteration continue from * where it stopped, in practice this does not appear to work. * The documentation is quite unclear about this function. */ do { status = U_ZERO_ERROR; utrans_transUChars(utrans, utext, &ulen, capacity, start, /* beginning index */ &limit, &status); if (U_FAILURE(status)) { if (status != U_BUFFER_OVERFLOW_ERROR) { elog(ERROR, "utrans_transUChars failed: %s", u_errorName(status)); } else { pfree(utext); capacity = capacity * 2; utext = (UChar*) palloc(capacity*sizeof(UChar)); /* restore the original text in the enlarged buffer */ ulen = original_ulen; limit = ulen; memcpy(utext, original, (ulen+1)*sizeof(UChar)); } } else done = true; } while (!done); result_len = icu_from_uchar(&result, utext, ulen); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } icu_ext-1.8.0/sql/000077500000000000000000000000001450461324700137655ustar00rootroot00000000000000icu_ext-1.8.0/sql/icu_ext--1.0--1.1.sql000066400000000000000000000022221450461324700170670ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.1'" to load this file. \quit CREATE FUNCTION icu_char_name( c character ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_char_name(character) IS 'Return the Unicode character name corresponding to the first codepoint of the input'; CREATE FUNCTION icu_number_spellout( num float8, locale text ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_number_spellout(float8,text) IS 'Spell out the number according to the given locale'; CREATE FUNCTION icu_spoof_check( str text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_spoof_check(text) IS 'Check whether the argument is likely to be an attempt at confusing a reader'; CREATE FUNCTION icu_confusable_strings_check( str1 text, str2 text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_confusable_strings_check(text,text) IS 'Check whether the arguments are visually confusable with each other'; icu_ext-1.8.0/sql/icu_ext--1.1--1.2.sql000066400000000000000000000011251450461324700170720ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.2'" to load this file. \quit CREATE FUNCTION icu_transforms_list( ) RETURNS SETOF text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transforms_list() IS 'List the basic transforms available to icu_transform'; CREATE FUNCTION icu_transform(string text, trans text) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transform(text,text) IS 'Apply a transformation through basic or compound transliterators and filters'; icu_ext-1.8.0/sql/icu_ext--1.2--1.3.sql000066400000000000000000000024101450461324700170720ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.3'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_sort_key( str text, collator text ) RETURNS bytea AS 'MODULE_PATHNAME', 'icu_sort_key_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text,text) IS 'Compute the binary sort key for the string given the collation'; CREATE OR REPLACE FUNCTION icu_sort_key( str text ) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text) IS 'Compute the binary sort key with the collate of the string'; CREATE OR REPLACE FUNCTION icu_compare( str1 text, str2 text ) RETURNS int AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text) IS 'Compare two strings with their ICU collation and return a signed int like strcoll'; CREATE OR REPLACE FUNCTION icu_compare( str1 text, str2 text, collator text ) RETURNS int AS 'MODULE_PATHNAME', 'icu_compare_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text,text) IS 'Compare two strings with the given collation and return a signed int like strcoll'; icu_ext-1.8.0/sql/icu_ext--1.3--1.4.sql000066400000000000000000000016201450461324700170760ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.4'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_strpos( string text, "substring" text ) RETURNS int4 AS 'MODULE_PATHNAME', 'icu_strpos' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; CREATE OR REPLACE FUNCTION icu_strpos( string text, "substring" text, collator text ) RETURNS int4 AS 'MODULE_PATHNAME', 'icu_strpos_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; CREATE OR REPLACE FUNCTION icu_replace( string text, "from" text, "to" text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_replace' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 100; CREATE OR REPLACE FUNCTION icu_replace( string text, "from" text, "to" text, collator text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_replace_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 100; icu_ext-1.8.0/sql/icu_ext--1.3.sql000066400000000000000000000131411450461324700165220ustar00rootroot00000000000000/* icu_ext.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION icu_ext" to load this file. \quit CREATE FUNCTION icu_version() RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_version() IS 'Version of the ICU library currently in use'; CREATE FUNCTION icu_unicode_version() RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_unicode_version() IS 'Version of the Unicode standard used by ICU'; CREATE FUNCTION icu_collation_attributes( IN collator text, IN exclude_defaults bool default false, OUT attribute text, OUT value text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_collation_attributes(text,bool) IS 'List the attributes of an ICU collation'; CREATE FUNCTION icu_locales_list ( OUT name text, OUT country text, OUT country_code text, OUT language text, OUT language_code text, OUT script text, OUT direction text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_locales_list() IS 'List the available ICU locales with their main properties'; CREATE FUNCTION icu_default_locale() RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_default_locale() IS 'Return the ICU locale currently used by default'; /* Set the default locale to some name and return the canonicalized name. */ CREATE FUNCTION icu_set_default_locale(text) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_set_default_locale(text) IS 'Set the ICU locale used by default'; /* See http://userguide.icu-project.org/boundaryanalysis */ CREATE FUNCTION icu_character_boundaries( contents text, locale text ) RETURNS SETOF text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_character_boundaries(text,text) IS 'Split text into characters, using boundary positions according to Unicode rules with the specified locale'; CREATE FUNCTION icu_word_boundaries( contents text, locale text, OUT tag int, OUT contents text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_word_boundaries(text,text) IS 'Split text into words, using boundary positions according to Unicode rules with the specified locale'; CREATE FUNCTION icu_line_boundaries( contents text, locale text, OUT tag int, OUT contents text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_line_boundaries(text,text) IS 'Split text into parts between which line breaks may occur, using rules of the specified locale'; CREATE FUNCTION icu_sentence_boundaries( contents text, locale text, OUT tag int, OUT contents text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_sentence_boundaries(text,text) IS 'Split text into sentences, according to Unicode rules with the specified locale'; CREATE FUNCTION icu_compare( str1 text, str2 text ) RETURNS int AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text) IS 'Compare two strings with their ICU collation and return a signed int like strcoll'; CREATE FUNCTION icu_compare( str1 text, str2 text, collator text ) RETURNS int AS 'MODULE_PATHNAME', 'icu_compare_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text,text) IS 'Compare two strings with the given collation and return a signed int like strcoll'; CREATE FUNCTION icu_case_compare( str1 text, str2 text ) RETURNS int AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_case_compare(text,text) IS 'Compare two strings case-insensitively using full case folding'; CREATE FUNCTION icu_sort_key( str text, collator text ) RETURNS bytea AS 'MODULE_PATHNAME', 'icu_sort_key_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text,text) IS 'Compute the binary sort key for the string given the collation'; CREATE FUNCTION icu_sort_key( str text ) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text) IS 'Compute the binary sort key with the collate of the string'; CREATE FUNCTION icu_char_name( c character ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_char_name(character) IS 'Return the Unicode character name corresponding to the first codepoint of the input'; CREATE FUNCTION icu_number_spellout( num float8, locale text ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_number_spellout(float8,text) IS 'Spell out the number according to the given locale'; CREATE FUNCTION icu_spoof_check( str text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_spoof_check(text) IS 'Check whether the argument is likely to be an attempt at confusing a reader'; CREATE FUNCTION icu_confusable_strings_check( str1 text, str2 text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_confusable_strings_check(text,text) IS 'Check whether the arguments are visually confusable with each other'; CREATE FUNCTION icu_transforms_list( ) RETURNS SETOF text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transforms_list() IS 'List the basic transforms available to icu_transform'; CREATE FUNCTION icu_transform(string text, trans text) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transform(text,text) IS 'Apply a transformation through basic or compound transliterators and filters'; icu_ext-1.8.0/sql/icu_ext--1.4--1.5.sql000066400000000000000000000002361450461324700171020ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.5'" to load this file. \quit icu_ext-1.8.0/sql/icu_ext--1.5--1.6.sql000066400000000000000000000013611450461324700171040ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.6'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_normalize( string text, form text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_normalize' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_normalize(text,text) IS 'Normalize the string into one of NFC, NFD, NFKC or NFKD Unicode forms'; CREATE OR REPLACE FUNCTION icu_is_normalized( string text, form text ) RETURNS bool AS 'MODULE_PATHNAME', 'icu_is_normalized' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_is_normalized(text,text) IS 'Test if the string is normalized in one of NFC, NFD, NFKC or NFKD Unicode forms'; icu_ext-1.8.0/sql/icu_ext--1.6--1.6.1.sql000066400000000000000000000002401450461324700172370ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.6.1'" to load this file. \quit icu_ext-1.8.0/sql/icu_ext--1.6.1--1.6.2.sql000066400000000000000000000002401450461324700173770ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.6.2'" to load this file. \quit icu_ext-1.8.0/sql/icu_ext--1.6.2--1.7.sql000066400000000000000000000006751450461324700172550ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.7'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_confusable_string_skeleton( string text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_confusable_string_skeleton' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_confusable_string_skeleton(text) IS 'Get the skeleton for an input string'; icu_ext-1.8.0/sql/icu_ext--1.7--1.8.sql000066400000000000000000000304211450461324700171070ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.8'" to load this file. \quit /* Interface to udat_parse(). The calendar is typically set in the locale argument. */ CREATE FUNCTION icu_parse_date( date_string text, format text, locale text ) RETURNS date AS 'MODULE_PATHNAME', 'icu_parse_date_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; /* Interface to udat_parse(), without the locale argument. */ CREATE FUNCTION icu_parse_date( date_string text, format text ) RETURNS date AS 'MODULE_PATHNAME', 'icu_parse_date_default_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_parse_date(text,text,text) IS 'Convert a locale-formatted string into a date, using the supplied locale'; COMMENT ON FUNCTION icu_parse_date(text,text) IS 'Convert a locale-formatted string into a date, using the default locale'; CREATE FUNCTION icu_parse_datetime( date_string text, format text, locale text ) RETURNS timestamptz AS 'MODULE_PATHNAME', 'icu_parse_datetime_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; /* Interface to udat_parse(), without the locale argument. */ CREATE FUNCTION icu_parse_datetime( date_string text, format text ) RETURNS timestamptz AS 'MODULE_PATHNAME', 'icu_parse_datetime_default_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_parse_datetime(text,text,text) IS 'Convert a locale-formatted string into a timestamptz, using the supplied locale'; COMMENT ON FUNCTION icu_parse_datetime(text,text) IS 'Convert a locale-formatted string into a timestamptz, using the default locale'; /* Interface to udat_format(). The calendar is typically set in the locale argument. */ CREATE FUNCTION icu_format_datetime( tstamp timestamptz, format text, locale text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_format_datetime_locale' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_format_datetime(timestamptz,text,text) IS 'Convert a time stamp into a string according to the given locale and format'; CREATE FUNCTION icu_format_datetime( tstamp timestamptz, format text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_format_datetime_default_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_format_datetime(timestamptz,text) IS 'Convert a time stamp into a string according to the given format and default locale'; CREATE FUNCTION icu_format_date( input date, format text, locale text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_format_date_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE FUNCTION icu_format_date( input date, format text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_format_date_default_locale' LANGUAGE C STRICT STABLE PARALLEL SAFE; --- --- icu_date datatype --- CREATE FUNCTION icu_date_in(cstring) RETURNS icu_date AS 'MODULE_PATHNAME', 'icu_date_in' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE FUNCTION icu_date_out(icu_date) RETURNS cstring AS 'MODULE_PATHNAME', 'icu_date_out' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE TYPE icu_date ( INPUT = icu_date_in, OUTPUT = icu_date_out, LIKE = pg_catalog.date ); CREATE FUNCTION icu_date_add_days(icu_date, int4) RETURNS icu_date AS 'MODULE_PATHNAME', 'icu_date_add_days' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; CREATE FUNCTION icu_date_days_add(int4, icu_date) RETURNS icu_date AS 'MODULE_PATHNAME', 'icu_date_days_add' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; CREATE FUNCTION icu_date_eq (icu_date, icu_date) RETURNS bool LANGUAGE internal AS 'date_eq' IMMUTABLE STRICT; CREATE FUNCTION icu_date_ne (icu_date, icu_date) RETURNS bool LANGUAGE internal AS 'date_ne' IMMUTABLE STRICT; CREATE FUNCTION icu_date_gt (icu_date, icu_date) RETURNS bool LANGUAGE internal AS 'date_gt' IMMUTABLE STRICT; CREATE FUNCTION icu_date_ge (icu_date, icu_date) RETURNS bool LANGUAGE internal AS 'date_ge' IMMUTABLE STRICT; CREATE FUNCTION icu_date_lt (icu_date, icu_date) RETURNS bool LANGUAGE internal AS 'date_lt' IMMUTABLE STRICT; CREATE FUNCTION icu_date_le (icu_date, icu_date) RETURNS bool LANGUAGE internal AS 'date_le' IMMUTABLE STRICT; CREATE FUNCTION icu_date_cmp (icu_date, icu_date) RETURNS int4 LANGUAGE internal AS 'date_cmp' IMMUTABLE STRICT; CREATE OPERATOR = ( PROCEDURE = icu_date_eq, LEFTARG = icu_date, RIGHTARG = icu_date, NEGATOR = '<>', HASHES, MERGES ); CREATE OPERATOR <> ( PROCEDURE = icu_date_ne, LEFTARG = icu_date, RIGHTARG = icu_date, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR > ( PROCEDURE = icu_date_gt, LEFTARG = icu_date, RIGHTARG = icu_date, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR >= ( PROCEDURE = icu_date_ge, LEFTARG = icu_date, RIGHTARG = icu_date, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR < ( PROCEDURE = icu_date_lt, LEFTARG = icu_date, RIGHTARG = icu_date, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR <= ( PROCEDURE = icu_date_le, LEFTARG = icu_date, RIGHTARG = icu_date, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR CLASS icu_date_ops DEFAULT FOR TYPE icu_date USING btree AS OPERATOR 1 <, OPERATOR 2 <=, OPERATOR 3 =, OPERATOR 4 >=, OPERATOR 5 >, FUNCTION 1 icu_date_cmp(icu_date, icu_date); CREATE OPERATOR + ( PROCEDURE = icu_date_add_days, LEFTARG = icu_date, RIGHTARG = int4 ); CREATE OPERATOR + ( PROCEDURE = icu_date_days_add, LEFTARG = int4, RIGHTARG = icu_date ); CREATE CAST (icu_date AS date) WITHOUT FUNCTION AS IMPLICIT; CREATE CAST (date AS icu_date) WITHOUT FUNCTION AS IMPLICIT; --- --- icu_timestamptz datatype --- CREATE FUNCTION icu_timestamptz_in(cstring) RETURNS icu_timestamptz AS 'MODULE_PATHNAME', 'icu_timestamptz_in' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE FUNCTION icu_timestamptz_out(icu_timestamptz) RETURNS cstring AS 'MODULE_PATHNAME', 'icu_timestamptz_out' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE TYPE icu_timestamptz ( INPUT = icu_timestamptz_in, OUTPUT = icu_timestamptz_out, LIKE = pg_catalog.timestamptz ); CREATE FUNCTION icu_timestamptz_eq (icu_timestamptz, icu_timestamptz) RETURNS bool LANGUAGE internal AS 'timestamp_eq' IMMUTABLE STRICT; CREATE FUNCTION icu_timestamptz_ne (icu_timestamptz, icu_timestamptz) RETURNS bool LANGUAGE internal AS 'timestamp_ne' IMMUTABLE STRICT; CREATE FUNCTION icu_timestamptz_gt (icu_timestamptz, icu_timestamptz) RETURNS bool LANGUAGE internal AS 'timestamp_gt' IMMUTABLE STRICT; CREATE FUNCTION icu_timestamptz_ge (icu_timestamptz, icu_timestamptz) RETURNS bool LANGUAGE internal AS 'timestamp_ge' IMMUTABLE STRICT; CREATE FUNCTION icu_timestamptz_lt (icu_timestamptz, icu_timestamptz) RETURNS bool LANGUAGE internal AS 'timestamp_lt' IMMUTABLE STRICT; CREATE FUNCTION icu_timestamptz_le (icu_timestamptz, icu_timestamptz) RETURNS bool LANGUAGE internal AS 'timestamp_le' IMMUTABLE STRICT; CREATE FUNCTION icu_timestamptz_cmp (icu_timestamptz, icu_timestamptz) RETURNS int4 LANGUAGE internal AS 'timestamp_cmp' IMMUTABLE STRICT; CREATE OPERATOR = ( PROCEDURE = icu_timestamptz_eq, LEFTARG = icu_timestamptz, RIGHTARG = icu_timestamptz, NEGATOR = '<>', HASHES, MERGES ); CREATE OPERATOR <> ( PROCEDURE = icu_timestamptz_ne, LEFTARG = icu_timestamptz, RIGHTARG = icu_timestamptz, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR > ( PROCEDURE = icu_timestamptz_gt, LEFTARG = icu_timestamptz, RIGHTARG = icu_timestamptz, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR >= ( PROCEDURE = icu_timestamptz_ge, LEFTARG = icu_timestamptz, RIGHTARG = icu_timestamptz, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR < ( PROCEDURE = icu_timestamptz_lt, LEFTARG = icu_timestamptz, RIGHTARG = icu_timestamptz, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR <= ( PROCEDURE = icu_timestamptz_le, LEFTARG = icu_timestamptz, RIGHTARG = icu_timestamptz, NEGATOR = '=', HASHES, MERGES ); CREATE OPERATOR CLASS icu_timestamptz_ops DEFAULT FOR TYPE icu_timestamptz USING btree AS OPERATOR 1 <, OPERATOR 2 <=, OPERATOR 3 =, OPERATOR 4 >=, OPERATOR 5 >, FUNCTION 1 icu_timestamptz_cmp(icu_timestamptz, icu_timestamptz); CREATE CAST (icu_timestamptz AS timestamptz) WITHOUT FUNCTION AS IMPLICIT; CREATE CAST (timestamptz AS icu_timestamptz) WITHOUT FUNCTION AS IMPLICIT; -- -- icu_interval datatype --- CREATE FUNCTION icu_interval_in(cstring) RETURNS icu_interval AS 'MODULE_PATHNAME', 'icu_interval_in' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE FUNCTION icu_interval_out(icu_interval) RETURNS cstring AS 'MODULE_PATHNAME', 'icu_interval_out' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE TYPE icu_interval ( INPUT = icu_interval_in, OUTPUT = icu_interval_out, INTERNALLENGTH = 20, ALIGNMENT = 'double' -- SEND = icu_interval_send, -- RECEIVE = interval_recv ); CREATE FUNCTION icu_from_interval (interval) RETURNS icu_interval AS 'MODULE_PATHNAME', 'icu_from_interval' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE CAST (interval AS icu_interval) WITH FUNCTION icu_from_interval(interval) AS IMPLICIT; -- -- Non-implicit casts -- CREATE FUNCTION icu_date_to_ts(icu_date) RETURNS icu_timestamptz AS 'MODULE_PATHNAME', 'icu_date_to_ts' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE CAST (icu_date AS icu_timestamptz) WITH FUNCTION icu_date_to_ts(icu_date) AS ASSIGNMENT; CREATE FUNCTION icu_ts_to_date(icu_timestamptz) RETURNS icu_date AS 'MODULE_PATHNAME', 'icu_ts_to_date' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE CAST (icu_timestamptz AS icu_date) WITH FUNCTION icu_ts_to_date(icu_timestamptz) AS ASSIGNMENT; CREATE CAST (timestamptz AS icu_date) WITH FUNCTION icu_ts_to_date(icu_timestamptz) AS ASSIGNMENT; -- -- Functions and operators combining types -- /* icu_timestamptz plus icu_interval */ CREATE FUNCTION icu_timestamptz_add_interval(icu_timestamptz, icu_interval) RETURNS icu_timestamptz AS 'MODULE_PATHNAME', 'icu_timestamptz_add_interval' LANGUAGE C STRICT STABLE PARALLEL SAFE; /* icu_interval plus icu_timestamptz */ CREATE FUNCTION icu_interval_add_timestamptz(icu_interval, icu_timestamptz) RETURNS icu_timestamptz AS 'MODULE_PATHNAME', 'icu_interval_add_timestamptz' LANGUAGE C STRICT STABLE PARALLEL SAFE; /* icu_timestamptz minus icu_interval */ CREATE FUNCTION icu_timestamptz_sub_interval(icu_timestamptz, icu_interval) RETURNS icu_timestamptz AS 'MODULE_PATHNAME', 'icu_timestamptz_sub_interval' LANGUAGE C STRICT STABLE PARALLEL SAFE; /* icu_interval multiplied by integer */ CREATE FUNCTION icu_interval_mul(icu_interval, int) RETURNS icu_interval AS 'MODULE_PATHNAME', 'icu_interval_mul' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; /* integer multiplied by icu_interval */ CREATE FUNCTION icu_mul_i_interval(int, icu_interval) RETURNS icu_interval AS 'MODULE_PATHNAME', 'icu_mul_i_interval' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; CREATE OPERATOR + ( PROCEDURE = icu_timestamptz_add_interval, LEFTARG = icu_timestamptz, RIGHTARG = icu_interval, COMMUTATOR = + ); CREATE OPERATOR + ( PROCEDURE = icu_interval_add_timestamptz, LEFTARG = icu_interval, RIGHTARG = icu_timestamptz, COMMUTATOR = + ); CREATE OPERATOR - ( PROCEDURE = icu_timestamptz_sub_interval, LEFTARG = icu_timestamptz, RIGHTARG = icu_interval ); CREATE OPERATOR * ( PROCEDURE = icu_interval_mul, LEFTARG = icu_interval, RIGHTARG = int, COMMUTATOR = * ); CREATE OPERATOR * ( PROCEDURE = icu_mul_i_interval, LEFTARG = int, RIGHTARG = icu_interval, COMMUTATOR = * ); /* icu_interval plus icu_interval */ CREATE FUNCTION icu_interv_plus_interv(icu_interval, icu_interval) RETURNS icu_interval AS 'MODULE_PATHNAME', 'icu_interv_plus_interv' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; /* icu_interval plus icu_interval */ CREATE FUNCTION icu_interv_minus_interv(icu_interval, icu_interval) RETURNS icu_interval AS 'MODULE_PATHNAME', 'icu_interv_minus_interv' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; CREATE OPERATOR + ( PROCEDURE = icu_interv_plus_interv, LEFTARG = icu_interval, RIGHTARG = icu_interval, COMMUTATOR = + ); CREATE OPERATOR - ( PROCEDURE = icu_interv_minus_interv, LEFTARG = icu_interval, RIGHTARG = icu_interval ); CREATE FUNCTION icu_date_plus_interval(icu_date, icu_interval) RETURNS icu_timestamptz AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE FUNCTION icu_date_minus_interval(icu_date, icu_interval) RETURNS icu_timestamptz AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; CREATE OPERATOR + ( PROCEDURE = icu_date_plus_interval, LEFTARG = icu_date, RIGHTARG = icu_interval, COMMUTATOR = + ); CREATE OPERATOR - ( PROCEDURE = icu_date_minus_interval, LEFTARG = icu_date, RIGHTARG = icu_interval ); icu_ext-1.8.0/sql/tests-01.sql000066400000000000000000000047731450461324700161010ustar00rootroot00000000000000-- regression tests for icu_ext CREATE EXTENSION icu_ext; -- Check that the database has the built-in ICU collations -- required by the tests SELECT collname FROM pg_collation WHERE collname IN ('und-x-icu', 'en-x-icu') ORDER BY collname; -- icu_char_name SELECT c, to_hex(ascii(c)), icu_char_name(c) FROM regexp_split_to_table('El Niño', '') as c; -- icu_character_boundaries SELECT * FROM icu_character_boundaries('Ete'||E'\u0301', 'fr') as chars; -- icu_collation_attributes SELECT * FROM icu_collation_attributes('en') WHERE attribute <> 'version'; -- icu_compare SELECT icu_compare('abcé', 'abce', 'en@colStrength=primary;colCaseLevel=yes'); SELECT icu_compare('Abcé', 'abce' COLLATE "en-x-icu"); -- icu_confusable_strings_check SELECT txt, icu_confusable_strings_check('phil', txt) AS confusable FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l')) AS s(txt); -- icu_confusable_string_skeleton SELECT txt, icu_confusable_string_skeleton(txt) AS skeleton FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l'), (E'\u2026\u2026')) AS s(txt); -- icu_line_boundaries SELECT *,convert_to( contents, 'utf-8') FROM icu_line_boundaries( $$Thus much let me avow You are not wrong, who deem That my days have been a dream; Yet if hope has flown away In a night, or in a day,$$ , 'en'); -- icu_number_spellout /* use the unaligned format for this test. With the aligned format, there are environment-related differences in how psql computes the width of strings containing U+00AD (soft hyphen) */ \pset format unaligned SELECT loc, icu_number_spellout(1234, loc) FROM (values ('en'),('fr'),('de'),('ru'),('ja')) AS s(loc); \pset format aligned -- icu_replace SELECT n, icu_replace( n, 'jeanrene', '{firstname}', 'und@colStrength=primary;colAlternate=shifted') FROM (values('jeanrenédupont'),('Jean-René Dupont')) as s(n) ORDER BY n COLLATE "C"; -- icu_sentence_boundaries SELECT * FROM icu_sentence_boundaries('Call me Mr. Brown. It''s a movie.', 'en@ss=standard'); -- icu_strpos SELECT v,icu_strpos('hey rene', v, 'und@colStrength=primary;colAlternate=shifted') FROM (VALUES ('René'), ('rené'), ('Rene'), ('n'), ('në'), ('no'), (''), (null)) AS s(v) ORDER BY v COLLATE "C"; -- icu_transform SELECT icu_transform('10\N{SUPERSCRIPT MINUS}\N{SUPERSCRIPT FOUR}' '\N{MICRO SIGN}m = 1 \N{ANGSTROM SIGN}', 'Name-Any'); SELECT icu_transform('Ich muß essen.', '[:^ascii:]; Hex'); -- icu_word_boundaries SELECT * FROM icu_word_boundaries($$Do you like O'Reilly books?$$, 'en'); icu_ext-1.8.0/sql/tests-datetime.sql000066400000000000000000000021551450461324700174450ustar00rootroot00000000000000/* test date and time support */ \set format unaligned set icu_ext.locale to 'en@calendar=gregorian'; set icu_ext.timestamptz_format to 'YYYY-MM-dd HH:mm:ss'; set timezone to 'Europe/Paris'; -- DST transition to summer time select '2023-03-25 00:00:00'::timestamptz + '26.5 hours'::interval AS "core", '2023-03-25 00:00:00'::icu_timestamptz + '26.5 hours'::icu_interval AS "ext"; set icu_ext.locale to 'en@calendar=ethiopic'; set icu_ext.date_format to '{short}'; set icu_ext.timestamptz_format to '{short}'; -- 13-month year with 5 days in the last month select '1/13/2016 ERA1'::icu_date + icu_interval '12 months' as d1, '1/13/2016 ERA1'::icu_date + icu_interval '13 months' as d2, '1/13/2016 ERA1'::icu_date + icu_interval '1 year' as d3; select '13/5/2016 ERA1'::icu_date + 1; set icu_ext.locale to 'en@calendar=gregorian'; select icu_parse_date('17/10/2023', 'dd/MM/yyyy'); select icu_parse_datetime('17/10/2023', 'dd/MM/yyyy'); select icu_parse_datetime('17/10/2023 12:02:40.653', 'dd/MM/yyyy HH:mm:ss.S'); set timezone to 'GMT'; select icu_parse_datetime('17/10/2023 12:02:40.653', 'dd/MM/yyyy HH:mm:ss.S');