pax_global_header00006660000000000000000000000064137604712410014517gustar00rootroot0000000000000052 comment=4e9e44cd96332c1a9c1d8011a9731ef407437f2c icu_ext-1.6.2/000077500000000000000000000000001376047124100131655ustar00rootroot00000000000000icu_ext-1.6.2/.gitignore000066400000000000000000000006561376047124100151640ustar00rootroot00000000000000# Prerequisites *.d # Object files *.o *.ko *.obj *.elf # Linker output *.ilk *.map *.exp # Precompiled Headers *.gch *.pch # Libraries *.lib *.a *.la *.lo # Shared objects (inc. Windows DLLs) *.dll *.so *.so.* *.dylib # Executables *.exe a.out *.app *.i*86 *.x86_64 *.hex # Debug files *.dSYM/ *.su *.idb *.pdb # Kernel Module Compile Results *.mod* *.cmd .tmp_versions/ modules.order Module.symvers Mkfile.old dkms.conf icu_ext-1.6.2/LICENSE.md000066400000000000000000000017151376047124100145750ustar00rootroot00000000000000# Copyright and License Copyright (c) 2018-2020, Daniel Vérité Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. In no event shall Daniel Vérité be liable to any party for direct, indirect, special, incidental, or consequential damages, including lost profits, arising out of the use of this software and its documentation, even if Daniel Vérité has been advised of the possibility of such damage. Daniel Vérité specifically disclaims any warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The software provided hereunder is on an "AS IS" basis, and Daniel Vérité has no obligations to provide maintenance, support, updates, enhancements, or modifications. icu_ext-1.6.2/META.json000066400000000000000000000020671376047124100146130ustar00rootroot00000000000000{ "name": "icu_ext", "abstract": "Extension to expose functionality from the ICU (Unicode) library", "version": "1.6.2", "release_status": "stable", "maintainer": "Daniel Vérité ", "license": "postgresql", "prereqs": { "runtime": { "requires": { "PostgreSQL": "10.0.0" } } }, "provides": { "icu_ext": { "file": "sql/icu_ext--1.3.sql", "version": "1.6.2", "abstract": "Extension to expose functionality from the ICU (Unicode) library" } }, "resources": { "bugtracker": { "web": "https://github.com/dverite/icu_ext/issues" }, "repository": { "url": "git://github.com/dverite/icu_ext.git" , "web": "https://github.com/dverite/icu_ext", "type": "git" } }, "meta-spec": { "version": "1.0.0", "url": "http://pgxn.org/meta/spec.txt" }, "tags": [ "icu", "unicode", "collation" ] } icu_ext-1.6.2/Makefile000066400000000000000000000005461376047124100146320ustar00rootroot00000000000000EXTENSION = icu_ext EXTVERSION = 1.6.2 PG_CONFIG = pg_config DATA = $(wildcard sql/icu_*.sql) MODULE_big = icu_ext OBJS = icu_ext.o icu_break.o icu_num.o icu_spoof.o icu_transform.o \ icu_search.o icu_normalize.o SHLIB_LINK = $(ICU_LIBS) REGRESS = tests-01 EXTRA_CLEAN = expected/tests.out all: PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) icu_ext-1.6.2/README.md000066400000000000000000000641251376047124100144540ustar00rootroot00000000000000# icu_ext An extension to expose functionality from [ICU](http://icu-project.org) to PostgreSQL applications. It requires PostgreSQL version 10 or newer, configured with ICU (--with-icu). Note: this text is in GitHub Flavored Markdown format. Please see the version [on github](https://github.com/dverite/icu_ext/blob/master/README.md) if it's rendered weirdly elsewhere. ## Installation The Makefile uses the [PGXS infrastructure](https://www.postgresql.org/docs/current/static/extend-pgxs.html) to find include and library files and determine the install location. Build and install with: $ make $ (sudo) make install ## Functions ### Quick links (in alphabetical order) [icu_char_name](#icu_char_name) [icu_character_boundaries](#icu_character_boundaries) [icu_collation_attributes](#icu_collation_attributes) [icu_compare](#icu_compare) [icu_confusable_strings_check](#icu_confusable_strings_check) [icu_default_locale](#icu_default_locale) [icu_is_normalized](#icu_is_normalized) [icu_line_boundaries](#icu_line_boundaries) [icu_locales_list](#icu_locales_list) [icu_normalize](#icu_normalize) [icu_number_spellout](#icu_number_spellout) [icu_replace](#icu_replace) [icu_sentence_boundaries](#icu_sentence_boundaries) [icu_set_default_locale](#icu_set_default_locale) [icu_sort_key](#icu_sort_key) [icu_spoof_check](#icu_spoof_check) [icu_strpos](#icu_strpos) [icu_transform](#icu_transform) [icu_transforms_list](#icu_transforms_list) [icu_unicode_version](#icu_unicode_version) [icu_version](#icu_version) [icu_word_boundaries](#icu_word_boundaries) These functions work in both Unicode and non-Unicode databases. ### icu_version() Returns the version of the ICU library linked with the server. ### icu_unicode_version() Returns the version of the Unicode standard used by the ICU library linked with the server. ### icu_locales_list() Returns a table-type list of available ICU locales with their main properties (country code and name, language code and name, script, direction). When translations are available, the country and language names are localized with the default ICU locale, configurable with `icu_set_default_locale()`. Set it to `en` to force english names. Examples: =# SELECT * FROM icu_locales_list() where name like 'es%' limit 5; name | country | country_code | language | language_code | script | direction --------+---------------+--------------+----------+---------------+--------+----------- es | | | Spanish | spa | | LTR es_419 | Latin America | | Spanish | spa | | LTR es_AR | Argentina | ARG | Spanish | spa | | LTR es_BO | Bolivia | BOL | Spanish | spa | | LTR es_CL | Chile | CHL | Spanish | spa | | LTR =# SELECT name,country FROM icu_locales_list() where script='Simplified Han'; name | country ------------+--------------------- zh_Hans | zh_Hans_CN | China zh_Hans_HK | Hong Kong SAR China zh_Hans_MO | Macau SAR China zh_Hans_SG | Singapore This list is obtained independently from the collations declared to PostgreSQL (found in `pg_collation`). ### icu_collation_attributes(`collator` text [, `exclude_defaults` bool]) Lists the attributes, version and display name of an ICU collation, returned as a set of `(attribute,value)` tuples. The `collator` argument must designate an [ICU collator](http://userguide.icu-project.org/collation/api) and accepts several different syntaxes. In particular, a [locale ID](http://userguide.icu-project.org/locale) or (if ICU>=54) [language tags](http://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Settings) may be used. Note that this argument is **not** a reference to a PostgreSQL collation, and that this function does not depend on whether a corresponding collation has been instantiated in the database with [`CREATE COLLATION`](https://www.postgresql.org/docs/current/static/sql-createcollation.html). To query the properties of an already created PostgreSQL ICU collation, refer to `pg_collation.collcollate` (which corresponds to the `lc_collate` argument of CREATE COLLATION). =# SELECT a.attribute,a.value FROM pg_collation JOIN LATERAL icu_collation_attributes(collcollate) a ON (collname='fr-CA-x-icu'); attribute | value -------------+------------------- displayname | français (Canada) kn | false kb | true kk | false ka | noignore ks | level3 kf | false kc | false kv | punct version | 153.80.33 `icu_collation_attributes()` is useful to check that the settings embedded into a collation name activate the intended options, because ICU parses them in a way that non-conformant parts tend to be silently ignored, and because the interpretation somewhat depends on the ICU version (in particular, pre-54 versions do not support options expressed as BCP-47 tags). It may be also useful to search existing collations by their properties. When `exclude_defaults` is set to `true`, attributes that have their default value are filtered out, to put in evidence the specifics of collations. For instance, to find the only collations that use `shifted` for the `Alternate` attribute: =# SELECT collname,collcollate,a.attribute,a.value FROM pg_collation JOIN LATERAL icu_collation_attributes(collcollate,true) a ON (attribute='ka') ; collname | collcollate | attribute | value -------------+-------------+-----------+--------- th-x-icu | th | ka | shifted th-TH-x-icu | th-TH | ka | shifted (2 rows) By default there is no filtering (`exclude_defaults` = false) so that all attributes known by the function as well as the collation version number are reported. Example of checking a collation without any reference to `pg_collation`: =# SELECT * FROM icu_collation_attributes('fr-u-ks-level2-kn'); attribute | value -----------+---------- kn | true kb | false kk | false ka | noignore ks | level2 kf | false kc | false version | 153.64 `icu_collation_attributes()` will error out if ICU is unable to open a collator with the given argument. ### icu_sort_key(`string` text [, `collator` text]) Returns the binary sort key (type: `bytea`) corresponding to the string with the given collation. See http://userguide.icu-project.org/collation/architecture#TOC-Sort-Keys When a `collator` argument is passed, it is interpreted as an ICU locale independently of the persistent collations instantiated in the database. When there is no `collator` argument, the collation associated to `string` gets used to generate the sort key. It must be an ICU collation or the function will error out. This form with a single argument is faster due to Postgres keeping its collations "open" (in the sense of `ucol_open()/ucol_close()`) for the duration of the session, whereas the other form with the explicit `collator` argument does open and close the ICU collation for each call. Binary sort keys may be useful to circumvent a core PostgreSQL limitation that two strings that differ in their byte representation are never considered equal by deterministic collations (see for instance [this thread](https://www.postgresql.org/message-id/7f0120e8945c4befac964777d31912d7%40exmbdft5.ad.twosigma.com) in the pgsql-bugs mailing-list for a discussion of this problem in relation with the ICU integration). With PostgreSQL 12 or newer versions, the "deterministic" property can be set to `false` by [`CREATE COLLATION`](https://www.postgresql.org/docs/current/sql-createcollation.html) to request that string comparisons with these collations skip the tie-breaker. With older versions, "deterministic" is always `true`. You may order or rank by binary sort keys, or materialize them in a unique index to achieve at the SQL level what cannot be done internally by persistent collations, either because PostgreSQL is not recent enough or because you don't want or lack the permission to instantiate nondeterministic collations. The function is declared IMMUTABLE to be usable in indexes, but please be aware that it's only true as far as the "version" of the collation doesn't change. (Typically it changes with every version of Unicode). In short, consider rebuilding the affected indexes on ICU upgrades. To simply compare pairs of strings, consider `icu_compare()` instead. Example demonstrating a case-sensitive, accent-sensitive unique index: =# CREATE TABLE uniq(name text); =# CREATE UNIQUE INDEX idx ON uniq((icu_sort_key(name, 'fr-u-ks-level1'))); =# INSERT INTO uniq values('été'); INSERT 0 1 =# INSERT INTO uniq values('Ête'); ERROR: duplicate key value violates unique constraint "idx" DETAIL: Key (icu_sort_key(name, 'fr-u-ks-level1'::text))=(\x314f31) already exists. =# insert into uniq values('Êtes'); INSERT 0 1 ### icu_compare(`string1` text, `string2` text [, `collator` text]) Compare two strings with the given collation. Return the result as a signed integer, similarly to strcoll(), that is, the result is negative if string1 < string2, zero if string = string2, and positive if string1 > string2. When a `collator` argument is passed, it is taken as the ICU locale (independently of the collations instantiated in the database) to use to collate the strings. When there is no `collator` argument, the collation associated to `string1` and `string2` gets used for the comparison. It must be an ICU collation and it must be the same for the two arguments or the function will error out. With PostgreSQL 12 or newer, it can be nondeterministic, but whether it is nondeterministic or deterministic will not make any difference in the result of `icu_compare`, contrary to comparisons done by PostgreSQL core with the equality operator. The two-argument form is significantly faster due to Postgres keeping its collations "open" (in the sense of `ucol_open()/ucol_close()`) for the duration of the session, whereas the other form with the explicit `collator` argument does open and close the ICU collation for each call. Example: case-sensitive, accent-insensitive comparison: =# SELECT icu_compare('abcé', 'abce', 'en-u-ks-level1-kc-true'); icu_compare ------------- 0 =# SELECT icu_compare('Abcé', 'abce', 'en-u-ks-level1-kc-true'); icu_compare ------------- 1 With two arguments and a collation determined by the COLLATE clause: =# SELECT icu_compare('Abcé', 'abce' COLLATE "fr-x-icu"); icu_compare ------------- 1 With an implicit Postgres collation: =# CREATE COLLATION mycoll (locale='fr-u-ks-level1', provider='icu'); CREATE COLLATION =# CREATE TABLE books (id int, title text COLLATE "mycoll"); CREATE TABLE =# insert into books values(1, $$C'est l'été$$); INSERT 0 1 =# select id,title from books where icu_compare (title, $$c'est l'ete$$) = 0; id | title ----+------------- 1 | C'est l'été ### icu_set_default_locale(`locale` text) Sets the default ICU locale for the session, and returns a canonicalized version of the locale name. The POSIX syntax (`lang[_country[@attr]]`) is accepted. Call this function to change the output language of `icu_locales_list()`. This setting should not have any effect on PostgreSQL core functions, at least as of PG version 10. Warning: passing bogus contents to this function may freeze the backend with older versions of ICU (seen with 52.1). ### icu_default_locale() Returns the name of the default ICU locale as a text. The initial value is automatically set by ICU from the environment. ### icu_character_boundaries(`string` text, `locale` text) Break down the string into its characters and return them as a set of text. This is comparable to calling `regexp_split_to_table` with an empty regexp, with some differences, for instance: - CRLF sequences do not get split into two characters. - Sequences with a base and a combining character are kept together. Example (the "e" followed by the combining acute accent U+0301 may be rendered as an accented e or differently depending on your browser): =# SELECT * FROM icu_character_boundaries('Ete'||E'\u0301', 'fr') as chars; chars ------- E t é See [Boundary Analysis](http://userguide.icu-project.org/boundaryanalysis) in the ICU User Guide for more information. ### icu_word_boundaries (`string` text, `locale` text) Break down the string into words and non-words constituents, and return them in a set of (tag, contents) tuples. `tag` has values from the [UWordBreak enum](http://icu-project.org/apiref/icu4c/ubrk_8h_source.html) defined in ubrk.h indicating the nature of the piece of contents. The current values are: UBRK_WORD_NONE = 0, UBRK_WORD_NUMBER = 100, UBRK_WORD_LETTER = 200, UBRK_WORD_KANA = 300, UBRK_WORD_IDEO = 400, /* up to 500 */ (strictly speaking, any number between the lower and the upper bounds may be counted, as these numbers are meant to be intervals inside which new subdivisions may be added in future versions of ICU). Example: =# SELECT * FROM icu_word_boundaries($$I like O'Reilly books, like the japanese 初めてのPerl 第7版.$$ , 'en'); tag | contents -----+---------- 200 | I 0 | 200 | like 0 | 200 | O'Reilly 0 | 200 | books 0 | , 0 | 200 | like 0 | 200 | the 0 | 200 | japanese 0 | 400 | 初めて 400 | の 200 | Perl 0 | 400 | 第 100 | 7 400 | 版 0 | . or to count words in english: =# SELECT count(*) FROM icu_words_boundaries($$piece of text$$, 'en_US') WHERE tag=200; ### icu_line_boundaries (`string` text, `locale` text) Split the string into pieces where a line break may occur, according to the Unicode line breaking algorithm defined in [UAX #14](http://unicode.org/reports/tr14/), and return them in a set of (tag, contents) tuples. `tag` has values from the [ULineBreakTag enum](http://icu-project.org/apiref/icu4c/ubrk_8h_source.html) defined in ubrk.h indicating the nature of the break. The current values are: UBRK_LINE_SOFT = 0, UBRK_LINE_HARD = 100, /* up to 200 */ (strictly speaking, any number between the lower and the upper bounds may be counted, as these numbers are meant to be intervals inside which new subdivisions may be added in future versions of ICU). Example: =# SELECT *,convert_to( contents, 'utf-8') from icu_line_boundaries( $$Thus much let me avow--You are not wrong, who deem That my days have been a dream; Yet if hope has flown away In a night, or in a day,$$ , 'en'); tag | contents | convert_to -----+----------+------------------ 100 | +| \x0a | | 0 | Thus | \x5468757320 0 | much | \x6d75636820 0 | let | \x6c657420 0 | me | \x6d6520 100 | avow-- +| \x61766f772d2d0a | | 0 | You | \x596f7520 0 | are | \x61726520 0 | not | \x6e6f7420 0 | wrong, | \x77726f6e672c20 0 | who | \x77686f20 100 | deem +| \x6465656d0a | | 0 | That | \x5468617420 0 | my | \x6d7920 0 | days | \x6461797320 0 | have | \x6861766520 0 | been | \x6265656e20 0 | a | \x6120 100 | dream; +| \x647265616d3b0a | | 0 | Yet | \x59657420 0 | if | \x696620 0 | hope | \x686f706520 0 | has | \x68617320 0 | flown | \x666c6f776e20 100 | away +| \x617761790a | | 0 | In | \x496e20 0 | a | \x6120 0 | night, | \x6e696768742c20 0 | or | \x6f7220 0 | in | \x696e20 0 | a | \x6120 0 | day, | \x6461792c ### icu_sentence_boundaries (`string` text, `locale` text) Split the string into sentences, according the Unicode text segmentation rules defined in [UAX #29](http://unicode.org/reports/tr29/), and return them in a set of (tag, contents) tuples. `tag` has values from the [USentenceBreakTag enum](http://icu-project.org/apiref/icu4c/ubrk_8h_source.html) defined in ubrk.h indicating the nature of the break. The current values are: UBRK_SENTENCE_TERM = 0, UBRK_SENTENCE_SEP = 100, /* up to 200 */ (strictly speaking, any number between the lower and the upper bounds may be counted, as these numbers are meant to be intervals inside which new subdivisions may be added in future versions of ICU). Example: =# SELECT * FROM icu_sentence_boundaries('Mr. Barry Sheene was born in 1950. He was a motorcycle racer.', 'en-u-ss-standard'); tag | contents -----+------------------------------------- 0 | Mr. Barry Sheene was born in 1950. 0 | He was a motorcycle racer. Note: "Mr." followed by a space is recognized by virtue of the locale as an abbreviation of the english "Mister", rather than the end of a sentence. ### icu_number_spellout (`number` double precision, `locale` text) Return the spelled out text corresponding to the number expressed in the given locale. Example: =# SELECT loc, icu_number_spellout(1234, loc) FROM (values ('en'),('fr'),('de'),('ru'),('ja')) AS s(loc); loc | icu_number_spellout -----+------------------------------------------- en | one thousand two hundred thirty-four fr | mille deux cent trente-quatre de | ein­tausend­zwei­hundert­vier­und­dreißig ru | одна тысяча двести тридцать четыре ja | 千二百三十四 (Note: the german output uses U+00AD (SOFT HYPHEN) to separate words. Github's markdown to HTML conversion seems to remove them, so in the above text the spellout might appear like a single long word.) ### icu_char_name(`c` character) Return the Unicode character name corresponding to the first codepoint of the input. Example: =# SELECT c, to_hex(ascii(c)), icu_char_name(c) FROM regexp_split_to_table('El Niño', '') as c; c | to_hex | icu_char_name ---+--------+--------------------------------- E | 45 | LATIN CAPITAL LETTER E l | 6c | LATIN SMALL LETTER L | 20 | SPACE N | 4e | LATIN CAPITAL LETTER N i | 69 | LATIN SMALL LETTER I ñ | f1 | LATIN SMALL LETTER N WITH TILDE o | 6f | LATIN SMALL LETTER O ### icu_spoof_check (`string` text) Return a boolean indicating whether the argument is likely to be an attempt at confusing a reader. The implementation is based on Unicode Technical Reports [#36](http://unicode.org/reports/tr36) and [#39](http://unicode.org/reports/tr39) and uses the ICU default settings for spoof checks. Example: =# SELECT txt, icu_spoof_check(txt) FROM (VALUES ('paypal'), (E'p\u0430ypal')) AS s(txt); txt | icu_spoof_check --------+----------------- paypal | f pаypal | t (Note: The second character in the second row is U+0430 (CYRILLIC SMALL LETTER A) instead of the genuine ASCII U+0061 (LATIN SMALL LETTER A)) ### icu_confusable_strings_check(`string1` text, `string2` text) Return a boolean indicating whether the string arguments are visually confusable with each other, according to data described in [Unicode Technical Report #39](http://unicode.org/reports/tr39/#Confusable_Detection). The settings and comparison levels are ICU defaults. For strictly identical strings, it returns true. Example: =# SELECT txt, icu_confusable_strings_check('phil', txt) AS confusable FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l')) AS s(txt); txt | confusable ------+------------ phiL | f phiI | t phi1 | t phıl | t ### icu_transform (`string` text, `transformations` text) Return a string with some transformations applied. This function essentially calls ICU's [utrans_transUChars()](http://icu-project.org/apiref/icu4c/utrans_8h.html#af415d8aa51e79d4494ebb8ef8fc76ae2). The first argument is the string to transform, and the second is the transformation to apply, expressed as a sequence of transforms and filters (see the [ICU user guide on transforms](http://userguide.icu-project.org/transforms/general) and the output of `icu_transforms_list()` mentioned below). Examples: Transliterate: =# select icu_transform('Владимир Путин', 'Cyrl-Latn'); -- just 'Latin' would work here too icu_transform ---------------- Vladimir Putin Transform Unicode names into the corresponding characters: =# select icu_transform('10\N{SUPERSCRIPT MINUS}\N{SUPERSCRIPT FOUR}' '\N{MICRO SIGN}m = 1 \N{ANGSTROM SIGN}', 'Name-Any'); icu_transform --------------- 10⁻⁴µm = 1 Å Remove diacritics (generalized "unaccent") through Unicode decomposition. =# select icu_transform('1 Å', 'any-NFD; [:nonspacing mark:] any-remove; any-NFC'); icu_transform --------------- 1 A Generate hexadecimal codepoints for non-ASCII characters: =# select icu_transform('Ich muß essen.', '[:^ascii:]; Hex'); icu_transform --------------------- Ich mu\u00DF essen. ### icu_transforms_list () Return the list of built-in transliterations or transforms, as a set of text, corresponding to "Basic IDs" in [ICU documentation](http://userguide.icu-project.org/transforms/general). The initial set of transforms are transliterations between scripts (like `Katakana-Latin` or `Latin-Cyrillic`), but they're supplemented with functionalities related to accents, casing, Unicode composition and decomposition with combining characters and other conversions. Values from this list are meant to be used individually as the 2nd argument of `icu_transform()`, or assembled with semi-colon separators to form compound transforms, possibly with filters added to limit the set of characters to transform. ### icu_strpos(`string` text, `substring` text [, `collator` text]) Like `strpos(text,text)` in Postgres core, except that it uses the linguistic rules of `collator` to search `substring` in `string`, and that it supports nondeterministic collations seamlessly. When the substring is not found, it returns 0. Otherwise, It returns the 1-based position of the first match of `substring` inside `string`, or 1 if `substring` is empty. When `collator` is not passed, the collation of the arguments is used. As with the other functions in this extension, the two-argument form is faster since it can keep the ICU collation open across function calls. Example: -- Search in names independently of punctuation, case and accents =# select name from addresses where icu_strpos(name, 'jeanrene', 'fr-u-ks-level1-ka-shifted') > 0 name ------------------ jean-rené dupont Jean-René Dupont jeanrenédupont ### icu_replace(`string` text, `from` text, `to` text [, `collator` text]) Like `replace(string text, from text, to text)` in Postgres core, except it uses the linguistic rules of `collator` to search `substring` in `string` instead of a byte-wise comparison. It also supports nondeterministic collations to search `from` as a substring. It returns `strings` with all substrings that match `from` replaced by `to`. When `collator` is not passed, the collation of the arguments is used, which is faster because the ICU collation can be kept open across function calls. Example: -- Collation comparing independently of punctuation, case and accents =# CREATE COLLATION ciaipi (provider = icu, locale = 'und-u-ks-level1-ka-shifted'); -- Replace names matching 'jeanrene' by a placeholder =# select s.n, icu_replace(n, 'jeanrene', '{firstname}' collate "ciaipi") from (values('jeanrenédupont'),('Jean-René Dupont')) as s(n) ; n | icu_replace -------------------+--------------------- jeanrenédupont | {firstname}dupont Jean-René Dupont | {firstname} Dupont ### icu_normalize(`string` text, `form` text) Return `string` transformed into the Unicode normalized `form`, which must be `nfc`, `nfkc`, `nfd`, or `nfkd` (upper case or mixed case variants are accepted). Returns NULL if any input argument is NULL. The database must use an Unicode encoding, which means UTF-8 in practice. See the Unicode Annex [UAX #15](http://unicode.org/reports/tr15/#Introduction) for an introduction on Unicode normal forms. Example: =# select icu_normalize('éte'||E'\u0301', 'nfc') = E'ét\u00E9'; ?column? ---------- t ### icu_is_normalized(`string` text, `form` text) Return true if `string` is in the Unicode normalized `form`, which must be `nfc`, `nfkc`, `nfd`, or `nfkd` (upper case or mixed case variants are accepted). Returns false otherwise, or NULL if any input argument is NULL. The database must use an Unicode encoding, which means UTF-8 in practice. Example: =# SELECT icu_is_normalized('ét'||E'\u0301', 'nfc'); icu_is_normalized ------------------- f =# SELECT icu_is_normalized('ét'||E'\u0301', 'nfd'); icu_is_normalized ------------------- t ## License This project is licensed under the PostgreSQL License -- see [LICENSE.md](LICENSE.md). icu_ext-1.6.2/expected/000077500000000000000000000000001376047124100147665ustar00rootroot00000000000000icu_ext-1.6.2/expected/tests-01.out000066400000000000000000000121271376047124100171020ustar00rootroot00000000000000-- regression tests for icu_ext CREATE EXTENSION icu_ext; -- Check that the database has the built-in ICU collations -- required by the tests SELECT collname FROM pg_collation WHERE collname IN ('und-x-icu', 'en-x-icu') ORDER BY collname; collname ----------- en-x-icu und-x-icu (2 rows) -- icu_char_name SELECT c, to_hex(ascii(c)), icu_char_name(c) FROM regexp_split_to_table('El Niño', '') as c; c | to_hex | icu_char_name ---+--------+--------------------------------- E | 45 | LATIN CAPITAL LETTER E l | 6c | LATIN SMALL LETTER L | 20 | SPACE N | 4e | LATIN CAPITAL LETTER N i | 69 | LATIN SMALL LETTER I ñ | f1 | LATIN SMALL LETTER N WITH TILDE o | 6f | LATIN SMALL LETTER O (7 rows) -- icu_character_boundaries SELECT * FROM icu_character_boundaries('Ete'||E'\u0301', 'fr') as chars; chars ------- E t é (3 rows) -- icu_collation_attributes SELECT * FROM icu_collation_attributes('en') WHERE attribute <> 'version'; attribute | value -------------+---------- displayname | English kn | false kb | false kk | false ka | noignore ks | level3 kf | false kc | false kv | punct (9 rows) -- icu_compare SELECT icu_compare('abcé', 'abce', 'en@colStrength=primary;colCaseLevel=yes'); icu_compare ------------- 0 (1 row) SELECT icu_compare('Abcé', 'abce' COLLATE "en-x-icu"); icu_compare ------------- 1 (1 row) -- icu_confusable_strings_check SELECT txt, icu_confusable_strings_check('phil', txt) AS confusable FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l')) AS s(txt); txt | confusable ------+------------ phiL | f phiI | t phi1 | t phıl | t (4 rows) -- icu_line_boundaries SELECT *,convert_to( contents, 'utf-8') FROM icu_line_boundaries( $$Thus much let me avow You are not wrong, who deem That my days have been a dream; Yet if hope has flown away In a night, or in a day,$$ , 'en'); tag | contents | convert_to -----+----------+------------------ 0 | Thus | \x5468757320 0 | much | \x6d75636820 0 | let | \x6c657420 0 | me | \x6d6520 100 | avow +| \x61766f770a | | 0 | You | \x596f7520 0 | are | \x61726520 0 | not | \x6e6f7420 0 | wrong, | \x77726f6e672c20 0 | who | \x77686f20 100 | deem +| \x6465656d0a | | 0 | That | \x5468617420 0 | my | \x6d7920 0 | days | \x6461797320 0 | have | \x6861766520 0 | been | \x6265656e20 0 | a | \x6120 100 | dream; +| \x647265616d3b0a | | 0 | Yet | \x59657420 0 | if | \x696620 0 | hope | \x686f706520 0 | has | \x68617320 0 | flown | \x666c6f776e20 100 | away +| \x617761790a | | 0 | In | \x496e20 0 | a | \x6120 0 | night, | \x6e696768742c20 0 | or | \x6f7220 0 | in | \x696e20 0 | a | \x6120 0 | day, | \x6461792c (31 rows) -- icu_number_spellout SELECT loc, icu_number_spellout(1234, loc) FROM (values ('en'),('fr'),('de'),('ru'),('ja')) AS s(loc); loc | icu_number_spellout -----+------------------------------------------- en | one thousand two hundred thirty-four fr | mille deux cent trente-quatre de | ein­tausend­zwei­hundert­vier­und­dreißig ru | одна тысяча двести тридцать четыре ja | 千二百三十四 (5 rows) -- icu_replace SELECT n, icu_replace( n, 'jeanrene', '{firstname}', 'und@colStrength=primary;colAlternate=shifted') FROM (values('jeanrenédupont'),('Jean-René Dupont')) as s(n) ORDER BY n COLLATE "C"; n | icu_replace -------------------+--------------------- Jean-René Dupont | {firstname} Dupont jeanrenédupont | {firstname}dupont (2 rows) -- icu_sentence_boundaries SELECT * FROM icu_sentence_boundaries('Call me Mr. Brown. It''s a movie.', 'en@ss=standard'); tag | contents -----+--------------------- 0 | Call me Mr. Brown. 0 | It's a movie. (2 rows) -- icu_strpos SELECT v,icu_strpos('hey rene', v, 'und@colStrength=primary;colAlternate=shifted') FROM (VALUES ('René'), ('rené'), ('Rene'), ('n'), ('në'), ('no'), (''), (null)) AS s(v) ORDER BY v COLLATE "C"; v | icu_strpos ------+------------ | 1 Rene | 5 René | 5 n | 7 no | 0 në | 7 rené | 5 | (8 rows) -- icu_transform SELECT icu_transform('10\N{SUPERSCRIPT MINUS}\N{SUPERSCRIPT FOUR}' '\N{MICRO SIGN}m = 1 \N{ANGSTROM SIGN}', 'Name-Any'); icu_transform --------------- 10⁻⁴µm = 1 Å (1 row) SELECT icu_transform('Ich muß essen.', '[:^ascii:]; Hex'); icu_transform --------------------- Ich mu\u00DF essen. (1 row) -- icu_word_boundaries SELECT * FROM icu_word_boundaries($$Do you like O'Reilly books?$$, 'en'); tag | contents -----+---------- 200 | Do 0 | 200 | you 0 | 200 | like 0 | 200 | O'Reilly 0 | 200 | books 0 | ? (10 rows) icu_ext-1.6.2/icu_break.c000066400000000000000000000143501376047124100152600ustar00rootroot00000000000000/* * icu_break.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ #include "postgres.h" #include "access/htup_details.h" #include "fmgr.h" #include "funcapi.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "mb/pg_wchar.h" #include "unicode/ubrk.h" #include "unicode/ucnv.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "unicode/ustring.h" #include "unicode/utext.h" /* * PG set-returning functions exposing ICU's BreakIterator API for * characters, words, line-wrapping, sentences */ PG_FUNCTION_INFO_V1(icu_character_boundaries); PG_FUNCTION_INFO_V1(icu_word_boundaries); PG_FUNCTION_INFO_V1(icu_sentence_boundaries); PG_FUNCTION_INFO_V1(icu_line_boundaries); struct ubreak_ctxt { UBreakIterator *iter; UText* ut; char* source_text; UChar* cnv_text; /* unused and NULL if the database encoding is UTF-8 */ int32_t len; TupleDesc tupdesc; }; /* * Initialize the context to iterate on the input. * arg1=input string, arg2=locale * The main difference between break iterators is: * - UBRK_CHARACTER: return SETOF text * - others: return SETOF (int,text) */ static void init_srf_first_call(UBreakIteratorType break_type, PG_FUNCTION_ARGS) { MemoryContext oldcontext; const char *brk_locale; UErrorCode status = U_ZERO_ERROR; FuncCallContext *funcctx; struct ubreak_ctxt *ctxt; funcctx = SRF_FIRSTCALL_INIT(); /* * Switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); ctxt = palloc(sizeof(struct ubreak_ctxt)); if (break_type != UBRK_CHARACTER) { TupleDesc tupdesc; /* Construct tuple descriptor */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("function returning record called in context that cannot accept type record"))); ctxt->tupdesc = BlessTupleDesc(tupdesc); } else ctxt->tupdesc = NULL; /* Use the UTF-8 ICU functions if our string is in UTF-8 */ if (GetDatabaseEncoding() == PG_UTF8) { text *txt = PG_GETARG_TEXT_PP(0); ctxt->len = VARSIZE_ANY_EXHDR(txt); ctxt->source_text = (char*)palloc(ctxt->len); ctxt->cnv_text = NULL; memcpy(ctxt->source_text, VARDATA_ANY(txt), ctxt->len); ctxt->ut = utext_openUTF8(NULL, ctxt->source_text, ctxt->len, &status); if (U_FAILURE(status)) elog(ERROR, "utext_openUTF8() failed: %s", u_errorName(status)); } else { text *input = PG_GETARG_TEXT_PP(0); /* database encoding to UChar buffer */ ctxt->len = icu_to_uchar(&ctxt->cnv_text, text_to_cstring(input), VARSIZE_ANY_EXHDR(input)); ctxt->ut = utext_openUChars(NULL, ctxt->cnv_text, ctxt->len, &status); if (U_FAILURE(status)) elog(ERROR, "utext_openUChars() failed: %s", u_errorName(status)); } funcctx->user_fctx = (void *) ctxt; brk_locale = text_to_cstring(PG_GETARG_TEXT_PP(1)); MemoryContextSwitchTo(oldcontext); ctxt->iter = ubrk_open(break_type, brk_locale, NULL, 0, &status); if (U_FAILURE(status)) { utext_close(ctxt->ut); elog(ERROR, "ubrk_open failed: %s", u_errorName(status)); } ubrk_setUText(ctxt->iter, ctxt->ut, &status); if (U_FAILURE(status)) { ubrk_close(ctxt->iter); utext_close(ctxt->ut); elog(ERROR, "ubrk_setText() failed: %s", u_errorName(status)); } } /* * Return substrings (SETOF text). In general, they're are * one-character only but CRLF are returned in one piece, * and combining+base characters are also pieced together. * In this respect it differs from regexp_split_to_table(text, '') */ Datum icu_character_boundaries(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; int32_t pos0, pos; struct ubreak_ctxt *ctxt; if (SRF_IS_FIRSTCALL()) { init_srf_first_call(UBRK_CHARACTER, fcinfo); } funcctx = SRF_PERCALL_SETUP(); ctxt = (struct ubreak_ctxt*) funcctx->user_fctx; if (ctxt->len == 0) SRF_RETURN_DONE(funcctx); /* no result */ pos0 = ubrk_current(ctxt->iter); pos = ubrk_next(ctxt->iter); if (pos != UBRK_DONE) { text *item; if (ctxt->source_text != NULL) item = cstring_to_text_with_len(ctxt->source_text+pos0, pos-pos0); else { char *buf; /* convert UChar to a buffer in the database encoding */ int32_t len = icu_from_uchar(&buf, ctxt->cnv_text+pos0, pos-pos0); item = cstring_to_text_with_len(buf, len); } SRF_RETURN_NEXT(funcctx, PointerGetDatum(item)); } else /* end of SRF iteration */ { ubrk_close(ctxt->iter); utext_close(ctxt->ut); SRF_RETURN_DONE(funcctx); } } /* * Return (tag,content) tuples */ static Datum icu_boundaries_internal(UBreakIteratorType break_type, PG_FUNCTION_ARGS) { FuncCallContext *funcctx; int32_t pos0, pos1; struct ubreak_ctxt *ctxt; if (SRF_IS_FIRSTCALL()) { init_srf_first_call(break_type, fcinfo); } funcctx = SRF_PERCALL_SETUP(); ctxt = (struct ubreak_ctxt*) funcctx->user_fctx; if (ctxt->len == 0) SRF_RETURN_DONE(funcctx); /* no result */ pos0 = ubrk_current(ctxt->iter); do { pos1 = ubrk_next(ctxt->iter); if (pos1 != UBRK_DONE) { Datum values[2]; bool nulls[2]; HeapTuple tuple; text *item; if (ctxt->source_text != NULL) { item = cstring_to_text_with_len(ctxt->source_text + pos0, pos1-pos0); } else { char *buf; /* convert back UChar to a buffer in the database encoding */ int32_t len = icu_from_uchar(&buf, ctxt->cnv_text + pos0, pos1-pos0); item = cstring_to_text_with_len(buf, len); } values[0] = Int32GetDatum(ubrk_getRuleStatus(ctxt->iter)); nulls[0] = false; values[1] = PointerGetDatum(item); nulls[1] = false; tuple = heap_form_tuple(ctxt->tupdesc, values, nulls); SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); } } while (pos1 != UBRK_DONE); /* end of SRF iteration */ ubrk_close(ctxt->iter); utext_close(ctxt->ut); SRF_RETURN_DONE(funcctx); } Datum icu_word_boundaries(PG_FUNCTION_ARGS) { return icu_boundaries_internal(UBRK_WORD, fcinfo); } Datum icu_line_boundaries(PG_FUNCTION_ARGS) { return icu_boundaries_internal(UBRK_LINE, fcinfo); } Datum icu_sentence_boundaries(PG_FUNCTION_ARGS) { return icu_boundaries_internal(UBRK_SENTENCE, fcinfo); } icu_ext-1.6.2/icu_ext.c000066400000000000000000000546221376047124100150020ustar00rootroot00000000000000/* * icu_ext.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ #include "postgres.h" #include "catalog/pg_collation.h" #include "fmgr.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "miscadmin.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/tuplestore.h" #include "utils/pg_locale.h" #include "unicode/ucnv.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "unicode/umachine.h" #include "unicode/uscript.h" #include "unicode/ustring.h" #include "unicode/utext.h" #include "unicode/uvernum.h" #include "icu_ext.h" PG_MODULE_MAGIC; PG_FUNCTION_INFO_V1(icu_version); PG_FUNCTION_INFO_V1(icu_unicode_version); PG_FUNCTION_INFO_V1(icu_collation_attributes); PG_FUNCTION_INFO_V1(icu_locales_list); PG_FUNCTION_INFO_V1(icu_default_locale); PG_FUNCTION_INFO_V1(icu_set_default_locale); PG_FUNCTION_INFO_V1(icu_compare); PG_FUNCTION_INFO_V1(icu_compare_coll); PG_FUNCTION_INFO_V1(icu_case_compare); PG_FUNCTION_INFO_V1(icu_sort_key); PG_FUNCTION_INFO_V1(icu_sort_key_coll); PG_FUNCTION_INFO_V1(icu_char_name); Datum icu_version(PG_FUNCTION_ARGS) { UVersionInfo version; char buf[U_MAX_VERSION_STRING_LENGTH+1]; u_getVersion(version); u_versionToString(version, buf); PG_RETURN_TEXT_P(cstring_to_text(buf)); } Datum icu_unicode_version(PG_FUNCTION_ARGS) { UVersionInfo version; char buf[U_MAX_VERSION_STRING_LENGTH+1]; u_getUnicodeVersion(version); u_versionToString(version, buf); PG_RETURN_TEXT_P(cstring_to_text(buf)); } /* Get the value of a collation attribute, aborting on error. */ static UColAttributeValue get_attribute(const UCollator *coll, UColAttribute attr) { UColAttributeValue val; UErrorCode status = U_ZERO_ERROR; val = ucol_getAttribute(coll, attr, &status); if (status != U_ZERO_ERROR) { elog(ERROR, "ucol_getAttribute failed"); } return val; } /* * Return (attribute,value) tuples for all attributes of a collation, * with keys and values matching options defined at * http://unicode.org/reports/tr35/tr35-collation.html#Setting_Options * Optionally, the attributes kept at their default values are not * included in the results. */ Datum icu_collation_attributes(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; MemoryContext per_query_ctx; MemoryContext oldcontext; TupleDesc tupdesc; Tuplestorestate *tupstore; Datum values[2]; bool nulls[2]; char *txt; const char *locale; bool include_defaults = !(PG_GETARG_BOOL(1)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; UColAttributeValue u_attr_val; if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); /* Switch into long-lived context to construct returned data structures */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* Open ICU collation */ locale = text_to_cstring(PG_GETARG_TEXT_P(0)); collator = ucol_open(locale, &status); if (!collator) { elog(ERROR, "failed to open collation"); } tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); memset(nulls, 0, sizeof(nulls)); /* name (not a real attribute, added for convenience) */ if (include_defaults) { /* Use a large initial buffer to avoid bug ICU-21157 */ UChar dname_local[500]; UChar *dname = dname_local; char *buf; int32_t ulen; ulen = uloc_getDisplayName(locale, NULL, dname, sizeof(dname_local)/sizeof(UChar), &status); if (status == U_BUFFER_OVERFLOW_ERROR) { dname = palloc((ulen+1)*sizeof(UChar)); status = U_ZERO_ERROR; ulen = uloc_getDisplayName(locale, NULL, dname, ulen, &status); } if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayName failed: %s", u_errorName(status)); icu_from_uchar(&buf, dname, ulen); values[0] = CStringGetTextDatum("displayname"); values[1] = CStringGetTextDatum(buf); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_NUMERIC_COLLATION (key:kn) */ u_attr_val = get_attribute(collator, UCOL_NUMERIC_COLLATION); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kn"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_FRENCH_COLLATION (key:kb, rule:[backwards 2]) */ u_attr_val = get_attribute(collator, UCOL_FRENCH_COLLATION); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kb"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_NORMALIZATION_MODE (key:kk)*/ u_attr_val = get_attribute(collator, UCOL_NORMALIZATION_MODE); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kk"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_ALTERNATE_HANDLING (key:ka) */ u_attr_val = get_attribute(collator, UCOL_ALTERNATE_HANDLING); if (include_defaults || u_attr_val != UCOL_NON_IGNORABLE) { switch (u_attr_val) { case UCOL_NON_IGNORABLE: txt = "noignore"; break; case UCOL_SHIFTED: txt = "shifted"; break; default: txt = ""; break; } values[0] = CStringGetTextDatum("ka"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_STRENGTH (key:ks) */ u_attr_val = get_attribute(collator, UCOL_STRENGTH); if (include_defaults || u_attr_val != UCOL_TERTIARY) { switch(u_attr_val) { case UCOL_PRIMARY: txt = "level1"; break; case UCOL_SECONDARY: txt = "level2"; break; case UCOL_TERTIARY: txt = "level3"; break; case UCOL_QUATERNARY: txt = "level4"; break; case UCOL_IDENTICAL: txt = "identic"; break; default: txt = ""; break; } values[0] = CStringGetTextDatum("ks"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_CASE_FIRST (key:kf) */ u_attr_val = get_attribute(collator, UCOL_CASE_FIRST); if (include_defaults || u_attr_val != UCOL_OFF) { switch(u_attr_val) { case UCOL_OFF: txt = "false"; break; case UCOL_LOWER_FIRST: txt = "lower"; break; case UCOL_UPPER_FIRST: txt = "upper"; break; default: txt = ""; break; } values[0] = CStringGetTextDatum("kf"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* UCOL_CASE_LEVEL (key:kc) */ u_attr_val = get_attribute(collator, UCOL_CASE_LEVEL); if (include_defaults || u_attr_val != UCOL_OFF) { txt = (u_attr_val == UCOL_OFF) ? "false" : "true"; values[0] = CStringGetTextDatum("kc"); values[1] = CStringGetTextDatum(txt); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } /* Max variable (key:kv) */ { UColReorderCode reorder_code = ucol_getMaxVariable(collator); const char *code_name = NULL; switch(reorder_code) { case UCOL_REORDER_CODE_SPACE: code_name = "space"; break; case UCOL_REORDER_CODE_PUNCTUATION: code_name = "punct"; break; case UCOL_REORDER_CODE_SYMBOL: code_name = "symbol"; break; case UCOL_REORDER_CODE_CURRENCY: code_name = "currency"; break; case UCOL_REORDER_CODE_DIGIT: code_name = "digit"; break; default: break; } /* "punct" is the default. Omit it unless include_defaults is set */ if (code_name != NULL && (include_defaults || reorder_code != UCOL_REORDER_CODE_PUNCTUATION)) { values[0] = CStringGetTextDatum("kv"); values[1] = CStringGetTextDatum(code_name); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } } /* Reorder codes (key:kr) */ { UErrorCode status = U_ZERO_ERROR; StringInfoData aggr_values; /* 4-letter codes separated by hyphens */ int32_t *reorder_codes = NULL; int32_t nb_reorderings = ucol_getReorderCodes(collator, NULL, 0, &status); if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) elog(ERROR, "uloc_getReorderCodes failed: %s", u_errorName(status)); initStringInfo(&aggr_values); if (nb_reorderings > 0) { reorder_codes = palloc(nb_reorderings*sizeof(int32_t)); status = U_ZERO_ERROR; nb_reorderings = ucol_getReorderCodes(collator, reorder_codes, nb_reorderings, &status); if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) elog(ERROR, "uloc_getReorderCodes failed: %s", u_errorName(status)); } for (uint32_t idx=0; idx < nb_reorderings; idx++) { const char *value = NULL; if (reorder_codes[idx] >= UCOL_REORDER_CODE_FIRST) { switch(reorder_codes[idx]) { case UCOL_REORDER_CODE_SPACE: value = "space"; break; case UCOL_REORDER_CODE_PUNCTUATION: value = "punct"; break; case UCOL_REORDER_CODE_SYMBOL: value = "symbol"; break; case UCOL_REORDER_CODE_CURRENCY: value = "currency"; break; case UCOL_REORDER_CODE_DIGIT: value = "digit"; break; } } else { value = uscript_getShortName((UScriptCode)reorder_codes[idx]); } if (value != NULL) { if (idx >= 1) appendStringInfoChar(&aggr_values, '-'); appendStringInfoString(&aggr_values, value); } } if (aggr_values.len > 0) { values[0] = CStringGetTextDatum("kr"); values[1] = CStringGetTextDatum(aggr_values.data); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } } /* version (not a real attribute, added for convenience) */ if (include_defaults) { UVersionInfo version; char buf[U_MAX_VERSION_STRING_LENGTH+1]; ucol_getVersion(collator, version); u_versionToString(version, buf); values[0] = CStringGetTextDatum("version"); values[1] = CStringGetTextDatum(buf); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } tuplestore_donestoring(tupstore); ucol_close(collator); return (Datum) 0; } /* * Add a piece of text as a new Datum value, setting it to NULL * if it's empty. */ static int add_string(const char* value, int column, Datum *values, bool *nulls) { if (*value) values[column] = CStringGetTextDatum(value); else values[column] = (Datum)0; nulls[column] = (*value == '\0'); return column+1; } /* * Interface to uloc_getAvailable() for all locales. * Return a table of available locales with their main properties. */ Datum icu_locales_list(PG_FUNCTION_ARGS) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; MemoryContext per_query_ctx; MemoryContext oldcontext; TupleDesc tupdesc; Tuplestorestate *tupstore; int32_t loc_count = uloc_countAvailable(); int32_t i; Datum values[7]; bool nulls[7]; if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); /* Switch into long-lived context to construct returned data structures */ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; oldcontext = MemoryContextSwitchTo(per_query_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); for (i=0; i < loc_count; i++) { UErrorCode status = U_ZERO_ERROR; int col_num = 0; const char *p = uloc_getAvailable(i); /* Name */ col_num = add_string(p, col_num, values, nulls); /* Country */ { UChar country_buf[200]; char* country; /* with the database encoding */ uloc_getDisplayCountry(p, NULL /*ULOC_ENGLISH*/, country_buf, sizeof(country_buf)/sizeof(UChar), &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayCountry() failed on locale '%s': %s", p, u_errorName(status)); icu_from_uchar(&country, country_buf, u_strlen(country_buf)); col_num = add_string(country, col_num, values, nulls); } /* Country code */ col_num = add_string(uloc_getISO3Country(p), col_num, values, nulls); /* Language */ { UChar lang_buf[200]; char* language; uloc_getDisplayLanguage(p, NULL, lang_buf, sizeof(lang_buf)/sizeof(UChar), &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayLanguage() failed on locale '%s': %s", p, u_errorName(status)); icu_from_uchar(&language, lang_buf, u_strlen(lang_buf)); col_num = add_string(language, col_num, values, nulls); } /* Language code */ col_num = add_string(uloc_getISO3Language(p), col_num, values, nulls); /* Script */ { UChar script_buf[100]; char* script; uloc_getDisplayScript(p, NULL, script_buf, sizeof(script_buf)/sizeof(UChar), &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getDisplayScript() failed on locale '%s': %s", p, u_errorName(status)); icu_from_uchar(&script, script_buf, u_strlen(script_buf)); col_num = add_string(script, col_num, values, nulls); } /* Character orientation */ { const char* layout; ULayoutType t = uloc_getCharacterOrientation(p, &status); if (U_FAILURE(status)) elog(ERROR, "uloc_getCharacterOrientation() failed on locale '%s': %s", p, u_errorName(status)); switch (t) { case ULOC_LAYOUT_LTR: layout = "LTR"; break; case ULOC_LAYOUT_RTL: layout = "RTL"; break; case ULOC_LAYOUT_TTB: layout = "TTB"; break; case ULOC_LAYOUT_BTT: layout = "BTT"; break; default: layout = ""; break; } col_num = add_string(layout, col_num, values, nulls); } tuplestore_putvalues(tupstore, tupdesc, values, nulls); } tuplestore_donestoring(tupstore); return (Datum) 0; } /* * Return the default locale. */ Datum icu_default_locale(PG_FUNCTION_ARGS) { PG_RETURN_TEXT_P(cstring_to_text(uloc_getDefault())); } /* * Set the default locale to some name and return its canonicalized * name. * Warning: seen with ICU-52, passing a locale name with BCP-47 * extensions makes ICU never return from uloc_setDefault() (it seems * to wait for some internal lock). * Note that ICU documentation says about uloc_setDefault(): * "Do not use unless you know what you are doing." * This is useful in icu_ext to get translated versions of country * and language names from icu_locales_list(). */ Datum icu_set_default_locale(PG_FUNCTION_ARGS) { UErrorCode status = U_ZERO_ERROR; const char *locname = text_to_cstring(PG_GETARG_TEXT_P(0)); char buf[1024]; uloc_setDefault(locname, &status); if (U_FAILURE(status)) elog(ERROR, "failed to set ICU locale: %s", u_errorName(status)); uloc_canonicalize(locname, buf, sizeof(buf), &status); if (U_FAILURE(status)) PG_RETURN_NULL(); else PG_RETURN_TEXT_P(cstring_to_text(buf)); } /* * Get the UCollator object corresponding to the collation in input. * This UCollator is kept open by the backend and pointed to by the * cached pg_locale_t object. */ UCollator* ucollator_from_coll_id(Oid collid) { pg_locale_t pg_locale; if (collid == DEFAULT_COLLATION_OID || !OidIsValid(collid)) { /* * This will need to be changed when a db will be able to have * an ICU collation by default (not possible as of PG11). */ ereport(ERROR, (errcode(ERRCODE_INDETERMINATE_COLLATION), errmsg("could not determine which ICU collation to use"), errhint("Use the COLLATE clause to set the collation explicitly."))); } pg_locale = pg_newlocale_from_collation(collid); if (!pg_locale || pg_locale->provider != 'i') { ereport(ERROR, (errcode(ERRCODE_COLLATION_MISMATCH), errmsg("the collation provider of the input string must be ICU"))); } return pg_locale->info.icu.ucol; } /* * The actual collation-aware comparison happens here. * the UCollator comes either from a cached pg_locale_t * or is just opened and closed immediately by icu_ext. */ static UCollationResult our_strcoll(text *txt1, text *txt2, UCollator *collator ) { UCollationResult result; int32_t len1 = VARSIZE_ANY_EXHDR(txt1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); if (GetDatabaseEncoding() == PG_UTF8) { /* use the UTF-8 representation directly if possible */ UErrorCode status = U_ZERO_ERROR; result = ucol_strcollUTF8(collator, text_to_cstring(txt1), len1, text_to_cstring(txt2), len2, &status); if (U_FAILURE(status)) elog(ERROR, "ICU strcoll failed: %s", u_errorName(status)); } else { int32_t ulen1, ulen2; UChar *uchar1, *uchar2; ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); ulen2 = icu_to_uchar(&uchar2, text_to_cstring(txt2), len2); result = ucol_strcoll(collator, uchar1, ulen1, uchar2, ulen2); pfree(uchar1); pfree(uchar2); } return result; } /* * Compare two strings with the given collation. * Return the result as a signed integer, similarly to strcoll(). */ Datum icu_compare_coll(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); text *txt2 = PG_GETARG_TEXT_PP(1); const char *collname = text_to_cstring(PG_GETARG_TEXT_P(2)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; UCollationResult result; collator = ucol_open(collname, &status); if (!collator || U_FAILURE(status)) { elog(ERROR, "failed to open collation: %s", u_errorName(status)); } result = our_strcoll(txt1, txt2, collator); ucol_close(collator); PG_RETURN_INT32(result == UCOL_EQUAL ? 0 : (result == UCOL_GREATER ? 1 : -1)); } /* * Compare two strings with the collation of the function, * which must be an ICU collation. * Return the result as a signed integer, similarly to strcoll(). */ Datum icu_compare(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); text *txt2 = PG_GETARG_TEXT_PP(1); UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); UCollationResult result; result = our_strcoll(txt1, txt2, collator); PG_RETURN_INT32(result == UCOL_EQUAL ? 0 : (result == UCOL_GREATER ? 1 : -1)); } /* * Compare two strings with full case folding. */ Datum icu_case_compare(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); text *txt2 = PG_GETARG_TEXT_PP(1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); int32_t result; UChar *uchar1, *uchar2; (void)icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); (void)icu_to_uchar(&uchar2, text_to_cstring(txt2), len2); result = u_strcasecmp(uchar1, uchar2, 0); pfree(uchar1); pfree(uchar2); PG_RETURN_INT32(result); } /* * Return a binary sort key corresponding to the string and * its collation (through a COLLATE clause). */ Datum icu_sort_key(PG_FUNCTION_ARGS) { text *txt = PG_GETARG_TEXT_PP(0); UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); int32_t o_len = 1024; /* first attempt */ int32_t ulen; UChar *ustring; bytea *output; ulen = icu_to_uchar(&ustring, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); do { int32_t effective_len; output = (bytea*) palloc(o_len + VARHDRSZ); effective_len = ucol_getSortKey(collator, ustring, ulen, (uint8_t*)VARDATA(output), o_len); if (effective_len == 0) { elog(ERROR, "ucol_getSortKey() failed: internal error"); } if (effective_len > o_len) { pfree(output); output = NULL; } o_len = effective_len; } while (output == NULL); /* should loop at most once, if buffer too small */ SET_VARSIZE(output, o_len + VARHDRSZ - 1); /* -1 excludes the ending NUL byte */ PG_RETURN_BYTEA_P(output); } /* * Return a binary sort key corresponding to the string and * the given collation. */ Datum icu_sort_key_coll(PG_FUNCTION_ARGS) { text *txt = PG_GETARG_TEXT_PP(0); const char *locname = text_to_cstring(PG_GETARG_TEXT_P(1)); UCollator *collator; UErrorCode status = U_ZERO_ERROR; int32_t o_len = 1024; /* first attempt */ int32_t ulen; UChar *ustring; bytea *output; ulen = icu_to_uchar(&ustring, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); collator = ucol_open(locname, &status); if (!collator) elog(ERROR, "failed to open collation"); do { int32_t effective_len; output = (bytea*) palloc(o_len + VARHDRSZ); effective_len = ucol_getSortKey(collator, ustring, ulen, (uint8_t*)VARDATA(output), o_len); if (effective_len == 0) { ucol_close(collator); elog(ERROR, "ucol_getSortKey() failed: internal error"); } if (effective_len > o_len) { pfree(output); output = NULL; } o_len = effective_len; } while (output == NULL); /* should loop at most once, if buffer too small */ ucol_close(collator); SET_VARSIZE(output, o_len + VARHDRSZ - 1); /* -1 excludes the ending NUL byte */ PG_RETURN_BYTEA_P(output); } /* Return the first UChar32 of the char(1) string */ static UChar32 first_char32(BpChar* source) { UChar32 c; UText *ut; int32_t ulen; UChar *ustring; UErrorCode status = U_ZERO_ERROR; ulen = icu_to_uchar(&ustring, VARDATA_ANY(source), VARSIZE_ANY_EXHDR(source)); ut = utext_openUChars(NULL, ustring, ulen, &status); if (U_FAILURE(status)) elog(ERROR, "utext_openUChars() failed: %s", u_errorName(status)); c = utext_current32(ut); utext_close(ut); return c; } /* * Return the Unicode name corresponding to the the input character. */ Datum icu_char_name(PG_FUNCTION_ARGS) { BpChar *source = PG_GETARG_BPCHAR_PP(0); char local_buf[80]; char *buffer; int32_t buflen = sizeof(local_buf); UChar32 first_char; int32_t ulen; UErrorCode status = U_ZERO_ERROR; first_char = first_char32(source); ulen = u_charName(first_char, U_EXTENDED_CHAR_NAME, local_buf, buflen, &status); if (status == U_BUFFER_OVERFLOW_ERROR) /* buffer too small */ { buffer = palloc((ulen+1)*sizeof(char)); status = U_ZERO_ERROR; ulen = u_charName(first_char, U_EXTENDED_CHAR_NAME, buffer, ulen+1, &status); } else buffer = local_buf; if (U_FAILURE(status)) elog(ERROR, "u_charName failed: %s", u_errorName(status)); else PG_RETURN_TEXT_P(cstring_to_text(buffer)); } icu_ext-1.6.2/icu_ext.control000066400000000000000000000002061376047124100162250ustar00rootroot00000000000000# icu_ext extension comment = 'Access ICU functions' default_version = '1.6.2' module_pathname = '$libdir/icu_ext' relocatable = true icu_ext-1.6.2/icu_ext.h000066400000000000000000000004311376047124100147740ustar00rootroot00000000000000/* * icu_ext.h * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ #include "postgres.h" #include "unicode/ucol.h" UCollator* ucollator_from_coll_id(Oid collid); icu_ext-1.6.2/icu_normalize.c000066400000000000000000000074071376047124100162010ustar00rootroot00000000000000/* * icu_normalize.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "mb/pg_wchar.h" #include "utils/builtins.h" #include "utils/pg_locale.h" /* ICU includes */ #include "unicode/unorm.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_is_normalized); PG_FUNCTION_INFO_V1(icu_normalize); typedef enum { UNICODE_NFC, UNICODE_NFD, UNICODE_NFKC, UNICODE_NFKD } norm_form_t; static norm_form_t name_to_norm(const char *formstr) { if (pg_strcasecmp(formstr, "NFC") == 0) return UNICODE_NFC; else if (pg_strcasecmp(formstr, "NFD") == 0) return UNICODE_NFD; else if (pg_strcasecmp(formstr, "NFKC") == 0) return UNICODE_NFKC; else if (pg_strcasecmp(formstr, "NFKD") == 0) return UNICODE_NFKD; else elog(ERROR, "invalid normalization form: %s", formstr); } static const UNormalizer2* norm_instance(norm_form_t form) { UErrorCode status = U_ZERO_ERROR; const UNormalizer2 *instance = NULL; switch (form) { case UNICODE_NFC: instance = unorm2_getNFCInstance(&status); break; case UNICODE_NFD: instance = unorm2_getNFDInstance(&status); break; case UNICODE_NFKC: instance = unorm2_getNFKCInstance(&status); break; case UNICODE_NFKD: instance = unorm2_getNFKDInstance(&status); break; } if (U_FAILURE(status)) elog(ERROR, "norm_instance failure: %s", u_errorName(status)); return instance; } /* * Return the string (1st arg) with the given Unicode normalization * (2nd arg). */ Datum icu_normalize(PG_FUNCTION_ARGS) { text *src_text = PG_GETARG_TEXT_PP(0); const char* arg_form = text_to_cstring(PG_GETARG_TEXT_P(1)); norm_form_t form = name_to_norm(arg_form); const UNormalizer2 *instance = norm_instance(form); int32_t u_src_length, u_dest_length, effective_length, result_len; char *result; UChar *u_src, *u_dest; UErrorCode status = U_ZERO_ERROR; if (GetDatabaseEncoding() != PG_UTF8) elog(ERROR, "non-Unicode database encoding"); u_src_length = icu_to_uchar(&u_src, VARDATA_ANY(src_text), VARSIZE_ANY_EXHDR(src_text)); /* * The result may be expanded by the maximum factor given at: * https://unicode.org/faq/normalization.html#12 * (given that the UChar buffer is in UTF-16) */ switch(form) { case UNICODE_NFC: u_dest_length = u_src_length * 3; break; case UNICODE_NFD: u_dest_length = u_src_length * 4; break; case UNICODE_NFKC: case UNICODE_NFKD: default: u_dest_length = u_src_length * 18; break; } u_dest = (UChar*) palloc(u_dest_length*sizeof(UChar)); effective_length = unorm2_normalize(instance, u_src, u_src_length, u_dest, u_dest_length, &status); if (U_FAILURE(status)) elog(ERROR, "unorm2_normalize failure: %s", u_errorName(status)); result_len = icu_from_uchar(&result, u_dest, effective_length); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } /* * Check if a string (1st arg) is in the given Unicode normal form * (2nd arg). */ Datum icu_is_normalized(PG_FUNCTION_ARGS) { text *src_text = PG_GETARG_TEXT_PP(0); const char* arg_form = text_to_cstring(PG_GETARG_TEXT_PP(1)); norm_form_t form = name_to_norm(arg_form); UErrorCode status = U_ZERO_ERROR; UChar *u_src; int32_t u_src_length; UBool is_norm; const UNormalizer2 *instance = norm_instance(form); if (GetDatabaseEncoding() != PG_UTF8) elog(ERROR, "non-Unicode database encoding"); u_src_length = icu_to_uchar(&u_src, VARDATA_ANY(src_text), VARSIZE_ANY_EXHDR(src_text)); is_norm = unorm2_isNormalized(instance, u_src, u_src_length, &status); if (U_FAILURE(status)) elog(ERROR, "unorm2_isNormalized failure: %s", u_errorName(status)); PG_RETURN_BOOL(is_norm == TRUE); } icu_ext-1.6.2/icu_num.c000066400000000000000000000031321376047124100147670ustar00rootroot00000000000000/* * icu_num.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ #include "postgres.h" #include "access/htup_details.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "mb/pg_wchar.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "unicode/unum.h" #include "unicode/ustring.h" #include "unicode/utext.h" PG_FUNCTION_INFO_V1(icu_number_spellout); Datum icu_number_spellout(PG_FUNCTION_ARGS) { float8 number = PG_GETARG_FLOAT8(0); const char *locale = text_to_cstring(PG_GETARG_TEXT_PP(1)); UErrorCode status = U_ZERO_ERROR; UChar local_ubuf[256]; UChar *ubuf = local_ubuf; int32_t buf_len = sizeof(local_ubuf)/sizeof(UChar); UNumberFormat* nf; int32_t real_len; char *output; nf = unum_open(UNUM_SPELLOUT, NULL, /* pattern */ -1, /* pattern length */ locale, NULL, /* parseErr */ &status); if (U_FAILURE(status)) elog(ERROR, "unum_open failed: %s", u_errorName(status)); real_len = unum_formatDouble(nf, number, ubuf, buf_len, NULL, &status); if (status == U_BUFFER_OVERFLOW_ERROR) { /* buffer too small */ ubuf = palloc((real_len+1)*sizeof(UChar)); status = U_ZERO_ERROR; real_len = unum_formatDouble(nf, number, ubuf, real_len+1, NULL, &status); } if (U_FAILURE(status)) { unum_close(nf); elog(ERROR, "unum_formatDouble failed: %s", u_errorName(status)); } icu_from_uchar(&output, ubuf, real_len); unum_close(nf); PG_RETURN_TEXT_P(cstring_to_text(output)); } icu_ext-1.6.2/icu_search.c000066400000000000000000000226021376047124100154400ustar00rootroot00000000000000/* * icu_search.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ /* Postgres includes */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "lib/stringinfo.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" #include "utils/pg_locale.h" /* ICU includes */ #include "unicode/ucol.h" #include "unicode/usearch.h" #include "icu_ext.h" PG_FUNCTION_INFO_V1(icu_strpos); PG_FUNCTION_INFO_V1(icu_strpos_coll); PG_FUNCTION_INFO_V1(icu_replace); PG_FUNCTION_INFO_V1(icu_replace_coll); /* * Given @str in the database encoding and @str_utf16 its UTF-16 * representation, translate the character position @u16_pos (expressed in * UTF-16 code units and 0-based) to a character position in @str. * It differs from @u16_pos if @str_utf16 contains surrogate pairs. * * if @p_str null, make it point to the first byte * corresponding to @pos in @str */ static int32_t translate_char_pos(const char* str, int32_t str_len, const UChar* str_utf16, int32_t u16_len, /* in 16-bit code units */ int32_t u16_pos, const char **p_str) { UChar32 c; int32_t u16_idx = 0; int32_t out_pos = 0; if (GetDatabaseEncoding() == PG_UTF8) { int32_t u8_offset = 0; /* for UTF-8, use ICU macros instead of calling pg_mblen() */ while (u16_idx < u16_pos) { U16_NEXT(str_utf16, u16_idx, u16_len, c); U8_NEXT(str, u8_offset, str_len, c); out_pos++; } if (p_str != NULL) *p_str = str + u8_offset; } else if (pg_encoding_max_length(GetDatabaseEncoding()) == 1) { /* * for mono-byte encodings, assume a 1:1 mapping with UTF-16 * code units, since they should not contain characters * outside of the BMP. */ out_pos = u16_pos; if (p_str != NULL) *p_str = str + out_pos; } else { /* for non-UTF-8 multi-byte encodings, use pg_mblen() */ while (u16_idx < u16_pos) { U16_NEXT(str_utf16, u16_idx, u16_len, c); str += pg_mblen(str); out_pos++; } if (p_str != NULL) *p_str = str; } return out_pos; } /* * Do the bulk of the work for icu_strpos and icu_strpos_coll. * Return values: * 0: not found * >0: the 1-based position of txt2 into txt1 */ static int32_t internal_strpos(text *txt1, text *txt2, UCollator *collator) { int32_t len1 = VARSIZE_ANY_EXHDR(txt1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); UErrorCode status = U_ZERO_ERROR; UStringSearch *usearch; UChar *uchar1, *uchar2; int32_t ulen1, ulen2; int32_t pos; /* * A non-empty substring is never contained by an empty string. */ if (len1 == 0 && len2 != 0) return 0; /* * An empty substring is always found at the first character (even * inside an empty string), to be consistent with strpos() in * core. */ if (len2 == 0) return 1; ulen1 = icu_to_uchar(&uchar1, VARDATA_ANY(txt1), len1); ulen2 = icu_to_uchar(&uchar2, VARDATA_ANY(txt2), len2); usearch = usearch_openFromCollator(uchar2, /* needle */ ulen2, uchar1, /* haystack */ ulen1, collator, NULL, &status); if (U_FAILURE(status)) elog(ERROR, "failed to start search: %s", u_errorName(status)); else { pos = usearch_first(usearch, &status); if (!U_FAILURE(status) && pos != USEARCH_DONE) { /* * pos is in UTF-16 code units, with surrogate pairs counting * as two, so we need a non-trivial translation to the corresponding * position in the original string. */ pos = translate_char_pos(VARDATA_ANY(txt1), len1, uchar1, ulen1, pos, NULL); } else pos = -1; } pfree(uchar1); pfree(uchar2); usearch_close(usearch); if (U_FAILURE(status)) elog(ERROR, "failed to perform ICU search: %s", u_errorName(status)); /* return 0 if not found or the 1-based position of txt2 inside txt1 */ return pos + 1; } /* * Equivalent of strpos(haystack, needle) using ICU search */ Datum icu_strpos(PG_FUNCTION_ARGS) { UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); PG_RETURN_INT32(internal_strpos(PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ collator)); } /* * Equivalent of strpos(haystack, needle) using ICU search */ Datum icu_strpos_coll(PG_FUNCTION_ARGS) { const char *collname = text_to_cstring(PG_GETARG_TEXT_PP(2)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; int32_t pos; collator = ucol_open(collname, &status); if (!collator || U_FAILURE(status)) { elog(ERROR, "failed to open collation: %s", u_errorName(status)); } pos = internal_strpos(PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ collator); ucol_close(collator); PG_RETURN_INT32(pos); } /* * Search for @txt2 in @txt1 with the ICU @collator and replace the * matched substrings with @txt3. * * The replacement text is always txt3, but the replaced text may not * be exactly txt2, and its length in bytes may differ too, depending on * the collation rules. For example in utf-8 with an accent-insensitive * collation, {LATIN SMALL LETTER E WITH ACUTE} (2 bytes) will match * {LATIN SMALL LETTER E} (1 byte). */ static text * internal_str_replace(text *txt1, /* not const because it may be returned */ const text *txt2, /* search for txt2 with collator */ const text *txt3, /* replace the matched substrings by txt3 */ UCollator *collator) { int32_t len1 = VARSIZE_ANY_EXHDR(txt1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); int32_t len3 = VARSIZE_ANY_EXHDR(txt3); UErrorCode status = U_ZERO_ERROR; UStringSearch *usearch; UChar *uchar1, *uchar2; int32_t ulen1, ulen2; /* in utf-16 units */ text *result; int32_t pos; StringInfoData resbuf; if (len1 == 0 || len2 == 0) return txt1; ulen1 = icu_to_uchar(&uchar1, VARDATA_ANY(txt1), len1); ulen2 = icu_to_uchar(&uchar2, VARDATA_ANY(txt2), len2); usearch = usearch_openFromCollator(uchar2, /* needle */ ulen2, uchar1, /* haystack */ ulen1, collator, NULL, &status); /* "nana" in "nananana" must be found 2 times, not 3 times. */ usearch_setAttribute(usearch, USEARCH_OVERLAP, USEARCH_OFF, &status); pos = usearch_first(usearch, &status); if (U_FAILURE(status)) elog(ERROR, "failed to perform ICU search: %s", u_errorName(status)); if (pos != USEARCH_DONE) { const char *txt1_currptr; const char* txt1_startptr = VARDATA_ANY(txt1); initStringInfo(&resbuf); /* initialize the output string with the segment before the first match */ translate_char_pos(txt1_startptr, len1, uchar1, ulen1, pos, &txt1_currptr); appendBinaryStringInfo(&resbuf, txt1_startptr, txt1_currptr - txt1_startptr); /* append the replacement text */ appendBinaryStringInfo(&resbuf, VARDATA_ANY(txt3), len3); /* skip the replaced text in txt1 */ translate_char_pos( txt1_currptr, len1 - (txt1_currptr - txt1_startptr), uchar1 + pos, usearch_getMatchedLength(usearch), usearch_getMatchedLength(usearch), &txt1_currptr); do { int32_t previous_pos = pos + usearch_getMatchedLength(usearch); CHECK_FOR_INTERRUPTS(); pos = usearch_next(usearch, &status); if (U_FAILURE(status)) break; if (pos != USEARCH_DONE) { const char *txt1_nextptr; /* copy the segment before the match */ translate_char_pos( txt1_currptr, len1 - (txt1_currptr - txt1_startptr), uchar1 + previous_pos, len1 - previous_pos, pos - previous_pos, &txt1_nextptr); appendBinaryStringInfo(&resbuf, txt1_currptr, txt1_nextptr - txt1_currptr); /* compute the length of the replaced text in txt1 */ translate_char_pos( txt1_nextptr, len1 - (txt1_nextptr - txt1_startptr), uchar1 + pos, usearch_getMatchedLength(usearch), usearch_getMatchedLength(usearch), &txt1_currptr); /* append the replacement text */ appendBinaryStringInfo(&resbuf, VARDATA_ANY(txt3), len3); } } while (pos != USEARCH_DONE); /* copy the segment after the last match */ if (len1 - (txt1_currptr - txt1_startptr) > 0) { appendBinaryStringInfo(&resbuf, txt1_currptr, len1 - (txt1_currptr - txt1_startptr)); } result = cstring_to_text_with_len(resbuf.data, resbuf.len); pfree(resbuf.data); } else { /* * The substring is not found: return the original string */ result = txt1; } pfree(uchar1); pfree(uchar2); if (usearch != NULL) usearch_close(usearch); if (U_FAILURE(status)) elog(ERROR, "failed to perform ICU search: %s", u_errorName(status)); return result; } Datum icu_replace(PG_FUNCTION_ARGS) { UCollator *collator = ucollator_from_coll_id(PG_GET_COLLATION()); text *string; string = internal_str_replace( PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ PG_GETARG_TEXT_PP(2), /* replacement */ collator); PG_RETURN_TEXT_P(string); } Datum icu_replace_coll(PG_FUNCTION_ARGS) { const char *collname = text_to_cstring(PG_GETARG_TEXT_PP(3)); UCollator *collator = NULL; UErrorCode status = U_ZERO_ERROR; collator = ucol_open(collname, &status); if (!collator || U_FAILURE(status)) { elog(ERROR, "failed to open collation: %s", u_errorName(status)); } PG_RETURN_TEXT_P( internal_str_replace( PG_GETARG_TEXT_PP(0), /* haystack */ PG_GETARG_TEXT_PP(1), /* needle */ PG_GETARG_TEXT_PP(2), /* replacement */ collator) ); } icu_ext-1.6.2/icu_spoof.c000066400000000000000000000037111376047124100153210ustar00rootroot00000000000000/* * icu_spoof.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "unicode/uspoof.h" PG_FUNCTION_INFO_V1(icu_spoof_check); PG_FUNCTION_INFO_V1(icu_confusable_strings_check); /* TODO: icu_confusable_string_skeleton() */ /* * Check whether the input string is likely to be an attempt at * confusing a reader. */ Datum icu_spoof_check(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); UErrorCode status = U_ZERO_ERROR; USpoofChecker *sc; int32_t bitmask; int32_t ulen1; UChar *uchar1; sc = uspoof_open(&status); if (!sc) elog(ERROR, "ICU uspoof_open failed"); ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); bitmask = uspoof_check(sc, uchar1, ulen1, NULL, &status); uspoof_close(sc); if (U_FAILURE(status)) elog(ERROR, "ICU uspoof_areConfusable failed: %s", u_errorName(status)); PG_RETURN_BOOL(bitmask != 0); } /* * Check whether the two input strings are visually confusable with * each other. */ Datum icu_confusable_strings_check(PG_FUNCTION_ARGS) { text *txt1 = PG_GETARG_TEXT_PP(0); int32_t len1 = VARSIZE_ANY_EXHDR(txt1); text *txt2 = PG_GETARG_TEXT_PP(1); int32_t len2 = VARSIZE_ANY_EXHDR(txt2); int32_t ulen1, ulen2; UChar *uchar1, *uchar2; USpoofChecker *sc; UErrorCode status = U_ZERO_ERROR; int32_t bitmask; sc = uspoof_open(&status); if (!sc) elog(ERROR, "ICU uspoof_open failed"); ulen1 = icu_to_uchar(&uchar1, text_to_cstring(txt1), len1); ulen2 = icu_to_uchar(&uchar2, text_to_cstring(txt2), len2); bitmask = uspoof_areConfusable(sc, uchar1, ulen1, uchar2, ulen2, &status); uspoof_close(sc); if (U_FAILURE(status)) elog(ERROR, "ICU uspoof_areConfusable failed: %s", u_errorName(status)); PG_RETURN_BOOL(bitmask != 0); } icu_ext-1.6.2/icu_transform.c000066400000000000000000000103761376047124100162130ustar00rootroot00000000000000/* * icu_transform.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2020. See LICENSE.md */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "utils/memutils.h" #include "unicode/uenum.h" #include "unicode/utrans.h" PG_FUNCTION_INFO_V1(icu_transforms_list); PG_FUNCTION_INFO_V1(icu_transform); /* * List the available pre-defined transforms/transliterations. */ Datum icu_transforms_list(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; UErrorCode status = U_ZERO_ERROR; UEnumeration *en; const char *elt; if (SRF_IS_FIRSTCALL()) { funcctx = SRF_FIRSTCALL_INIT(); en = utrans_openIDs(&status); if (U_FAILURE(status)) elog(ERROR, "utrans_openIDs failed: %s", u_errorName(status)); funcctx->user_fctx = (void *)en; } funcctx = SRF_PERCALL_SETUP(); en = (UEnumeration*) funcctx->user_fctx; elt = uenum_next(en, NULL, &status); if (U_FAILURE(status)) elog(ERROR, "uenum_next failed: %s", u_errorName(status)); if (elt) { text* item = cstring_to_text(elt); SRF_RETURN_NEXT(funcctx, PointerGetDatum(item)); } else { uenum_close(en); SRF_RETURN_DONE(funcctx); } } /* * Cache for the last transformation used. * This may come in handy in applications that use several times the same transformation */ static UTransliterator *utrans = NULL; static char *cached_utrans_id = NULL; /* * Main function to apply a tranformation based on UTransliterator. * Input: * 1st arg: string to transform * 2nd arg: name (system identifier) of the transliterator */ Datum icu_transform(PG_FUNCTION_ARGS) { text *arg1 = PG_GETARG_TEXT_PP(0); text *arg2 = PG_GETARG_TEXT_PP(1); int32_t len1 = VARSIZE_ANY_EXHDR(arg1); const char *input_id = text_to_cstring(arg2); UErrorCode status = U_ZERO_ERROR; int32_t ulen, limit, capacity, start, original_ulen; int32_t result_len, in_ulen; UChar* utext; UChar* trans_id; char* result; UChar* original; bool done = false; if (cached_utrans_id != NULL) { if (strcmp(cached_utrans_id, input_id) != 0) { pfree(cached_utrans_id); cached_utrans_id = NULL; utrans_close(utrans); utrans = NULL; } } if (utrans == NULL) { in_ulen = icu_to_uchar(&trans_id, input_id, strlen(input_id)); utrans = utrans_openU(trans_id, in_ulen, UTRANS_FORWARD, NULL, /* rules. NULL for system transliterators */ -1, NULL, /* pointer to parseError. Not used */ &status); if (U_FAILURE(status) || !utrans) { elog(ERROR, "utrans_open failed: %s", u_errorName(status)); } else { cached_utrans_id = MemoryContextStrdup(TopMemoryContext, input_id); } } ulen = icu_to_uchar(&utext, text_to_cstring(arg1), len1); /* utext is terminated by a zero UChar that we include in the copy. */ original = (UChar*) palloc((ulen+1)*sizeof(UChar)); original_ulen = ulen; memcpy(original, utext, (ulen+1)*sizeof(UChar)); limit = ulen; capacity = ulen + 1; start = 0; /* * utrans_transUChars() updates the string in-place, stopping if * it would go over `capacity`. * The following loop doubles the capacity and restarts from * scratch with a clean copy of the source if the buffer was * too small. * Although it looks like we could use `start` and `limit` * to reallocate and make the transliteration continue from * where it stopped, in practice this does not appear to work. * The documentation is quite unclear about this function. */ do { status = U_ZERO_ERROR; utrans_transUChars(utrans, utext, &ulen, capacity, start, /* beginning index */ &limit, &status); if (U_FAILURE(status)) { if (status != U_BUFFER_OVERFLOW_ERROR) { elog(ERROR, "utrans_transUChars failed: %s", u_errorName(status)); } else { pfree(utext); capacity = capacity * 2; utext = (UChar*) palloc(capacity*sizeof(UChar)); /* restore the original text in the enlarged buffer */ ulen = original_ulen; limit = ulen; memcpy(utext, original, (ulen+1)*sizeof(UChar)); } } else done = true; } while (!done); result_len = icu_from_uchar(&result, utext, ulen); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); } icu_ext-1.6.2/sql/000077500000000000000000000000001376047124100137645ustar00rootroot00000000000000icu_ext-1.6.2/sql/icu_ext--1.0--1.1.sql000066400000000000000000000022221376047124100170660ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.1'" to load this file. \quit CREATE FUNCTION icu_char_name( c character ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_char_name(character) IS 'Return the Unicode character name corresponding to the first codepoint of the input'; CREATE FUNCTION icu_number_spellout( num float8, locale text ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_number_spellout(float8,text) IS 'Spell out the number according to the given locale'; CREATE FUNCTION icu_spoof_check( str text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_spoof_check(text) IS 'Check whether the argument is likely to be an attempt at confusing a reader'; CREATE FUNCTION icu_confusable_strings_check( str1 text, str2 text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_confusable_strings_check(text,text) IS 'Check whether the arguments are visually confusable with each other'; icu_ext-1.6.2/sql/icu_ext--1.1--1.2.sql000066400000000000000000000011251376047124100170710ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.2'" to load this file. \quit CREATE FUNCTION icu_transforms_list( ) RETURNS SETOF text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transforms_list() IS 'List the basic transforms available to icu_transform'; CREATE FUNCTION icu_transform(string text, trans text) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transform(text,text) IS 'Apply a transformation through basic or compound transliterators and filters'; icu_ext-1.6.2/sql/icu_ext--1.2--1.3.sql000066400000000000000000000024101376047124100170710ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.3'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_sort_key( str text, collator text ) RETURNS bytea AS 'MODULE_PATHNAME', 'icu_sort_key_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text,text) IS 'Compute the binary sort key for the string given the collation'; CREATE OR REPLACE FUNCTION icu_sort_key( str text ) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text) IS 'Compute the binary sort key with the collate of the string'; CREATE OR REPLACE FUNCTION icu_compare( str1 text, str2 text ) RETURNS int AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text) IS 'Compare two strings with their ICU collation and return a signed int like strcoll'; CREATE OR REPLACE FUNCTION icu_compare( str1 text, str2 text, collator text ) RETURNS int AS 'MODULE_PATHNAME', 'icu_compare_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text,text) IS 'Compare two strings with the given collation and return a signed int like strcoll'; icu_ext-1.6.2/sql/icu_ext--1.3--1.4.sql000066400000000000000000000016201376047124100170750ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.4'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_strpos( string text, "substring" text ) RETURNS int4 AS 'MODULE_PATHNAME', 'icu_strpos' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; CREATE OR REPLACE FUNCTION icu_strpos( string text, "substring" text, collator text ) RETURNS int4 AS 'MODULE_PATHNAME', 'icu_strpos_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; CREATE OR REPLACE FUNCTION icu_replace( string text, "from" text, "to" text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_replace' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 100; CREATE OR REPLACE FUNCTION icu_replace( string text, "from" text, "to" text, collator text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_replace_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 100; icu_ext-1.6.2/sql/icu_ext--1.3.sql000066400000000000000000000131411376047124100165210ustar00rootroot00000000000000/* icu_ext.sql */ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "CREATE EXTENSION icu_ext" to load this file. \quit CREATE FUNCTION icu_version() RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_version() IS 'Version of the ICU library currently in use'; CREATE FUNCTION icu_unicode_version() RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_unicode_version() IS 'Version of the Unicode standard used by ICU'; CREATE FUNCTION icu_collation_attributes( IN collator text, IN exclude_defaults bool default false, OUT attribute text, OUT value text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_collation_attributes(text,bool) IS 'List the attributes of an ICU collation'; CREATE FUNCTION icu_locales_list ( OUT name text, OUT country text, OUT country_code text, OUT language text, OUT language_code text, OUT script text, OUT direction text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_locales_list() IS 'List the available ICU locales with their main properties'; CREATE FUNCTION icu_default_locale() RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C; COMMENT ON FUNCTION icu_default_locale() IS 'Return the ICU locale currently used by default'; /* Set the default locale to some name and return the canonicalized name. */ CREATE FUNCTION icu_set_default_locale(text) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_set_default_locale(text) IS 'Set the ICU locale used by default'; /* See http://userguide.icu-project.org/boundaryanalysis */ CREATE FUNCTION icu_character_boundaries( contents text, locale text ) RETURNS SETOF text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_character_boundaries(text,text) IS 'Split text into characters, using boundary positions according to Unicode rules with the specified locale'; CREATE FUNCTION icu_word_boundaries( contents text, locale text, OUT tag int, OUT contents text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_word_boundaries(text,text) IS 'Split text into words, using boundary positions according to Unicode rules with the specified locale'; CREATE FUNCTION icu_line_boundaries( contents text, locale text, OUT tag int, OUT contents text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_line_boundaries(text,text) IS 'Split text into parts between which line breaks may occur, using rules of the specified locale'; CREATE FUNCTION icu_sentence_boundaries( contents text, locale text, OUT tag int, OUT contents text ) RETURNS SETOF record AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_sentence_boundaries(text,text) IS 'Split text into sentences, according to Unicode rules with the specified locale'; CREATE FUNCTION icu_compare( str1 text, str2 text ) RETURNS int AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text) IS 'Compare two strings with their ICU collation and return a signed int like strcoll'; CREATE FUNCTION icu_compare( str1 text, str2 text, collator text ) RETURNS int AS 'MODULE_PATHNAME', 'icu_compare_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_compare(text,text,text) IS 'Compare two strings with the given collation and return a signed int like strcoll'; CREATE FUNCTION icu_case_compare( str1 text, str2 text ) RETURNS int AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_case_compare(text,text) IS 'Compare two strings case-insensitively using full case folding'; CREATE FUNCTION icu_sort_key( str text, collator text ) RETURNS bytea AS 'MODULE_PATHNAME', 'icu_sort_key_coll' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text,text) IS 'Compute the binary sort key for the string given the collation'; CREATE FUNCTION icu_sort_key( str text ) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE COST 10; COMMENT ON FUNCTION icu_sort_key(text) IS 'Compute the binary sort key with the collate of the string'; CREATE FUNCTION icu_char_name( c character ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_char_name(character) IS 'Return the Unicode character name corresponding to the first codepoint of the input'; CREATE FUNCTION icu_number_spellout( num float8, locale text ) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE; COMMENT ON FUNCTION icu_number_spellout(float8,text) IS 'Spell out the number according to the given locale'; CREATE FUNCTION icu_spoof_check( str text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_spoof_check(text) IS 'Check whether the argument is likely to be an attempt at confusing a reader'; CREATE FUNCTION icu_confusable_strings_check( str1 text, str2 text ) RETURNS boolean AS 'MODULE_PATHNAME' LANGUAGE C STRICT STABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_confusable_strings_check(text,text) IS 'Check whether the arguments are visually confusable with each other'; CREATE FUNCTION icu_transforms_list( ) RETURNS SETOF text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transforms_list() IS 'List the basic transforms available to icu_transform'; CREATE FUNCTION icu_transform(string text, trans text) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT; COMMENT ON FUNCTION icu_transform(text,text) IS 'Apply a transformation through basic or compound transliterators and filters'; icu_ext-1.6.2/sql/icu_ext--1.4--1.5.sql000066400000000000000000000002361376047124100171010ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.5'" to load this file. \quit icu_ext-1.6.2/sql/icu_ext--1.5--1.6.sql000066400000000000000000000013611376047124100171030ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.6'" to load this file. \quit CREATE OR REPLACE FUNCTION icu_normalize( string text, form text ) RETURNS text AS 'MODULE_PATHNAME', 'icu_normalize' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_normalize(text,text) IS 'Normalize the string into one of NFC, NFD, NFKC or NFKD Unicode forms'; CREATE OR REPLACE FUNCTION icu_is_normalized( string text, form text ) RETURNS bool AS 'MODULE_PATHNAME', 'icu_is_normalized' LANGUAGE C STRICT IMMUTABLE PARALLEL SAFE; COMMENT ON FUNCTION icu_is_normalized(text,text) IS 'Test if the string is normalized in one of NFC, NFD, NFKC or NFKD Unicode forms'; icu_ext-1.6.2/sql/icu_ext--1.6--1.6.1.sql000066400000000000000000000002401376047124100172360ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.6.1'" to load this file. \quit icu_ext-1.6.2/sql/icu_ext--1.6.1--1.6.2.sql000066400000000000000000000002401376047124100173760ustar00rootroot00000000000000-- complain if script is sourced in psql, rather than via CREATE/ALTER EXTENSION \echo Use "ALTER EXTENSION icu_ext UPDATE TO '1.6.2'" to load this file. \quit icu_ext-1.6.2/sql/tests-01.sql000066400000000000000000000041221376047124100160640ustar00rootroot00000000000000-- regression tests for icu_ext CREATE EXTENSION icu_ext; -- Check that the database has the built-in ICU collations -- required by the tests SELECT collname FROM pg_collation WHERE collname IN ('und-x-icu', 'en-x-icu') ORDER BY collname; -- icu_char_name SELECT c, to_hex(ascii(c)), icu_char_name(c) FROM regexp_split_to_table('El Niño', '') as c; -- icu_character_boundaries SELECT * FROM icu_character_boundaries('Ete'||E'\u0301', 'fr') as chars; -- icu_collation_attributes SELECT * FROM icu_collation_attributes('en') WHERE attribute <> 'version'; -- icu_compare SELECT icu_compare('abcé', 'abce', 'en@colStrength=primary;colCaseLevel=yes'); SELECT icu_compare('Abcé', 'abce' COLLATE "en-x-icu"); -- icu_confusable_strings_check SELECT txt, icu_confusable_strings_check('phil', txt) AS confusable FROM (VALUES ('phiL'), ('phiI'), ('phi1'), (E'ph\u0131l')) AS s(txt); -- icu_line_boundaries SELECT *,convert_to( contents, 'utf-8') FROM icu_line_boundaries( $$Thus much let me avow You are not wrong, who deem That my days have been a dream; Yet if hope has flown away In a night, or in a day,$$ , 'en'); -- icu_number_spellout SELECT loc, icu_number_spellout(1234, loc) FROM (values ('en'),('fr'),('de'),('ru'),('ja')) AS s(loc); -- icu_replace SELECT n, icu_replace( n, 'jeanrene', '{firstname}', 'und@colStrength=primary;colAlternate=shifted') FROM (values('jeanrenédupont'),('Jean-René Dupont')) as s(n) ORDER BY n COLLATE "C"; -- icu_sentence_boundaries SELECT * FROM icu_sentence_boundaries('Call me Mr. Brown. It''s a movie.', 'en@ss=standard'); -- icu_strpos SELECT v,icu_strpos('hey rene', v, 'und@colStrength=primary;colAlternate=shifted') FROM (VALUES ('René'), ('rené'), ('Rene'), ('n'), ('në'), ('no'), (''), (null)) AS s(v) ORDER BY v COLLATE "C"; -- icu_transform SELECT icu_transform('10\N{SUPERSCRIPT MINUS}\N{SUPERSCRIPT FOUR}' '\N{MICRO SIGN}m = 1 \N{ANGSTROM SIGN}', 'Name-Any'); SELECT icu_transform('Ich muß essen.', '[:^ascii:]; Hex'); -- icu_word_boundaries SELECT * FROM icu_word_boundaries($$Do you like O'Reilly books?$$, 'en');