pax_global_header00006660000000000000000000000064147055404540014522gustar00rootroot0000000000000052 comment=1f73ee76e766bf8cd6cf94a54e701e06e9d8d8e5 phosh-osk-data-0.42.0/000077500000000000000000000000001470554045400144275ustar00rootroot00000000000000phosh-osk-data-0.42.0/.gitignore000066400000000000000000000002501470554045400164140ustar00rootroot00000000000000input/ out/ data/ output/ debian/.debhelper/ debian/debhelper-build-stamp debian/files debian/phosh-osk-data-packager/ debian/phosh-osk-data-packager.substvars doc/*.1 phosh-osk-data-0.42.0/.gitlab-ci.yml000066400000000000000000000027421470554045400170700ustar00rootroot00000000000000include: - remote: 'https://gitlab.freedesktop.org/freedesktop/ci-templates/-/raw/34039cd573a2df832d465bc9e4c5f543571f5241/templates/ci-fairy.yml' stages: - build - deploy default: # Protect CI infra from rogue jobs timeout: 15 minutes # Allow jobs to be caneled on new commits interruptible: true # Retry on infra hickups automatically retry: max: 1 when: - 'api_failure' - 'runner_system_failure' - 'scheduler_failure' - 'stuck_or_timeout_failure' variables: # For ci-fairy FDO_UPSTREAM_REPO: guidog/phosh-osk-data DEBIAN_IMAGE: $CI_REGISTRY/guidog/phosh-osk-stub/debian:v0.0.2024-06-19 .prep: &prep before_script: - apt-get -y update - apt-get -y build-dep . .step: &build_step script: - mkdir -p out/ - make check - dpkg-buildpackage -uc -us -rfakeroot -A - 'cp ../phosh-osk-data-*_$(dpkg-parsechangelog -SVersion)_all.deb out/' # Sanity checks of MR settings and commit logs sanity: extends: - .fdo.ci-fairy stage: build script: | if [ -n "$CI_OPEN_MERGE_REQUESTS" ]; then ci-fairy check-commits --junit-xml=commit-message-junit-report.xml cifairy/main..HEAD ; else echo "Not a merge request" ; fi artifacts: reports: junit: commit-message-junit-report.xml only: variables: - $CI_OPEN_MERGE_REQUESTS && $PKG_ONLY != "1" build:native-debian-trixie: stage: build image: ${DEBIAN_IMAGE} <<: *prep <<: *build_step artifacts: paths: - out/ phosh-osk-data-0.42.0/Makefile000066400000000000000000000002561470554045400160720ustar00rootroot00000000000000PY_SCRIPTS = \ pod-db-from-wiki-dump \ phosh-osk-data-packager \ $(NULL) check: flake8 --format=pylint $(PY_SCRIPTS) man: $(MAKE) -C doc clean: $(MAKE) -C doc clean phosh-osk-data-0.42.0/NEWS000066400000000000000000000004501470554045400151250ustar00rootroot00000000000000phosh-osk-data 0.42.0 --------------------- Released: October 2024 * Move to databases built from Wikipedia dumps * Allow to built packages from the db files for local installation phosh-osk-data 0.25.0 --------------------- Released: February 2023 * Initial release with data for de, it and sv phosh-osk-data-0.42.0/README.md000066400000000000000000000034171470554045400157130ustar00rootroot00000000000000# phosh osk data Scripts to build word prediction data for [phosh-osk-stub][] and other presage based completers. The aim here is to have models that are distributable without licensing issues and using modern language so we're using Wikipedia dumps. ## Building your own dictionaries based in Wikipedia data Get a host with disk space (~40G), more cores make the first steps (extraction and parsing into sentences significantly faster. You can then provision it with the provided ansible playbook on your cloud provider of choice: ```sh ansible-playbook -v -i "${BUILDER}", -u root builder/setup.yml ``` `${BUILDER}` is the IP or hostname of the host to provision. Once there get the Wikipedia dump: ```sh ssh ${BUILDER} cd output/ export LANG=es wget "https://dumps.wikimedia.org/${LANG}wiki/latest/${LANG}wiki-latest-pages-articles.xml.bz2" ``` Import some nltk data: ``` python3 -c "import nltk; nltk.download('punkt')" ``` Process the dump ``` ./pod-db-from-wiki-dump --processes 4 --language "${LANG}" --dump "output/${LANG}wiki-latest-pages-articles.xml.bz2" --output "output/${LANG}" ``` You'll then get a database usable by presage based completers in `output/${LANG}/database_${LANG}.db`. This happens in steps so should a step fail you can skip it in subsequent runs. See the `--skip-*` options. The extract and parsing steps happen in parallel and can be spread over multiple cores (default `8`). ## Installing the data See the [phosh-data-packager manpage](doc/phosh-osk-data-packager.rst). ## Related projects - presage: - sfos presage databases: - phosh-osk-stub: [phosh-osk-stub]: https://gitlab.gnome.org/guidog/phosh-osk-stub phosh-osk-data-0.42.0/builder/000077500000000000000000000000001470554045400160555ustar00rootroot00000000000000phosh-osk-data-0.42.0/builder/setup.yml000066400000000000000000000034661470554045400177510ustar00rootroot00000000000000- name: Setup Phosh OSK Data wiki builder gather_facts: false hosts: all vars: pod_user: pod-builder pod_home: "/home/pod-builder" wikiextractorpkg: "wikiextractor_3.0.7-1_all.deb" tasks: - name: Add packages ansible.builtin.apt: pkg: - htop - kitty-terminfo - python3-nltk - python3-tqdm - screen - sqlite3 - vim-nox - wget - name: Add user ansible.builtin.user: name: "{{ pod_user }}" system: true create_home: true shell: /usr/sbin/nologin home: "{{ pod_home }}" - name: Download wikiextractor until in Debian ansible.builtin.get_url: url: "https://people.debian.org/~agx/wikiextractor/{{ wikiextractorpkg }}" dest: "{{ pod_home }}/{{ wikiextractorpkg }}" - name: Install wikiextractor ansible.builtin.command: dpkg -i "{{ pod_home }}/{{ wikiextractorpkg }}" - name: Copy script ansible.builtin.copy: src: ../pod-db-from-wiki-dump dest: "{{ pod_home }}/pod-db-from-wiki-dump" owner: pod-builder mode: '0755' - name: Creates directory ansible.builtin.file: path: "{{ pod_home }}/output" state: directory owner: "{{ pod_user }}" - name: Check for wiki data volume ansible.builtin.stat: path: "/dev/disk/by-label/wiki-data" register: d - name: Mount data volume ansible.posix.mount: path: "{{ pod_home }}/output" src: "/dev/disk/by-label/wiki-data" state: mounted fstype: ext4 when: d.stat.islnk is defined and d.stat.islnk - name: Allow access to output dir ansible.builtin.file: path: "{{ pod_home }}/output" state: directory owner: "{{ pod_user }}" phosh-osk-data-0.42.0/data/000077500000000000000000000000001470554045400153405ustar00rootroot00000000000000phosh-osk-data-0.42.0/data/.gitkeep000066400000000000000000000000001470554045400167570ustar00rootroot00000000000000phosh-osk-data-0.42.0/debian/000077500000000000000000000000001470554045400156515ustar00rootroot00000000000000phosh-osk-data-0.42.0/debian/changelog000066400000000000000000000014221470554045400175220ustar00rootroot00000000000000phosh-osk-data (0.42.0) experimental; urgency=medium * Release 0.42.0 -- Guido Günther Mon, 21 Oct 2024 23:00:26 +0200 phosh-osk-data (0.42.0~rc1) experimental; urgency=medium * build: Clean tmp/ too * ci: Use a more recent image * treewide: Allow to use Wikipedia dumps to build the corpus. This allows us to use DFSG free and modern CC BY-SA 4.0 data. We also provide a small anisble role to set up a host for that and a script that downloads and packages the corpus into a distribution package (currently deb only). -- Guido Günther Wed, 09 Oct 2024 15:53:28 +0200 phosh-osk-data (0.25.0) experimental; urgency=medium * Initial release. -- Guido Günther Tue, 28 Feb 2023 15:57:15 +0100 phosh-osk-data-0.42.0/debian/control000066400000000000000000000011521470554045400172530ustar00rootroot00000000000000Source: phosh-osk-data Section: contrib/text Priority: optional Maintainer: Guido Günther Rules-Requires-Root: no Build-Depends: debhelper-compat (= 13), flake8, python3-docutils, python3-nltk, wget, Standards-Version: 4.7.0 Package: phosh-osk-data-packager Architecture: all Depends: python3-requests, python3-tqdm, ${shlibs:Depends}, ${misc:Depends}, Recommends: pkexec, Suggests: phosh-osk-stub, Description: OSK completion data packager for Phosh Data to enable text completion in phosh-osk-stub via presage. . This package contains as script to download and package data files. phosh-osk-data-0.42.0/debian/copyright000066400000000000000000000020411470554045400176010ustar00rootroot00000000000000Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Source: https://gitlab.gnome.org/guidog/phosh-osk-data Upstream-Name: phosh-osk-data Upstream-Contact: Guido Günther Files: * Copyright: 2024 Guido Günther License: GPL-2+ This package is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. . This package is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. . You should have received a copy of the GNU General Public License along with this program. If not, see Comment: On Debian systems, the complete text of the GNU General Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". phosh-osk-data-0.42.0/debian/gbp.conf000066400000000000000000000001701470554045400172660ustar00rootroot00000000000000[DEFAULT] debian-branch = main debian-tag = v%(version)s debian-tag-msg = %(pkg)s v%(version)s [tag] sign-tags = true phosh-osk-data-0.42.0/debian/phosh-osk-data-packager.install000066400000000000000000000000441470554045400236340ustar00rootroot00000000000000./phosh-osk-data-packager /usr/bin/ phosh-osk-data-0.42.0/debian/phosh-osk-data-packager.manpages000066400000000000000000000000101470554045400237520ustar00rootroot00000000000000doc/*.1 phosh-osk-data-0.42.0/debian/rules000077500000000000000000000001011470554045400167210ustar00rootroot00000000000000#!/usr/bin/make -f %: dh $@ override_dh_auto_build: make man phosh-osk-data-0.42.0/debian/source/000077500000000000000000000000001470554045400171515ustar00rootroot00000000000000phosh-osk-data-0.42.0/debian/source/format000066400000000000000000000000151470554045400203600ustar00rootroot000000000000003.0 (native) phosh-osk-data-0.42.0/doc/000077500000000000000000000000001470554045400151745ustar00rootroot00000000000000phosh-osk-data-0.42.0/doc/Makefile000066400000000000000000000002371470554045400166360ustar00rootroot00000000000000MANPAGES = \ phosh-osk-data-packager.1 \ $(NULL) %.1: %.rst rst2man --syntax-highlight=none $< > $@.tmp mv $@.tmp $@ doc: $(MANPAGES) clean: rm -f *.1 phosh-osk-data-0.42.0/doc/phosh-osk-data-packager.rst000066400000000000000000000022661470554045400223310ustar00rootroot00000000000000.. _phosh-osk-data-packager(1): ======================= phosh-osk-data-packager ======================= ----------------------------- Download and package OSK data ----------------------------- SYNOPSIS -------- | **phosh-osk-data-packager** [OPTIONS...] DESCRIPTION ----------- ``phosh-osk-data-packager`` downloads and packages data for Phosh's on screen keyboards (OSKs). It currently supports downloading and packaging data for completion using the presage library. OPTIONS ------- ``-h``, ``--help`` Print help and exit ``--language=LANGUAGE`` The language code of the language to process. E.g. `de` or `se`. ``--engine=ENGINE`` The completion engine to download data for. The only currently supported engine is `presage` using it's sqlite backend. ``--pkg=FORMAT`` The packaging format to build a package for. The only currently supported format is `deb` building a package for Debian based distributions. ``--install`` Whether to install the package EXAMPLE ------- This downloads and packages data for Swedish and puts the resulting package into the current directory: :: ./phosh-osk-data-packager --language=se See also -------- ``phosh-osk-stub(1)`` phosh-osk-data-0.42.0/phosh-osk-data-packager000077500000000000000000000071321470554045400207550ustar00rootroot00000000000000#!/usr/bin/python3 # # Copyright (C) The Phosh Developers # # SPDX-License-Identifier: GPL-3.0-or-later # # Author: Guido Günther # # Build a package for the given OSK data import sys import argparse import subprocess import requests import shutil from tempfile import TemporaryDirectory from tqdm import tqdm from pathlib import Path VERSION = '0.42.0' URL = 'https://data.phosh.mobi/osk-data' def build_deb(dbfile, dir, engine, lang): pkgname = f'phosh-osk-data-{lang}' pkgdir = Path(dir) / pkgname dbdir = pkgdir / 'usr' / 'share' / 'phosh' / 'osk' / engine debian = pkgdir / 'DEBIAN' control = debian / 'control' dbdir.mkdir(parents=True) debian.mkdir(parents=True) shutil.move(dbfile, dbdir) with open(control, 'w') as f: f.write(f"""Package: {pkgname} Source: phosh-osk-data Version: {VERSION}-1 Architecture: all Maintainer: Guido Günther Suggests: phosh-osk-stub Breaks: phosh-osk-data-eu (<< 0.42) Replaces: phosh-osk-data-eu (<< 0.42) Section: text Priority: optional Description: OSK completion data for phosh - {lang} Data to enable text completion in phosh-osk-stub via {engine}. . This package contains the data files for {lang}. . It was generated via pod-build-pkg. """) subprocess.check_call(["dpkg-deb", '-b', pkgdir, dir]) deb = Path(dir) / f'{pkgname}_{VERSION}-1_all.deb' if not deb.exists(): raise Exception(f"Deb {deb} not created") return deb def download_db(dir, engine, lang): dbname = f'database_{lang}.db' dbfile = Path(dir) / dbname url = URL + f'/{VERSION}/{engine}/{dbname}' response = requests.get(url, stream=True) if response.status_code == 404: print(f"No datase found for '{lang}'", file=sys.stderr) return None response.raise_for_status() total = int(response.headers.get("content-length", 0)) with tqdm(total=total, unit="B", unit_scale=True) as progress_bar: with open(dbfile, "wb") as f: for data in response.iter_content(4096): progress_bar.update(len(data)) f.write(data) return dbfile def install_deb(pkg): try: subprocess.check_call(["pkexec", "dpkg", "-i", str(pkg)]) except Exception as e: print(f"Failed to install package: {e}", file=sys.stderr) def main(): parser = argparse.ArgumentParser( description="Download and Build a package for the given OSK data" ) parser.add_argument( "--language", type=str, default="en", help="Language to use" ) parser.add_argument( "--engine", type=str, default='presage', choices=['presage'], help="Completion engine" ) parser.add_argument( "--pkg", type=str, default='deb', choices=['deb'], help="Package format" ) parser.add_argument( "--install", action="store_true", help="Install the built package" ) args = parser.parse_args() with TemporaryDirectory(prefix="pod-build-pkg") as dir: dbfile = download_db(dir, args.engine, args.language) if not dbfile: return 1 if args.pkg == 'deb': pkg = build_deb(dbfile, dir, args.engine, args.language) else: print(f"Unsupported packaging format '{args.pkg}'", file=sys.stderr) return 1 if args.install: if args.pkg == 'deb': pkg = install_deb(pkg) else: print(f"Don't know how to install '{args.pkg}'", file=sys.stderr) return 1 else: shutil.move(pkg, '.') return 0 if __name__ == "__main__": sys.exit(main()) phosh-osk-data-0.42.0/pod-db-from-wiki-dump000077500000000000000000000170461470554045400203770ustar00rootroot00000000000000#!/usr/bin/python3 # # Copyright (C) The Phosh Developers # # SPDX-License-Identifier: GPL-3.0-or-later # # Author: Guido Günther # # The extraction is base on the MIT licensed # https://github.com/mpoyraz/ngram-lm-wiki import fileinput import os import sys import re import json import argparse import random import subprocess import sqlite3 from tqdm import tqdm from multiprocessing import Pool from nltk.tokenize import sent_tokenize, word_tokenize from nltk.util import ngrams from pathlib import Path from collections import defaultdict # Tokenize to sentences tokenize_fn = None # Lowercase a sentence lower_fn = None # Drop an unwanted word when counting n-grams drop_word_fn = None # Chars to remove from wiki data chars_to_remove_regex = r"[#$%&()*+,-./:;<=>?@\[\]^_{|}~!\"\\]" apostrophes = "[’`´ʹʻʼʽʿˈ‘]" # Maximum n-gram count max_ngrams = 3 def parse_sentences_from_wiki_json_file(fpath): with open(fpath) as fp: texts = [json.loads(line.strip())["text"] for line in fp] # Sentences from paragraphs sentences = [] for text in texts: for sent in tokenize_fn(text): # Lower the sentence sent = lower_fn(sent) # Remove pre-defined chars sent = re.sub(chars_to_remove_regex, "", sent) # Unify apostrophes sent = re.sub(apostrophes, "'", sent) # Remove multiple spaces sent = re.sub(r"\s+", " ", sent) # Append if len(sent) > 0: sentences.append(sent) return sentences def extract_wiki_dump(extract_dir, wiki_dump, n_procs): extractor = [ "wikiextractor", wiki_dump, "-o", extract_dir, "--no-templates", "--json", "--processes", str(n_procs), ] subprocess.check_call(extractor) def parse_sentences(sentence_file, extract_dir, n_files, n_procs): # Paths of the extracted wiki files dirs = list(extract_dir.glob("[A-Z][A-Z]")) filepaths = [] for i in range(n_files): dir = random.choice(dirs) subdir = list(dir.glob("wiki_??")) f = random.choice(subdir) print(f) filepaths.append(f) with open(sentence_file, "w") as f: # Load each wiki files and parse sentences with Pool(n_procs) as pool: n_sentences = 0 for sentences in tqdm( pool.imap(parse_sentences_from_wiki_json_file, filepaths), total=n_files): for sent in sentences: f.write(f"{sent}\n") n_sentences += len(sentences) print("Number of extracted sentences: {}".format(n_sentences)) def build_where_clause(words): where_clause = "WHERE" for i in range(len(words) - 1): where_clause += f" word_{len(words) - i - 1} = '{words[i]}' AND" where_clause += f" word = '{words[-1]}'" return where_clause def build_ngrams(sentences, ngram_file, db_file): if os.path.exists(db_file): os.remove(db_file) print("Creating database:") con = sqlite3.connect(db_file) # con.set_trace_callback(print) cur = con.cursor() for n in range(0, max_ngrams): cols = ", ".join([f"word_{i} TEXT" for i in reversed(range(n + 1))]).replace( "_0", "" ) constraints = "UNIQUE({})".format( ", ".join([f"word_{i}" for i in range(n + 1)])).replace("_0", "") table = f"_{n + 1}_gram" cur.execute(f"CREATE TABLE {table}({cols}, count INTEGER, {constraints})") print("Filling database tables:") i = 0 for sentence in tqdm(sentences): counts = defaultdict(int) tokens = word_tokenize(sentence) for n in range(1, max_ngrams + 1): n_grams = ngrams(tokens, n) for n_gram in n_grams: for word in n_gram: if drop_word_fn(word): break else: counts[n_gram] += 1 # Insert after each sentence to keep memory usage under control for key, count in counts.items(): table = f"_{len(key)}_gram" words = ",".join([f"'{word}'" for word in key]) where = build_where_clause(key) query = f"SELECT count FROM {table} {where}" try: res = cur.execute(query).fetchone() except Exception: print("Statement failed: %s", query) raise if res: count = res[-1] + 1 stmt = f"UPDATE {table} SET count = {count} {where}" else: stmt = f"INSERT INTO {table} VALUES ({words}, {count})" try: cur.execute(stmt) except Exception: print("Statement failed: %s", stmt) raise if (i % 100000 == 0): con.commit() i += 1 con.commit() # Create index for n in range(0, max_ngrams): word_cols = "({})".format(", ".join([f"word_{i}" for i in reversed(range(n + 1))])).replace("_0", "") table = f"_{n + 1}_gram" index = f"_{n + 1}_index" stmt = f"CREATE UNIQUE INDEX {index} ON {table}{word_cols}" cur.execute(stmt) con.commit() cur.execute("pragma optimize") con.commit() # TODO: drop rare items from tables con.execute("VACUUM") con.commit() con.close() def main(): parser = argparse.ArgumentParser( description="Build ngram database from Wikipedia dumps" ) parser.add_argument( "--dump", type=str, required=True, help="Path to a wikipedia dump" ) parser.add_argument("--output", type=str, required=True, help="Output directory") parser.add_argument( "--language", type=str, default="de", help="Language of the wikipedia dump" ) parser.add_argument( "--processes", type=int, default=8, help="Number of processes to use" ) parser.add_argument( "--skip-extract", default=False, action="store_true", help="Extract wiki data" ) parser.add_argument( "--files", type=int, default=10, help="Number of wiki files to use to build the DB" ) parser.add_argument( "--skip-parse", default=False, action="store_true", help="Parse extraced wiki data into sentences", ) parser.add_argument( "--skip-presage-ngrams", default=False, action="store_true", help="Build n-grams of built sentences for presage", ) args = parser.parse_args() global tokenize_fn, lower_fn, drop_word_fn # Defaults for all languages tokenize_fn = sent_tokenize lower_fn = lambda x: x.lower() drop_word_fn = lambda x: "'" in x if args.language in ["de"]: tokenize_fn = lambda x: sent_tokenize(x, language="german") output_path = Path(args.output) extract_dir = output_path / "extract" sentence_file = output_path / "sentences.txt" ngram_file = output_path / f"n-gram-{args.language}.txt" db_file = output_path / f"database_{args.language}.db" if not args.skip_extract: print("Extracting Wiki source") extract_wiki_dump(extract_dir, args.dump, args.processes) if not args.skip_parse: print("Parsing sentences") sentences = parse_sentences(sentence_file, extract_dir, args.files, args.processes) sentences = fileinput.input(sentence_file, encoding="utf-8") if not args.skip_presage_ngrams: print("Building N-grams") build_ngrams(sentences, ngram_file, db_file) return 0 if __name__ == "__main__": sys.exit(main()) phosh-osk-data-0.42.0/setup.cfg000066400000000000000000000001171470554045400162470ustar00rootroot00000000000000[flake8] # E501: ignore line length # E731: do not use lambda ignore=E501,E731