pax_global_header00006660000000000000000000000064141773313250014520gustar00rootroot0000000000000052 comment=2433ff5a84aed509e11db236eb1a68e2f35ea447 pyglossary-4.5.0/000077500000000000000000000000001417733132500137425ustar00rootroot00000000000000pyglossary-4.5.0/.github/000077500000000000000000000000001417733132500153025ustar00rootroot00000000000000pyglossary-4.5.0/.github/ISSUE_TEMPLATE/000077500000000000000000000000001417733132500174655ustar00rootroot00000000000000pyglossary-4.5.0/.github/ISSUE_TEMPLATE/feature-request.md000066400000000000000000000012301417733132500231240ustar00rootroot00000000000000--- name: Feature request about: Suggest/request a feature (new format, option, parameter etc) title: '' labels: --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. **Describe the solution you'd like** A clear and concise description of what you want to happen. **Provide links and sample file(s)** Provide links to the official website and/or download page of the related software or format. Provide sample file(s) for the format/feature you want to be supported. Attach the file(s) if you can. If no sample file is publicly downloadable due to copyright, please mention and explain. pyglossary-4.5.0/.github/workflows/000077500000000000000000000000001417733132500173375ustar00rootroot00000000000000pyglossary-4.5.0/.github/workflows/codeql-analysis.yml000066400000000000000000000044701417733132500231570ustar00rootroot00000000000000# For most projects, this workflow file will not need changing; you simply need # to commit it to your repository. # # You may wish to alter this file to override the set of languages analyzed, # or to provide custom queries or build logic. # # ******** NOTE ******** # We have attempted to detect the languages in your repository. Please check # the `language` matrix defined below to confirm you have the correct set of # supported CodeQL languages. # name: "CodeQL" on: push: branches: [ master ] pull_request: # The branches below must be a subset of the branches above branches: [ master ] schedule: - cron: '33 1 * * 3' jobs: analyze: name: Analyze runs-on: ubuntu-latest strategy: fail-fast: false matrix: language: [ 'python' ] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] # Learn more: # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed steps: - name: Checkout repository uses: actions/checkout@v2 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v1 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. # By default, queries listed here will override any specified in a config file. # Prefix the list here with "+" to use these queries and those in the config file. # queries: ./path/to/local/query, your-org/your-repo/queries@main # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v1 # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines # and modify them (or add more) to build your code if your project # uses a compiled language #- run: | # make bootstrap # make release - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v1 pyglossary-4.5.0/.gitignore000066400000000000000000000001621417733132500157310ustar00rootroot00000000000000*~ *.py[oc] /build /dist /bin /pyglossary.egg-info /.mypy_cache/ /plugins /ui .coverage *,cover htmlcov vulture.* pyglossary-4.5.0/AUTHORS000066400000000000000000000012001417733132500150030ustar00rootroot00000000000000⚫︎ Saeed Rasooli (ilius) Thanks to: ⚫︎ Kubtek for contributions to BGL plugin ⚫︎ Xiaoqiang Wang for codes / contributions in AppleDict, MDict and DSL ⚫︎ Thomas Vogt for fixing several bugs ⚫︎ Raul Fernandes and Karl Grill for reverse enginearing on BGL format ⚫︎ Nilton Volpato for https://github.com/niltonvolpato/python-progressbar ⚫︎ Jeff Quast for https://github.com/jquast/wcwidth PyGlossary logo is created based on QStarDict logo and Python logo pyglossary-4.5.0/CODE_OF_CONDUCT.md000066400000000000000000000057021417733132500165450ustar00rootroot00000000000000# Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at saeed.gnu@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq pyglossary-4.5.0/Dockerfile000066400000000000000000000011171417733132500157340ustar00rootroot00000000000000FROM bitnami/minideb MAINTAINER Saeed Rasooli saeed.gnu@gmail.com LABEL Description="Dockefile to run PyGlossary inside a Debian-based Docker image" COPY . /opt/pyglossary RUN apt-get update RUN apt-get install --yes python3 RUN apt-get install --yes python3-pip RUN apt-get install --yes python3-lxml RUN apt-get install --yes python3-lzo RUN apt-get install --yes python3-icu RUN apt-get install --yes pkg-config RUN pip3 install prompt_toolkit RUN pip3 install beautifulsoup4 RUN pip3 install marisa-trie RUN pip3 install libzim WORKDIR /root CMD python3 /opt/pyglossary/main.py --cmd pyglossary-4.5.0/README.md000066400000000000000000000401251417733132500152230ustar00rootroot00000000000000# PyGlossary A tool for converting dictionary files aka glossaries. The primary purpose is to be able to use our offline glossaries in any Open Source dictionary we like on any OS/device. There are countless formats, and my time is limited, so I implement formats that seem more useful for myself, or for Open Source community. Also diversity of languages is taken into account. Pull requests are welcome. ## Screenshots Linux - Gtk3-based interface ______________________________________________________________________ Windows - Tkinter-based interface ______________________________________________________________________ Linux - command-line interface ______________________________________________________________________ Android Termux - interactive command-line interface ## Supported formats | Format | | Extension | Read | Write | | ------------------------------------------------------- | :-: | :-------------: | :--: | :---: | | [Aard 2 (slob)](./doc/p/aard2_slob.md) | 🔢 | .slob | ✔ | ✔ | | [ABBYY Lingvo DSL](./doc/p/dsl.md) | 📝 | .dsl | ✔ | | | [Almaany.com](./doc/p/almaany.md) (SQLite3, Arabic) | 🔢 | .db | ✔ | | | [AppleDict Binary](./doc/p/appledict_bin.md) | 🔢 | .dictionary | ✔ | ❌ | | [AppleDict Source](./doc/p/appledict.md) | 📁 | | | ✔ | | [Babylon BGL](./doc/p/babylon_bgl.md) | 🔢 | .bgl | ✔ | ❌ | | [CC-CEDICT](./doc/p/cc_cedict.md) (Chinese) | 📝 | | ✔ | ❌ | | [cc-kedict](./doc/p/cc_kedict.md) (Korean) | 📝 | | ✔ | ❌ | | [CSV](./doc/p/csv.md) | 📝 | .csv | ✔ | ✔ | | [Dict.cc](./doc/p/dict_cc.md) (SQLite3, German) | 🔢 | .db | ✔ | | | [DICT.org / Dictd server](./doc/p/dict_org.md) | 📁 | (📝.index) | ✔ | ✔ | | [DICT.org / dictfmt source](./doc/p/dict_org_source.md) | 📝 | (.dtxt) | | ✔ | | [dictunformat output file](./doc/p/dictunformat.md) | 📝 | (.dictunformat) | ✔ | | | [DictionaryForMIDs](./doc/p/dicformids.md) | 📁 | (📁.mids) | ✔ | ✔ | | [DigitalNK](./doc/p/digitalnk.md) (SQLite3, N-Korean) | 🔢 | .db | ✔ | | | [EDLIN](./doc/p/edlin.md) | 📁 | .edlin | ✔ | ✔ | | [EPUB-2 E-Book](./doc/p/epub2.md) | 📦 | .epub | ❌ | ✔ | | [FreeDict](./doc/p/freedict.md) | 📝 | .tei | ✔ | ❌ | | [Gettext Source](./doc/p/gettext_po.md) | 📝 | .po | ✔ | ✔ | | [HTML Directory (by file size)](./doc/p/html_dir.md) | 📁 | | ❌ | ✔ | | [JMDict](./doc/p/jmdict.md) (Japanese) | 📝 | | ✔ | ❌ | | [JSON](./doc/p/json.md) | 📝 | .json | | ✔ | | [Kobo E-Reader Dictionary](./doc/p/kobo.md) | 📦 | .kobo.zip | ❌ | ✔ | | [Kobo E-Reader Dictfile](./doc/p/kobo_dictfile.md) | 📝 | .df | ✔ | ✔ | | [Lingoes Source](./doc/p/lingoes_ldf.md) | 📝 | .ldf | ✔ | ✔ | | [Mobipocket E-Book](./doc/p/mobi.md) | 🔢 | .mobi | ❌ | ✔ | | [Octopus MDict](./doc/p/octopus_mdict.md) | 🔢 | .mdx | ✔ | ❌ | | [Sdictionary Binary](./doc/p/sdict.md) | 🔢 | .dct | ✔ | | | [Sdictionary Source](./doc/p/sdict_source.md) | 📝 | .sdct | | ✔ | | [SQL](./doc/p/sql.md) | 📝 | .sql | ❌ | ✔ | | [StarDict](./doc/p/stardict.md) | 📁 | (📝.ifo) | ✔ | ✔ | | [Tabfile](./doc/p/tabfile.md) | 📝 | .txt, .tab | ✔ | ✔ | | [Wiktionary Dump](./doc/p/wiktionary_dump.md) | 📝 | .xml | ✔ | ❌ | | [Wordset.org](./doc/p/wordset.md) | 📁 | | ✔ | | | [XDXF](./doc/p/xdxf.md) | 📝 | .xdxf | ✔ | ❌ | | [Zim (Kiwix)](./doc/p/zim.md) | 🔢 | .zim | ✔ | | Legend: - 📁 Directory - 📝 Text file - 📦 Package/archive file - 🔢 Binary file - ✔ Supported - ❌ Will not be supported **Note**: SQLite-based formats are not detected by extension (`.db`); So you need to select the format (with UI or `--read-format` flag). **Also don't confuse SQLite-based formats with [SQLite mode](#sqlite-mode).** ## Requirements PyGlossary requires **Python 3.8 or higher**, and works in practically all modern operating systems. While primarily designed for *GNU/Linux*, it works on *Windows*, *Mac OS X* and other Unix-based operating systems as well. As shown in the screenshots, there are multiple User Interface types (multiple ways to use the program). - **Gtk3-based interface**, uses [PyGI (Python Gobject Introspection)](http://pygobject.readthedocs.io/en/latest/getting_started.html) You can install it on: - Debian/Ubuntu: `apt install python3-gi python3-gi-cairo gir1.2-gtk-3.0` - openSUSE: `zypper install python3-gobject gtk3` - Fedora: `dnf install pygobject3 python3-gobject gtk3` - ArchLinux: - `pacman -S python-gobject gtk3` - https://aur.archlinux.org/packages/pyglossary/ - Mac OS X: `brew install pygobject3 gtk+3` - Nix / NixOS: `nix-shell -p gnome3.gobjectIntrospection python38Packages.pygobject3 python38Packages.pycairo` - **Tkinter-based interface**, works in the lack of Gtk. Specially on Windows where Tkinter library is installed with the Python itself. You can also install it on: - Debian/Ubuntu: `apt-get install python3-tk tix` - openSUSE: `zypper install python3-tk tix` - Fedora: `yum install python3-tkinter tix` - Mac OS X: read - Nix / NixOS: `nix-shell -p python38Packages.tkinter tix` - **Command-line interface**, works in all operating systems without any specific requirements, just type: `python3 main.py --help` - **Interactive command-line interface** - Requires: `pip3 install prompt_toolkit` - Perfect for mobile devices (like Termux on Android) where no GUI is available - Automatically selected if output file argument is not passed **and** one of these: - On Linux and `$DISPLAY` environment variable is empty or not set - For example when you are using a remote Linux machine over SSH - On Mac and no `tkinter` module is found - Manually select with `--cmd` or `--ui=cmd` - Minimally: `python3 main.py --cmd` - You can still pass input file, or any flag/option - If both input and output files are passed, non-interactive cmd ui will be default - If you are writing a script, you can pass `--no-interactive` to force disable interactive ui - Then you have to pass both input and output file arguments - Don't forget to use *Up/Down* or *Tab* keys in prompts! - Up/Down key shows you recent values you have used - Tab key shows available values/options - You can press Control+C (on Linux/Windows) at any prompt to exit ## UI (User Interface) selection When you run PyGlossary without any command-line arguments or options/flags, PyGlossary tries to find PyGI and open the Gtk3-based interface. If it fails, it tries to find Tkinter and open the Tkinter-based interface. If that fails, it tries to find `prompt_toolkit` and run interactive command-line interface. And if none of these libraries are found, it exits with an error. But you can explicitly determine the user interface type using `--ui` - `python3 main.py --ui=gtk` - `python3 main.py --ui=tk` - `python3 main.py --ui=cmd` ## Installation on Windows - [Download and install Python](https://www.python.org/downloads/windows/) (3.8 or above is recommended) - Open Start -> type Command -> right-click on Command Prompt -> Run as administrator - To ensure you have `pip`, run: `python -m ensurepip --upgrade` - To install, run: `pip install --upgrade pyglossary` - Now you should be able to run `pyglossary` command - If command was not found, make sure Python environment variables are set up: ## Feature-specific requirements - **Using `--remove-html-all` flag** `sudo pip3 install lxml beautifulsoup4` Some formats have additional requirements. If you have trouble with any format, please check the [link given for that format](#supported-formats) to see its documentations. **Using Termux on Android?** See [doc/termux.md](./doc/termux.md) ## Configuration See [doc/config.rst](./doc/config.rst). ## Direct and indirect modes Indirect mode means the input glossary is completely read and loaded into RAM, then converted into the output format. This was the only method available in old versions (before [3.0.0](https://github.com/ilius/pyglossary/releases/tag/3.0.0)). Direct mode means entries are one-at-a-time read, processed and written into output glossary. Direct mode was added to limit the memory usage for large glossaries; But it may reduce the conversion time for most cases as well. Converting glossaries into these formats requires [sorting](#sorting) entries: - [StarDict](./doc/p/stardict.md) - [EPUB-2](./doc/p/epub2.md) - [Mobipocket E-Book](./doc/p/mobi.md) That's why direct mode will not work for these formats, and PyGlossary has to switch to indirect mode (or it previously had to, see [SQLite mode](#sqlite-mode)). For other formats, direct mode will be the default. You may override this by `--indirect` flag. ## SQLite mode As mentioned above, converting glossaries to some specific formats will need them to loaded into RAM. This can be problematic if the glossary is too big to fit into RAM. That's when you should try adding `--sqlite` flag to your command. Then it uses SQLite3 as intermediate storage for storing, sorting and then fetching entries. This fixes the memory issue, and may even reduce running time of conversion (depending on your home directory storage). The temporary SQLite file is stored in [cache directory](#cache-directory) then deleted after conversion (unless you pass `--no-cleanup` flag). SQLite mode is automatically enabled for writing these formats if `auto_sqlite` [config parameter](./doc/config.rst) is `true` (which is the default). This also applies to when you pass `--sort` flag for any format. You may use `--no-sqlite` to override this and switch to indirect mode. Currently you can not disable alternates in SQLite mode (`--no-alts` is ignored). ## Sorting There are two things than can activate sorting entries: - Output format requires sorting (as explained [above](#direct-and-indirect-modes)) - You pass `--sort` flag in command line. In the case of passing `--sort`, you can also pass: - `--sort-key` to select sort key aka sorting order, see [doc/sort-key.md](./doc/sort-key.md) - `--sort-encoding` to change the encoding used for sort - UTF-8 is the default encoding for all sort keys and all output formats (unless mentioned otherwise) - This will only effect the order of entries, and will not corrupt words / definition - Non-encodable characters are replaced with `?` byte (*only for sorting*) ## Cache directory Cache directory is used for storing temporary files which are either moved or deleted after conversion. You can pass `--no-cleanup` flag in order to keep them. The path for cache directory: - Linux or BSD: `~/.cache/pyglossary/` - Mac: `~/Library/Caches/PyGlossary/` - Windows: `C:\Users\USERNAME\AppData\Local\PyGlossary\Cache\` ## User plugins If you want to add your own plugin without adding it to source code directory, or you want to use a plugin that has been removed from repository, you can place it in this directory: - Linux or BSD: `~/.pyglossary/plugins/` - Mac: `~/Library/Preferences/PyGlossary/plugins/` - Windows: `C:\Users\USERNAME\AppData\Roaming\PyGlossary\plugins\` ## Using PyGlossary as a Python library There are a few examples in [doc/lib-examples](./doc/lib-examples) directory. Here is a basic script that converts any supported glossary format to [Tabfile](./doc/p/tabfile.md): ```python import sys import pyglossary from pyglossary import Glossary # Glossary.init() should be called only once, so make sure you put it # in the right place Glossary.init() glos = Glossary() glos.convert( inputFilename=sys.argv[1], outputFilename=f"{sys.argv[1]}.txt", # although it can detect format for *.txt, you can still pass outputFormat outputFormat="Tabfile", # you can pass readOptions or writeOptions as a dict # writeOptions={"encoding": "utf-8"}, ) ``` You may look at docstring of `Glossary.convert` for full list of keyword arguments. If you need to add entries inside your Python program (rather than converting one glossary into another), then you use `write` instead of `convert`, here is an example: ```python from pyglossary.glossary import Glossary Glossary.init() glos = Glossary() mydict = { "a": "test1", "b": "test2", "c": "test3", } for word, defi in mydict.items(): glos.addEntryObj(glos.newEntry( word, defi, defiFormat="m", # "m" for plain text, "h" for HTML )) glos.setInfo("title", "My Test StarDict") glos.setInfo("author", "John Doe") glos.write("test.ifo", format="Stardict") ``` And if you need to read a glossary from file into a `Glossary` object in RAM (without immediately converting it), you can use `glos.read(filename, format=inputFormat)`. Be wary of RAM usage in this case. ## Internal glossary structure A glossary contains a number of entries. Each entry contains: - Headword (title or main phrase for lookup) - Alternates (some alternative phrases for lookup) - Definition In PyGlossary, headword and alternates together are accessible as a single Python list `entry.l_word` `entry.defi` is the definition as a Python Unicode `str`. Also `entry.b_defi` is definition in UTF-8 byte array. `entry.defiFormat` is definition format. If definition is plaintext (not rich text), the value is `m`. And if it's in HTML (contains any html tag), then `defiFormat` is `m`. The value `x` is also allowed for XFXF, but XDXF is not widely supported in dictionary applications. There is another type of `Entry` which is called **Data Entry**, and generally contains image files, TTL or other audio files, or any file that was included in input glossary. For data entries: - `entry.s_word` is file name (and `l_word` is still a list containing this string), - `entry.defiFormat` is `b` - `entry.data` gives the content of file in `bytes`. ## Entry filters Entry filters are internal objects that modify words/definition of entries, or remove entries (in some special cases). Like several filters in a pipe that connects a `reader` object to a `writer` object (with both of their classes defined in plugins and instantiated in `Glossary` class). You can enable/disable some of these filters using config parameters / command like flags, which are documented in [doc/config.rst](./doc/config.rst). The full list of entry filters is also documented in [doc/entry-filters.md](./doc/entry-filters.md). pyglossary-4.5.0/__init__.py000066400000000000000000000000001417733132500160410ustar00rootroot00000000000000pyglossary-4.5.0/about000066400000000000000000000003721417733132500150010ustar00rootroot00000000000000PyGlossary is a tool for converting dictionary files aka glossaries, from/to various formats used by different dictionary applications Copyleft © 2008-2022 Saeed Rasooli PyGlossary is licensed by the GNU General Public License version 3 (or later) pyglossary-4.5.0/config.json000066400000000000000000000013271417733132500161050ustar00rootroot00000000000000{ "log_time": false, "cleanup": true, "auto_sqlite": true, "lower": false, "utf8_check": false, "enable_alts": true, "skip_resources": false, "rtl": false, "remove_html": "", "remove_html_all": false, "normalize_html": false, "save_info_json": false, "color.enable.cmd.unix": true, "color.enable.cmd.windows": false, "color.cmd.critical": 196, "color.cmd.error": 1, "color.cmd.warning": 208, "cmdi.prompt.indent.str": ">", "cmdi.prompt.indent.color": 2, "cmdi.prompt.msg.color": -1, "cmdi.msg.color": -1, "ui_autoSetFormat": true, "reverse_matchWord": true, "reverse_showRel": "Percent", "reverse_saveStep": 1000, "reverse_minRel": 0.3, "reverse_maxNum": -1, "reverse_includeDefs": false } pyglossary-4.5.0/doc/000077500000000000000000000000001417733132500145075ustar00rootroot00000000000000pyglossary-4.5.0/doc/apple.md000066400000000000000000000044101417733132500161310ustar00rootroot00000000000000### Required Python libraries for AppleDict - **Reading from AppleDict Binary (.dictionary)** `sudo pip3 install lxml` - **Writing to AppleDict** `sudo pip3 install lxml beautifulsoup4 html5lib` ### Requirements for AppleDict on Mac OS X If you want to convert glossaries into AppleDict format on Mac OS X, you also need: - GNU make as part of [Command Line Tools for Xcode](http://developer.apple.com/downloads). - Dictionary Development Kit as part of [Additional Tools for Xcode](http://developer.apple.com/downloads). Extract to `/Applications/Utilities/Dictionary Development Kit` ### Convert Babylon (bgl) to Mac OS X dictionary Let's assume the Babylon dict is at `~/Documents/Duden_Synonym/Duden_Synonym.BGL`: ```sh cd ~/Documents/Duden_Synonym/ python3 ~/Software/pyglossary/main.py --write-format=AppleDict Duden_Synonym.BGL Duden_Synonym-apple cd Duden_Synonym-apple make make install ``` Launch Dictionary.app and test. ### Convert Octopus Mdict to Mac OS X dictionary Let's assume the MDict dict is at `~/Documents/Duden-Oxford/Duden-Oxford DEED ver.20110408.mdx`. Run the following command: ```sh cd ~/Documents/Duden-Oxford/ python3 ~/Software/pyglossary/main.py --write-format=AppleDict "Duden-Oxford DEED ver.20110408.mdx" "Duden-Oxford DEED ver.20110408-apple" cd "Duden-Oxford DEED ver.20110408-apple" make make install ``` Launch Dictionary.app and test. Let's assume the MDict dict is at `~/Downloads/oald8/oald8.mdx`, along with the image/audio resources file `oald8.mdd`. Run the following commands: : ```sh cd ~/Downloads/oald8/ python3 ~/Software/pyglossary/main.py --write-format=AppleDict oald8.mdx oald8-apple cd oald8-apple ``` This extracts dictionary into `oald8.xml` and data resources into folder `OtherResources`. Hyperlinks use relative path. : ```sh sed -i "" 's:src="/:src=":g' oald8.xml ``` Convert audio file from SPX format to WAV format. You need package `speex` from [MacPorts](https://www.macports.org) : ```sh find OtherResources -name "*.spx" -execdir sh -c 'spx={};speexdec $spx ${spx%.*}.wav' \; sed -i "" 's|sound://\([/_a-zA-Z0-9]*\).spx|\1.wav|g' oald8.xml ``` But be warned that the decoded WAVE audio can consume ~5 times more disk space! Compile and install. : ```sh make make install ``` Launch Dictionary.app and test. pyglossary-4.5.0/doc/babylon/000077500000000000000000000000001417733132500161355ustar00rootroot00000000000000pyglossary-4.5.0/doc/babylon/BGL.svgz000066400000000000000000000147551417733132500174700ustar00rootroot000000000000002_BGL.svg\]oHv}ϯ H0 贵@cM6Y;9(MY&=ӓk%nսSU|7nZFTluX^ǿN'Gߺ_>>JUIJ?v˻Ӈ6S:mگͶߚΧ|Y-˻?lyOt~=/wѧޞ_UԘߨ,5z_ӟl?WTէ {>{w'g<ǹ\{;[}͟fom xr`3czy]o;Zpv#nhnV ĖR&f1jGξIR}9;1:N uv%Z_X߭1n7oxzWV&GI~?c>N镏p~~2yƟ -.S?C>)DMl-h 䔼g`h'$:Ϣj>=>|Z76ʅZ&m4-+LjG5,ϗM#wINEoMpqkэOaτv&Lͯ5Y\kp7^h>b XSuy:gYoM^;a4qz:ƤݍVMkwz|Cᠯucb&_De9 _;8 ^opBz;$vB%ކ+?NW9=Sio+jV6#+FbC絯L2'ֲlY EsE5^ʍwM阱}zqL{rYZaLsd׷C| /퐷}~m٘{OmM[+:ﷴBnjoPXakVX2j  ^Oj~2?lS\(r~!?_H"b0/WYv 30 ԛ%"1:[by >Th:UsWo[gU{]&>c7ߵUϐ^h XBGsy`1]@]WpϤ靛IZVtz5IbnꏁDzH*V1G23`9' q 9PJ,3VS~cB|?|2ή~N\rA>KS-=~^5^'$LD1d3ltNٗ-6)gh(cCgj׸[!ČYrzeI:WQ;l G\q?KSa_i|̯: Vdy m< H2Z\Y`iqV(ӑiуh>@*%i|552aSdH:y}ۂoAL,){4ߒ31HFx`.ÜuE K˹1_MHތ5[0c×SAg 0rl[.ᛐ^-*#&|d=kBEO%i‡lEC`knu}D!񰾊0 HvaxПh &z\__"pCl rl B9" oA[ f8`y !6%\ XFa^E”6Ȑ6VP.I3: k*s0GQ_/a'T$nьw*cL8peEYXk @ܯ%10l@$ &&>51 XTwPF'|)1!0F%+0^]|| otJ ^x3E7iüND\#3 --&dax"X+f, 9 (d t\$"aɺ0bԲ)Z,jE 0Dn!|A[-2&Z-@jLI(Z@V @V]-o>4)W>,@ )Ѕ"bNE Ua#g!FJ73E %5pAXQt+:&<-ԟccrmk`m/&j@DR.19̈ю2 aPM75\fFjn9 2),]y\@~,cC8DTUǃO!?"wU[%k)$SօkbZ-7IH& ?[z&;[ K7s)"uAm $u7qsѲm)#B`\)3ȈAs[I0\OJA+TEb L ZGbsdHG`VЊE/ hjT}$_ v\GbqAJ[9ʅ BqÅ?{pY#pH+X Ā K)ʎQ\Y Zr㲻t(tB[ŗfKGM-r2 6%S &| ecqUD/A<4=%ZU =4\HA UW4$KYŹdhopTwN\/8܊xK!4*N"[!cs')iv\`brc0fMy{&U'Bb;:=R@A&2xP#59n,`'9+Y"^$Z eցMR{P>_QZ/{TkK +svښf}2z%)mapoD:AxGthdF%QG'a#^$cg=![D=)ŁۘlBB[*h?}r>X,jyb%Da]jOT qqsAgM6ѽ; x8+)㪂!#|z,D\`h解pXn4A!" Y-'q5lTR4hl岦jeó,J3189T$I#Q6"-ۤp d1%Gj؞uJċ(R$ґp(a&җMv_'mW׊~*4H/,d_D]?e9<M|*!8H a Z#:wzأD>1/RJekJj0Dܱ>Bk!lKb2x!`dd\>(-D<wjf00{B<#71]zqy__aH<^{͓61EO? tM2# Tu-&p*Sy ;uz#{övjGNJNs܇BHIS5P +Ir"Z䛒,41OZ< MH5Q4P| ?⪤˞'*w°,u:ߗ-p繷|=EU,eR%GF=oO$ FVIbҁ { |\IVOGnCtE[&n12Nzf%ě½$f]\ag,V&n|wd-j3[ɄdClIGmy}ig|x"-QaWe[TwbY?a zk7i<eA#ؘT%+۰}Y Ԯ"jszvQ/+ydz.!?"OE۶.>ȱt:Y'y䇇2|qM:FWϓ> (0ƧW8jzw6ʛniwHE$ {#TH1m.'`[m.F/\(}OA<ʢ=9ZQVk-+^3 qG=*RP0u|X Fy ;t1oNLڬ7b& <yпY pgW+=73t/yBeddh]$@B=O6/}_p}7.Xp]n0{h>%6rIM;K*RRFZ KRrg8ka}X+rcn2^>Du(G .QsS@h &~ 5-̂z~a #F\ג<,$@(q~S`Q!nesSj"4'Ω1Q/"JfPG?+M2V$gi,bI!H',8$c=Y|P8S;KKMD/ѹ Q&=-i;uv~' ʃL=7~&Җ17 7l"Ŵr@ T4>4}%FJѥ3ch]i퀲sq7Q} '_C7'Cb[h[h2 ǜ[_[<ͨi-~<-Y[,c<EZrm5ì*~FWܔҳm:7Rm>_x>}M诜ЅJ=HzL ޕʉ N\OM xnVJ@biԦd7gC,ԘRBw6F@)[_3<ָo9<\VoΊT+\ZD~wO{=~ܿDBܲU@V*2^YwupbD| {/Uơ^4vh~+Dhvvrr.ow_ƘVkwbo( spyglossary-4.5.0/doc/config.rst000066400000000000000000000247211417733132500165140ustar00rootroot00000000000000Configuration Parameters ------------------------ +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | Name | Command Flags | Type | Default | Comment | +==============================+=======================+=======+===============+=========================================================+ | ``log_time`` | | ``--log-time`` | bool | ``false`` | Show date and time in logs | | | | ``--no-log-time`` | | | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``cleanup`` | | ``--cleanup`` | bool | ``true`` | Cleanup cache or temporary files after conversion | | | | ``--no-cleanup`` | | | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``auto_sqlite`` | | bool | ``true`` | Auto-enable ``--sqlite`` to limit RAM usage when direct | | | | | | mode is not possible. Can override with ``--no-sqlite`` | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``lower`` | | ``--lower`` | bool | ``false`` | Lowercase words before writing | | | | ``--no-lower`` | | | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``utf8_check`` | | ``--utf8-check`` | bool | ``false`` | Ensure entries contain valid UTF-8 strings | | | | ``--no-utf8-check`` | | | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``enable_alts`` | | ``--alts`` | bool | ``true`` | Enable alternates | | | | ``--no-alts`` | | | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``skip_resources`` | ``--skip-resources`` | bool | ``false`` | Skip resources (images, audio, css, etc) | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``rtl`` | ``--rtl`` | bool | ``false`` | Right-To-Left all (HTML) definitions | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``remove_html`` | ``--remove-html`` | str | ``""`` | Remove given HTML tags (comma-separated) | | | | | | from definitions | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``remove_html_all`` | ``--remove-html-all`` | bool | ``false`` | Remove all HTML tags from definitions | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``normalize_html`` | ``--normalize-html`` | bool | ``false`` | Lowercase and normalize HTML tags in definitions | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``save_info_json`` | ``--info`` | bool | ``false`` | Save .info file alongside output file(s) | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``color.enable.cmd.unix`` | ``--no-color`` | bool | ``true`` | Enable colors in Linux/Unix command line | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``color.enable.cmd.windows`` | ``--no-color`` | bool | ``false`` | Enable colors in Windows command line | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``color.cmd.critical`` | | int | ``196`` | | Color code for critical errors in command line | | | | | |image0| | | See `term-colors.md <./term-colors.md/>`_ | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``color.cmd.error`` | | int | ``1`` | | Color code for errors in command line | | | | | |image1| | | See `term-colors.md <./term-colors.md/>`_ | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``color.cmd.warning`` | | int | ``208`` | | Color code for warnings in command line | | | | | |image2| | | See `term-colors.md <./term-colors.md/>`_ | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``cmdi.prompt.indent.str`` | | str | ``">"`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``cmdi.prompt.indent.color`` | | int | ``2`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``cmdi.prompt.msg.color`` | | int | ``-1`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``cmdi.msg.color`` | | int | ``-1`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``ui_autoSetFormat`` | | bool | ``true`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``reverse_matchWord`` | | bool | ``true`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``reverse_showRel`` | | str | ``"Percent"`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``reverse_saveStep`` | | int | ``1000`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``reverse_minRel`` | | float | ``0.3`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``reverse_maxNum`` | | int | ``-1`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ | ``reverse_includeDefs`` | | bool | ``false`` | | +------------------------------+-----------------------+-------+---------------+---------------------------------------------------------+ Configuration Files ------------------- The default configuration values are stored in `config.json <./../config.json/>`_ file in source/installation directory. The user configuration file - if exists - will override default configuration values. The location of this file depends on the operating system: - Linux or BSD: ``~/.pyglossary/config.json`` - Mac: ``~/Library/Preferences/PyGlossary/config.json`` - Windows: ``C:\Users\USERNAME\AppData\Roaming\PyGlossary\config.json`` Using as library ---------------- When you use PyGlossary as a library, neither of ``config.json`` files are loaded. So if you want to change the config, you should set ``glos.config`` property (which you can do only once for each instance of ``Glossary``). For example: .. code:: python glos = Glossary() glos.config = { "lower": True, } .. |image0| image:: https://via.placeholder.com/20/ff0000/000000?text=+ .. |image1| image:: https://via.placeholder.com/20/aa0000/000000?text=+ .. |image2| image:: https://via.placeholder.com/20/ff8700/000000?text=+pyglossary-4.5.0/doc/dsl/000077500000000000000000000000001417733132500152715ustar00rootroot00000000000000pyglossary-4.5.0/doc/dsl/README.rst000066400000000000000000000022171417733132500167620ustar00rootroot00000000000000 {{COMMENT}}..{{/COMMENT}} MAINENTRY: entry word MULTIWORD: entry words STYLE-LEVEL: spoken DEFINITION: definition PRON: pronunciation PART-OF-SPEECH: word class INFLECTION: list of inflection types INFLECTION-TYPE: singular/plural for noun, comparative/superlative for adj INFLECTION-ENTRY: SENSE-NUM: meaning number HOMO-NUM: meaning number SYNTAX-CODING: Thesaurus: PATTERNS-COLLOCATIONS: EXAMPLE: Main entry: DIALECT: See also: Phrase: Phrasal Verb: [m#] Indent Level [*]...[/* ] Optional Text. Show only in full mode [p]...[/p] Label defined in abbrev.dsl [s]...[/s] Sound/Picture File Text Format ============== [c color_name]...[/c] Color Name, e.g red, orange, [b]...[/b] Bold [']...[/'] [u]...[/u] Underline [i]...[/i] Italic [sup]...[/sup] Superscript [sub]...[/sub] Subscript Text Zone ========= [trn]...[/trn] Translation [ex]...[/ex] Example [com]...[/com] Comment [!trs]...[/!trs] text between these tags will not be indexed [t]...[/t] Unknown [url]...[/url] URL Link <<...>> Reference pyglossary-4.5.0/doc/entry-filters.md000066400000000000000000000040121417733132500176350ustar00rootroot00000000000000## Entry Filters | Name | Default Enabled | Command Flags | Description | | ---------------------------- | --------------- | ------------------------------------ | ------------------------------------------- | | `strip` | Yes | | Strip whitespaces in word(s) and definition | | `non_empty_word` | Yes | | Skip entries with empty word | | `skip_resources` | No | `--skip-resources` | Skip resources / data files | | `utf8_check` | No | `--utf8-check`
`--no-utf8-check` | Fix Unicode in word(s) and definition | | `lower` | No | `--lower`
`--no-lower` | Lowercase word(s) | | `rtl` | No | `--rtl` | Make definition right-to-left | | `remove_html_all` | No | `--remove-html-all` | Remove all HTML tags from definition | | `remove_html` | No | `--remove-html` | Remove specific HTML tags from definition | | `normalize_html` | No | `--normalize-html` | Normalize HTML tags in definition (WIP) | | `lang` | Yes | | Language-specific cleanup/fixes | | `non_empty_word` | Yes | | Skip entries with empty word | | `non_empty_defi` | Yes | | Skip entries with empty definition | | `remove_empty_dup_alt_words` | Yes | | Remove empty and duplicate alternate words | pyglossary-4.5.0/doc/lib-examples/000077500000000000000000000000001417733132500170715ustar00rootroot00000000000000pyglossary-4.5.0/doc/lib-examples/any_to_txt.py000077500000000000000000000007461417733132500216450ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import pyglossary from pyglossary import Glossary # Glossary.init() must be called only once, so make sure you put it # in the right place Glossary.init() glos = Glossary() glos.convert( inputFilename=sys.argv[1], outputFilename=f"{sys.argv[1]}.txt", # although it can detect format for *.txt, you can still pass outputFormat outputFormat="Tabfile", # you can pass readOptions or writeOptions as a dict # writeOptions={"encoding": "utf-8"}, ) pyglossary-4.5.0/doc/lib-examples/oxford.py000066400000000000000000000017521417733132500207510ustar00rootroot00000000000000def takePhonetic_oxford_gb(glos): phonGlos = Glossary() ## phonetic glossary phonGlos.setInfo("name", glos.getInfo("name") + "_phonetic") for entry in glos: word = entry.s_word defi = entry.defi if not defi.startswith("/"): continue # Now set the phonetic to the `ph` variable. ph = "" for s in ( "/ adj", "/ v", "/ n", "/ adv", "/adj", "/v", "/n", "/adv", "/ n", "/ the", ): i = defi.find(s, 2, 85) if i == -1: continue else: ph = defi[:i + 1] break ph = ph.replace(";", "\t")\ .replace(",", "\t")\ .replace(" ", "\t")\ .replace(" ", "\t")\ .replace(" ", "\t")\ .replace("//", "/")\ .replace("\t/\t", "\t")\ .replace("US\t", "\tUS: ")\ .replace("US", "\tUS: ")\ .replace("\t\t\t", "\t")\ .replace("\t\t", "\t") # .replace("/", "") # .replace("\\n ", "\\n") # .replace("\\n ", "\\n") if ph != "": phonGlos.addEntryObj(phonGlos.newEntry(word, ph)) return phonGlos pyglossary-4.5.0/doc/lib-examples/py-to-stardict.py000066400000000000000000000007111417733132500223250ustar00rootroot00000000000000from pyglossary.glossary import Glossary Glossary.init() glos = Glossary() defiFormat = "m" # "m" for plain text, "h" for HTML mydict = { "a": "test1", "b": "test2", "c": "test3", "d": "test4", "e": "test5", "f": "test6", } for word, defi in mydict.items(): glos.addEntryObj(glos.newEntry(word, defi, defiFormat=defiFormat)) glos.setInfo("title", "My Test StarDict") glos.setInfo("author", "John Doe") glos.write("test.ifo", format="Stardict") pyglossary-4.5.0/doc/lzo.md000066400000000000000000000011761417733132500156420ustar00rootroot00000000000000## Install `python-lzo` - **On Linux** - Make sure `liblzo2-dev` or `liblzo2-devel` is installed. - Run `sudo pip3 install python-lzo` - **On Android with Termux** - `apt install liblzo` - `pip install python-lzo` - **On Windows**: - Open this page: https://www.lfd.uci.edu/~gohlke/pythonlibs/#python-lzo - If you are using Python 3.7 (32 bit) for example, click on `python_lzo‑1.12‑cp37‑cp37m‑win32.whl` - Open Start -> type Command -> right-click on Command Prompt -> Run as administrator - Run `pip install C:\....\python_lzo‑1.12‑cp37‑cp37m‑win32.whl` command, giving the path of downloaded file pyglossary-4.5.0/doc/octopus_mdict/000077500000000000000000000000001417733132500173635ustar00rootroot00000000000000pyglossary-4.5.0/doc/octopus_mdict/MDD.svgz000066400000000000000000000066641417733132500207160ustar00rootroot00000000000000_MDD.svg.new]mo~hz]`}C~sl%6֖[q6-;ԛe[r,qT!8 9>m>CpMbCa4q݇z4|)v# -&Q"=2Mo$~8^!KG"&U88ZG++Q?0J}/a|&mMo}\Lۇk:Z'XW(~ 4~O?WƏM];]ŋ0L/EC޻b|e_W鿡1%1[$ !rOܗOb2yiY)J] | 7W>S.F_>ӛe P9'NEM)[2bpg;-Qq >GIx.v:Pr¾nuD_)jt26˼-5]u0>oOI㴱& #.)@Xg_$9B^1҅Fg77N191^g[50η[~VW}$4:;nBy|J.\tԢD1g8qyvW/.5Lӵ8e_)a?\cnrf7LJO #Oc4^2$MEX?*s'$[ 3kWbEG5b=} F،m!ˁ҈#c*0G]4neUf; >v j˕2hӪLbY!+Hg_32?Gp1*Qй̞ͦ 1B,ri;Tk@K,h3)+ qhD0j^d5QAe 7l#;o\8X^FuU.K1N3 32 rY qVMCR+,+F:, ЍAf%fX,-wJe,1?g-aE9 [=tj.)S,ݫ:u{B6X$D! j 0_M<#a)l-PFNǮZ/lz~N鿇Yt/ZOH =sm"(v4r3T}c|,~@+ u ?|T @.)v P)&a-f);{ʁmyw\^HylؚBxRh'$7|#S,ǹˌ> (" Ql۠"f9uN LDyV5y|)cмc ~.9(HЊ$X_Ney|f]'Hն RMo^[ Cl-izh^FK%hwJi+&̬a25;|trurMV3֭|!vIS[)Xr Tcͺ{p 180H2 W(GkN!)bَ>OFi\BsrRck 1킋 T_bޚ4bfd8".a[,fn ybMX[Q: h,v2G2ݫ9ػ`z\Յ Z8>mP0M*Edzgh~NHsys _dxR% ^dEWO4NAӜ{/e6q)Eď|NM8FA Jq^AbȌycRRC5Y,$䪆N`j6Q$nVkȊʽTO {$,k0e`b8RǧdA;o@:{Vԝ+Vζp=AnE bA{T`#l v% [,tE+>qWW XVMr]_֋od%t3LWaNgVO٫_sGlcf11bO 7ߖ_tr3I'#㛿~Njo&V_>{3}q6A>hCI:٬I#7ӻ9>9zw8`T 0O <*[x[' ~zCc_e2 t`!>o*u2MCnE=r E1Ko׃[3 \R%̒ X&tFR?눂S%uD)]FݶGXVAB89>r9Gc#[.#fɆ-I_zkLZ.c"3̸i5]H-TALR ple:8/@zcjH4Y7->_>7{ڤbL9=o7(gW>LW6s޻y;QR](ӆdMYTPovWP3Κ`e&b8XH36j46_]L852uO+_~Z)D2s͖] %m9˗ɚ>ɦ̻r.瘫ͳ~]d#l/XeMh/(ŃS΋K\Ym4 (ݝۋM0k+럗w\[귽^6qFV_ ћ(xOow"P/HSE,BRx&G?[,۠z 0_oeS͙{YfL»&8`I.sl 4)YxQJR.SlRs'vlʨrhceXKiA<ү!*Sjrv7w*JȈ֟,\'|bĖ=c,⣕2V;:VCkfZHfɲlBH {(Ky>k6zg:\|IquZ gnK1J+u fA\&@w,6 Ha4mmޯWiyz ?yu, b=Z jMJRQXO8XOQc=r[r,o¥-fM9wB8c$.c&K[zjXatPG )w#\Iϱ&+lZ5*N; CKH$wGI)Ti,8K-`4+ }q ,+qח.eX|Q ?` G&S !tŒJ#{0R) zcRFّbUPrK#vCbxr1,l5MG(wo/b~3Iao%u4ӈsZa>d@V&9YQ> }"PtI}y l1ld铆m-JT5l%|LXV} t5l#K1Vc^&+ӄ0^U [IuKTVۘxjDX-ZW2=fսo`2 2'U<C*iA N%\3Y,g#1I:yj0(jp ;Liδ"Xnl>(c砄35/5u"9$``[pGO_9ez=9"[S͎GI͉[SY#x[4 c1@~Y6z;AM;nY:A)G>g/FuѹP3 =-W[Brp0wzM0KL0դ^#~gU-|]_N4!Ϣ8*{]GYrDWtߖEJ^j{|ZH @mRw: @mOGvf}s̿?W)WT A-;Ğ1)L7{D6Y##o/D>F6 + p+6bĠ^kۜӑZԶpG5y @GM^j[27;xyn9Wx”*O 6Zܖ.x[ܑy*4Q:aqr}I(0ݣ m;Ҟa3=`oJlîRyNLv#D_ ! ѓ^,##c# ӏHY+N؟{6a#~4-Xn+{#=IC1sVW6ꋷ "E41/1i/A ˲K,^:{`Cml*^s.Lg~ ){v9CǨ5U)砶R0}t /,&Ʈ!mٝA^j[vWB^j[y"8ʸwkww[eXy 949k'T9!</~[7yd\ɮo^0EH!ȎPgZg&e* V439ۑ)HNoC鸳Δë⎳}pv 1΄}磶숚!c֓9;MP۝L^1gP۝]I:|k|RNkg$ OAUy#iqUv*9)FR|ͥ[ lAwYoEw4Z}roOG6>f|((Z`]TD/1e:Mܣa!ȣa¹~@efG^$vޞJe3A{Sqe\FuHte~Y'dm3/E<0iR)IYM٭]Y ],ۢ-Mx$ Yn&ɚL76lr[,o0J֧]@!hJr2+xgkW:hDɷ"`%4j'Vu~!67a>hφP\j CL 1 Zq{ Z " 2]l(,{Gso0\q񘒚{s8,c'ζOBunBQpe~Uj܄ V=7PN|Tv`ʈ#uY5!5.-ZCv?|uϟh4͊>QIu!.,)֚ZY56HIgmą`0m:X-;!tM_)7{} 6Hh>w8?kʶJ>J u8o9f<@w7W55Մغ Lt᮰=&r(㯧(/*CcB~Y$k;tu//w-aXaw'"l>;l%`\;aIq⤢+6D7jwo16zKrU +i71R:dD6++WQ![jMn5A`B ?tϣ%2Փ.1oTf(sطjU 9LN_Me"P-BJcxz sU _| E a= [u0.,Y1me \C@y9 vF99Y<'{01i$foU$<7xC1^ERV c,(XEx6 ԶO`G6U^+9ڗ{ٳ8 UFZZuV2| '`A0l`W{|>NeSŜkz$÷;1ÇW'/5C=y _x{"=FԎP tK(͎| hs1Zwpyglossary-4.5.0/doc/octopus_mdict/README.md000066400000000000000000000045521417733132500206500ustar00rootroot00000000000000# An Analysis of MDX/MDD File Format > MDict is a multi-platform open dictionary which are both questionable. It is not available for every platform, e.g. OS X, Linux. Its dictionary file format is not open. But this has not hindered its popularity, and many dictionaries have been created for it. This is an attempt to reveal MDX/MDD file format, so that my favarite dictionaries, created by MDict users, could be used elsewhere. # MDict Files MDict stores the dictionary definitions, i.e. (key word, explanation) in MDX file and the dictionary reference data, e.g. images, pronunciations, stylesheets in MDD file. Although holding different contents, these two file formats share the same structure. # MDX and MDD File Formats See [MDX.svgz](./MDX.svgz) and [MDD.svgz](./MDD.svgz) # Example Programs ## readmdict.py readmdict.py is an example implementation in Python. This program can read/extract mdx/mdd files. **NOTE:** python-lzo is required to read mdx files created with enegine 1.2. Get Windows version from http://www.lfd.uci.edu/~gohlke/pythonlibs/#python-lzo It can be used as a command line tool. Suppose one has oald8.mdx and oald8.mdd:: ``` $ python readmdict.py -x oald8.mdx ``` This will creates *oald8.txt* dictionary file and creates a folder *data* for images, pronunciation audio files. On Windows, one can also double click it and select the file in the popup dialog. Or as a module:: ``` In [1]: from readmdict import MDX, MDD ``` Read MDX file and print the first entry:: ``` In [2]: mdx = MDX('oald8.mdx') In [3]: items = mdx.items() In [4]: items.next() Out[4]: ('A', '.........') ``` `mdx` is an object having all info from a MDX file. `items` is an iterator producing 2-item tuples. Of each tuple, the first element is the entry text and the second is the explanation. Both are UTF-8 encoded strings. Read MDD file and print the first entry:: ``` In [5]: mdd = MDD('oald8.mdd') In [6]: items = mdd.items() In [7]: items = mdd.next() Out[7]: ('\\pic\\accordion_concertina.jpg', '\xff\xd8\xff\xe0\x00\x10JFIF...........') ``` `mdd` is an object having all info from a MDD file. `items` is an iterator producing 2-item tuples. Of each tuple, the first element is the file name and the second element is the corresponding file content. The file name is encoded in UTF-8. The file content is a plain bytes array. pyglossary-4.5.0/doc/p/000077500000000000000000000000001417733132500147465ustar00rootroot00000000000000pyglossary-4.5.0/doc/p/__index__.md000066400000000000000000000102221417733132500171700ustar00rootroot00000000000000| Description | Name | Doc Link | | ----------------------------- | --------------- | ---------------------------------------------- | | Aard 2 (.slob) | Aard2Slob | [aard2_slob.md](./aard2_slob.md) | | ABC Medical Notes (SQLite3) | ABCMedicalNotes | [abc_medical_notes.md](./abc_medical_notes.md) | | Almaany.com (SQLite3) | Almaany | [almaany.md](./almaany.md) | | AppleDict Source | AppleDict | [appledict.md](./appledict.md) | | AppleDict Binary | AppleDictBin | [appledict_bin.md](./appledict_bin.md) | | Babylon (.BGL) | BabylonBgl | [babylon_bgl.md](./babylon_bgl.md) | | CC-CEDICT | CC-CEDICT | [cc_cedict.md](./cc_cedict.md) | | cc-kedict | cc-kedict | [cc_kedict.md](./cc_kedict.md) | | Crawler Directory | CrawlerDir | [crawler_dir.md](./crawler_dir.md) | | CSV (.csv) | Csv | [csv.md](./csv.md) | | DictionaryForMIDs | Dicformids | [dicformids.md](./dicformids.md) | | Dict.cc (SQLite3) | Dictcc | [dict_cc.md](./dict_cc.md) | | Dict.cc (SQLite3) - Split | Dictcc_split | [dict_cc_split.md](./dict_cc_split.md) | | DICT.org file format (.index) | DictOrg | [dict_org.md](./dict_org.md) | | DICT.org dictfmt source file | DictOrgSource | [dict_org_source.md](./dict_org_source.md) | | dictunformat output file | Dictunformat | [dictunformat.md](./dictunformat.md) | | DigitalNK (SQLite3, N-Korean) | DigitalNK | [digitalnk.md](./digitalnk.md) | | ABBYY Lingvo DSL (.dsl) | ABBYYLingvoDSL | [dsl.md](./dsl.md) | | EPUB-2 E-Book | Epub2 | [epub2.md](./epub2.md) | | Kobo E-Reader Dictionary | Kobo | [kobo.md](./kobo.md) | | Kobo E-Reader Dictfile (.df) | Dictfile | [kobo_dictfile.md](./kobo_dictfile.md) | | Mobipocket (.mobi) E-Book | Mobi | [mobi.md](./mobi.md) | | EDLIN | Edlin | [edlin.md](./edlin.md) | | FreeDict (.tei) | FreeDict | [freedict.md](./freedict.md) | | Gettext Source (.po) | GettextPo | [gettext_po.md](./gettext_po.md) | | HTML Directory | HtmlDir | [html_dir.md](./html_dir.md) | | Glossary Info (.info) | Info | [info.md](./info.md) | | JMDict | JMDict | [jmdict.md](./jmdict.md) | | JSON (.json) | Json | [json.md](./json.md) | | Lingoes Source (.ldf) | LingoesLDF | [lingoes_ldf.md](./lingoes_ldf.md) | | Octopus MDict (.mdx) | OctopusMdict | [octopus_mdict.md](./octopus_mdict.md) | | Sdictionary Binary(dct) | Sdict | [sdict.md](./sdict.md) | | Sdictionary Source (.sdct) | SdictSource | [sdict_source.md](./sdict_source.md) | | SQL (.sql) | Sql | [sql.md](./sql.md) | | StarDict (.ifo) | Stardict | [stardict.md](./stardict.md) | | Tabfile (.txt, .dic) | Tabfile | [tabfile.md](./tabfile.md) | | Wiktionary Dump (.xml) | WiktionaryDump | [wiktionary_dump.md](./wiktionary_dump.md) | | Wordset.org JSON directory | Wordset | [wordset.md](./wordset.md) | | XDXF (.xdxf) | Xdxf | [xdxf.md](./xdxf.md) | | Zim (.zim, for Kiwix) | Zim | [zim.md](./zim.md) | pyglossary-4.5.0/doc/p/aard2_slob.md000066400000000000000000000046621417733132500173100ustar00rootroot00000000000000## Aard 2 (.slob) ### General Information | Attribute | Value | | --------------- | -------------------------------------------------------- | | Name | Aard2Slob | | snake_case_name | aard2_slob | | Description | Aard 2 (.slob) | | Extensions | `.slob` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [@itkach/slob/wiki](https://github.com/itkach/slob/wiki) | | Website | [aarddict.org](http://aarddict.org/) | ### Write options | Name | Default | Type | Comment | | ------------------- | ------- | ---- | --------------------------------------------------------------- | | compression | `zlib` | str | Compression Algorithm | | content_type | | str | Content Type | | file_size_approx | `0` | int | split up by given approximate file size
examples: 100m, 1g | | separate_alternates | `False` | bool | add alternate headwords as separate entries to slob | | word_title | `False` | bool | add headwords title to begining of definition | ### Dependencies for reading and writing PyPI Links: [PyICU](https://pypi.org/project/PyICU) To install, run: ```sh pip3 install PyICU ``` ### PyICU See [doc/pyicu.md](../pyicu.md) file for more detailed instructions on how to install PyICU. ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------------ | ------- | --------- | | [Aard 2 for Android](http://aarddict.org/) | GPL | Android | | [Aard2 for Web](http://aarddict.org/) | MPL | Web | pyglossary-4.5.0/doc/p/abc_medical_notes.md000066400000000000000000000046431417733132500207120ustar00rootroot00000000000000## ABC Medical Notes (SQLite3) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------------------------------------------------------- | | Name | ABCMedicalNotes | | snake_case_name | abc_medical_notes | | Description | ABC Medical Notes (SQLite3) | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [ABC Medical Notes 2021 - Google Play](https://play.google.com/store/apps/details?id=com.pocketmednotes2014.secondapp) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------------------------------------------------------------------------- | ------- | --------- | | [ABC Medical Notes 2020](https://play.google.com/store/apps/details?id=com.pocketmednotes2014.secondapp) | Unknown | Android | pyglossary-4.5.0/doc/p/almaany.md000066400000000000000000000044041417733132500167140ustar00rootroot00000000000000## Almaany.com (SQLite3) ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------------------------------------------------------- | | Name | Almaany | | snake_case_name | almaany | | Description | Almaany.com (SQLite3) | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [Almaany.com Arabic Dictionary - Google Play](https://play.google.com/store/apps/details?id=com.almaany.arar) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ----------------------------------------------------------------------------------------------- | ------- | --------- | | [Almaany.com Arabic Dictionary](https://play.google.com/store/apps/details?id=com.almaany.arar) | Unknown | Android | pyglossary-4.5.0/doc/p/appledict.md000066400000000000000000000062611417733132500172420ustar00rootroot00000000000000## AppleDict Source ### General Information | Attribute | Value | | --------------- | --------------------------------------------------------------------------------------------- | | Name | AppleDict | | snake_case_name | appledict | | Description | AppleDict Source | | Extensions | `.apple` | | Read support | No | | Write support | Yes | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [Dictionary User Guide for Mac](https://support.apple.com/en-gu/guide/dictionary/welcome/mac) | ### Write options | Name | Default | Type | Comment | | ----------------- | ------- | ---- | ---------------------------------------- | | clean_html | `True` | bool | use BeautifulSoup parser | | css | | str | custom .css file path | | xsl | | str | custom XSL transformations file path | | default_prefs | `None` | dict | default prefs in python dict format | | prefs_html | | str | preferences XHTML file path | | front_back_matter | | str | XML file path with top-level tag | | jing | `False` | bool | run Jing check on generated XML | | indexes | | str | Additional indexes to dictionary entries | ### Dependencies for writing PyPI Links: [lxml](https://pypi.org/project/lxml), [beautifulsoup4](https://pypi.org/project/beautifulsoup4), [html5lib](https://pypi.org/project/html5lib) To install, run ```sh pip3 install lxml beautifulsoup4 html5lib ``` ### Also see: See [doc/apple.md](../apple.md) for additional AppleDict instructions. ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------------------------------------------------------------- | ------- | --------- | | [Dictionary Development Kit](https://github.com/SebastianSzturo/Dictionary-Development-Kit) | Unknown | Mac | pyglossary-4.5.0/doc/p/appledict_bin.md000066400000000000000000000047441417733132500200760ustar00rootroot00000000000000## AppleDict Binary ### General Information | Attribute | Value | | --------------- | --------------------------------------------------------------------------------------------- | | Name | AppleDictBin | | snake_case_name | appledict_bin | | Description | AppleDict Binary | | Extensions | `.dictionary`, `.data` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [Dictionary User Guide for Mac](https://support.apple.com/en-gu/guide/dictionary/welcome/mac) | ### Read options | Name | Default | Type | Comment | | --------- | ------- | ---- | --------------------------------------------------- | | html | `True` | bool | Entries are HTML | | html_full | `False` | bool | Turn every entry's definition into an HTML document | ### Dependencies for reading PyPI Links: [lxml](https://pypi.org/project/lxml) To install, run: ```sh pip3 install lxml ``` ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------------------------------------------------- | ----------- | --------- | | [Apple Dictionary](https://support.apple.com/en-gu/guide/dictionary/welcome/mac) | Proprietary | Mac | pyglossary-4.5.0/doc/p/babylon_bgl.md000066400000000000000000000042141417733132500175430ustar00rootroot00000000000000## Babylon (.BGL) ### General Information | Attribute | Value | | --------------- | ------------------ | | Name | BabylonBgl | | snake_case_name | babylon_bgl | | Description | Babylon (.BGL) | | Extensions | `.bgl` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | ― | ### Read options | Name | Default | Type | Comment | | --------------------------- | -------- | ---- | ------------------------------------------- | | default_encoding_overwrite | | str | Default encoding (overwrite) | | source_encoding_overwrite | | str | Source encoding (overwrite) | | target_encoding_overwrite | | str | Target encoding (overwrite) | | part_of_speech_color | `007000` | str | Color for Part of Speech | | no_control_sequence_in_defi | `False` | bool | No control sequence in definitions | | strict_string_convertion | `False` | bool | Strict string convertion | | process_html_in_key | `False` | bool | Process HTML in (entry or info) key | | key_rstrip_chars | | str | Characters to strip from right-side of keys | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------------------------- | ----------- | -------------- | | [Babylon Translator](https://www.babylon-software.com/) | Freemium | Windows | | [GoldenDict](http://goldendict.org/) | GPL | Linux, Windows | | [GoldenDict Mobile (Free)](http://goldendict.mobi/) | Freeware | Android | | [GoldenDict Mobile (Full)](http://goldendict.mobi/) | Proprietary | Android | pyglossary-4.5.0/doc/p/cc_cedict.md000066400000000000000000000032301417733132500171660ustar00rootroot00000000000000## CC-CEDICT ### General Information | Attribute | Value | | --------------- | ----------------------------------------------------------- | | Name | CC-CEDICT | | snake_case_name | cc_cedict | | Description | CC-CEDICT | | Extensions | `.u8` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [CEDICT](https://en.wikipedia.org/wiki/CEDICT) | | Website | [CC-CEDICT Editor](https://cc-cedict.org/editor/editor.php) | ### Read options | Name | Default | Type | Comment | | ----------------- | ------- | ---- | --------------------------------------------- | | encoding | `utf-8` | str | Encoding/charset | | traditional_title | `False` | bool | Use traditional Chinese for entry titles/keys | ### Dependencies for reading PyPI Links: [lxml](https://pypi.org/project/lxml) To install, run: ```sh pip3 install lxml ``` pyglossary-4.5.0/doc/p/cc_kedict.md000066400000000000000000000026071417733132500172050ustar00rootroot00000000000000## cc-kedict ### General Information | Attribute | Value | | --------------- | -------------------------------------------------------------- | | Name | cc-kedict | | snake_case_name | cc_kedict | | Description | cc-kedict | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [@mhagiwara/cc-kedict](https://github.com/mhagiwara/cc-kedict) | ### Dependencies for reading PyPI Links: [PyYAML](https://pypi.org/project/PyYAML), [lxml](https://pypi.org/project/lxml) To install, run: ```sh pip3 install PyYAML lxml ``` pyglossary-4.5.0/doc/p/crawler_dir.md000066400000000000000000000014671417733132500175750ustar00rootroot00000000000000## Crawler Directory ### General Information | Attribute | Value | | --------------- | ------------------ | | Name | CrawlerDir | | snake_case_name | crawler_dir | | Description | Crawler Directory | | Extensions | `.crawler` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 📁 directory | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | ― | ### Write options | Name | Default | Type | Comment | | ----------- | ------- | ---- | --------------------- | | compression | | str | Compression Algorithm | pyglossary-4.5.0/doc/p/csv.md000066400000000000000000000054541417733132500160730ustar00rootroot00000000000000## CSV (.csv) ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------------------------ | | Name | Csv | | snake_case_name | csv | | Description | CSV (.csv) | | Extensions | `.csv` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Comma-separated values](https://en.wikipedia.org/wiki/Comma-separated_values) | | Website | ― | ### Read options | Name | Default | Type | Comment | | --------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | | delimiter | `,` | str | Column delimiter | ### Write options | Name | Default | Type | Comment | | --------------- | ------- | ---- | --------------------------------------------- | | encoding | `utf-8` | str | Encoding/charset | | resources | `True` | bool | Enable resources / data files | | delimiter | `,` | str | Column delimiter | | add_defi_format | `False` | bool | enable adding defiFormat (m/h/x) | | enable_info | `True` | bool | Enable glossary info / metedata | | word_title | `False` | bool | add headwords title to begining of definition | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ---------------------------------------------------------------------- | ----------- | ------------------- | | [LibreOffice Calc](https://www.libreoffice.org/discover/calc/) | MPL/GPL | Linux, Windows, Mac | | [Microsoft Excel](https://www.microsoft.com/en-us/microsoft-365/excel) | Proprietary | Windows | pyglossary-4.5.0/doc/p/dicformids.md000066400000000000000000000033231417733132500174140ustar00rootroot00000000000000## DictionaryForMIDs ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------------------ | | Name | Dicformids | | snake_case_name | dicformids | | Description | DictionaryForMIDs | | Extensions | `.mids` | | Read support | Yes | | Write support | Yes | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | always | | Sort key | `dicformids` | | Wiki | ― | | Website | [DictionaryForMIDs - SourceForge](http://dictionarymid.sourceforge.net/) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ---------------------------------------------------------- | ------- | --------------------------------- | | [DictionaryForMIDs](http://dictionarymid.sourceforge.net/) | GPL | Android, Web, Windows, Linux, Mac | pyglossary-4.5.0/doc/p/dict_cc.md000066400000000000000000000040551417733132500166640ustar00rootroot00000000000000## Dict.cc (SQLite3) ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------------------------------------------ | | Name | Dictcc | | snake_case_name | dict_cc | | Description | Dict.cc (SQLite3) | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Dict.cc](https://en.wikipedia.org/wiki/Dict.cc) | | Website | [dict.cc dictionary - Google Play](https://play.google.com/store/apps/details?id=cc.dict.dictcc) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ---------------------------------------------------------------------------------- | ----------- | --------- | | [dict.cc dictionary](https://play.google.com/store/apps/details?id=cc.dict.dictcc) | Proprietary | Android | pyglossary-4.5.0/doc/p/dict_cc_split.md000066400000000000000000000040651417733132500201000ustar00rootroot00000000000000## Dict.cc (SQLite3) - Split ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------------------------------------------ | | Name | Dictcc_split | | snake_case_name | dict_cc_split | | Description | Dict.cc (SQLite3) - Split | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Dict.cc](https://en.wikipedia.org/wiki/Dict.cc) | | Website | [dict.cc dictionary - Google Play](https://play.google.com/store/apps/details?id=cc.dict.dictcc) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ---------------------------------------------------------------------------------- | ----------- | --------- | | [dict.cc dictionary](https://play.google.com/store/apps/details?id=cc.dict.dictcc) | Proprietary | Android | pyglossary-4.5.0/doc/p/dict_org.md000066400000000000000000000044321417733132500170650ustar00rootroot00000000000000## DICT.org file format (.index) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------------- | | Name | DictOrg | | snake_case_name | dict_org | | Description | DICT.org file format (.index) | | Extensions | `.index` | | Read support | Yes | | Write support | Yes | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [DICT#DICT file format](https://en.wikipedia.org/wiki/DICT#DICT_file_format) | | Website | [The DICT Development Group](http://dict.org/bin/Dict) | ### Write options | Name | Default | Type | Comment | | ------- | ------- | ---- | --------------------------------------- | | dictzip | `False` | bool | Compress .dict file to .dict.dz | | install | `True` | bool | Install dictionary to /usr/share/dictd/ | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | --------------------------------------------------------------- | ------- | --------- | | [Dictd](https://directory.fsf.org/wiki/Dictd) | GPL | Linux | | [GNOME Dictionary](https://wiki.gnome.org/Apps/Dictionary) | GPL | Linux | | [Xfce4 Dictionary](https://docs.xfce.org/apps/xfce4-dict/start) | GPL | linux | | [Ding](https://www-user.tu-chemnitz.de/~fri/ding/) | GPL | linux | pyglossary-4.5.0/doc/p/dict_org_source.md000066400000000000000000000030041417733132500204370ustar00rootroot00000000000000## DICT.org dictfmt source file ### General Information | Attribute | Value | | --------------- | -------------------------------------------------- | | Name | DictOrgSource | | snake_case_name | dict_org_source | | Description | DICT.org dictfmt source file | | Extensions | `.dtxt` | | Read support | No | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [DICT](https://en.wikipedia.org/wiki/DICT) | | Website | [@cheusov/dictd](https://github.com/cheusov/dictd) | ### Write options | Name | Default | Type | Comment | | --------------- | ------- | ---- | -------------------- | | remove_html_all | `True` | bool | Remove all HTML tags | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ---------------------------------------------- | ------- | --------- | | [dictfmt](https://linux.die.net/man/1/dictfmt) | GPL | Linux | pyglossary-4.5.0/doc/p/dictunformat.md000066400000000000000000000044141417733132500177720ustar00rootroot00000000000000## dictunformat output file ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------------------------------------------- | | Name | Dictunformat | | snake_case_name | dictunformat | | Description | dictunformat output file | | Extensions | `.dictunformat` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Dictd](https://directory.fsf.org/wiki/Dictd) | | Website | [dictd/dictunformat.1.in - @cheusov/dictd](https://github.com/cheusov/dictd/blob/master/dictunformat.1.in) | ### Read options | Name | Default | Type | Comment | | -------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------------------------- | ------- | --------- | | [dictunformat](https://linux.die.net/man/1/dictunformat) | GPL | Linux | pyglossary-4.5.0/doc/p/digitalnk.md000066400000000000000000000026371417733132500172460ustar00rootroot00000000000000## DigitalNK (SQLite3, N-Korean) ### General Information | Attribute | Value | | --------------- | -------------------------------------------------------- | | Name | DigitalNK | | snake_case_name | digitalnk | | Description | DigitalNK (SQLite3, N-Korean) | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [@digitalprk/dicrs](https://github.com/digitalprk/dicrs) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | --------------------------------------------- | ------------ | --------- | | [Dic.rs](https://github.com/digitalprk/dicrs) | BSD-2-Clause | Linux | pyglossary-4.5.0/doc/p/dsl.md000066400000000000000000000036671417733132500160660ustar00rootroot00000000000000## ABBYY Lingvo DSL (.dsl) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------- | | Name | ABBYYLingvoDSL | | snake_case_name | dsl | | Description | ABBYY Lingvo DSL (.dsl) | | Extensions | `.dsl` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [ABBYY Lingvo](https://ru.wikipedia.org/wiki/ABBYY_Lingvo) | | Website | [www.lingvo.ru](https://www.lingvo.ru/) | ### Read options | Name | Default | Type | Comment | | --------------- | ------- | ---- | --------------------------------------- | | encoding | | str | Encoding/charset | | audio | `False` | bool | Enable audio objects | | only_fix_markup | `False` | bool | Only fix markup, without tag conversion | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------- | ----------- | --------------------------------------------------- | | [ABBYY Lingvo](https://www.lingvo.ru/) | Proprietary | Windows, Mac, Android, iOS, Windows Mobile, Symbian | pyglossary-4.5.0/doc/p/edlin.md000066400000000000000000000020421417733132500163610ustar00rootroot00000000000000## EDLIN ### General Information | Attribute | Value | | --------------- | ------------------ | | Name | Edlin | | snake_case_name | edlin | | Description | EDLIN | | Extensions | `.edlin` | | Read support | Yes | | Write support | Yes | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | ― | ### Read options | Name | Default | Type | Comment | | -------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | ### Write options | Name | Default | Type | Comment | | --------- | ------- | ---- | ----------------------------- | | encoding | `utf-8` | str | Encoding/charset | | prev_link | `True` | bool | Enable link to previous entry | pyglossary-4.5.0/doc/p/epub2.md000066400000000000000000000047301417733132500163110ustar00rootroot00000000000000## EPUB-2 E-Book ### General Information | Attribute | Value | | --------------- | ------------------------------------------ | | Name | Epub2 | | snake_case_name | epub2 | | Description | EPUB-2 E-Book | | Extensions | `.epub` | | Read support | No | | Write support | Yes | | Single-file | No | | Kind | 📦 package | | Sort-on-write | always | | Sort key | `ebook` | | Wiki | [EPUB](https://en.wikipedia.org/wiki/EPUB) | | Website | ― | ### Write options | Name | Default | Type | Comment | | ---------------------- | ------- | ---- | -------------------------- | | keep | `False` | bool | Keep temp files | | group_by_prefix_length | `2` | int | Prefix length for grouping | | include_index_page | `False` | bool | Include index page | | compress | `True` | bool | Enable compression | | css | | str | Path to css file | | cover_path | | str | Path to cover file | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------------------------------------------- | ----------- | ------------------- | | [calibre](https://calibre-ebook.com/) | GPL | Linux, Windows, Mac | | [Okular](https://okular.kde.org/) | GPL | Linux, Windows, Mac | | [Book Reader](https://f-droid.org/en/packages/com.github.axet.bookreader/) | GPL | Android | | [Kobo eReader](https://www.kobo.com) | Proprietary | Kobo eReader | | [Icecream Ebook Reader](https://icecreamapps.com/Ebook-Reader/) | Proprietary | Windows | | [Aldiko](https://www.demarque.com/aldiko) | Proprietary | Android, iOS | pyglossary-4.5.0/doc/p/freedict.md000066400000000000000000000044541417733132500170640ustar00rootroot00000000000000## FreeDict (.tei) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------------------- | | Name | FreeDict | | snake_case_name | freedict | | Description | FreeDict (.tei) | | Extensions | `.tei` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [@freedict/fd-dictionaries/wiki](https://github.com/freedict/fd-dictionaries/wiki) | | Website | [FreeDict.org](https://freedict.org/) | ### Read options | Name | Default | Type | Comment | | --------------- | ------- | ---- | --------------------------------------------- | | discover | `False` | bool | Find and show unsupported tags | | auto_rtl | `None` | bool | Auto-detect and mark Right-to-Left text | | word_title | `False` | bool | Add headwords title to begining of definition | | pron_color | `gray` | str | Pronunciation color | | gram_color | `green` | str | Grammar color | | example_padding | `10` | int | Padding for examples (in px) | ### Dependencies for reading PyPI Links: [lxml](https://pypi.org/project/lxml) To install, run: ```sh pip3 install lxml ``` pyglossary-4.5.0/doc/p/gettext_po.md000066400000000000000000000037051417733132500174570ustar00rootroot00000000000000## Gettext Source (.po) ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------- | | Name | GettextPo | | snake_case_name | gettext_po | | Description | Gettext Source (.po) | | Extensions | `.po` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Gettext](https://en.wikipedia.org/wiki/Gettext) | | Website | [gettext - GNU Project](https://www.gnu.org/software/gettext) | ### Write options | Name | Default | Type | Comment | | --------- | ------- | ---- | ----------------------------- | | resources | `True` | bool | Enable resources / data files | ### Dependencies for reading and writing PyPI Links: [polib](https://pypi.org/project/polib) To install, run: ```sh pip3 install polib ``` ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------------------ | --------------- | ------------------- | | [gettext](https://www.gnu.org/software/gettext/) | GPL | Linux, Windows | | [poEdit](https://github.com/vslavik/poedit) | MIT / Shareware | Linux, Windows, Mac | pyglossary-4.5.0/doc/p/html_dir.md000066400000000000000000000032601417733132500170730ustar00rootroot00000000000000## HTML Directory ### General Information | Attribute | Value | | --------------- | ------------------ | | Name | HtmlDir | | snake_case_name | html_dir | | Description | HTML Directory | | Extensions | `.hdir` | | Read support | No | | Write support | Yes | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | ― | ### Write options | Name | Default | Type | Comment | | --------------- | -------------- | ---- | --------------------------------------------- | | encoding | `utf-8` | str | Encoding/charset | | resources | `True` | bool | Enable resources / data files | | max_file_size | `102400` | int | Maximum file size in bytes | | filename_format | `{n:05d}.html` | str | Filename format, default: {n:05d}.html | | escape_defi | `False` | bool | Escape definitions | | dark | `True` | bool | Use dark style | | css | | str | Path to css file | | word_title | `True` | bool | Add headwords title to begining of definition | ### Dependencies for writing PyPI Links: [cachetools](https://pypi.org/project/cachetools) To install, run ```sh pip3 install cachetools ``` pyglossary-4.5.0/doc/p/info.md000066400000000000000000000012461417733132500162260ustar00rootroot00000000000000## Glossary Info (.info) ### General Information | Attribute | Value | | --------------- | --------------------- | | Name | Info | | snake_case_name | info | | Description | Glossary Info (.info) | | Extensions | `.info` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | ― | pyglossary-4.5.0/doc/p/jmdict.md000066400000000000000000000025541417733132500165500ustar00rootroot00000000000000## JMDict ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------- | | Name | JMDict | | snake_case_name | jmdict | | Description | JMDict | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [JMdict](https://en.wikipedia.org/wiki/JMdict) | | Website | [The JMDict Project](https://www.edrdg.org/jmdict/j_jmdict.html) | ### Dependencies for reading PyPI Links: [lxml](https://pypi.org/project/lxml) To install, run: ```sh pip3 install lxml ``` pyglossary-4.5.0/doc/p/json.md000066400000000000000000000030331417733132500162400ustar00rootroot00000000000000## JSON (.json) ### General Information | Attribute | Value | | --------------- | ------------------------------------------------- | | Name | Json | | snake_case_name | json | | Description | JSON (.json) | | Extensions | `.json` | | Read support | No | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [JSON](https://en.wikipedia.org/wiki/JSON) | | Website | [www.json.org](https://www.json.org/json-en.html) | ### Write options | Name | Default | Type | Comment | | ----------- | ------- | ---- | --------------------------------------------- | | encoding | `utf-8` | str | Encoding/charset | | enable_info | `True` | bool | Enable glossary info / metedata | | resources | `True` | bool | Enable resources / data files | | word_title | `False` | bool | add headwords title to begining of definition | pyglossary-4.5.0/doc/p/kobo.md000066400000000000000000000030611417733132500162220ustar00rootroot00000000000000## Kobo E-Reader Dictionary ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------- | | Name | Kobo | | snake_case_name | kobo | | Description | Kobo E-Reader Dictionary | | Extensions | `.kobo` | | Read support | No | | Write support | Yes | | Single-file | No | | Kind | 📦 package | | Sort-on-write | never | | Sort key | (`headword_lower`) | | Wiki | [Kobo eReader](https://en.wikipedia.org/wiki/Kobo_eReader) | | Website | [www.kobo.com](https://www.kobo.com) | ### Dependencies for writing PyPI Links: [marisa-trie](https://pypi.org/project/marisa-trie) To install, run ```sh pip3 install marisa-trie ``` ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------ | ----------- | ------------ | | [Kobo eReader](https://www.kobo.com) | Proprietary | Kobo eReader | pyglossary-4.5.0/doc/p/kobo_dictfile.md000066400000000000000000000044471417733132500200760ustar00rootroot00000000000000## Kobo E-Reader Dictfile (.df) ### General Information | Attribute | Value | | --------------- | --------------------------------------------------------------------------- | | Name | Dictfile | | snake_case_name | kobo_dictfile | | Description | Kobo E-Reader Dictfile (.df) | | Extensions | `.df` | | Read support | Yes | | Write support | Yes | | Single-file | No | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [dictgen - dictutil](https://pgaskin.net/dictutil/dictgen/#dictfile-format) | ### Read options | Name | Default | Type | Comment | | --------------------- | ------- | ---- | --------------------- | | encoding | `utf-8` | str | Encoding/charset | | extract_inline_images | `True` | bool | Extract inline images | ### Write options | Name | Default | Type | Comment | | -------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | ### Dependencies for reading PyPI Links: [mistune 2.0.0a5](https://pypi.org/project/mistune/2.0.0a5) To install, run: ```sh pip3 install mistune==2.0.0a5 ``` ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------------------ | ------- | ------------------- | | [dictgen](https://pgaskin.net/dictutil/dictgen/) | MIT | Linux, Windows, Mac | pyglossary-4.5.0/doc/p/lingoes_ldf.md000066400000000000000000000041051417733132500175550ustar00rootroot00000000000000## Lingoes Source (.ldf) ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------------------- | | Name | LingoesLDF | | snake_case_name | lingoes_ldf | | Description | Lingoes Source (.ldf) | | Extensions | `.ldf` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Lingoes](https://en.wikipedia.org/wiki/Lingoes) | | Website | [Lingoes.net](http://www.lingoes.net/en/dictionary/dict_format.php) | ### Read options | Name | Default | Type | Comment | | -------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | ### Write options | Name | Default | Type | Comment | | --------- | ------- | ---- | ----------------------------- | | newline | `\n` | str | Newline string | | resources | `True` | bool | Enable resources / data files | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ---------------------------------------------------------------------------------- | ------- | --------- | | [Lingoes Dictionary Creator](http://www.lingoes.net/en/dictionary/dict_format.php) | Unknown | | pyglossary-4.5.0/doc/p/mobi.md000066400000000000000000000062221417733132500162200ustar00rootroot00000000000000## Mobipocket (.mobi) E-Book ### General Information | Attribute | Value | | --------------- | ------------------------------------------------------ | | Name | Mobi | | snake_case_name | mobi | | Description | Mobipocket (.mobi) E-Book | | Extensions | `.mobi` | | Read support | No | | Write support | Yes | | Single-file | No | | Kind | 📦 package | | Sort-on-write | default_yes | | Sort key | `ebook` | | Wiki | [Mobipocket](https://en.wikipedia.org/wiki/Mobipocket) | | Website | ― | ### Write options | Name | Default | Type | Comment | | ---------------------- | -------- | ---- | -------------------------------------------------------------- | | keep | `False` | bool | Keep temp files | | group_by_prefix_length | `2` | int | Prefix length for grouping | | css | | str | Path to css file | | cover_path | | str | Path to cover file | | kindlegen_path | | str | Path to kindlegen executable | | file_size_approx | `271360` | int | Approximate size of each xhtml file (example: 200kb) | | hide_word_index | `False` | bool | Hide headword in tap-to-check interface | | spellcheck | `True` | bool | Enable wildcard search and spell correction during word lookup | | exact | `False` | bool | Exact-match Parameter | ### Other Requirements Install [KindleGen](https://wiki.mobileread.com/wiki/KindleGen) for creating Mobipocket e-books. ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------------------------------------------- | ----------- | ------------------- | | [Amazon Kindle](https://www.amazon.com/kindle) | Proprietary | Amazon Kindle | | [calibre](https://calibre-ebook.com/) | GPL | Linux, Windows, Mac | | [Okular](https://okular.kde.org/) | GPL | Linux, Windows, Mac | | [Book Reader](https://f-droid.org/en/packages/com.github.axet.bookreader/) | GPL | Android | pyglossary-4.5.0/doc/p/octopus_mdict.md000066400000000000000000000044041417733132500201460ustar00rootroot00000000000000## Octopus MDict (.mdx) ### General Information | Attribute | Value | | --------------- | --------------------------------------------------------------------- | | Name | OctopusMdict | | snake_case_name | octopus_mdict | | Description | Octopus MDict (.mdx) | | Extensions | `.mdx` | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [Download \| MDict.cn](https://www.mdict.cn/wp/?page_id=5325&lang=en) | ### Read options | Name | Default | Type | Comment | | ------------------- | ------- | ---- | ----------------------------------- | | encoding | | str | Encoding/charset | | substyle | `True` | bool | Enable substyle | | same_dir_data_files | `False` | bool | Read data files from same directory | | audio | `False` | bool | Enable audio objects | ### `python-lzo` is required for **some** MDX glossaries. First try converting your MDX file, if failed (`AssertionError` probably), then try to install [LZO library and Python binding](../lzo.md). ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------ | ----------- | -------------------------- | | [MDict](https://www.mdict.cn/) | Proprietary | Android, iOS, Windows, Mac | pyglossary-4.5.0/doc/p/sdict.md000066400000000000000000000026641417733132500164060ustar00rootroot00000000000000## Sdictionary Binary(dct) ### General Information | Attribute | Value | | --------------- | --------------------------------------------- | | Name | Sdict | | snake_case_name | sdict | | Description | Sdictionary Binary(dct) | | Extensions | `.dct` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [Sdictionary Project](http://swaj.net/sdict/) | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ------------------------------------------------------------------ | ------- | ------------------- | | [PTkSdict](http://swaj.net/sdict/) | GPL | Linux, Windows, Mac | | [Sdictionary for Symbian](http://swaj.net/epoc/symbian/index.html) | Unknown | Symbian | pyglossary-4.5.0/doc/p/sdict_source.md000066400000000000000000000032061417733132500177570ustar00rootroot00000000000000## Sdictionary Source (.sdct) ### General Information | Attribute | Value | | --------------- | --------------------------------------------- | | Name | SdictSource | | snake_case_name | sdict_source | | Description | Sdictionary Source (.sdct) | | Extensions | `.sdct` | | Read support | No | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [Sdictionary Project](http://swaj.net/sdict/) | ### Write options | Name | Default | Type | Comment | | ----------- | ------- | ---- | ------------------------------- | | enable_info | `True` | bool | Enable glossary info / metedata | | newline | `\n` | str | Newline string | | resources | `True` | bool | Enable resources / data files | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | --------------------------------------------------- | ------- | ------------------- | | [PTkSdict](http://swaj.net/sdict/create-dicts.html) | GPL | Linux, Windows, Mac | pyglossary-4.5.0/doc/p/sql.md000066400000000000000000000026141417733132500160720ustar00rootroot00000000000000## SQL (.sql) ### General Information | Attribute | Value | | --------------- | ---------------------------------------- | | Name | Sql | | snake_case_name | sql | | Description | SQL (.sql) | | Extensions | `.sql` | | Read support | No | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [SQL](https://en.wikipedia.org/wiki/SQL) | | Website | ― | ### Write options | Name | Default | Type | Comment | | -------------- | ------- | ---- | ---------------------------- | | encoding | `utf-8` | str | Encoding/charset | | info_keys | `None` | list | List of dbinfo table columns | | add_extra_info | `True` | bool | Create dbinfo_extra table | | newline | `
` | str | Newline string | | transaction | `False` | bool | Use TRANSACTION | pyglossary-4.5.0/doc/p/stardict.md000066400000000000000000000102031417733132500171010ustar00rootroot00000000000000## StarDict (.ifo) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------- | | Name | Stardict | | snake_case_name | stardict | | Description | StarDict (.ifo) | | Extensions | `.ifo` | | Read support | Yes | | Write support | Yes | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | always | | Sort key | `stardict` | | Wiki | [StarDict](https://en.wikipedia.org/wiki/StarDict) | | Website | [huzheng.org/stardict](http://huzheng.org/stardict/) | ### Read options | Name | Default | Type | Comment | | -------------- | -------- | ---- | --------------------------------------- | | xdxf_to_html | `True` | bool | Convert XDXF entries to HTML | | unicode_errors | `strict` | str | What to do with Unicode decoding errors | ### Write options | Name | Default | Type | Comment | | ---------------- | ------- | ---- | ---------------------------------------------- | | dictzip | `True` | bool | Compress .dict file to .dict.dz | | sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf | | stardict_client | `False` | bool | Modify html entries for StarDict 3.0 | | merge_syns | `False` | bool | Write alternates to .idx instead of .syn | | audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) | | audio_icon | `True` | bool | Add glossary's audio icon | ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ----------------------------------------------------------------------------------------- | ----------- | ----------------------------------------------------------- | | [GoldenDict](http://goldendict.org/) | GPL | Linux, Windows | | [StarDict](http://huzheng.org/stardict/) | GPL | Linux, Windows, Mac | | [GoldenDict Mobile (Free)](http://goldendict.mobi/) | Freemium | Android | | [GoldenDict Mobile (Full)](http://goldendict.mobi/) | Proprietary | Android | | [Twinkle Star Dictionary](https://play.google.com/store/apps/details?id=com.qtier.dict) | Unknown | Android | | [WordMateX](https://apkcombo.com/wordmatex/org.d1scw0rld.wordmatex/) | Proprietary | Android | | [QDict](https://play.google.com/store/apps/details?id=com.annie.dictionary) | Apache 2.0 | Android | | [Fora Dictionary](https://play.google.com/store/apps/details?id=com.ngc.fora) | Freemium | Android | | [Fora Dictionary Pro](https://play.google.com/store/apps/details?id=com.ngc.fora.android) | Proprietary | Android | | [KOReader](http://koreader.rocks/) | AGPLv3 | Android, Amazon Kindle, Kobo eReader, PocketBook, Cervantes | pyglossary-4.5.0/doc/p/tabfile.md000066400000000000000000000044331417733132500167020ustar00rootroot00000000000000## Tabfile (.txt, .dic) ### General Information | Attribute | Value | | --------------- | -------------------------------------------------------------------------- | | Name | Tabfile | | snake_case_name | tabfile | | Description | Tabfile (.txt, .dic) | | Extensions | `.txt`, `.tab`, `.tsv` | | Read support | Yes | | Write support | Yes | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Tab-separated values](https://en.wikipedia.org/wiki/Tab-separated_values) | | Website | ― | ### Read options | Name | Default | Type | Comment | | -------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | ### Write options | Name | Default | Type | Comment | | ---------------- | ------- | ---- | --------------------------------------------------------------- | | encoding | `utf-8` | str | Encoding/charset | | enable_info | `True` | bool | Enable glossary info / metedata | | resources | `True` | bool | Enable resources / data files | | file_size_approx | `0` | int | Split up by given approximate file size
examples: 100m, 1g | | word_title | `False` | bool | Add headwords title to begining of definition | pyglossary-4.5.0/doc/p/wiktionary_dump.md000066400000000000000000000026271417733132500205240ustar00rootroot00000000000000## Wiktionary Dump (.xml) ### General Information | Attribute | Value | | --------------- | --------------------------------------------------------------------------- | | Name | WiktionaryDump | | snake_case_name | wiktionary_dump | | Description | Wiktionary Dump (.xml) | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [Wiktionary:Main Page](https://en.wiktionary.org/wiki/Wiktionary:Main_Page) | | Website | [dumps.wikimedia.org](https://dumps.wikimedia.org/mirrors.html) | pyglossary-4.5.0/doc/p/wordset.md000066400000000000000000000031211417733132500167540ustar00rootroot00000000000000## Wordset.org JSON directory ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------------- | | Name | Wordset | | snake_case_name | wordset | | Description | Wordset.org JSON directory | | Extensions | | | Read support | Yes | | Write support | No | | Single-file | No | | Kind | 📁 directory | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | ― | | Website | [@wordset/wordset-dictionary](https://github.com/wordset/wordset-dictionary) | ### Read options | Name | Default | Type | Comment | | -------- | ------- | ---- | ---------------- | | encoding | `utf-8` | str | Encoding/charset | pyglossary-4.5.0/doc/p/xdxf.md000066400000000000000000000057641417733132500162550ustar00rootroot00000000000000## XDXF (.xdxf) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | | Name | Xdxf | | snake_case_name | xdxf | | Description | XDXF (.xdxf) | | Extensions | `.xdxf` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 📝 text | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [XDXF](https://en.wikipedia.org/wiki/XDXF) | | Website | [xdxf_description.md - @soshial/xdxf_makedict](https://github.com/soshial/xdxf_makedict/blob/master/format_standard/xdxf_description.md) | ### Read options | Name | Default | Type | Comment | | ---- | ------- | ---- | ---------------- | | html | `True` | bool | Entries are HTML | ### Dependencies for reading PyPI Links: [lxml](https://pypi.org/project/lxml) To install, run: ```sh pip3 install lxml ``` ### Dictionary Applications/Tools | Name & Website | License | Platforms | | -------------------------------------------- | ----------- | ---------------------------- | | [GoldenDict](http://goldendict.org/) | GPL | Linux, Windows | | [QTranslate](https://quest-app.appspot.com/) | Proprietary | Windows | | [Alpus](https://alpusapp.com/) | Freeware | Windows, Mac, Linux, Android | pyglossary-4.5.0/doc/p/zim.md000066400000000000000000000045701417733132500160750ustar00rootroot00000000000000## Zim (.zim, for Kiwix) ### General Information | Attribute | Value | | --------------- | ---------------------------------------------------------------------- | | Name | Zim | | snake_case_name | zim | | Description | Zim (.zim, for Kiwix) | | Extensions | `.zim` | | Read support | Yes | | Write support | No | | Single-file | Yes | | Kind | 🔢 binary | | Sort-on-write | default_no | | Sort key | (`headword_lower`) | | Wiki | [ZIM (file format)]() | | Website | [OpenZIM](https://wiki.openzim.org/wiki/OpenZIM) | ### Read options | Name | Default | Type | Comment | | -------------------- | ------- | ---- | ------------------------------- | | skip_duplicate_words | `False` | bool | Detect and skip duplicate words | ### Dependencies for reading PyPI Links: [libzim 1.0](https://pypi.org/project/libzim/1.0) To install, run: ```sh pip3 install libzim==1.0 ``` ### Dictionary Applications/Tools | Name & Website | License | Platforms | | ----------------------------------------------------------- | ------- | -------------- | | [Kiwix Desktop](https://github.com/kiwix/kiwix-desktop) | GPL | Linux, Windows | | [Kiwix JS](https://github.com/kiwix/kiwix-js) | GPL | Windows | | [Kiwix Serve](https://github.com/kiwix/kiwix-tools) | GPL | Linux, Windows | | [Kiwix for Apple Mac OS X](macos.kiwix.org) | | Mac | | [Kiwix for Android](https://github.com/kiwix/kiwix-android) | GPL | Android | pyglossary-4.5.0/doc/pyicu.md000066400000000000000000000020001417733132500161520ustar00rootroot00000000000000# [PyICU](https://pyicu.org) ## Installation on Linux - Debian/Ubuntu: `sudo apt install pyicu` - openSUSE: `sudo zypper install python3-PyICU` - Fedora: `sudo dnf install python3-pyicu` - Other distros: - Install [ICU](http://site.icu-project.org/) >= 4.8 - Run `sudo pip3 install PyICU` or `pip3 install PyICU --user` ## Installation on Android with Termux - Run `pkg install libicu` - Run `pip install PyICU` ## Installation on Windows - Open https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyicu - Download latest file that matches your system: - `cp39` for Python 3.9, `cp38` for Python 3.8, etc. - `win_amd64` for Windows 64-bit, `win32` for Windows 32-bit. For example: - `PyICU‑2.6‑cp39‑cp39‑win_amd64.whl` for 64-bit with Python 3.9 - `PyICU‑2.6‑cp39‑cp39‑win32.whl` for 32-bit with Python 3.9 - Open Start -> type Command -> right-click on Command Prompt -> Run as administrator - Type `pip install ` then drag-and-drop downloaded file into Command Prompt and press Enter. pyglossary-4.5.0/doc/releases/000077500000000000000000000000001417733132500163125ustar00rootroot00000000000000pyglossary-4.5.0/doc/releases/3.0.0.md000066400000000000000000000144721417733132500173020ustar00rootroot00000000000000# Changes since version 2016.03.18 # New versioning -------------- * Using *date* as the version was a mistake I made 7 years ago * From now on, versions are in **X.Y.Z** format (*major.minor.patch*) * While X, Y and Z are digits(0-9) for simplicity (version strings can be compared alphabetically) * Starting from 3.0.0 + Take it for migrating to Python 3.x, or Gtk 3.x, or being alphabetically larger than previous versions (date string) Since I believe this is the first *standard version*, I'm not sure which code revision should I compare it with. So I just write the most important recent changes, in both application-view and library-view. Breaking Compatibility ---------------------- * **Config migration** + Config file becomes a **config directory** containing config file + Config file format changes from Python (loaded by `exec`) to **JSON** + Remove some obsolete / unused config parameters, and rename some + Remove permanent `sort` boolean flag * Must give `--sort` in command line to enable sorting for most of output formats + Load user-defined plugins from a directory named `plugins` inside config directory * **Glossary class** + Remove some obsolete / unused method * `copy`, `attach`, `merge`, `deepMerge`, `takeWords`, `getInputList`, `getOutputList` + Rename some methods: * `reverseDic` -> `reverse` + Make some public attributes private: * `data` -> `_data` * `info` -> `_info` * `filename` -> `_filename` + Clear (reset) the Glossary instance (data, info, etc) after `write` operation * Glossary class is for converting from file(s) to file, not keeping data in memory + New methods: * `convert`: + `convert` method is added to be used instead of `read` and then `write` + Not just for convenience, but it's also recommended, * and let's Glossary class to have a better default behavior * for example it enables *direct* mode by default (stay tuned) if sorting is not enabled (by user or plugin) + all UI modules (Command line, Gtk3, Tkinter) use Glossary.convert method now + Sorting policy * `sort` boolean flag is now an argument to `write` method + sort=True if user gives `--sort` in command line + sort=False if user gives `--no-sort` in command line + sort=None if user does not give either, so `write` method itself decides what to do * Now we allow plugins to specify sorting policy based on output format + By `sortOnWrite` variable in plugin, with allowed values: * `ALWAYS`: force sorting even if sort=False (user gives `--no-sort`), used only for writing StarDict * `DEFAULT_YES`: enable sorting unless sort=False (user gives `--no-sort`) * `DEFAULT_NO`: disable sorting unless sort=True (user gives `--sort`) * `NEVER`: disable sorting even if sort=True (user gives `--sort`) + The default and common value is: `sortOnWrite = DEFAULT_NO` + Plugin can also have a global `sortKey` function to be used for sorting + (like the `key` argument to `list.sort` method, See `pydoc list.sort`) + New way of interacting with Glossary instance in plugins: * `glos.data.append((word, defi))` -> `glos.addEntry(word, defi)` * `for item in glos.data:` -> `for entry in glos:` * `for key, value in glos.info.items():` -> `for key, value in glos.iterInfo():` Gtk2 to Gtk3 ------------ * Replace obsolete PyGTK-based interface with a simpler PyGI-based (Gtk3) interface Migrating to Python 3 --------------------- * Even though `master` branch was based on Python 3 since 2016 Apr 29, there was some problem that are fixed in this release * If you are still forced need to use Python 2.7, you can use branch `python2.7` Introducing Direct mode ---------------------------------------------- * `--direct` command line option * reads and writes at the same time, without loading the whole data into memory * Partial sorting is supported + `--sort` in command line + `--sort-cache-size=1000` is optional * If plugin defines sortOnWrite=ALWAYS, it means output format requires full sorting, so direct mode will be disabled * As mentioned above (using `Glossary.convert` method), direct mode is enabled by default if sorting is not enabled (by user or plugin) * Of course user can manually disable direct mode by giving `--indirect` option in command line Progress Bar ------------ Automatic command line Progress Bar for all input / output formats is now supported * Implemented based on plugins Reader classes * Works both for direct mode and indirect mode + Only one progress bar for direct mode + Two progress bars for indirect mode (one while reading, one while writing) * Plugins must not update the progress bar anymore * Still no progress bar when both `--direct` and `--sort` flags are given, will be fixed later * User can disable progress bar by giving `--no-progress-bar` option (recommended for Windows users) BGL Plugin ---------- * BGL plugin works better now (comparing to latest Python 2.7 code), and it's much cleaner too * I totally refactored the code, made it fully Python3-compatible, and much more easier to understand * This fixes bytes/str bugs (like Bug [#54](https://github.com/ilius/pyglossary/issues/54)), and CRC check problem for some glossaries (Bug [#55](https://github.com/ilius/pyglossary/issues/55)) * I'm a fan of micro-commits and I usually hate single-commit refactoring, but this time I had no choice! Other Changes ------------- **Feature**: Add `encoding` option to read and write drivers of some plain-text formats **Feature**: SQL and SQLite: read/write extra information from/to a new table dbinfo_extra, backward compatible **New format** invented and implemented for *later implementation of a Glossary Editor* * `edlin.py` (*Editable Linked List of Entries*) is optimized for adding/modifying/removing one entry at a time * while we can save the changes instantly after each modification * Using the ideas of Doubly Linked List, and Git's hash-based object database Rewrite non-working **Reverse** functionality * The old code was messy, not working by default, slow, and language-dependent * It's much faster and cleaner now Improve and complete command line help (`-h` or `--help`) pyglossary-4.5.0/doc/releases/3.0.1.md000066400000000000000000000002551417733132500172750ustar00rootroot00000000000000# Changes since [3.0.0](./3.0.0.md) # - Fix some minor bugs in Glossary class - Fix wrong exist status in command line from `pyglossary.pyw` - Fix exception in BGL plugin pyglossary-4.5.0/doc/releases/3.0.2.md000066400000000000000000000004171417733132500172760ustar00rootroot00000000000000# Changes since [3.0.1](./3.0.1.md) # - Fix a bug in `setup.py`, making it not to work - Fix a bug in logger class, occurring when pyglossary is imported as a library - Fix a few bugs in Octopus MDict reader - Fix a minor bug in BGL reader - Update README.md pyglossary-4.5.0/doc/releases/3.0.3.md000066400000000000000000000005611417733132500172770ustar00rootroot00000000000000# Changes since [3.0.2](./3.0.2.md) # - Fixes in AppleDict plugin - Improve Tkinter interface: fix Not Responding bug, make window icon colorful - Fix visual bug in command line Progress Bar (percentage did not become 100.0%) - BGL reader: add support for `Python < 3.5`, with a warning to install Python 3.5 - Fixes in Reverse feature - Update README.md pyglossary-4.5.0/doc/releases/3.0.4.md000066400000000000000000000043561417733132500173060ustar00rootroot00000000000000# Changes since [3.0.3](./3.0.3.md) # ## Changes in `Glossary` code base ## - Fix critical bug in Glossary: `ZeroDivisionError` if `wordCount < 500`, [#61](https://github.com/ilius/pyglossary/issues/61) - Bug fix in Glossary.progress: make sure ui.progress is not called with a number more than 1.0 - Fix non-working write to SQL, [#67](https://github.com/ilius/pyglossary/issues/67) - Bug fix & Feature: add newline argument to `Glossary.writeTxt` Because Python's `open` converts (modifies) newlines automatically, [#66](https://github.com/ilius/pyglossary/issues/66) - Break compatibility about using `Glossary.writeTxt` method Replace argument `sep` which was a tuple of length two, with two mandatory arguments: `sep1` and `sep2` ## Changes in plugins ## - Fix in StarDict plugin: fix some Python3-related errors, [#71](https://github.com/ilius/pyglossary/issues/71) - Fix in Dict.org plugin: `install` was not working - Fix in DSL plugin: replace backslash at the end of line with `
`, [#61](https://github.com/ilius/pyglossary/issues/61) - Fix in SQL plugin: specify `encoding='utf-8'` while opening file for write, [#67](https://github.com/ilius/pyglossary/issues/67) - Fix in Octopus Mdict Source plugin: specify `encoding='utf-8'` while opening file for read, [#78](https://github.com/ilius/pyglossary/issues/78) - Fix (probable) bugs of bad newlines in 4 plugins (use `newline` argument to `Glossary.writeTxt`), [#66](https://github.com/ilius/pyglossary/issues/66) - Octopus MDict Source - Babylon Source (gls) - Lingoes Source (LDF) - Sdictionary Source (sdct) - Feature in Lingoes Source plugin: add `newline` write option - Minor fix in AppleDict plugin: fix beautifulsoup4 error message, [#72](https://github.com/ilius/pyglossary/issues/72) - BGL plugin: better compatibilty with Python 3.4 Fix `CRC check failed` error for some (rare) glossaries with Python 3.4 ## Other Changes ## - Bug fix in parsing command line read options`--read-options` and `--write-options` (happened in very rare cases) - Fix wrong shebang line in setup.py: must run with python3, fix [#75](https://github.com/ilius/pyglossary/issues/75) - Update `pyglossary.spec` - Change Categories for `pyglossary.desktop` pyglossary-4.5.0/doc/releases/3.1.0.md000066400000000000000000000042031417733132500172720ustar00rootroot00000000000000# Changes since [3.0.4](./3.0.4.md) # - Refactor StarDict plugin, and improve the performance - Detect HTML definitions when reading, and mark them as HTML when converting to StarDict - Fix [#135](https://github.com/ilius/pyglossary/issues/135) in StarDict writer: - Alternates were pointing at a wrong word in case there are resource/image files - Refactor AppleDict plugin - Refactor and improve BGL plugin - Style fixes including pep-8 fixes - Change indentations to tabs, and single quote to double quotes - Allow `--ui=none` flag - Allow `--skip-resources` flag - SQL plugin: add `encoding` write option - Octopus MDict Source plugin: add `encoding` read option - Drop sqlite3 support, xFarDic support, and read support for Omnidic - Improvement and cleaning in the code base and different plugins - Introduce DataEntry - Allowing to access resource files when iterating over entries (words) of Glossary - Glossary: `write` and `convert` methods return absolute path of output file, or None - Changes in master branch since [3.0.4](./3.0.4.md): - Update README.md - Update pyglossary.spec - Fixes in setup.py - BGL: add `gzip_no_crc.py` for Python 36 (required for some non-standard BGL files) - AppleDict: give `encoding='utf8'` while opening xml file, fix for [#84](https://github.com/ilius/pyglossary/issues/84) - Avoid lines that require trailing backslash, to avoid bugs like [#67](https://github.com/ilius/pyglossary/issues/67) - babylon_source.py: remove extra %s, fix [#92](https://github.com/ilius/pyglossary/issues/92) - AppleDict: force encoding="utf-8" for plist file, fix [#94](https://github.com/ilius/pyglossary/issues/94) - Fix str/bytes bug in stardict.py (fix [#98](https://github.com/ilius/pyglossary/issues/98)) and some renames for clarification - Fix [#102](https://github.com/ilius/pyglossary/issues/102): exception in dict_org.py - Fix wrong path of static files when running from dist-packages - readmdict.py: change by Xiaoqiang Wang: no encryption if Encrypted is not in header - Fix [#118](https://github.com/ilius/pyglossary/issues/118), SyntaxError (`return` with argument inside generator) in Glossary.reverse with Python 3.6 pyglossary-4.5.0/doc/releases/3.2.0.md000066400000000000000000000015151417733132500172760ustar00rootroot00000000000000## Changes since [3.1.0](./3.1.0.md) ## - Add read support for CC-CEDICT plugin * Pull request [#140](https://github.com/ilius/pyglossary/pull/140), with some fixes and improvements by me - Fixes in DSL (ABBYY Lingvo) plugin: * Fix [#136](https://github.com/ilius/pyglossary/issues/136), removing one extra character after `#CONTENTS_LANGUAGE:` * Fix [#137](https://github.com/ilius/pyglossary/issues/137), regexp for re_lang_open - Improvement in Gtk interface: * Avoid changing Format combobox based on file extention if a format is already selected, [#141](https://github.com/ilius/pyglossary/issues/141) - Fix encoding problem with non-UTF-8 system locales * Fix [#147](https://github.com/ilius/pyglossary/issues/147), give encoding="utf-8" when opening text files, for non-UTF-8 system locales - Improvements in `Glossary` class pyglossary-4.5.0/doc/releases/3.2.1.md000066400000000000000000000016541417733132500173030ustar00rootroot00000000000000# Changes since [3.2.0](./3.2.0.md) # - Changes in StarDict plugin: + Add sametypesequence write option (PR [#162](https://github.com/ilius/pyglossary/pull/162)) + Fix some bugs + Cleaning - Disable gzip CRC check for BGL files with Python 3.7 - Fix a bug in octopus_mdict.py - Fix Gtk warnings in ui_gtk - Allow seeing/customizing warnings by setting environment variable WARNINGS - Fix not being able to run the program when installed inside virtualenv ([#168](https://github.com/ilius/pyglossary/issues/168)) - Show a tip about -h when no UI were found, [#169](https://github.com/ilius/pyglossary/issues/169) - octopus_mdict_source.py: fix [#68](https://github.com/ilius/pyglossary/issues/68), add support for inconsecutive links with --read-options=links=True - Auto-detect UTF-16 encoding of DSL files - Update README.md (fix Archlinux pkg name, add AUR, add instructions for installing python-lzo on Windows, etc) - Some clean up pyglossary-4.5.0/doc/releases/3.3.0.md000066400000000000000000000106151417733132500173000ustar00rootroot00000000000000# Changes since [3.2.1](./3.2.1.md) # - Require Python 3.6 or higher (mainly becuase of f-strings) - New format support - Add support to write Kobo dictionary, [#205](https://github.com/ilius/pyglossary/issues/205) - Add support to write EPUB-2 - Add support to read AppleDict Binary (.dictionary) - Add support to read and write Aard 2 (slob), [#116](https://github.com/ilius/pyglossary/issues/116) - Glossary: detect and load Writer class from plugins - Remove write function from plugin if it has Writer class - Glossary: call `gc.collect()` on indirect mode after reading/writing each 128 entries - To free up memory and avoid running out of RAM for large glossaries - Glossary: remove empty and duplicate alternate words when converting, using Entry Filter, [#188](https://github.com/ilius/pyglossary/issues/188) - Add command line options to remove html tags: - `--remove-html=tag1,tag2,tag3` - `--remove-html-all` - Re-design format-specific options - Allow specifying format-specific read/write options in ui_gtk and ui_tk - Add much better and cleaner codebase for handling options in `option.py` - Implement validation of options in command line, GTK and Tkinter interfaces - Add tests for `option.py` in `option_test.py` - Avoid using None as default value of option argument - Check default value of plugin options and show warning if invalid - Add IntOption class, use it in Omnidic plugin - Add DictOption, use it for appledict defaultPrefs - And `optionsProp` to all plugins - Containing value type, allowed values and optional comment - Remove `readOptions` and `writeOptions` from all plugins - Detect options from functions' signature and `optionsProp` variables - Avoid using `**kwargs` in plugin `read`, `Reader.open` or `write` functions - Add `depends` variable to plugins - To let GUI install plugin dependencies - Type: `dict`, keys are module names, values are pip's package name - Add `Glossary.formatsDepends` - Minor fixes and improvements in Glossary class: - Return with error if output file path is an existing directory - Fix empty zip when creating `DIRECTORY.zip` as output glossary - Do not uncompress gz/bz2/zip input files automatically - Ignore "read" function of plugin if "Reader" class is present - Cleaning: Add Glossary.init() classmethod to initialize the class, can be called multiple times - Some refactoring and cleaning, and add some logs - Small optimization: `index % 100` -> `index & 0x7f` - Allow having progressbar by position in file and size of file - use for `appledict_bin.py` - Do not write resource file names as entries to text file in `Glossary.writeTxt` - StarDict plugin - Always open `.ifo` file as UTF-8 - Fix output filenames without .ifo extention creating hidden files, [#187](https://github.com/ilius/pyglossary/issues/187) - Babylon BGL plugin - Fix bytes metedata values `b'...'` and some refactoring in readType3 - Skip empty info values - Fix non-string info values written as empty - Prefix 3 info keys with `bgl_` - Fix NameError in debug mode in `stripHtmlTags` - Some refactoring - Octopus MDict plugin - Fix Python 3 bug in `readmdict.py`: https://bitbucket.org/xwang/mdict-analysis/commits/8f66c30 - Support multiple mdd files ([#203](https://github.com/ilius/pyglossary/issues/203)) - Change yes/no options in AppleDict and ABBYY Lingvo DSL plugins to boolean - To keep compatibility of command line flags, fix yes/no manually in ui_cmd.py - AppleDict plugin: - Fix `echo` problem in `Makefile` ([#177](https://github.com/ilius/pyglossary/issues/177)) - Add dark mode support for AppleDict output ([#177](https://github.com/ilius/pyglossary/issues/177)) - Add comments for `optionsProp` - Use keyword argument `features=` and fix a warning about from_encoding= - Fix misspelled "extension" (as "extention") in plugins - Detect entries with `span` tag as html, [#193](https://github.com/ilius/pyglossary/issues/193) - Refactoring in ui_gtk and ui_tk - Fix some deprecated API in ui_gtk - Fix minor bugs and improvements in ui_tk and ui_gtk - Update setup.py to adapt packaging with wheel, [#189](https://github.com/ilius/pyglossary/issues/189) - Add type hints to codebase and plugins - Refactoring and style changes: - rename `pyglossary.pyw` to main.py, add a small `pyglossary.pyw` for compatibility - Switch to f-strings in glossary.py and freedict.py - main.py: replace single quotes with double quotes - PEP-8 style fixes pyglossary-4.5.0/doc/releases/4.0.0.md000066400000000000000000000254531417733132500173040ustar00rootroot00000000000000# Changes since [3.3.0](./3.3.0.md) # - Require Python 3.7 or 3.8, drop support for Python 3.4, 3.5 and 3.6 - Fix / rewrite `setup.py` - Fix `python3 setup.py sdist bdist_wheel`, and pypi paackage - Had to move `ui/` directory into `pyglossary/` - Switch from `distutils` to `setuptools` - Remove `py2exe` - Add interactive command line user interface - Automatically selected if input & ouput file arguments are not passed **and** one of these: - On Linux and no `$DISPLAY` is not set - On Mac and no `tkinter` module is found - `--ui=cmd` flag is passed - New format support: - Add read support for FreeDict, [#206](https://github.com/ilius/pyglossary/issues/206) - Add read support for Zim (Kiwix) - Add read and write support for Kobo E-Reader Dictfile (.df) - Add write support for DICT.org `dictfmt` source file - Add read support for [dictunformat](https://linux.die.net/man/1/dictunformat) output file - Add write support for JSON - Add read support for Dict.cc (SQLite3) - Add read support for [JMDict](https://www.edrdg.org/jmdict/j_jmdict.html), [#239](https://github.com/ilius/pyglossary/issues/239) - Add basic read support for Wiktionary Dump (.xml) - Add read support for [cc-kedict](https://github.com/mhagiwara/cc-kedict) - Add read support for [DigitalNK](https://github.com/digitalprk/dicrs) (SQLite3) - Add read support for [Wordset.org](https://github.com/wordset/wordset-dictionary) JSON directory - Remove Omnidic write support (Unmaintained J2ME dictionary) - Remove Octopus MDict Source plugin - Remove Babylon Source plugin - BGL Weader: improvements - DictionaryForMIDs Writer: fix non-working code - Gettext Source (po) Writer: fix info header - MOBI E-Book Writer: fix sort order, fix and test kindlegen codes, add `kindlegen_path` option, [#112](https://github.com/ilius/pyglossary/issues/112) - EPUB-2 E-Book Writer: fix sort order - XDXF Reader: rewrite with `etree.iterparse` to avoid using too much RAM - Lingoes Source (LDF) Reader: fix ignoring info/metadata header - dict_org.py: rewrite broken plugin (Reader and Writer) - DSL Reader: fix loosing metadata/info - Aard 2 (slob) Reader: - Fix adding css/js files as normal entries - Add `bword://` prefix to entry links - Fix duplicate entries issue by keeping a set of blob IDs, [#224](https://github.com/ilius/pyglossary/issues/224) - Detect and pass defiFormat - Aard 2 (slob) Writer: - Fix content_type detection - Remove `bword://` prefix from entry links - Add resource files / data entries, [#243](https://github.com/ilius/pyglossary/issues/243) - Fix replacing image paths - Show log events from `slob.py` in debug mode - Change default `compression` to `zlib` - Allow passing empty `compression` - Octopus MDict Reader: - Read MDX file twice to load links - Count data entries as part of `len(reader)` for progressbar - StarDict Writer: - Copy "copyright" and "publisher" values to "description" - Add source and target language codes to the end of bookname - Add write-option `stardict_client: bool` Set `True` to make glossary more compatible with StarDict 3.x - Fix broken result when `sametypesequence` option is given and a definitions contains `|` - Allow `sametypesequence=x` for xdxf - Add `merge_syns` option - Allow `sametypesequence=None` option - XDXF Reader: - Fix/improve xdxf to html transformation - Kobo Writer: - Fix get_prefix algorithm and sorting order, with tests, [#219](https://github.com/ilius/pyglossary/issues/219) - Replace ` "Generator[None, BaseEntry, None]"` - Entries must be fetched with `entry = yield` in a `while True` loop: ```python while True: entry = yield if entry is None: break # process and write entry into file(s) ``` - `finish(self)` - Read options and write options must be set to their default values as class attributes - See `pyglossary/plugins/csv_pyg.py` plugin for example - `sortKey` must be an intance method of Writer, instead of a function outside any class - Only for plugins that need sorting before write - Refactor and cleanup `Glossary` class - Removed or replaced most of class/static attributes of `Glossary` - To see the diff, run `git diff [3.3.0](./3.3.0.md)..master -- pyglossary/glossary.py` - Removed `glos.addEntry` method - If you use it in your program, replace with `glos.addEntryObj(glos.newEntry(word, defi, defiFormat))` - Removed instance methods: - `getMostUsedDefiFormats` - `iterEntryBuckets` - `zipOutDir` and `archiveOutDir` - Moved to `pyglossary/glossary_utils.py` - `archiveOutDir` renamed to `compressOutDir` - `writeDict` - `iterSqlLines` -> moved to `pyglossary/plugins/sql.py` - `reverse`, `takeOutputWords`, `searchWordInDef` -> moved to `pyglossary/reverse.py` - Values of `Glossary.plugins` is changed to `plugin_prop.PluginProp` instances - Change `glos.writeTxt` arguments - Replace `sep1` and `sep2` with `entryFmt` - Replace `rplList` with `defiEscapeFunc`, `wordEscapeFunc` and `tail` - Remove `iterEntries`, `entryFilterFunc` - Method returns `Generator[None, BaseEntry, None]` instead of `bool` - See for usage example: - `pyglossary/glossary.py` -> `def writeTabfile` - `pyglossary/plugins/dict_org_source.py` - `pyglossary/plugins/json_plugin.py` - `pyglossary/plugins/lingoes_ldf.py` - `pyglossary/plugins/sdict_source.py` - Refactor, cleanup and fixes in `Entry` and `DataEntry` classes - Replace `entry.getWord()` with `entry.word` - Replace `entry.getWords()` with `entry.l_word` - Replace `entry.getDefi()` with `entry.defi` - Remove `entry.getDefis()` - Drop handling alternate definitions in `Entry` objects - Replace `entry.getDefiFormat()` with `entry.defiFormat` - Add `entry.b_word` and `entry.b_defi` shortcuts that give `bytes` (UTF-8) - Replace `dataEntry.getData()` with `dataEntry.data` - Add `__slots__` to Entry and DataEntry classes - Fix `DataEntry` in indirect mode - Mistaken for Entry with defi=DATA, and file content discarded - Save resource files in user's cache directory when loading input glossary into memory - Move file to output glossary on `dataEntry.save(...)` - Fix `Entry.getRawEntrySortKey` not being alternates-aware, broke StarDict Writer - `DataEntry`: save: use `shutil.copy` if has `_tmpPath`, and set `_tmpPath` - New features of `Entry` - `entry.stripFullHtml()`, remove `......` - Used in Kobo and Kobo Dictfile writers - Add tests - Fix `glos.writeTabfile`: - Remove `\r` from definitions and info values - Fix not escaping word - Fix/improve html detection in definitions - Switch to lazy imports of non-standard modules in plugins - Optimize RAM usage of indirect conversion - To write StarDict, EPUB and DictionaryForMIDs glossaries, we need to load all entries into RAM to sort them - Other new features of Glossary class - `glos.getAuthor()` to get "author", or "publisher" (as fallback) - `glos.removeHtmlTagsAll()` method, can be called by plugins' writer - `glos.collectDefiFormat(maxCount)` extract defiFormat counts - by reading first `maxCount` entries. (then iterator will be reset) - Used in StarDict Writer - Show memory usage in trace mode - Bug fixes and improvements in code base - Apply entry filter when iterating over reader, fix [#251](https://github.com/ilius/pyglossary/issues/251) - Fixes wrong sort order for some glossaries (converting to StarDict or other formats that need sort) - Fixes and improvements in `TextGlossaryReader` class - Fix ignoring glossary defaultDefiFormat - Fix evaluating `None` value in read/write options - Support reading multi-file Tabfile or other text formats - Example: `file.txt`, `file.txt.1`, `file.txt.2` - Need to add `file_count` info key, for example: `##file_count 3` - Fixes in Tabfile Writer - Fix not escaping "\" - Add/update documentation - Update README.md - Add Termux guides in `doc/termux.md` - Move AppleDict guides to `doc/apple.md` - Move LZO notes to `doc/lzo.md` - Minify and compress `.svg` files in `doc/` folder - Switch to f-strings, pep8 fixes, add types, style changes and refactoring - New command line flags: - `--log-time` to show datetime in logs (override `log_time` in config.json) - `--no-alts` to disable alternates handling - `--normalize-html` to lowercase tags (for now) - `--cleanup` and `--no-cleanup` - `--info` to save `.info` file alongside output file pyglossary-4.5.0/doc/releases/4.1.0.md000066400000000000000000000302541417733132500173000ustar00rootroot00000000000000# Changes since [4.0.0](./4.0.0.md) # There are a lot of changes since last release, but here is what I could gather and organize! Please see the commit list for more! - Improvements in ui_gtk - Improvements in ui_tk - Improvements in ui_cmd_interactive - Refactoring and improvements in ui-related codebase - Fix not loading config with `--ui=none` - Code style fixes and cleanup - Documentation + Update most documentations. + Add comments for read/write options. + Generate documentation for all formats - Placed in [doc/p](../p), linked to in `README.md` - Generating with `scripts/plugin-doc-gen.py` script - Read list of dictionary tools/applicatios from TOML files in [plugins-meta/tools](../../plugins-meta/tools) - Add `Dockerfile` and `run-with-docker.sh` script - New command-line flags: - `--json-read-options` and `--json-write-options` - To allow using `;` in option values - Example: `'--json-write-options={"delimiter": ";"}'` - `--gtk`, `--tk` and `--cmd` as shortcut for `--ui=gtk` etc - `--rtl` to change direction of definitions, [#268](https://github.com/ilius/pyglossary/issues/268), also added to `config.json` - Fix non-working `--remove-html` flag - Changes in `Glossary` class - Rename `glos.getPref` to `glos.getConfig` - Change `formatsReadOptions` and `formatsWriteOptions` to `Dict[str, OrderedDict[str, Any]]` + to include default values - remove `glos.writeTabfile`, replace with a func in `pyglossary/text_writer.py` - `Glossary.init`: avoid showing error if user plugin directory does not exist - Fixes and improvements code base - Prevent `dataEntry.save()` from raising exception because of invalid filename or permission - Avoid exception if removing temp file/folder failed - Avoid `mktemp` and more improvements + use `~/.cache/pyglossary/` directory instead of `/tmp/` - Fixes and improvements in `runDictzip` - Raise `RuntimeError` instead of `StopIteration` when iterating over a non-open reader - Avoid exception if no zip command was found, fix [#294](https://github.com/ilius/pyglossary/issues/294) - Remove directory after creating .zip, and some refactoring, [#294](https://github.com/ilius/pyglossary/issues/294) - `DataEntry`: replace `inTmp` argument with `tmpPath` argument - `Entry`: fix html pattern for hyperlinks, [#330](https://github.com/ilius/pyglossary/issues/330) - Fix incorrect virutal env directory detection - Refactor `dataDir` detection, [#307](https://github.com/ilius/pyglossary/issues/307) [#316](https://github.com/ilius/pyglossary/issues/316) - Show warning if failed to create user plugins directory - fix possible exception in `log.emit` - Add support for Conda in `dataDir` detection, [#321](https://github.com/ilius/pyglossary/issues/321) - Fix f-string in `StdLogHandler.emit` - Fixes and improvements in Windows + Fix bad `dataDir` on Windows, [#307](https://github.com/ilius/pyglossary/issues/307) + Fix `shutil.rmtree` exception on Windows + Support creating .zip on Windows 10, [#294](https://github.com/ilius/pyglossary/issues/294) + Check zip command before tar on Windows, [#294](https://github.com/ilius/pyglossary/issues/294) + Show graphical error on exceptions on Windows + Fix dataDir detection on Windows, [#323](https://github.com/ilius/pyglossary/issues/323) $324 - Changes in Config: - Rename config key `skipResources` to `skip_resources` + Add it to config.json and configDefDict - Rename config key `utf8Check` to `utf8_check` + User should edit ~/.pyglossary/config.json manually - Implement direct compression and uncompression, and some refactoring + change glos.detectInputFormat to return (filename, format, compression) or None + remove Glossary.formatsReadFileObj and Glossary.formatsWriteFileObj + remove `fileObj=` argument from `glos.writeTxt` + use optional 'compressions' list/tuple from Writer or Reader classes for direct compression/uncompression + refactoring in glossary_utils.py - Update `setup.py` - Show version from 'git describe --always' on `--version` - `FileSize` option (used in many formats): + Switch to metric (powers of 1000) for `K`, `M`, `G` units + Add `KiB`, `MiB`, `GiB` for powers of 1024 - Add `extensionCreate` variable (str) to plugins and plugin API + Use it to improve ui_tk - Text-based glossary code-base (effecting Tabfile, Kobo Dictfile, LDF) - Optimize TextGlossaryReader - Change multi-file text glossary file names from `.N.txt` to `.txt.N` (where `N>=1`) - Enable reading pyglossary-writen multi-file text glossary by adding `file_count=-1` to metadata + because the number of files is not known when creating the first txt file - Tabfile - Rename option `writeInfo` to `enable_info` - Reader: read resource files from `*.txt_res` directory if exists - Add `*.txt_res` directory to *.zip file - Zim Reader: - Migrate to libzim 1.0 - Add mimetype `image/webp`, fix [#329](https://github.com/ilius/pyglossary/issues/329) - Slob and Tabfile Writer: add `file_size_approx` option to allow writing multi-part output + support values like: `5500k`, `100m`, `1.2g` - Add `word_title=False` option to some writers + Slob Writer: add `word_title=False` option + Tabfile Writer: add `word_title=False` option + CSV Writer: add `word_title=False` option + JSON Writer: add `word_title=False` option + Dict.cc Reader: do not add word title + FreeDict Reader: rename `keywords_header` option to `word_title` + Add `glos.wordTitleStr`, used in plugins with `word_title` option + Add `definition_has_headwords=True` info key to avoid adding the title next time we read the glossary - Aard2 (slob) - Writer: add option `separate_alternates=False`, [#270](https://github.com/ilius/pyglossary/issues/270) - Writer: fix handling `content_type` option - Writer: use `~/.cache/pyglossary/` instead of `/tmp` - Writer: add mp3 to mime types, [#289](https://github.com/ilius/pyglossary/issues/289) - Writer: add support for .ini data file, [#289](https://github.com/ilius/pyglossary/issues/289) - Writer: support .webp files, [#329](https://github.com/ilius/pyglossary/issues/329) - Writer: supoort .tiff and .tif files - Reader: read glossary name/title and creation time from tags - Reader: extract all metedata / tags - `slob.py` library: Refactoring and cleanup - StarDict: - Reader: add option unicode_errors for invalid UTF-8 data, [#309](https://github.com/ilius/pyglossary/issues/309) - Writer: add bool write-option `audio_goldendict`, [#327](https://github.com/ilius/pyglossary/issues/327) - Writer: add option `audio_icon=True`, and add option comment, [#327](https://github.com/ilius/pyglossary/issues/327) - FreeDict Reader - Fix two slashes before and after `pron` - Avoid running `unescape_unicode` by `encoding="utf-8"` arg to `ET.htmlfile` - Fix exception if `edition` is missing in header, and few other fixes - Support `` with `` inside it - Support `` inside nested second-level(nested) `` - Add `"lang"` attribute to html elements - Add option "example_padding" - Fix rendering ``, refactoring and improvement - Handle `` inside `` - Support `` in `` - Mark external refs with `` - Support comment in `` - Support `` inside `` - Implement many tags under `` - Improvements and refactoring - XDXF + Fix not finding `xdxf.xsl` in installed mode - Effecting XDXF and StarDict formats + `xdxf.xsl`: generate `` instead of `` + StarDict Reader: Add `xdxf_to_html=True` option, [#258](https://github.com/ilius/pyglossary/issues/258) + StarDict Reader: Import `xdxf_transform` lazily - Remove forced dependency to `lxml`, [#261](https://github.com/ilius/pyglossary/issues/261) + XDXF plugin: fix glos.setDefaultDefiFormat call - `xdxf_transform.py`: remove warnings for , [#322](https://github.com/ilius/pyglossary/issues/322) + Merge PR [#317](https://github.com/ilius/pull/issues/317) + Parse `sr`, `gr`, `ex_orig`, `ex_transl` tags and `audio` + Remove `None` attribute from `audio` tag + Use unicode symbols for audio and external link + Use another speaker symbol for audio + Add audio controls + Use plain link without an audio tag - Mobi - Update ebook_mobi.py and README.md, [#299](https://github.com/ilius/pyglossary/issues/299) - Add PR [#335](https://github.com/ilius/pyglossary/pull/335) with some modifications - Changes in `ebook_base.py` (Mobi and EPUB) - Avoid exception if removing tmpDir failed - Use `style.css` dataEntry, [#299](https://github.com/ilius/pyglossary/issues/299) - DSL Reader: - Strip whitespaces around language names, [#264](https://github.com/ilius/pyglossary/issues/264) - Add progressbar support, [#264](https://github.com/ilius/pyglossary/issues/264) - Run `html.escape` on text before adding html tags, [#265](https://github.com/ilius/pyglossary/issues/265) - Strip and unquote glossary name - Generate `` and `` instead of `` - Avoid adding html comment - Remove `\ufeff` from header lines, [#306](https://github.com/ilius/pyglossary/issues/306) - AppleDict Source + Change path of Dictionary Development Kit, [#300](https://github.com/ilius/pyglossary/issues/300) + Open all text files with `encoding="utf-8"` + Some refactporing - Rename 4 options: - cleanHTML -> clean_html - defaultPrefs -> default_prefs - prefsHTML -> prefs_html - frontBackMatter -> front_back_matter - AppleDict Binary + Improvements, [#299](https://github.com/ilius/pyglossary/issues/299) + Read `DefaultStyle.css` file, add as `style.css`, [#299](https://github.com/ilius/pyglossary/issues/299) + Change default value of option: `html=True` - Octopus MDict (MDX) + Fix image links + Do not set empty title + Minor improvement in `readmdict.py` + Handle exception when reading from a corrupt MDD file + Add bool flag same_dir_data_files, [#289](https://github.com/ilius/pyglossary/issues/289) + Add read-option: `audio=True` (default: `False`), [#327](https://github.com/ilius/pyglossary/issues/327) + `audio`: remove extra attrs and add comments - DICT.org plugin: - `installToDictd`: skip if target directory does not exist - Make rendering dictd files a bit clear in pure txt - Fix indention issue and add bword prefix as url - Fixes and improvements in Dict.cc (SQLite3) plugin: - Fix typo, and avoid iterating over cur, use `fetchall()`, [#296](https://github.com/ilius/pyglossary/issues/296) - Remove gender from headword, add it to definition, [#296](https://github.com/ilius/pyglossary/issues/296) - Avoid running `unescape_unicode` - JMDict - Support reading compressed file directly - Show pos before gloss (translations) - Avoid running `unescape_unicode` - DigitalNK: work around Python's sqlite bug, [#282](https://github.com/ilius/pyglossary/issues/282) - Changes in `dict_org.py` plugin, By Justin Yang - Use
to replace newline - Replace words with {} around to true web link - CC-CEDICT Reader: + Fix import error in `conv.py` + Switch from jinja2 to lxml - Fix not escaping `<`, `>` and `&` - Note: lxml inserts `&[#160](https://github.com/ilius/pyglossary/issues/160);` instead of ` ` + Use `` instead of `` + add option to use Traditional Chinese for entry name - Avoid colorizing if tones count does not match `len(syllables)`, [#328](https://github.com/ilius/pyglossary/issues/328) - Add `` for each syllable in case of mismatch tones, [#328](https://github.com/ilius/pyglossary/issues/328) - Rename read/write options: - DSL: rename option onlyFixMarkUp to only_fix_markup - SQL: rename 2 options: - `infoKeys` -> `info_keys` - `addExtraInfo` -> `add_extra_info` - EDLIN: rename option `havePrevLink` to `prev_link` - CSV: rename option `writeInfo` to `enable_info` - JSON: rename option `writeInfo` to `enable_info` - BGL: rename all read/write options (to cameCase to snake_case) - New formats: - Read "ABC Medical Notes (SQLite3)", `plugins/abc_medical_notes.py`, [#267](https://github.com/ilius/pyglossary/issues/267) - Read "Almaany.com (SQLite3)", `plugins/almaany.py`, [#267](https://github.com/ilius/pyglossary/issues/267) [#268](https://github.com/ilius/pyglossary/issues/268) - Remove TreeDict plugin, `plugins/treedict.py` - Remove FreeDict writer pyglossary-4.5.0/doc/releases/4.2.0.md000066400000000000000000000025431417733132500173010ustar00rootroot00000000000000# Changes since [4.1.0](./4.1.0.md) # - Breaking changes: - Replace `glos.getAuthor()` with `glos.author` - This looks for "author" and then "publisher" keys in info/metadata - Rename option `apply_css` to `css` for mobi and epub2 - `glos.getInfo` and `glos.setInfo` only accept `str` as key (or a subclass of `str`) - Bug fixes: - Indirect mode: Fix handling '|' character in words. - Escape/unescape `|` in words when converting `entry` <-> `rawEntry` - Escape/unescape `|` in words when writing/reading text-based file formats - JSON: Prevent duplicate keys in json output, [#344](https://github.com/ilius/pyglossary/issues/344) - Add new method `glos.preventDuplicateWords()` - Features and improvements - Add SQLite mode with `--sqlite` flag for converting to StarDict. - Eliminates the need to load all entries into RAM, limiting RAM usage. - You can add `--sqlite` to you command, even for running GUI. - For example: `python3 main.py --tk --sqlite` - See [README.md](../../README.md#sqlite-mode) for more details. - Add `--source-lang` and `--target-lang` flags - XDXF: support more tags and improvements - Add unit tests for `Glossary` class, and some functions in `text_utils.py` - Windows: change cache directory to `%LOCALAPPDATA%` - Some refactoring and optimization - Update, improve and re-format documentations pyglossary-4.5.0/doc/releases/4.2.1.md000066400000000000000000000016711417733132500173030ustar00rootroot00000000000000# Changes since version [4.2.0](./4.2.0.md) # ### Minor bug fixes and improvements: - `text_utils.py` - Minor bug: fix legacy function `urlToPath` using `urllib.parse.unquote` - Minor bug: `replacePostSpaceChar`: remove trailing space from the output str - Cleanup: - Remove unused function `isControlChar` - Remove unused function `formatByteStr` - Remove argument `exclude` from function `isASCII` - Add unit tests - `ui_cmd_interactive.py`: fix a minor bug and some small refactoring - Command line: Override input glossary info with `--source-lang` and `--target-lang` flags - Add unit tests for CSV -> Tabfile conversion - CSV plugin: some refactoring, and rename the module to `csv_plugin.py` - Update `setup.py`: add `python_requires=">=3.7.0"`, update `extras_require` - Update README.md ### Fearures: - Command line: Add `--name` flag for changing glossary name - `Glossary`: `convert`: add `infoOverride` optional argument pyglossary-4.5.0/doc/releases/4.3.0.md000066400000000000000000000071341417733132500173030ustar00rootroot00000000000000# Changes since [4.2.1](./4.2.1.md) ## Bug fixes - Tabfile writer: fix replacing `\` with `\\` - `--remove-html` flag: fix bad regex - ui_cmd_interactive: fix a few bugs - Lowercase word/entry links (`
The data is NOT a utf-8 string, but is instead a string in locale encoding, ending with `\0`. Sometimes using this type will save disk space, but its use is discouraged. This is only a idea. ## `g` A utf-8 string which is marked up with the Pango text markup language.
For more information about this markup language, See the [Pango Reference Manual](http://library.gnome.org/devel/pango/stable/PangoMarkupFormat.html).
You might have it installed locally \[here\](file:///usr/share/gtk-doc/html/pango/PangoMarkupFormat.html) ## `t` English phonetic string.
The data should be a utf-8 string ending with `\0`. Here are some utf-8 phonetic characters:
`θʃŋʧðʒæıʌʊɒɛəɑɜɔˌˈːˑṃṇḷ`
`æɑɒʌәєŋvθðʃʒɚːɡˏˊˋ` ## `x` A utf-8 string which is marked up with the [xdxf language](https://github.com/soshial/xdxf_makedict).
StarDict has these extensions: - `` can have "type" attribute, it can be "image", "sound", "video" and "attach". - `` can have "k" attribute. ## `y` Chinese YinBiao or Japanese KANA.
The data should be a utf-8 string ending with `\0`. ## `k` [KingSoft](https://en.wikipedia.org/wiki/Kingsoft) [PowerWord](https://en.wikipedia.org/wiki/PowerWord)'s data. The data is a utf-8 string ending with `\0`. And it's in XML format. ## `w` [MediaWiki markup language](https://www.mediawiki.org/wiki/Help:Formatting). ## `h` Html codes. ## `n` WordNet data. ## `r` Resource file list.
The content can be: - `img:pic/example.jpg` Image file - `snd:apple.wav` Sound file - `vdo:film.avi` Video file - `att:file.bin` Attachment file More than one line is supported as a list of available files.
StarDict will find the files in the Resource Storage.
The image will be shown, the sound file will have a play button.
You can "save as" the attachment file and so on.
The file list must be a utf-8 string ending with `\0`.
Use `\n` for separating new lines.
Use `/` character as directory separator.
## `W` `.wav` audio file.
The data begins with a network byte-ordered guint32 to identify the wav file's size, immediately followed by the file's content. This is only a idea, it is better to use `r` Resource file list in most case. ## `P` Picture file.
The data begins with a network byte-ordered guint32 to identify the picture file's size, immediately followed by the file's content.
This feature is implemented, as stardict-advertisement-plugin needs it. Anyway, it is better to use `r` Resource file list in most case. ## `X` This type identifier is reserved for experimental extensions. # For more information Refer to StarDict documentations at: [https://github.com/huzheng001/stardict-3/blob/master/dict/doc/StarDictFileFormat](https://github.com/huzheng001/stardict-3/blob/master/dict/doc/StarDictFileFormat) pyglossary-4.5.0/doc/term-colors.md000066400000000000000000000740111417733132500173020ustar00rootroot00000000000000## Terminal / ANSI Colors | Sample | Code | Hex | RGB | HSL | | ----------------------------------------------------------- | ---- | --------- | ------------- | ----------------- | | ![](https://via.placeholder.com/60x30/000000/000000?text=+) | 0 | `#000000` | 0, 0, 0 | 0, 0, 0 | | ![](https://via.placeholder.com/60x30/aa0000/000000?text=+) | 1 | `#aa0000` | 170, 0, 0 | 0, 1, 0.333 | | ![](https://via.placeholder.com/60x30/00aa00/000000?text=+) | 2 | `#00aa00` | 0, 170, 0 | 120, 1, 0.333 | | ![](https://via.placeholder.com/60x30/aa5500/000000?text=+) | 3 | `#aa5500` | 170, 85, 0 | 30, 1, 0.333 | | ![](https://via.placeholder.com/60x30/0000aa/000000?text=+) | 4 | `#0000aa` | 0, 0, 170 | 240, 1, 0.333 | | ![](https://via.placeholder.com/60x30/aa00aa/000000?text=+) | 5 | `#aa00aa` | 170, 0, 170 | 300, 1, 0.333 | | ![](https://via.placeholder.com/60x30/00aaaa/000000?text=+) | 6 | `#00aaaa` | 0, 170, 170 | 180, 1, 0.333 | | ![](https://via.placeholder.com/60x30/b9b9b9/000000?text=+) | 7 | `#b9b9b9` | 185, 185, 185 | 0, 0, 0.725 | | ![](https://via.placeholder.com/60x30/555555/000000?text=+) | 8 | `#555555` | 85, 85, 85 | 0, 0, 0.333 | | ![](https://via.placeholder.com/60x30/ff5555/000000?text=+) | 9 | `#ff5555` | 255, 85, 85 | 0, 1, 0.667 | | ![](https://via.placeholder.com/60x30/55ff55/000000?text=+) | 10 | `#55ff55` | 85, 255, 85 | 120, 1, 0.667 | | ![](https://via.placeholder.com/60x30/ffff55/000000?text=+) | 11 | `#ffff55` | 255, 255, 85 | 60, 1, 0.667 | | ![](https://via.placeholder.com/60x30/5555ff/000000?text=+) | 12 | `#5555ff` | 85, 85, 255 | 240, 1, 0.667 | | ![](https://via.placeholder.com/60x30/ff55ff/000000?text=+) | 13 | `#ff55ff` | 255, 85, 255 | 300, 1, 0.667 | | ![](https://via.placeholder.com/60x30/55ffff/000000?text=+) | 14 | `#55ffff` | 85, 255, 255 | 180, 1, 0.667 | | ![](https://via.placeholder.com/60x30/ffffff/000000?text=+) | 15 | `#ffffff` | 255, 255, 255 | 0, 0, 1 | | ![](https://via.placeholder.com/60x30/000000/000000?text=+) | 16 | `#000000` | 0, 0, 0 | 0, 0, 0 | | ![](https://via.placeholder.com/60x30/00005f/000000?text=+) | 17 | `#00005f` | 0, 0, 95 | 240, 1, 0.186 | | ![](https://via.placeholder.com/60x30/000087/000000?text=+) | 18 | `#000087` | 0, 0, 135 | 240, 1, 0.265 | | ![](https://via.placeholder.com/60x30/0000af/000000?text=+) | 19 | `#0000af` | 0, 0, 175 | 240, 1, 0.343 | | ![](https://via.placeholder.com/60x30/0000d7/000000?text=+) | 20 | `#0000d7` | 0, 0, 215 | 240, 1, 0.422 | | ![](https://via.placeholder.com/60x30/0000ff/000000?text=+) | 21 | `#0000ff` | 0, 0, 255 | 240, 1, 0.5 | | ![](https://via.placeholder.com/60x30/005f00/000000?text=+) | 22 | `#005f00` | 0, 95, 0 | 120, 1, 0.186 | | ![](https://via.placeholder.com/60x30/005f5f/000000?text=+) | 23 | `#005f5f` | 0, 95, 95 | 180, 1, 0.186 | | ![](https://via.placeholder.com/60x30/005f87/000000?text=+) | 24 | `#005f87` | 0, 95, 135 | 197.778, 1, 0.265 | | ![](https://via.placeholder.com/60x30/005faf/000000?text=+) | 25 | `#005faf` | 0, 95, 175 | 207.429, 1, 0.343 | | ![](https://via.placeholder.com/60x30/005fd7/000000?text=+) | 26 | `#005fd7` | 0, 95, 215 | 213.488, 1, 0.422 | | ![](https://via.placeholder.com/60x30/005fff/000000?text=+) | 27 | `#005fff` | 0, 95, 255 | 217.647, 1, 0.5 | | ![](https://via.placeholder.com/60x30/008700/000000?text=+) | 28 | `#008700` | 0, 135, 0 | 120, 1, 0.265 | | ![](https://via.placeholder.com/60x30/00875f/000000?text=+) | 29 | `#00875f` | 0, 135, 95 | 162.222, 1, 0.265 | | ![](https://via.placeholder.com/60x30/008787/000000?text=+) | 30 | `#008787` | 0, 135, 135 | 180, 1, 0.265 | | ![](https://via.placeholder.com/60x30/0087af/000000?text=+) | 31 | `#0087af` | 0, 135, 175 | 193.714, 1, 0.343 | | ![](https://via.placeholder.com/60x30/0087d7/000000?text=+) | 32 | `#0087d7` | 0, 135, 215 | 202.326, 1, 0.422 | | ![](https://via.placeholder.com/60x30/0087ff/000000?text=+) | 33 | `#0087ff` | 0, 135, 255 | 208.235, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00af00/000000?text=+) | 34 | `#00af00` | 0, 175, 0 | 120, 1, 0.343 | | ![](https://via.placeholder.com/60x30/00af5f/000000?text=+) | 35 | `#00af5f` | 0, 175, 95 | 152.571, 1, 0.343 | | ![](https://via.placeholder.com/60x30/00af87/000000?text=+) | 36 | `#00af87` | 0, 175, 135 | 166.286, 1, 0.343 | | ![](https://via.placeholder.com/60x30/00afaf/000000?text=+) | 37 | `#00afaf` | 0, 175, 175 | 180, 1, 0.343 | | ![](https://via.placeholder.com/60x30/00afd7/000000?text=+) | 38 | `#00afd7` | 0, 175, 215 | 191.163, 1, 0.422 | | ![](https://via.placeholder.com/60x30/00afff/000000?text=+) | 39 | `#00afff` | 0, 175, 255 | 198.824, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00d700/000000?text=+) | 40 | `#00d700` | 0, 215, 0 | 120, 1, 0.422 | | ![](https://via.placeholder.com/60x30/00d75f/000000?text=+) | 41 | `#00d75f` | 0, 215, 95 | 146.512, 1, 0.422 | | ![](https://via.placeholder.com/60x30/00d787/000000?text=+) | 42 | `#00d787` | 0, 215, 135 | 157.674, 1, 0.422 | | ![](https://via.placeholder.com/60x30/00d7af/000000?text=+) | 43 | `#00d7af` | 0, 215, 175 | 168.837, 1, 0.422 | | ![](https://via.placeholder.com/60x30/00d7d7/000000?text=+) | 44 | `#00d7d7` | 0, 215, 215 | 180, 1, 0.422 | | ![](https://via.placeholder.com/60x30/00d7ff/000000?text=+) | 45 | `#00d7ff` | 0, 215, 255 | 189.412, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00ff00/000000?text=+) | 46 | `#00ff00` | 0, 255, 0 | 120, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00ff5f/000000?text=+) | 47 | `#00ff5f` | 0, 255, 95 | 142.353, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00ff87/000000?text=+) | 48 | `#00ff87` | 0, 255, 135 | 151.765, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00ffaf/000000?text=+) | 49 | `#00ffaf` | 0, 255, 175 | 161.176, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00ffd7/000000?text=+) | 50 | `#00ffd7` | 0, 255, 215 | 170.588, 1, 0.5 | | ![](https://via.placeholder.com/60x30/00ffff/000000?text=+) | 51 | `#00ffff` | 0, 255, 255 | 180, 1, 0.5 | | ![](https://via.placeholder.com/60x30/5f0000/000000?text=+) | 52 | `#5f0000` | 95, 0, 0 | 0, 1, 0.186 | | ![](https://via.placeholder.com/60x30/5f005f/000000?text=+) | 53 | `#5f005f` | 95, 0, 95 | 300, 1, 0.186 | | ![](https://via.placeholder.com/60x30/5f0087/000000?text=+) | 54 | `#5f0087` | 95, 0, 135 | 282.222, 1, 0.265 | | ![](https://via.placeholder.com/60x30/5f00af/000000?text=+) | 55 | `#5f00af` | 95, 0, 175 | 272.571, 1, 0.343 | | ![](https://via.placeholder.com/60x30/5f00d7/000000?text=+) | 56 | `#5f00d7` | 95, 0, 215 | 266.512, 1, 0.422 | | ![](https://via.placeholder.com/60x30/5f00ff/000000?text=+) | 57 | `#5f00ff` | 95, 0, 255 | 262.353, 1, 0.5 | | ![](https://via.placeholder.com/60x30/5f5f00/000000?text=+) | 58 | `#5f5f00` | 95, 95, 0 | 60, 1, 0.186 | | ![](https://via.placeholder.com/60x30/5f5f5f/000000?text=+) | 59 | `#5f5f5f` | 95, 95, 95 | 0, 0, 0.373 | | ![](https://via.placeholder.com/60x30/5f5f87/000000?text=+) | 60 | `#5f5f87` | 95, 95, 135 | 240, 0.174, 0.451 | | ![](https://via.placeholder.com/60x30/5f5faf/000000?text=+) | 61 | `#5f5faf` | 95, 95, 175 | 240, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/5f5fd7/000000?text=+) | 62 | `#5f5fd7` | 95, 95, 215 | 240, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5f5fff/000000?text=+) | 63 | `#5f5fff` | 95, 95, 255 | 240, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5f8700/000000?text=+) | 64 | `#5f8700` | 95, 135, 0 | 77.778, 1, 0.265 | | ![](https://via.placeholder.com/60x30/5f875f/000000?text=+) | 65 | `#5f875f` | 95, 135, 95 | 120, 0.174, 0.451 | | ![](https://via.placeholder.com/60x30/5f8787/000000?text=+) | 66 | `#5f8787` | 95, 135, 135 | 180, 0.174, 0.451 | | ![](https://via.placeholder.com/60x30/5f87af/000000?text=+) | 67 | `#5f87af` | 95, 135, 175 | 210, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/5f87d7/000000?text=+) | 68 | `#5f87d7` | 95, 135, 215 | 220, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5f87ff/000000?text=+) | 69 | `#5f87ff` | 95, 135, 255 | 225, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5faf00/000000?text=+) | 70 | `#5faf00` | 95, 175, 0 | 87.429, 1, 0.343 | | ![](https://via.placeholder.com/60x30/5faf5f/000000?text=+) | 71 | `#5faf5f` | 95, 175, 95 | 120, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/5faf87/000000?text=+) | 72 | `#5faf87` | 95, 175, 135 | 150, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/5fafaf/000000?text=+) | 73 | `#5fafaf` | 95, 175, 175 | 180, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/5fafd7/000000?text=+) | 74 | `#5fafd7` | 95, 175, 215 | 200, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5fafff/000000?text=+) | 75 | `#5fafff` | 95, 175, 255 | 210, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5fd700/000000?text=+) | 76 | `#5fd700` | 95, 215, 0 | 93.488, 1, 0.422 | | ![](https://via.placeholder.com/60x30/5fd75f/000000?text=+) | 77 | `#5fd75f` | 95, 215, 95 | 120, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5fd787/000000?text=+) | 78 | `#5fd787` | 95, 215, 135 | 140, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5fd7af/000000?text=+) | 79 | `#5fd7af` | 95, 215, 175 | 160, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5fd7d7/000000?text=+) | 80 | `#5fd7d7` | 95, 215, 215 | 180, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/5fd7ff/000000?text=+) | 81 | `#5fd7ff` | 95, 215, 255 | 195, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5fff00/000000?text=+) | 82 | `#5fff00` | 95, 255, 0 | 97.647, 1, 0.5 | | ![](https://via.placeholder.com/60x30/5fff5f/000000?text=+) | 83 | `#5fff5f` | 95, 255, 95 | 120, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5fff87/000000?text=+) | 84 | `#5fff87` | 95, 255, 135 | 135, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5fffaf/000000?text=+) | 85 | `#5fffaf` | 95, 255, 175 | 150, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5fffd7/000000?text=+) | 86 | `#5fffd7` | 95, 255, 215 | 165, 1, 0.686 | | ![](https://via.placeholder.com/60x30/5fffff/000000?text=+) | 87 | `#5fffff` | 95, 255, 255 | 180, 1, 0.686 | | ![](https://via.placeholder.com/60x30/870000/000000?text=+) | 88 | `#870000` | 135, 0, 0 | 0, 1, 0.265 | | ![](https://via.placeholder.com/60x30/87005f/000000?text=+) | 89 | `#87005f` | 135, 0, 95 | 317.778, 1, 0.265 | | ![](https://via.placeholder.com/60x30/870087/000000?text=+) | 90 | `#870087` | 135, 0, 135 | 300, 1, 0.265 | | ![](https://via.placeholder.com/60x30/8700af/000000?text=+) | 91 | `#8700af` | 135, 0, 175 | 286.286, 1, 0.343 | | ![](https://via.placeholder.com/60x30/8700d7/000000?text=+) | 92 | `#8700d7` | 135, 0, 215 | 277.674, 1, 0.422 | | ![](https://via.placeholder.com/60x30/8700ff/000000?text=+) | 93 | `#8700ff` | 135, 0, 255 | 271.765, 1, 0.5 | | ![](https://via.placeholder.com/60x30/875f00/000000?text=+) | 94 | `#875f00` | 135, 95, 0 | 42.222, 1, 0.265 | | ![](https://via.placeholder.com/60x30/875f5f/000000?text=+) | 95 | `#875f5f` | 135, 95, 95 | 0, 0.174, 0.451 | | ![](https://via.placeholder.com/60x30/875f87/000000?text=+) | 96 | `#875f87` | 135, 95, 135 | 300, 0.174, 0.451 | | ![](https://via.placeholder.com/60x30/875faf/000000?text=+) | 97 | `#875faf` | 135, 95, 175 | 270, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/875fd7/000000?text=+) | 98 | `#875fd7` | 135, 95, 215 | 260, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/875fff/000000?text=+) | 99 | `#875fff` | 135, 95, 255 | 255, 1, 0.686 | | ![](https://via.placeholder.com/60x30/878700/000000?text=+) | 100 | `#878700` | 135, 135, 0 | 60, 1, 0.265 | | ![](https://via.placeholder.com/60x30/87875f/000000?text=+) | 101 | `#87875f` | 135, 135, 95 | 60, 0.174, 0.451 | | ![](https://via.placeholder.com/60x30/878787/000000?text=+) | 102 | `#878787` | 135, 135, 135 | 0, 0, 0.529 | | ![](https://via.placeholder.com/60x30/8787af/000000?text=+) | 103 | `#8787af` | 135, 135, 175 | 240, 0.2, 0.608 | | ![](https://via.placeholder.com/60x30/8787d7/000000?text=+) | 104 | `#8787d7` | 135, 135, 215 | 240, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/8787ff/000000?text=+) | 105 | `#8787ff` | 135, 135, 255 | 240, 1, 0.765 | | ![](https://via.placeholder.com/60x30/87af00/000000?text=+) | 106 | `#87af00` | 135, 175, 0 | 73.714, 1, 0.343 | | ![](https://via.placeholder.com/60x30/87af5f/000000?text=+) | 107 | `#87af5f` | 135, 175, 95 | 90, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/87af87/000000?text=+) | 108 | `#87af87` | 135, 175, 135 | 120, 0.2, 0.608 | | ![](https://via.placeholder.com/60x30/87afaf/000000?text=+) | 109 | `#87afaf` | 135, 175, 175 | 180, 0.2, 0.608 | | ![](https://via.placeholder.com/60x30/87afd7/000000?text=+) | 110 | `#87afd7` | 135, 175, 215 | 210, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/87afff/000000?text=+) | 111 | `#87afff` | 135, 175, 255 | 220, 1, 0.765 | | ![](https://via.placeholder.com/60x30/87d700/000000?text=+) | 112 | `#87d700` | 135, 215, 0 | 82.326, 1, 0.422 | | ![](https://via.placeholder.com/60x30/87d75f/000000?text=+) | 113 | `#87d75f` | 135, 215, 95 | 100, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/87d787/000000?text=+) | 114 | `#87d787` | 135, 215, 135 | 120, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/87d7af/000000?text=+) | 115 | `#87d7af` | 135, 215, 175 | 150, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/87d7d7/000000?text=+) | 116 | `#87d7d7` | 135, 215, 215 | 180, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/87d7ff/000000?text=+) | 117 | `#87d7ff` | 135, 215, 255 | 200, 1, 0.765 | | ![](https://via.placeholder.com/60x30/87ff00/000000?text=+) | 118 | `#87ff00` | 135, 255, 0 | 88.235, 1, 0.5 | | ![](https://via.placeholder.com/60x30/87ff5f/000000?text=+) | 119 | `#87ff5f` | 135, 255, 95 | 105, 1, 0.686 | | ![](https://via.placeholder.com/60x30/87ff87/000000?text=+) | 120 | `#87ff87` | 135, 255, 135 | 120, 1, 0.765 | | ![](https://via.placeholder.com/60x30/87ffaf/000000?text=+) | 121 | `#87ffaf` | 135, 255, 175 | 140, 1, 0.765 | | ![](https://via.placeholder.com/60x30/87ffd7/000000?text=+) | 122 | `#87ffd7` | 135, 255, 215 | 160, 1, 0.765 | | ![](https://via.placeholder.com/60x30/87ffff/000000?text=+) | 123 | `#87ffff` | 135, 255, 255 | 180, 1, 0.765 | | ![](https://via.placeholder.com/60x30/af0000/000000?text=+) | 124 | `#af0000` | 175, 0, 0 | 0, 1, 0.343 | | ![](https://via.placeholder.com/60x30/af005f/000000?text=+) | 125 | `#af005f` | 175, 0, 95 | 327.429, 1, 0.343 | | ![](https://via.placeholder.com/60x30/af0087/000000?text=+) | 126 | `#af0087` | 175, 0, 135 | 313.714, 1, 0.343 | | ![](https://via.placeholder.com/60x30/af00af/000000?text=+) | 127 | `#af00af` | 175, 0, 175 | 300, 1, 0.343 | | ![](https://via.placeholder.com/60x30/af00d7/000000?text=+) | 128 | `#af00d7` | 175, 0, 215 | 288.837, 1, 0.422 | | ![](https://via.placeholder.com/60x30/af00ff/000000?text=+) | 129 | `#af00ff` | 175, 0, 255 | 281.176, 1, 0.5 | | ![](https://via.placeholder.com/60x30/af5f00/000000?text=+) | 130 | `#af5f00` | 175, 95, 0 | 32.571, 1, 0.343 | | ![](https://via.placeholder.com/60x30/af5f5f/000000?text=+) | 131 | `#af5f5f` | 175, 95, 95 | 0, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/af5f87/000000?text=+) | 132 | `#af5f87` | 175, 95, 135 | 330, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/af5faf/000000?text=+) | 133 | `#af5faf` | 175, 95, 175 | 300, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/af5fd7/000000?text=+) | 134 | `#af5fd7` | 175, 95, 215 | 280, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/af5fff/000000?text=+) | 135 | `#af5fff` | 175, 95, 255 | 270, 1, 0.686 | | ![](https://via.placeholder.com/60x30/af8700/000000?text=+) | 136 | `#af8700` | 175, 135, 0 | 46.286, 1, 0.343 | | ![](https://via.placeholder.com/60x30/af875f/000000?text=+) | 137 | `#af875f` | 175, 135, 95 | 30, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/af8787/000000?text=+) | 138 | `#af8787` | 175, 135, 135 | 0, 0.2, 0.608 | | ![](https://via.placeholder.com/60x30/af87af/000000?text=+) | 139 | `#af87af` | 175, 135, 175 | 300, 0.2, 0.608 | | ![](https://via.placeholder.com/60x30/af87d7/000000?text=+) | 140 | `#af87d7` | 175, 135, 215 | 270, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/af87ff/000000?text=+) | 141 | `#af87ff` | 175, 135, 255 | 260, 1, 0.765 | | ![](https://via.placeholder.com/60x30/afaf00/000000?text=+) | 142 | `#afaf00` | 175, 175, 0 | 60, 1, 0.343 | | ![](https://via.placeholder.com/60x30/afaf5f/000000?text=+) | 143 | `#afaf5f` | 175, 175, 95 | 60, 0.333, 0.529 | | ![](https://via.placeholder.com/60x30/afaf87/000000?text=+) | 144 | `#afaf87` | 175, 175, 135 | 60, 0.2, 0.608 | | ![](https://via.placeholder.com/60x30/afafaf/000000?text=+) | 145 | `#afafaf` | 175, 175, 175 | 0, 0, 0.686 | | ![](https://via.placeholder.com/60x30/afafd7/000000?text=+) | 146 | `#afafd7` | 175, 175, 215 | 240, 0.333, 0.765 | | ![](https://via.placeholder.com/60x30/afafff/000000?text=+) | 147 | `#afafff` | 175, 175, 255 | 240, 1, 0.843 | | ![](https://via.placeholder.com/60x30/afd700/000000?text=+) | 148 | `#afd700` | 175, 215, 0 | 71.163, 1, 0.422 | | ![](https://via.placeholder.com/60x30/afd75f/000000?text=+) | 149 | `#afd75f` | 175, 215, 95 | 80, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/afd787/000000?text=+) | 150 | `#afd787` | 175, 215, 135 | 90, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/afd7af/000000?text=+) | 151 | `#afd7af` | 175, 215, 175 | 120, 0.333, 0.765 | | ![](https://via.placeholder.com/60x30/afd7d7/000000?text=+) | 152 | `#afd7d7` | 175, 215, 215 | 180, 0.333, 0.765 | | ![](https://via.placeholder.com/60x30/afd7ff/000000?text=+) | 153 | `#afd7ff` | 175, 215, 255 | 210, 1, 0.843 | | ![](https://via.placeholder.com/60x30/afff00/000000?text=+) | 154 | `#afff00` | 175, 255, 0 | 78.824, 1, 0.5 | | ![](https://via.placeholder.com/60x30/afff5f/000000?text=+) | 155 | `#afff5f` | 175, 255, 95 | 90, 1, 0.686 | | ![](https://via.placeholder.com/60x30/afff87/000000?text=+) | 156 | `#afff87` | 175, 255, 135 | 100, 1, 0.765 | | ![](https://via.placeholder.com/60x30/afffaf/000000?text=+) | 157 | `#afffaf` | 175, 255, 175 | 120, 1, 0.843 | | ![](https://via.placeholder.com/60x30/afffd7/000000?text=+) | 158 | `#afffd7` | 175, 255, 215 | 150, 1, 0.843 | | ![](https://via.placeholder.com/60x30/afffff/000000?text=+) | 159 | `#afffff` | 175, 255, 255 | 180, 1, 0.843 | | ![](https://via.placeholder.com/60x30/d70000/000000?text=+) | 160 | `#d70000` | 215, 0, 0 | 0, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d7005f/000000?text=+) | 161 | `#d7005f` | 215, 0, 95 | 333.488, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d70087/000000?text=+) | 162 | `#d70087` | 215, 0, 135 | 322.326, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d700af/000000?text=+) | 163 | `#d700af` | 215, 0, 175 | 311.163, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d700d7/000000?text=+) | 164 | `#d700d7` | 215, 0, 215 | 300, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d700ff/000000?text=+) | 165 | `#d700ff` | 215, 0, 255 | 290.588, 1, 0.5 | | ![](https://via.placeholder.com/60x30/d75f00/000000?text=+) | 166 | `#d75f00` | 215, 95, 0 | 26.512, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d75f5f/000000?text=+) | 167 | `#d75f5f` | 215, 95, 95 | 0, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d75f87/000000?text=+) | 168 | `#d75f87` | 215, 95, 135 | 340, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d75faf/000000?text=+) | 169 | `#d75faf` | 215, 95, 175 | 320, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d75fd7/000000?text=+) | 170 | `#d75fd7` | 215, 95, 215 | 300, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d75fff/000000?text=+) | 171 | `#d75fff` | 215, 95, 255 | 285, 1, 0.686 | | ![](https://via.placeholder.com/60x30/d78700/000000?text=+) | 172 | `#d78700` | 215, 135, 0 | 37.674, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d7875f/000000?text=+) | 173 | `#d7875f` | 215, 135, 95 | 20, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d78787/000000?text=+) | 174 | `#d78787` | 215, 135, 135 | 0, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/d787af/000000?text=+) | 175 | `#d787af` | 215, 135, 175 | 330, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/d787d7/000000?text=+) | 176 | `#d787d7` | 215, 135, 215 | 300, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/d787ff/000000?text=+) | 177 | `#d787ff` | 215, 135, 255 | 280, 1, 0.765 | | ![](https://via.placeholder.com/60x30/d7af00/000000?text=+) | 178 | `#d7af00` | 215, 175, 0 | 48.837, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d7af5f/000000?text=+) | 179 | `#d7af5f` | 215, 175, 95 | 40, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d7af87/000000?text=+) | 180 | `#d7af87` | 215, 175, 135 | 30, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/d7afaf/000000?text=+) | 181 | `#d7afaf` | 215, 175, 175 | 0, 0.333, 0.765 | | ![](https://via.placeholder.com/60x30/d7afd7/000000?text=+) | 182 | `#d7afd7` | 215, 175, 215 | 300, 0.333, 0.765 | | ![](https://via.placeholder.com/60x30/d7afff/000000?text=+) | 183 | `#d7afff` | 215, 175, 255 | 270, 1, 0.843 | | ![](https://via.placeholder.com/60x30/d7d700/000000?text=+) | 184 | `#d7d700` | 215, 215, 0 | 60, 1, 0.422 | | ![](https://via.placeholder.com/60x30/d7d75f/000000?text=+) | 185 | `#d7d75f` | 215, 215, 95 | 60, 0.6, 0.608 | | ![](https://via.placeholder.com/60x30/d7d787/000000?text=+) | 186 | `#d7d787` | 215, 215, 135 | 60, 0.5, 0.686 | | ![](https://via.placeholder.com/60x30/d7d7af/000000?text=+) | 187 | `#d7d7af` | 215, 215, 175 | 60, 0.333, 0.765 | | ![](https://via.placeholder.com/60x30/d7d7d7/000000?text=+) | 188 | `#d7d7d7` | 215, 215, 215 | 0, 0, 0.843 | | ![](https://via.placeholder.com/60x30/d7d7ff/000000?text=+) | 189 | `#d7d7ff` | 215, 215, 255 | 240, 1, 0.922 | | ![](https://via.placeholder.com/60x30/d7ff00/000000?text=+) | 190 | `#d7ff00` | 215, 255, 0 | 69.412, 1, 0.5 | | ![](https://via.placeholder.com/60x30/d7ff5f/000000?text=+) | 191 | `#d7ff5f` | 215, 255, 95 | 75, 1, 0.686 | | ![](https://via.placeholder.com/60x30/d7ff87/000000?text=+) | 192 | `#d7ff87` | 215, 255, 135 | 80, 1, 0.765 | | ![](https://via.placeholder.com/60x30/d7ffaf/000000?text=+) | 193 | `#d7ffaf` | 215, 255, 175 | 90, 1, 0.843 | | ![](https://via.placeholder.com/60x30/d7ffd7/000000?text=+) | 194 | `#d7ffd7` | 215, 255, 215 | 120, 1, 0.922 | | ![](https://via.placeholder.com/60x30/d7ffff/000000?text=+) | 195 | `#d7ffff` | 215, 255, 255 | 180, 1, 0.922 | | ![](https://via.placeholder.com/60x30/ff0000/000000?text=+) | 196 | `#ff0000` | 255, 0, 0 | 0, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff005f/000000?text=+) | 197 | `#ff005f` | 255, 0, 95 | 337.647, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff0087/000000?text=+) | 198 | `#ff0087` | 255, 0, 135 | 328.235, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff00af/000000?text=+) | 199 | `#ff00af` | 255, 0, 175 | 318.824, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff00d7/000000?text=+) | 200 | `#ff00d7` | 255, 0, 215 | 309.412, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff00ff/000000?text=+) | 201 | `#ff00ff` | 255, 0, 255 | 300, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff5f00/000000?text=+) | 202 | `#ff5f00` | 255, 95, 0 | 22.353, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff5f5f/000000?text=+) | 203 | `#ff5f5f` | 255, 95, 95 | 0, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ff5f87/000000?text=+) | 204 | `#ff5f87` | 255, 95, 135 | 345, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ff5faf/000000?text=+) | 205 | `#ff5faf` | 255, 95, 175 | 330, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ff5fd7/000000?text=+) | 206 | `#ff5fd7` | 255, 95, 215 | 315, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ff5fff/000000?text=+) | 207 | `#ff5fff` | 255, 95, 255 | 300, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ff8700/000000?text=+) | 208 | `#ff8700` | 255, 135, 0 | 31.765, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ff875f/000000?text=+) | 209 | `#ff875f` | 255, 135, 95 | 15, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ff8787/000000?text=+) | 210 | `#ff8787` | 255, 135, 135 | 0, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ff87af/000000?text=+) | 211 | `#ff87af` | 255, 135, 175 | 340, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ff87d7/000000?text=+) | 212 | `#ff87d7` | 255, 135, 215 | 320, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ff87ff/000000?text=+) | 213 | `#ff87ff` | 255, 135, 255 | 300, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ffaf00/000000?text=+) | 214 | `#ffaf00` | 255, 175, 0 | 41.176, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ffaf5f/000000?text=+) | 215 | `#ffaf5f` | 255, 175, 95 | 30, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ffaf87/000000?text=+) | 216 | `#ffaf87` | 255, 175, 135 | 20, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ffafaf/000000?text=+) | 217 | `#ffafaf` | 255, 175, 175 | 0, 1, 0.843 | | ![](https://via.placeholder.com/60x30/ffafd7/000000?text=+) | 218 | `#ffafd7` | 255, 175, 215 | 330, 1, 0.843 | | ![](https://via.placeholder.com/60x30/ffafff/000000?text=+) | 219 | `#ffafff` | 255, 175, 255 | 300, 1, 0.843 | | ![](https://via.placeholder.com/60x30/ffd700/000000?text=+) | 220 | `#ffd700` | 255, 215, 0 | 50.588, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ffd75f/000000?text=+) | 221 | `#ffd75f` | 255, 215, 95 | 45, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ffd787/000000?text=+) | 222 | `#ffd787` | 255, 215, 135 | 40, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ffd7af/000000?text=+) | 223 | `#ffd7af` | 255, 215, 175 | 30, 1, 0.843 | | ![](https://via.placeholder.com/60x30/ffd7d7/000000?text=+) | 224 | `#ffd7d7` | 255, 215, 215 | 0, 1, 0.922 | | ![](https://via.placeholder.com/60x30/ffd7ff/000000?text=+) | 225 | `#ffd7ff` | 255, 215, 255 | 300, 1, 0.922 | | ![](https://via.placeholder.com/60x30/ffff00/000000?text=+) | 226 | `#ffff00` | 255, 255, 0 | 60, 1, 0.5 | | ![](https://via.placeholder.com/60x30/ffff5f/000000?text=+) | 227 | `#ffff5f` | 255, 255, 95 | 60, 1, 0.686 | | ![](https://via.placeholder.com/60x30/ffff87/000000?text=+) | 228 | `#ffff87` | 255, 255, 135 | 60, 1, 0.765 | | ![](https://via.placeholder.com/60x30/ffffaf/000000?text=+) | 229 | `#ffffaf` | 255, 255, 175 | 60, 1, 0.843 | | ![](https://via.placeholder.com/60x30/ffffd7/000000?text=+) | 230 | `#ffffd7` | 255, 255, 215 | 60, 1, 0.922 | | ![](https://via.placeholder.com/60x30/ffffff/000000?text=+) | 231 | `#ffffff` | 255, 255, 255 | 0, 0, 1 | | ![](https://via.placeholder.com/60x30/080808/000000?text=+) | 232 | `#080808` | 8, 8, 8 | 0, 0, 0.031 | | ![](https://via.placeholder.com/60x30/121212/000000?text=+) | 233 | `#121212` | 18, 18, 18 | 0, 0, 0.071 | | ![](https://via.placeholder.com/60x30/1c1c1c/000000?text=+) | 234 | `#1c1c1c` | 28, 28, 28 | 0, 0, 0.11 | | ![](https://via.placeholder.com/60x30/262626/000000?text=+) | 235 | `#262626` | 38, 38, 38 | 0, 0, 0.149 | | ![](https://via.placeholder.com/60x30/303030/000000?text=+) | 236 | `#303030` | 48, 48, 48 | 0, 0, 0.188 | | ![](https://via.placeholder.com/60x30/3a3a3a/000000?text=+) | 237 | `#3a3a3a` | 58, 58, 58 | 0, 0, 0.227 | | ![](https://via.placeholder.com/60x30/444444/000000?text=+) | 238 | `#444444` | 68, 68, 68 | 0, 0, 0.267 | | ![](https://via.placeholder.com/60x30/4e4e4e/000000?text=+) | 239 | `#4e4e4e` | 78, 78, 78 | 0, 0, 0.306 | | ![](https://via.placeholder.com/60x30/585858/000000?text=+) | 240 | `#585858` | 88, 88, 88 | 0, 0, 0.345 | | ![](https://via.placeholder.com/60x30/626262/000000?text=+) | 241 | `#626262` | 98, 98, 98 | 0, 0, 0.384 | | ![](https://via.placeholder.com/60x30/6c6c6c/000000?text=+) | 242 | `#6c6c6c` | 108, 108, 108 | 0, 0, 0.424 | | ![](https://via.placeholder.com/60x30/767676/000000?text=+) | 243 | `#767676` | 118, 118, 118 | 0, 0, 0.463 | | ![](https://via.placeholder.com/60x30/808080/000000?text=+) | 244 | `#808080` | 128, 128, 128 | 0, 0, 0.502 | | ![](https://via.placeholder.com/60x30/8a8a8a/000000?text=+) | 245 | `#8a8a8a` | 138, 138, 138 | 0, 0, 0.541 | | ![](https://via.placeholder.com/60x30/949494/000000?text=+) | 246 | `#949494` | 148, 148, 148 | 0, 0, 0.58 | | ![](https://via.placeholder.com/60x30/9e9e9e/000000?text=+) | 247 | `#9e9e9e` | 158, 158, 158 | 0, 0, 0.62 | | ![](https://via.placeholder.com/60x30/a8a8a8/000000?text=+) | 248 | `#a8a8a8` | 168, 168, 168 | 0, 0, 0.659 | | ![](https://via.placeholder.com/60x30/b2b2b2/000000?text=+) | 249 | `#b2b2b2` | 178, 178, 178 | 0, 0, 0.698 | | ![](https://via.placeholder.com/60x30/bcbcbc/000000?text=+) | 250 | `#bcbcbc` | 188, 188, 188 | 0, 0, 0.737 | | ![](https://via.placeholder.com/60x30/c6c6c6/000000?text=+) | 251 | `#c6c6c6` | 198, 198, 198 | 0, 0, 0.776 | | ![](https://via.placeholder.com/60x30/d0d0d0/000000?text=+) | 252 | `#d0d0d0` | 208, 208, 208 | 0, 0, 0.816 | | ![](https://via.placeholder.com/60x30/dadada/000000?text=+) | 253 | `#dadada` | 218, 218, 218 | 0, 0, 0.855 | | ![](https://via.placeholder.com/60x30/e4e4e4/000000?text=+) | 254 | `#e4e4e4` | 228, 228, 228 | 0, 0, 0.894 | | ![](https://via.placeholder.com/60x30/eeeeee/000000?text=+) | 255 | `#eeeeee` | 238, 238, 238 | 0, 0, 0.933 | pyglossary-4.5.0/doc/termux.md000066400000000000000000000014211417733132500163530ustar00rootroot00000000000000## Feature-specific Requirements on [Termux](https://github.com/termux/termux-app) - **Using `--remove-html-all` flag** - `apt install libxml2 libxslt` - `pip install lxml beautifulsoup4` - **Reading from FreeDict, XDXF, JMDict, AppleDict Binary (.dictionary) or CC-CEDICT** - `apt install libxml2 libxslt` - `pip install lxml` - **Reading from cc-kedict** - `apt install libxml2 libxslt` - `pip install lxml PyYAML` - **Reading or writing Aard 2 (.slob)** - `pkg install libicu` - `pip install PyICU` - **Writing to Kobo E-Reader Dictionary** - `pip install marisa-trie` - **Reading from Zim** - `apt install libzim` - `pip install libzim` - **Writing to AppleDict** - `apt install libxml2 libxslt` - `pip install lxml beautifulsoup4 html5lib` pyglossary-4.5.0/help000066400000000000000000000102461417733132500146200ustar00rootroot00000000000000PyGlossary is a tool for working with dictionary databases (glossaries) Basic Usage: PyGI (Gtk3) Interface: To open PyGlossary window: ${CMD} PyGI is the default interface (so you never need to use "--ui=gtk" or --gtk option) If PyGI was not found (not installed), then PyGlossary will fallback to Tkinter. Tkinter Interface: To open PyGlossary window: ${CMD} --tk Or ${CMD} --ui=tk Usually good for Windows and Mac OS X Command-line interface: To show this help: ${CMD} --help To show program version: ${CMD} --version To Convert: ${CMD} INPUT_FILE OUTPUT_FILE To Reverse: ${CMD} INPUT_FILE OUTPUT_FILE.txt --reverse Input and output formats will be detected from extensions if possible. If not, you need to specify input or output format, for example: ${CMD} test.utf8 test.ifo --read-format=tabfile ${CMD} test.utf8 test.ifo --read-format tabfile ${CMD} test.ifo test.utf8 --write-format=tabfile ${CMD} test.ifo test.utf8 --write-format tabfile Interactive command-line interface: Minimal command: ${CMD} --cmd Or ${CMD} --ui=cmd Additionally you can pass any flag to act to act as default General Options: Verbosity: -v0 or '--verbosity 0' for critical errors only -v1 or '--verbosity 1' for errors only -v2 or '--verbosity 2' for errors and warnings -v3 or '--verbosity 3' for errors, warnings and info -v4 or '--verbosity 4' for debug mode -v5 or '--verbosity 5' for trace mode Appearance: --no-progress-bar and --no-color, useful for scripts Full Convert Usage: ${CMD} INPUT_FILE OUTPUT_FILE [-vN] [--read-format=FORMAT] [--write-format=FORMAT] [--direct|--indirect|--sqlite] [--no-alts] [--sort|--no-sort] [--sort-cache-size=2000] [--utf8-check|--no-utf8-check] [--lower|--no-lower] [--read-options=READ_OPTIONS] [--write-options=WRITE_OPTIONS] [--source-lang=LANGUAGE] [--target-lang=LANGUAGE] ['--name=GLOSSARY NAME'] Direct and indirect modes Indirect mode means the input glossary is completely read and loaded into RAM, then converted into the output format. This was the only method available in old versions (before 3.0.0). Direct mode means entries are one-at-a-time read, processed and written into the output glossary. Direct mode was added to limit the memory usage for large glossaries; But it may reduce the conversion time for most cases as well. Converting glossaries into some formats like StarDict and EPUB-2 requires sorting entries. That's why direct mode will not work for these format, and PyGlossary will use indirect mode. Otherwise direct mode will be the default. You may override this by --indirect flag. SQLite mode: As mentioned above, converting glossaries into some formats like StarDict will need them to loaded into RAM. This can be problematic if the glossary is too big to fit into RAM. That's when you should try adding --sqlite flag to your command. Then it uses SQLite as intermediate storage for sorting then fetching entries. This fixes the memory issue, and may even reduce running time of conversion (depending on your home directory storage). The temporary SQLite file is stored in ~/.cache/pyglossary/ then deleted after conversion, unless you pass --no-cleanup flag. Currently you can not disable alternates in SQLite mode (--no-alts is ignored). Command line arguments and options (and arguments for options) is parsed with GNU getopt method Compressing with gz, bz2 and zip is supported. Just append these extension to the file name, for example: ${CMD} mydic.ifo mydic.txt.gz And if the input file has these extensions (gz, bz2, zip), it will be extracted before loading pyglossary-4.5.0/license-dialog000066400000000000000000000013741417733132500165510ustar00rootroot00000000000000PyGlossary - A tool for converting dictionary files Copyright © 2008-2022 Saeed Rasooli This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. Or on Debian systems, from /usr/share/common-licenses/GPL. If not, see http://www.gnu.org/licenses/gpl.txt pyglossary-4.5.0/license.txt000066400000000000000000001045131417733132500161310ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . pyglossary-4.5.0/main.py000077500000000000000000000002231417733132500152400ustar00rootroot00000000000000#!/usr/bin/env python3 import sys from os.path import dirname sys.path.insert(0, dirname(__file__)) from pyglossary.ui.main import main main() pyglossary-4.5.0/pkg/000077500000000000000000000000001417733132500145235ustar00rootroot00000000000000pyglossary-4.5.0/pkg/fedora/000077500000000000000000000000001417733132500157635ustar00rootroot00000000000000pyglossary-4.5.0/pkg/fedora/pyglossary.spec000066400000000000000000000022761417733132500210620ustar00rootroot00000000000000%global __python /usr/bin/python3 %{!?python_sitelib: %global python_sitelib %(%{__python} -c "import sys; from distutils.sysconfig import get_python_lib; sys.stdout.write(get_python_lib())")} Name: pyglossary Version: master Release: 1%{?dist} Summary: Working on glossaries (dictionary files) Group: Applications/Productivity License: GPLv3 URL: https://github.com/ilius/pyglossary Source0: pyglossary-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) BuildArch: noarch %description A tool for converting dictionary files aka glossaries. %prep %setup -q %install python3 setup.py install --root=%{buildroot} --prefix=%{_prefix} desktop-file-install --vendor fedora \ --dir %{buildroot}%{_datadir}/applications \ --delete-original \ %{buildroot}%{_datadir}/applications/pyglossary.desktop %clean rm -rf %{buildroot} %files %{_bindir}/pyglossary %{_datadir}/applications/fedora-pyglossary.desktop %{_datadir}/pixmaps/pyglossary.png %{_datadir}/pyglossary/ %{_datadir}/doc/pyglossary/ %{python_sitelib}/pyglossary/* pyglossary-4.5.0/pkg/pyglossary.desktop000077500000000000000000000004021417733132500203310ustar00rootroot00000000000000#!/usr/bin/env xdg-open [Desktop Entry] Name=PyGlossary GenericName=Glossary Converter Comment=Working on glossaries Exec=pyglossary Terminal=false Type=Application StartupNotify=true Icon=pyglossary Categories=Education; X-GNOME-FullName=Glossary Converter pyglossary-4.5.0/plugins-meta/000077500000000000000000000000001417733132500163475ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/index.json000066400000000000000000000761621417733132500203650ustar00rootroot00000000000000[ { "module": "aard2_slob", "lname": "aard2_slob", "name": "Aard2Slob", "description": "Aard 2 (.slob)", "extensions": [ ".slob" ], "singleFile": true, "optionsProp": { "compression": { "class": "StrOption", "type": "str", "customValue": false, "values": [ "", "bz2", "zlib", "lzma2" ], "comment": "Compression Algorithm" }, "content_type": { "class": "StrOption", "type": "str", "customValue": true, "values": [ "text/plain; charset=utf-8", "text/html; charset=utf-8" ], "comment": "Content Type" }, "file_size_approx": { "class": "FileSizeOption", "type": "int", "customValue": true, "comment": "split up by given approximate file size\nexamples: 100m, 1g" }, "separate_alternates": { "class": "BoolOption", "type": "bool", "comment": "add alternate headwords as separate entries to slob" }, "word_title": { "class": "BoolOption", "type": "bool", "comment": "add headwords title to begining of definition" } }, "canRead": true, "canWrite": true, "readOptions": {}, "writeOptions": { "compression": "zlib", "content_type": "", "file_size_approx": 0, "separate_alternates": false, "word_title": false }, "readDepends": { "icu": "PyICU" }, "writeDepends": { "icu": "PyICU" } }, { "module": "abc_medical_notes", "lname": "abc_medical_notes", "name": "ABCMedicalNotes", "description": "ABC Medical Notes (SQLite3)", "extensions": [], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "almaany", "lname": "almaany", "name": "Almaany", "description": "Almaany.com (SQLite3)", "extensions": [], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "appledict", "lname": "appledict", "name": "AppleDict", "description": "AppleDict Source", "extensions": [ ".apple" ], "singleFile": false, "optionsProp": { "clean_html": { "class": "BoolOption", "type": "bool", "comment": "use BeautifulSoup parser" }, "css": { "class": "StrOption", "type": "str", "customValue": true, "comment": "custom .css file path" }, "xsl": { "class": "StrOption", "type": "str", "customValue": true, "comment": "custom XSL transformations file path" }, "default_prefs": { "class": "DictOption", "type": "dict", "comment": "default prefs in python dict format" }, "prefs_html": { "class": "StrOption", "type": "str", "customValue": true, "comment": "preferences XHTML file path" }, "front_back_matter": { "class": "StrOption", "type": "str", "customValue": true, "comment": "XML file path with top-level tag" }, "jing": { "class": "BoolOption", "type": "bool", "comment": "run Jing check on generated XML" }, "indexes": { "class": "StrOption", "type": "str", "customValue": false, "values": [ "", "ru", "zh" ], "comment": "Additional indexes to dictionary entries" } }, "canRead": false, "canWrite": true, "writeOptions": { "clean_html": true, "css": "", "xsl": "", "default_prefs": null, "prefs_html": "", "front_back_matter": "", "jing": false, "indexes": "" }, "writeDepends": { "lxml": "lxml", "bs4": "beautifulsoup4", "html5lib": "html5lib" } }, { "module": "appledict_bin", "lname": "appledict_bin", "name": "AppleDictBin", "description": "AppleDict Binary", "extensions": [ ".dictionary", ".data" ], "singleFile": true, "optionsProp": { "html": { "class": "BoolOption", "type": "bool", "comment": "Entries are HTML" }, "html_full": { "class": "BoolOption", "type": "bool", "comment": "Turn every entry's definition into an HTML document" } }, "canRead": true, "canWrite": false, "readOptions": { "html": true, "html_full": false }, "readDepends": { "lxml": "lxml" } }, { "module": "babylon_bdc", "lname": "babylon_bdc", "name": "BabylonBdc", "description": "Babylon (bdc)", "extensions": [ ".bdc" ], "singleFile": true, "optionsProp": {}, "canRead": false, "canWrite": false, "enable": false }, { "module": "babylon_bgl", "lname": "babylon_bgl", "name": "BabylonBgl", "description": "Babylon (.BGL)", "extensions": [ ".bgl" ], "singleFile": true, "optionsProp": { "default_encoding_overwrite": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Default encoding (overwrite)" }, "source_encoding_overwrite": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Source encoding (overwrite)" }, "target_encoding_overwrite": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Target encoding (overwrite)" }, "part_of_speech_color": { "class": "HtmlColorOption", "type": "str", "comment": "Color for Part of Speech" }, "no_control_sequence_in_defi": { "class": "BoolOption", "type": "bool", "comment": "No control sequence in definitions" }, "strict_string_convertion": { "class": "BoolOption", "type": "bool", "comment": "Strict string convertion" }, "process_html_in_key": { "class": "BoolOption", "type": "bool", "comment": "Process HTML in (entry or info) key" }, "key_rstrip_chars": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Characters to strip from right-side of keys" }, "search_char_samples": { "class": "BoolOption", "type": "bool", "comment": "(debug) Search character samples" }, "collect_metadata2": { "class": "BoolOption", "type": "bool", "comment": "(debug) Collect second pass metadata from definitions" }, "write_gz": { "class": "BoolOption", "type": "bool", "comment": "(debug) Create a file named *-data.gz" }, "char_samples_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "(debug) File path for character samples" }, "msg_log_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "(debug) File path for message log" }, "raw_dump_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "(debug) File path for writing raw blocks" }, "unpacked_gzip_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "(debug) Path to create unzipped file" } }, "canRead": true, "canWrite": false, "readOptions": { "default_encoding_overwrite": "", "source_encoding_overwrite": "", "target_encoding_overwrite": "", "part_of_speech_color": "007000", "no_control_sequence_in_defi": false, "strict_string_convertion": false, "process_html_in_key": false, "key_rstrip_chars": "" } }, { "module": "cc_cedict", "lname": "cc_cedict", "name": "CC-CEDICT", "description": "CC-CEDICT", "extensions": [ ".u8" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "traditional_title": { "class": "BoolOption", "type": "bool", "comment": "Use traditional Chinese for entry titles/keys" } }, "canRead": true, "canWrite": false, "readOptions": { "encoding": "utf-8", "traditional_title": false }, "readDepends": { "lxml": "lxml" } }, { "module": "cc_kedict", "lname": "cc_kedict", "name": "cc-kedict", "description": "cc-kedict", "extensions": [], "singleFile": true, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {}, "readDepends": { "yaml": "PyYAML", "lxml": "lxml" } }, { "module": "crawler_dir", "lname": "crawler_dir", "name": "CrawlerDir", "description": "Crawler Directory", "extensions": [ ".crawler" ], "singleFile": true, "optionsProp": { "compression": { "class": "StrOption", "type": "str", "customValue": false, "values": [ "", "gz", "bz2", "lzma" ], "comment": "Compression Algorithm" } }, "canRead": true, "canWrite": true, "readOptions": {}, "writeOptions": { "compression": "" } }, { "module": "csv_plugin", "lname": "csv", "name": "Csv", "description": "CSV (.csv)", "extensions": [ ".csv" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" }, "delimiter": { "class": "Option", "type": "str", "customValue": true, "values": [ ",", ";", "@" ], "comment": "Column delimiter" }, "add_defi_format": { "class": "BoolOption", "type": "bool", "comment": "enable adding defiFormat (m/h/x)" }, "enable_info": { "class": "BoolOption", "type": "bool", "comment": "Enable glossary info / metedata" }, "word_title": { "class": "BoolOption", "type": "bool", "comment": "add headwords title to begining of definition" } }, "canRead": true, "canWrite": true, "readOptions": { "encoding": "utf-8", "delimiter": "," }, "writeOptions": { "encoding": "utf-8", "resources": true, "delimiter": ",", "add_defi_format": false, "enable_info": true, "word_title": false }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "dicformids", "lname": "dicformids", "name": "Dicformids", "description": "DictionaryForMIDs", "extensions": [ ".mids" ], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": true, "sortOnWrite": "always", "sortKeyName": "dicformids", "readOptions": {}, "writeOptions": {} }, { "module": "dict_cc", "lname": "dict_cc", "name": "Dictcc", "description": "Dict.cc (SQLite3)", "extensions": [], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "dict_cc_split", "lname": "dict_cc_split", "name": "Dictcc_split", "description": "Dict.cc (SQLite3) - Split", "extensions": [], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "dict_org", "lname": "dict_org", "name": "DictOrg", "description": "DICT.org file format (.index)", "extensions": [ ".index" ], "singleFile": false, "optionsProp": { "dictzip": { "class": "BoolOption", "type": "bool", "comment": "Compress .dict file to .dict.dz" }, "install": { "class": "BoolOption", "type": "bool", "comment": "Install dictionary to /usr/share/dictd/" } }, "canRead": true, "canWrite": true, "readOptions": {}, "writeOptions": { "dictzip": false, "install": true } }, { "module": "dict_org_source", "lname": "dict_org_source", "name": "DictOrgSource", "description": "DICT.org dictfmt source file", "extensions": [ ".dtxt" ], "singleFile": true, "optionsProp": { "remove_html_all": { "class": "BoolOption", "type": "bool", "comment": "Remove all HTML tags" } }, "canRead": false, "canWrite": true, "writeOptions": { "remove_html_all": true } }, { "module": "dictunformat", "lname": "dictunformat", "name": "Dictunformat", "description": "dictunformat output file", "extensions": [ ".dictunformat" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" } }, "canRead": true, "canWrite": false, "readOptions": { "encoding": "utf-8" }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "digitalnk", "lname": "digitalnk", "name": "DigitalNK", "description": "DigitalNK (SQLite3, N-Korean)", "extensions": [], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "dsl", "lname": "dsl", "name": "ABBYYLingvoDSL", "description": "ABBYY Lingvo DSL (.dsl)", "extensions": [ ".dsl" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "audio": { "class": "BoolOption", "type": "bool", "comment": "Enable audio objects" }, "only_fix_markup": { "class": "BoolOption", "type": "bool", "comment": "Only fix markup, without tag conversion" } }, "canRead": true, "canWrite": false, "readOptions": { "encoding": "", "audio": false, "only_fix_markup": false }, "readCompressions": [ "gz", "bz2", "lzma", "dz" ] }, { "module": "ebook_epub2", "lname": "epub2", "name": "Epub2", "description": "EPUB-2 E-Book", "extensions": [ ".epub" ], "singleFile": false, "optionsProp": { "group_by_prefix_length": { "class": "IntOption", "type": "int", "customValue": true, "comment": "Prefix length for grouping" }, "compress": { "class": "BoolOption", "type": "bool", "comment": "Enable compression" }, "keep": { "class": "BoolOption", "type": "bool", "comment": "Keep temp files" }, "include_index_page": { "class": "BoolOption", "type": "bool", "comment": "Include index page" }, "css": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Path to css file" }, "cover_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Path to cover file" } }, "canRead": false, "canWrite": true, "sortOnWrite": "always", "sortKeyName": "ebook", "writeOptions": { "keep": false, "group_by_prefix_length": 2, "include_index_page": false, "compress": true, "css": "", "cover_path": "" } }, { "module": "ebook_kobo", "lname": "kobo", "name": "Kobo", "description": "Kobo E-Reader Dictionary", "extensions": [ ".kobo" ], "singleFile": false, "optionsProp": {}, "canRead": false, "canWrite": true, "sortOnWrite": "never", "writeOptions": {}, "writeDepends": { "marisa_trie": "marisa-trie" } }, { "module": "ebook_kobo_dictfile", "lname": "kobo_dictfile", "name": "Dictfile", "description": "Kobo E-Reader Dictfile (.df)", "extensions": [ ".df" ], "singleFile": false, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "extract_inline_images": { "class": "BoolOption", "type": "bool", "comment": "Extract inline images" } }, "canRead": true, "canWrite": true, "readOptions": { "encoding": "utf-8", "extract_inline_images": true }, "writeOptions": { "encoding": "utf-8" }, "readDepends": { "mistune": "mistune==2.0.0a5" }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "ebook_mobi", "lname": "mobi", "name": "Mobi", "description": "Mobipocket (.mobi) E-Book", "extensions": [ ".mobi" ], "singleFile": false, "optionsProp": { "group_by_prefix_length": { "class": "IntOption", "type": "int", "customValue": true, "comment": "Prefix length for grouping" }, "kindlegen_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Path to kindlegen executable" }, "compress": { "class": "BoolOption", "type": "bool", "comment": "Enable compression", "disabled": true }, "keep": { "class": "BoolOption", "type": "bool", "comment": "Keep temp files" }, "include_index_page": { "class": "BoolOption", "type": "bool", "comment": "Include index page", "disabled": true }, "css": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Path to css file" }, "cover_path": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Path to cover file" }, "file_size_approx": { "class": "FileSizeOption", "type": "int", "customValue": true, "comment": "Approximate size of each xhtml file (example: 200kb)" }, "hide_word_index": { "class": "BoolOption", "type": "bool", "comment": "Hide headword in tap-to-check interface" }, "spellcheck": { "class": "BoolOption", "type": "bool", "comment": "Enable wildcard search and spell correction during word lookup" }, "exact": { "class": "BoolOption", "type": "bool", "comment": "Exact-match Parameter" } }, "canRead": false, "canWrite": true, "sortOnWrite": "default_yes", "sortKeyName": "ebook", "writeOptions": { "keep": false, "group_by_prefix_length": 2, "css": "", "cover_path": "", "kindlegen_path": "", "file_size_approx": 271360, "hide_word_index": false, "spellcheck": true, "exact": false } }, { "module": "edlin", "lname": "edlin", "name": "Edlin", "description": "EDLIN", "extensions": [ ".edlin" ], "singleFile": false, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "prev_link": { "class": "BoolOption", "type": "bool", "comment": "Enable link to previous entry" } }, "canRead": true, "canWrite": true, "readOptions": { "encoding": "utf-8" }, "writeOptions": { "encoding": "utf-8", "prev_link": true } }, { "module": "freedict", "lname": "freedict", "name": "FreeDict", "description": "FreeDict (.tei)", "extensions": [ ".tei" ], "singleFile": true, "optionsProp": { "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" }, "discover": { "class": "BoolOption", "type": "bool", "comment": "Find and show unsupported tags" }, "auto_rtl": { "class": "BoolOption", "type": "bool", "comment": "Auto-detect and mark Right-to-Left text" }, "word_title": { "class": "BoolOption", "type": "bool", "comment": "Add headwords title to begining of definition" }, "pron_color": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Pronunciation color" }, "gram_color": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Grammar color" }, "example_padding": { "class": "IntOption", "type": "int", "customValue": true, "comment": "Padding for examples (in px)" } }, "canRead": true, "canWrite": false, "readOptions": { "discover": false, "auto_rtl": null, "word_title": false, "pron_color": "gray", "gram_color": "green", "example_padding": 10 }, "readDepends": { "lxml": "lxml" }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "gettext_mo", "lname": "gettext_mo", "name": "GettextMo", "description": "Gettext Binary (mo)", "extensions": [ ".mo" ], "singleFile": true, "optionsProp": {}, "canRead": false, "canWrite": false, "enable": false }, { "module": "gettext_po", "lname": "gettext_po", "name": "GettextPo", "description": "Gettext Source (.po)", "extensions": [ ".po" ], "singleFile": true, "optionsProp": { "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" } }, "canRead": true, "canWrite": true, "readOptions": {}, "writeOptions": { "resources": true }, "readDepends": { "polib": "polib" }, "writeDepends": { "polib": "polib" } }, { "module": "html_dir", "lname": "html_dir", "name": "HtmlDir", "description": "HTML Directory", "extensions": [ ".hdir" ], "singleFile": false, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" }, "max_file_size": { "class": "IntOption", "type": "int", "customValue": true, "comment": "Maximum file size in bytes" }, "filename_format": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Filename format, default: {n:05d}.html" }, "escape_defi": { "class": "BoolOption", "type": "bool", "comment": "Escape definitions" }, "dark": { "class": "BoolOption", "type": "bool", "comment": "Use dark style" }, "css": { "class": "StrOption", "type": "str", "customValue": true, "comment": "Path to css file" }, "word_title": { "class": "BoolOption", "type": "bool", "comment": "Add headwords title to begining of definition" } }, "canRead": false, "canWrite": true, "writeOptions": { "encoding": "utf-8", "resources": true, "max_file_size": 102400, "filename_format": "{n:05d}.html", "escape_defi": false, "dark": true, "css": "", "word_title": true }, "writeDepends": { "cachetools": "cachetools" } }, { "module": "info_plugin", "lname": "info", "name": "Info", "description": "Glossary Info (.info)", "extensions": [ ".info" ], "singleFile": true, "optionsProp": {}, "canRead": true, "canWrite": true, "readOptions": {}, "writeOptions": {} }, { "module": "jmdict", "lname": "jmdict", "name": "JMDict", "description": "JMDict", "extensions": [], "singleFile": true, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {}, "readDepends": { "lxml": "lxml" }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "json_plugin", "lname": "json", "name": "Json", "description": "JSON (.json)", "extensions": [ ".json" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "enable_info": { "class": "BoolOption", "type": "bool", "comment": "Enable glossary info / metedata" }, "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" }, "word_title": { "class": "BoolOption", "type": "bool", "comment": "add headwords title to begining of definition" } }, "canRead": false, "canWrite": true, "writeOptions": { "encoding": "utf-8", "enable_info": true, "resources": true, "word_title": false } }, { "module": "lingoes_ldf", "lname": "lingoes_ldf", "name": "LingoesLDF", "description": "Lingoes Source (.ldf)", "extensions": [ ".ldf" ], "singleFile": true, "optionsProp": { "newline": { "class": "NewlineOption", "type": "str", "customValue": true, "values": [ "\r\n", "\n", "\r" ], "comment": "Newline string" }, "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" }, "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" } }, "canRead": true, "canWrite": true, "readOptions": { "encoding": "utf-8" }, "writeOptions": { "newline": "\n", "resources": true }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "octopus_mdict_new", "lname": "octopus_mdict", "name": "OctopusMdict", "description": "Octopus MDict (.mdx)", "extensions": [ ".mdx" ], "singleFile": false, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "substyle": { "class": "BoolOption", "type": "bool", "comment": "Enable substyle" }, "same_dir_data_files": { "class": "BoolOption", "type": "bool", "comment": "Read data files from same directory" }, "audio": { "class": "BoolOption", "type": "bool", "comment": "Enable audio objects" } }, "canRead": true, "canWrite": false, "readOptions": { "encoding": "", "substyle": true, "same_dir_data_files": false, "audio": false } }, { "module": "sdict", "lname": "sdict", "name": "Sdict", "description": "Sdictionary Binary(dct)", "extensions": [ ".dct" ], "singleFile": true, "optionsProp": {}, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "sdict_source", "lname": "sdict_source", "name": "SdictSource", "description": "Sdictionary Source (.sdct)", "extensions": [ ".sdct" ], "singleFile": true, "optionsProp": { "enable_info": { "class": "BoolOption", "type": "bool", "comment": "Enable glossary info / metedata" }, "newline": { "class": "NewlineOption", "type": "str", "customValue": true, "values": [ "\r\n", "\n", "\r" ], "comment": "Newline string" }, "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" } }, "canRead": false, "canWrite": true, "writeOptions": { "enable_info": true, "newline": "\n", "resources": true } }, { "module": "sql", "lname": "sql", "name": "Sql", "description": "SQL (.sql)", "extensions": [ ".sql" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "info_keys": { "class": "ListOption", "type": "list", "comment": "List of dbinfo table columns" }, "add_extra_info": { "class": "BoolOption", "type": "bool", "comment": "Create dbinfo_extra table" }, "newline": { "class": "NewlineOption", "type": "str", "customValue": true, "values": [ "\r\n", "\n", "\r" ], "comment": "Newline string" }, "transaction": { "class": "BoolOption", "type": "bool", "comment": "Use TRANSACTION" } }, "canRead": false, "canWrite": true, "writeOptions": { "encoding": "utf-8", "info_keys": null, "add_extra_info": true, "newline": "
", "transaction": false } }, { "module": "stardict", "lname": "stardict", "name": "Stardict", "description": "StarDict (.ifo)", "extensions": [ ".ifo" ], "singleFile": false, "optionsProp": { "stardict_client": { "class": "BoolOption", "type": "bool", "comment": "Modify html entries for StarDict 3.0" }, "dictzip": { "class": "BoolOption", "type": "bool", "comment": "Compress .dict file to .dict.dz" }, "sametypesequence": { "class": "StrOption", "type": "str", "customValue": false, "values": [ "", "h", "m", "x", null ], "comment": "Definition format: h=html, m=plaintext, x=xdxf" }, "merge_syns": { "class": "BoolOption", "type": "bool", "comment": "Write alternates to .idx instead of .syn" }, "xdxf_to_html": { "class": "BoolOption", "type": "bool", "comment": "Convert XDXF entries to HTML" }, "unicode_errors": { "class": "StrOption", "type": "str", "customValue": false, "values": [ "strict", "ignore", "replace", "backslashreplace" ], "comment": "What to do with Unicode decoding errors" }, "audio_goldendict": { "class": "BoolOption", "type": "bool", "comment": "Convert audio links for GoldenDict (desktop)" }, "audio_icon": { "class": "BoolOption", "type": "bool", "comment": "Add glossary's audio icon" } }, "canRead": true, "canWrite": true, "sortOnWrite": "always", "sortKeyName": "stardict", "readOptions": { "xdxf_to_html": true, "unicode_errors": "strict" }, "writeOptions": { "dictzip": true, "sametypesequence": "", "stardict_client": false, "merge_syns": false, "audio_goldendict": false, "audio_icon": true } }, { "module": "tabfile", "lname": "tabfile", "name": "Tabfile", "description": "Tabfile (.txt, .dic)", "extensions": [ ".txt", ".tab", ".tsv" ], "singleFile": true, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" }, "enable_info": { "class": "BoolOption", "type": "bool", "comment": "Enable glossary info / metedata" }, "resources": { "class": "BoolOption", "type": "bool", "comment": "Enable resources / data files" }, "file_size_approx": { "class": "FileSizeOption", "type": "int", "customValue": true, "comment": "Split up by given approximate file size\nexamples: 100m, 1g" }, "word_title": { "class": "BoolOption", "type": "bool", "comment": "Add headwords title to begining of definition" } }, "canRead": true, "canWrite": true, "readOptions": { "encoding": "utf-8" }, "writeOptions": { "encoding": "utf-8", "enable_info": true, "resources": true, "file_size_approx": 0, "word_title": false }, "readCompressions": [ "gz", "bz2", "lzma" ] }, { "module": "testformat", "lname": "testformat", "name": "Test", "description": "Test Format File(.test)", "extensions": [ ".test", ".tst" ], "singleFile": false, "optionsProp": {}, "canRead": true, "canWrite": true, "readOptions": {}, "writeOptions": {}, "enable": false }, { "module": "wiktionary_dump", "lname": "wiktionary_dump", "name": "WiktionaryDump", "description": "Wiktionary Dump (.xml)", "extensions": [], "singleFile": false, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" } }, "canRead": true, "canWrite": false, "readOptions": {} }, { "module": "wordset", "lname": "wordset", "name": "Wordset", "description": "Wordset.org JSON directory", "extensions": [], "singleFile": false, "optionsProp": { "encoding": { "class": "EncodingOption", "type": "str", "customValue": true, "comment": "Encoding/charset" } }, "canRead": true, "canWrite": false, "readOptions": { "encoding": "utf-8" } }, { "module": "xdxf", "lname": "xdxf", "name": "Xdxf", "description": "XDXF (.xdxf)", "extensions": [ ".xdxf" ], "singleFile": true, "optionsProp": { "html": { "class": "BoolOption", "type": "bool", "comment": "Entries are HTML" } }, "canRead": true, "canWrite": false, "readOptions": { "html": true }, "readDepends": { "lxml": "lxml" } }, { "module": "zimfile", "lname": "zim", "name": "Zim", "description": "Zim (.zim, for Kiwix)", "extensions": [ ".zim" ], "singleFile": true, "optionsProp": { "skip_duplicate_words": { "class": "BoolOption", "type": "bool", "comment": "Detect and skip duplicate words" } }, "canRead": true, "canWrite": false, "readOptions": { "skip_duplicate_words": false }, "readDepends": { "libzim": "libzim==1.0" } } ]pyglossary-4.5.0/plugins-meta/tools/000077500000000000000000000000001417733132500175075ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/aard2_slob.toml000066400000000000000000000004501417733132500224130ustar00rootroot00000000000000["Aard 2 for Android"] web = "http://aarddict.org/" platforms = [ "Android",] license = "GPL" plang = "Java" # no auto-RTL (in plaintext or html) ["Aard2 for Web"] web = "http://aarddict.org/" platforms = [ "Web",] license = "MPL" plang = "Java" # auto-RTL works in plaintext mode, but not html pyglossary-4.5.0/plugins-meta/tools/abc_medical_notes.toml000066400000000000000000000002401417733132500240130ustar00rootroot00000000000000["ABC Medical Notes 2020"] web = "https://play.google.com/store/apps/details?id=com.pocketmednotes2014.secondapp" platforms = [ "Android",] license = "Unknown" pyglossary-4.5.0/plugins-meta/tools/almaany.toml000066400000000000000000000002271417733132500220270ustar00rootroot00000000000000["Almaany.com Arabic Dictionary"] web = "https://play.google.com/store/apps/details?id=com.almaany.arar" platforms = [ "Android",] license = "Unknown" pyglossary-4.5.0/plugins-meta/tools/appledict.toml000066400000000000000000000002171417733132500223510ustar00rootroot00000000000000["Dictionary Development Kit"] web = "https://github.com/SebastianSzturo/Dictionary-Development-Kit" platforms = [ "Mac",] license = "Unknown" pyglossary-4.5.0/plugins-meta/tools/appledict_bin.toml000066400000000000000000000002101417733132500231720ustar00rootroot00000000000000["Apple Dictionary"] web = "https://support.apple.com/en-gu/guide/dictionary/welcome/mac" platforms = [ "Mac",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/babylon_bgl.toml000066400000000000000000000012101417733132500226500ustar00rootroot00000000000000["Babylon Translator"] web = "https://www.babylon-software.com/" wiki = "https://en.wikipedia.org/wiki/Babylon_Software" platforms = [ "Windows",] license = "Freemium" [GoldenDict] web = "http://goldendict.org/" platforms = [ "Linux", "Windows",] license = "GPL" ["GoldenDict Mobile (Free)"] web = "http://goldendict.mobi/" web2 = "https://play.google.com/store/apps/details?id=mobi.goldendict.android.free" platforms = [ "Android",] license = "Freeware" ["GoldenDict Mobile (Full)"] web = "http://goldendict.mobi/" web2 = "https://play.google.com/store/apps/details?id=mobi.goldendict.android" platforms = [ "Android",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/cc_cedict.toml000066400000000000000000000000001417733132500222720ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/cc_kedict.toml000066400000000000000000000000001417733132500223020ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/crawler_dir.toml000066400000000000000000000000001417733132500226670ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/csv.toml000066400000000000000000000004111417733132500211730ustar00rootroot00000000000000["LibreOffice Calc"] web = "https://www.libreoffice.org/discover/calc/" platforms = [ "Linux", "Windows", "Mac",] license = "MPL/GPL" ["Microsoft Excel"] web = "https://www.microsoft.com/en-us/microsoft-365/excel" platforms = [ "Windows",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/dicformids.toml000066400000000000000000000005041417733132500225260ustar00rootroot00000000000000[DictionaryForMIDs] web = "http://dictionarymid.sourceforge.net/" # https://sourceforge.net/projects/dictionarymid/ platforms = [ "Android", "Web", "Windows", "Linux", "Mac",] plang = "Java" # PC version is also Java-based license = "GPL" # android last commit: 2015/02/09 # android last release: 2015/02/09 - version 1.0.1 pyglossary-4.5.0/plugins-meta/tools/dict_cc.toml000066400000000000000000000002161417733132500217730ustar00rootroot00000000000000["dict.cc dictionary"] web = "https://play.google.com/store/apps/details?id=cc.dict.dictcc" platforms = [ "Android",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/dict_cc_split.toml000066400000000000000000000002161417733132500232060ustar00rootroot00000000000000["dict.cc dictionary"] web = "https://play.google.com/store/apps/details?id=cc.dict.dictcc" platforms = [ "Android",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/dict_org.toml000066400000000000000000000010201417733132500221670ustar00rootroot00000000000000[Dictd] web = "https://directory.fsf.org/wiki/Dictd" platforms = [ "Linux",] license = "GPL" ["GNOME Dictionary"] web = "https://wiki.gnome.org/Apps/Dictionary" platforms = [ "Linux",] license = "GPL" ["Xfce4 Dictionary"] web = "https://docs.xfce.org/apps/xfce4-dict/start" platforms = [ "linux",] license = "GPL" [Ding] desc = "Graphical dictionary lookup program for Unix (Tk)" web = "https://www-user.tu-chemnitz.de/~fri/ding/" platforms = [ "linux",] license = "GPL" copyright = "Copyright (c) 1999 - 2016 Frank Richter" pyglossary-4.5.0/plugins-meta/tools/dict_org_source.toml000066400000000000000000000001361417733132500235560ustar00rootroot00000000000000[dictfmt] web = "https://linux.die.net/man/1/dictfmt" platforms = [ "Linux",] license = "GPL" pyglossary-4.5.0/plugins-meta/tools/dictunformat.toml000066400000000000000000000001501417733132500230770ustar00rootroot00000000000000[dictunformat] web = "https://linux.die.net/man/1/dictunformat" platforms = [ "Linux",] license = "GPL" pyglossary-4.5.0/plugins-meta/tools/digitalnk.toml000066400000000000000000000001501417733132500223460ustar00rootroot00000000000000["Dic.rs"] web = "https://github.com/digitalprk/dicrs" platforms = [ "Linux",] license = "BSD-2-Clause" pyglossary-4.5.0/plugins-meta/tools/dsl.toml000066400000000000000000000003171417733132500211670ustar00rootroot00000000000000["ABBYY Lingvo"] web = "https://www.lingvo.ru/" wiki_ru = "https://ru.wikipedia.org/wiki/ABBYY_Lingvo" platforms = [ "Windows", "Mac", "Android", "iOS", "Windows Mobile", "Symbian",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/edlin.toml000066400000000000000000000000001417733132500214650ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/epub2.toml000066400000000000000000000016231417733132500214230ustar00rootroot00000000000000[calibre] web = "https://calibre-ebook.com/" wiki = "https://en.wikipedia.org/wiki/Calibre_(software)" repo = "https://github.com/kovidgoyal/calibre" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" [Okular] web = "https://okular.kde.org/" wiki = "https://en.wikipedia.org/wiki/Okular" repo = "https://invent.kde.org/graphics/okular" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" ["Book Reader"] web = "https://f-droid.org/en/packages/com.github.axet.bookreader/" repo = "https://gitlab.com/axet/android-book-reader" platforms = [ "Android",] license = "GPL" ["Kobo eReader"] web = "https://www.kobo.com" platforms = [ "Kobo eReader",] license = "Proprietary" ["Icecream Ebook Reader"] web = "https://icecreamapps.com/Ebook-Reader/" platforms = [ "Windows",] license = "Proprietary" [Aldiko] web = "https://www.demarque.com/aldiko" platforms = [ "Android", "iOS",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/freedict.toml000066400000000000000000000000001417733132500221570ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/gettext_po.toml000066400000000000000000000003451417733132500225700ustar00rootroot00000000000000[gettext] web = "https://www.gnu.org/software/gettext/" platforms = [ "Linux", "Windows",] license = "GPL" [poEdit] web = "https://github.com/vslavik/poedit" platforms = [ "Linux", "Windows", "Mac",] license = "MIT / Shareware" pyglossary-4.5.0/plugins-meta/tools/html_dir.toml000066400000000000000000000000001417733132500221740ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/info.toml000066400000000000000000000000001417733132500213250ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/jmdict.toml000066400000000000000000000000001417733132500216440ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/json.toml000066400000000000000000000000001417733132500213430ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/kobo.toml000066400000000000000000000001451417733132500213360ustar00rootroot00000000000000["Kobo eReader"] web = "https://www.kobo.com" platforms = [ "Kobo eReader",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/kobo_dictfile.toml000066400000000000000000000001621417733132500232000ustar00rootroot00000000000000[dictgen] web = "https://pgaskin.net/dictutil/dictgen/" platforms = [ "Linux", "Windows", "Mac",] license = "MIT" pyglossary-4.5.0/plugins-meta/tools/lingoes_ldf.toml000066400000000000000000000005441417733132500226740ustar00rootroot00000000000000["Lingoes Dictionary Creator"] web = "http://www.lingoes.net/en/dictionary/dict_format.php" platforms = [] license = "Unknown" comment = "Lingoes Dictionary Creator is developing now.\nPlease send your finished dictionary source file to kevin-yau@msn.com\nLingoes will compile it into .ld2 for you.\nYou will can do it yourself after the creator release." pyglossary-4.5.0/plugins-meta/tools/mobi.toml000066400000000000000000000012671417733132500213400ustar00rootroot00000000000000["Amazon Kindle"] web = "https://www.amazon.com/kindle" platforms = [ "Amazon Kindle",] license = "Proprietary" [calibre] web = "https://calibre-ebook.com/" wiki = "https://en.wikipedia.org/wiki/Calibre_(software)" repo = "https://github.com/kovidgoyal/calibre" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" [Okular] web = "https://okular.kde.org/" wiki = "https://en.wikipedia.org/wiki/Okular" repo = "https://invent.kde.org/graphics/okular" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" ["Book Reader"] web = "https://f-droid.org/en/packages/com.github.axet.bookreader/" repo = "https://gitlab.com/axet/android-book-reader" platforms = [ "Android",] license = "GPL" pyglossary-4.5.0/plugins-meta/tools/octopus_mdict.toml000066400000000000000000000001611417733132500232560ustar00rootroot00000000000000[MDict] web = "https://www.mdict.cn/" platforms = [ "Android", "iOS", "Windows", "Mac",] license = "Proprietary" pyglossary-4.5.0/plugins-meta/tools/sdict.toml000066400000000000000000000006161417733132500215150ustar00rootroot00000000000000[PTkSdict] web = "http://swaj.net/sdict/" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" last_modified = "2013-05-06" last_release = "2.0.0rc5" ["Sdictionary for Symbian"] web = "http://swaj.net/epoc/symbian/index.html" platforms = [ "Symbian",] license = "Unknown" # [["Sdict-3.0"] # last_modified = "2007-12-24" # there is a Perl/Tk-based GUI, and command line tool, both in Perl 5 pyglossary-4.5.0/plugins-meta/tools/sdict_source.toml000066400000000000000000000001651417733132500230740ustar00rootroot00000000000000[PTkSdict] web = "http://swaj.net/sdict/create-dicts.html" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" pyglossary-4.5.0/plugins-meta/tools/sql.toml000066400000000000000000000000001417733132500211710ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/stardict.toml000066400000000000000000000041201417733132500222160ustar00rootroot00000000000000[GoldenDict] web = "http://goldendict.org/" platforms = [ "Linux", "Windows",] license = "GPL" plang = "C++" [StarDict] web = "http://huzheng.org/stardict/" platforms = [ "Linux", "Windows", "Mac",] license = "GPL" plang = "C" ["GoldenDict Mobile (Free)"] web = "http://goldendict.mobi/" web2 = "https://play.google.com/store/apps/details?id=mobi.goldendict.android.free" platforms = [ "Android",] license = "Freemium" ["GoldenDict Mobile (Full)"] web = "http://goldendict.mobi/" web2 = "https://play.google.com/store/apps/details?id=mobi.goldendict.android" platforms = [ "Android",] license = "Proprietary" ["Twinkle Star Dictionary"] web = "https://play.google.com/store/apps/details?id=com.qtier.dict" platforms = [ "Android",] license = "Unknown" # last release: 2015/10/19 # could not find the source code, license or website [WordMateX] web = "https://apkcombo.com/wordmatex/org.d1scw0rld.wordmatex/" platforms = [ "Android",] license = "Proprietary" # last release: 2020/01/01, version 2.1.1 # Google Play says "not compatible with your devices", not letting me # download and install, so I downloaded apk from apkcombo.com # This is the only Android app (not just for StarDict format) I found # that supports auto-RTL [QDict] web = "https://play.google.com/store/apps/details?id=com.annie.dictionary" source = "https://github.com/namndev/QDict" platforms = [ "Android",] license = "Apache 2.0" # last release: 2017/04/16 (keeps crashing on my device, unusable) # last commit: 2020/06/24 ["Fora Dictionary"] web = "https://play.google.com/store/apps/details?id=com.ngc.fora" platforms = [ "Android",] license = "Freemium" # no dark mode # some options show "Premium Feature" # has prefix-search-on-type but it's a little slow # supports RTL (haven't tested auto-RTL) ["Fora Dictionary Pro"] web = "https://play.google.com/store/apps/details?id=com.ngc.fora.android" platforms = [ "Android",] license = "Proprietary" [KOReader] web = "http://koreader.rocks/" source = "https://github.com/koreader/koreader" platforms = [ "Android", "Amazon Kindle", "Kobo eReader", "PocketBook", "Cervantes",] license = "AGPLv3" pyglossary-4.5.0/plugins-meta/tools/tabfile.toml000066400000000000000000000000001417733132500220000ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/wiktionary_dump.toml000066400000000000000000000000001417733132500236170ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/wordset.toml000066400000000000000000000000001417733132500220610ustar00rootroot00000000000000pyglossary-4.5.0/plugins-meta/tools/xdxf.toml000066400000000000000000000005421417733132500213560ustar00rootroot00000000000000[GoldenDict] web = "http://goldendict.org/" platforms = [ "Linux", "Windows",] license = "GPL" plang = "C++" [QTranslate] web = "https://quest-app.appspot.com/" platforms = [ "Windows",] license = "Proprietary" plang = "C++" [Alpus] web = "https://alpusapp.com/" platforms = [ "Windows", "Mac", "Linux", "Android",] license = "Freeware" plang = "Java" pyglossary-4.5.0/plugins-meta/tools/zim.toml000066400000000000000000000010201417733132500211740ustar00rootroot00000000000000["Kiwix Desktop"] web = "https://github.com/kiwix/kiwix-desktop" platforms = [ "Linux", "Windows",] license = "GPL" ["Kiwix JS"] web = "https://github.com/kiwix/kiwix-js" platforms = [ "Windows",] license = "GPL" ["Kiwix Serve"] web = "https://github.com/kiwix/kiwix-tools" platforms = [ "Linux", "Windows",] license = "GPL" ["Kiwix for Apple Mac OS X"] web = "macos.kiwix.org" platforms = [ "Mac",] license = "" ["Kiwix for Android"] web = "https://github.com/kiwix/kiwix-android" platforms = [ "Android",] license = "GPL" pyglossary-4.5.0/pyglossary.pyw000077500000000000000000000002231417733132500167170ustar00rootroot00000000000000#!/usr/bin/env python3 import sys from os.path import dirname sys.path.insert(0, dirname(__file__)) from pyglossary.ui.main import main main() pyglossary-4.5.0/pyglossary/000077500000000000000000000000001417733132500161565ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/__init__.py000066400000000000000000000001251417733132500202650ustar00rootroot00000000000000from .core import log, VERSION from .glossary import Glossary __version__ = VERSION pyglossary-4.5.0/pyglossary/arabic_utils.py000066400000000000000000000000301417733132500211620ustar00rootroot00000000000000# -*- coding: utf-8 -*- pyglossary-4.5.0/pyglossary/compression.py000066400000000000000000000061401417733132500210720ustar00rootroot00000000000000# -*- coding: utf-8 -*- import os import logging stdCompressions = ("gz", "bz2", "lzma") log = logging.getLogger("pyglossary") def compressionOpenFunc(c: str): if not c: return open if c == "gz": import gzip return gzip.open if c == "bz2": import bz2 return bz2.open if c == "lzma": import lzma return lzma.open if c == "dz": import gzip return gzip.open return None def compressionOpen(filename, dz=False, **kwargs): from os.path import splitext filenameNoExt, ext = splitext(filename) ext = ext.lower().lstrip(".") try: int(ext) except ValueError: pass else: _, ext = splitext(filenameNoExt) ext = ext.lower().lstrip(".") if ext in stdCompressions or (dz and ext == "dz"): _file = compressionOpenFunc(ext)(filename, **kwargs) _file.compression = ext return _file return open(filename, **kwargs) def zipFileOrDir(glos: "GlossaryType", filename: str) -> "Optional[str]": import zipfile import shutil from os.path import ( isfile, isdir, split, ) from .os_utils import indir def _zipFileAdd(zf, filename): if isfile(filename): zf.write(filename) return if not isdir(filename): raise OSError(f"Not a file or directory: {filename}") for subFname in os.listdir(filename): _zipFileAdd(zf, join(filename, subFname)) zf = zipfile.ZipFile(f"{filename}.zip", mode="w") if isdir(filename): dirn, name = split(filename) with indir(filename): for subFname in os.listdir(filename): _zipFileAdd(zf, subFname) shutil.rmtree(filename) return dirn, name = split(filename) files = [name] if isdir(f"{filename}_res"): files.append(f"{name}_res") with indir(dirn): for fname in files: _zipFileAdd(zf, fname) def compress(glos: "GlossaryType", filename: str, compression: str) -> str: """ filename is the existing file path supported compressions: "gz", "bz2", "lzma", "zip" """ import shutil from os.path import isfile log.info(f"Compressing {filename!r} with {compression!r}") compFilename = f"{filename}.{compression}" if compression in stdCompressions: with compressionOpenFunc(compression)(compFilename, mode="wb") as dest: with open(filename, mode="rb") as source: shutil.copyfileobj(source, dest) return compFilename if compression == "zip": try: os.remove(compFilename) except OSError: pass try: error = zipFileOrDir(glos, filename) except Exception as e: log.error( f"{e}\nFailed to compress file \"{filename}\"" ) else: raise ValueError(f"unexpected compression={compression!r}") if isfile(compFilename): return compFilename else: return filename def uncompress(srcFilename: str, dstFilename: str, compression: str) -> None: """ filename is the existing file path supported compressions: "gz", "bz2", "lzma" """ import shutil log.info(f"Uncompressing {srcFilename!r} to {dstFilename!r}") if compression in stdCompressions: with compressionOpenFunc(compression)(srcFilename, mode="rb") as source: with open(dstFilename, mode="wb") as dest: shutil.copyfileobj(source, dest) return # TODO: if compression == "zip": raise ValueError(f"unexpected compression={compression!r}") pyglossary-4.5.0/pyglossary/core.py000066400000000000000000000222001417733132500174540ustar00rootroot00000000000000import logging import traceback import inspect import sys import os from os.path import ( join, isfile, isdir, exists, realpath, dirname, ) import platform VERSION = "4.5.0" homePage = "https://github.com/ilius/pyglossary" TRACE = 5 logging.addLevelName(TRACE, "TRACE") noColor = False class Formatter(logging.Formatter): def __init__(self, *args, **kwargs): logging.Formatter.__init__(self, *args, **kwargs) self.fill = None # type: Optional[Callable[[str], str]] def formatMessage(self, record): msg = logging.Formatter.formatMessage(self, record) if self.fill is not None: msg = self.fill(msg) return msg class MyLogger(logging.Logger): levelsByVerbosity = ( logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG, TRACE, logging.NOTSET, ) levelNamesCap = [ "Critical", "Error", "Warning", "Info", "Debug", "Trace", "All", # "Not-Set", ] def __init__(self, *args): logging.Logger.__init__(self, *args) self._verbosity = 3 self._timeEnable = False def setVerbosity(self, verbosity: int) -> None: self.setLevel(self.levelsByVerbosity[verbosity]) self._verbosity = verbosity def getVerbosity(self) -> int: return self._verbosity def trace(self, msg: str): self.log(TRACE, msg) def pretty(self, data: "Any", header: str = "") -> None: from pprint import pformat self.debug(header + pformat(data)) def isDebug(self) -> bool: return self.getVerbosity() >= 4 def newFormatter(self): timeEnable = self._timeEnable if timeEnable: fmt = "%(asctime)s [%(levelname)s] %(message)s" else: fmt = "[%(levelname)s] %(message)s" return Formatter(fmt) def setTimeEnable(self, timeEnable: bool): self._timeEnable = timeEnable formatter = self.newFormatter() for handler in self.handlers: handler.setFormatter(formatter) def addHandler(self, handler: "logging.Handler"): # if want to add separate format (new config keys and flags) for ui_gtk # and ui_tk, you need to remove this function and run handler.setFormatter # in ui_gtk and ui_tk logging.Logger.addHandler(self, handler) handler.setFormatter(self.newFormatter()) def formatVarDict( dct: "Dict[str, Any]", indent: int = 4, max_width: int = 80, ) -> str: lines = [] pre = " " * indent for key, value in dct.items(): line = pre + key + " = " + repr(value) if len(line) > max_width: line = line[:max_width - 3] + "..." try: value_len = len(value) except TypeError: pass else: line += f"\n{pre}len({key}) = {value_len}" lines.append(line) return "\n".join(lines) def format_exception( exc_info: "Optional[Tuple[Type, Exception, types.TracebackType]]" = None, add_locals: bool = False, add_globals: bool = False, ) -> str: if not exc_info: exc_info = sys.exc_info() _type, value, tback = exc_info text = "".join(traceback.format_exception(_type, value, tback)) if add_locals or add_globals: try: frame = inspect.getinnerframes(tback, context=0)[-1][0] except IndexError: pass else: if add_locals: text += f"Traceback locals:\n{formatVarDict(frame.f_locals)}\n" if add_globals: text += f"Traceback globals:\n{formatVarDict(frame.f_globals)}\n" return text class StdLogHandler(logging.Handler): colorsConfig = { "CRITICAL": ("color.cmd.critical", 196), "ERROR": ("color.cmd.error", 1), "WARNING": ("color.cmd.warning", 208), } # 1: dark red (like 31m), 196: real red, 9: light red # 15: white, 229: light yellow (#ffffaf), 226: real yellow (#ffff00) def __init__(self, noColor: bool = False): logging.Handler.__init__(self) self.set_name("std") self.noColor = noColor self.config = {} @property def endFormat(self): if self.noColor: return "" return "\x1b[0;0;0m" def emit(self, record: logging.LogRecord) -> None: msg = "" if record.getMessage(): msg = self.format(record) ### if record.exc_info: _type, value, tback = record.exc_info tback_text = format_exception( exc_info=record.exc_info, add_locals=(log.level <= logging.DEBUG), add_globals=False, ) if not msg: msg = "unhandled exception:" msg += "\n" msg += tback_text ### levelname = record.levelname if levelname in ("CRITICAL", "ERROR"): fp = sys.stderr else: fp = sys.stdout if not self.noColor and levelname in self.colorsConfig: key, default = self.colorsConfig[levelname] colorCode = self.config.get(key, default) startColor = f"\x1b[38;5;{colorCode}m" msg = startColor + msg + self.endFormat ### if fp is None: print(f"fp=None, levelname={record.levelname}") print(msg) return fp.write(msg + "\n") fp.flush() def checkCreateConfDir() -> None: if not isdir(confDir): if exists(confDir): # file, or anything other than directory os.rename(confDir, confDir + ".bak") # we do not import old config os.mkdir(confDir) if not exists(userPluginsDir): try: os.mkdir(userPluginsDir) except Exception as e: log.warning(f"failed to create user plugins directory: {e}") if not isfile(confJsonFile): with open(rootConfJsonFile) as srcF, open(confJsonFile, "w") as usrF: usrF.write(srcF.read()) def in_virtualenv(): if hasattr(sys, 'real_prefix'): return True if hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix: return True return False def getDataDir(): if in_virtualenv(): pass # TODO # print(f"prefix={sys.prefix}, base_prefix={sys.base_prefix}") # return join( # dirname(dirname(dirname(rootDir))), # os.getenv("VIRTUAL_ENV"), "share", "pyglossary", # ) if not ( rootDir.endswith("dist-packages") or rootDir.endswith("site-packages") ): return rootDir parent3 = dirname(dirname(dirname(rootDir))) if os.sep == "/": return join(parent3, "share", "pyglossary") _dir = join( parent3, f"Python{sys.version_info.major}{sys.version_info.minor}", "share", "pyglossary", ) if isdir(_dir): return _dir _dir = join(parent3, "Python3", "share", "pyglossary") if isdir(_dir): return _dir _dir = join(parent3, "Python", "share", "pyglossary") if isdir(_dir): return _dir _dir = join(sys.prefix, "share", "pyglossary") if isdir(_dir): return _dir if os.getenv("CONDA_PREFIX"): _dir = join(os.getenv("CONDA_PREFIX"), "share", "pyglossary") if isdir(_dir): return _dir raise OSError("failed to detect dataDir") def windows_show_exception(*exc_info): import ctypes msg = format_exception( exc_info=exc_info, add_locals=(log.level <= logging.DEBUG), add_globals=False, ) log.critical(msg) ctypes.windll.user32.MessageBoxW(0, msg, "PyGlossary Error", 0) # __________________________________________________________________________ # logging.setLoggerClass(MyLogger) log = logging.getLogger("pyglossary") if os.sep == "\\": sys.excepthook = windows_show_exception else: sys.excepthook = lambda *exc_info: log.critical( format_exception( exc_info=exc_info, add_locals=(log.level <= logging.DEBUG), add_globals=False, ) ) sysName = platform.system().lower() # platform.system() is in ["Linux", "Windows", "Darwin", "FreeBSD"] # sysName is in ["linux", "windows", "darwin', "freebsd"] # can set env var WARNINGS to: # "error", "ignore", "always", "default", "module", "once" if os.getenv("WARNINGS"): import warnings warnings.filterwarnings(os.getenv("WARNINGS")) if getattr(sys, "frozen", False): # PyInstaller frozen executable log.info(f"sys.frozen = {sys.frozen}") rootDir = dirname(sys.executable) uiDir = join(rootDir, "pyglossary", "ui") else: _srcDir = dirname(realpath(__file__)) uiDir = join(_srcDir, "ui") rootDir = dirname(_srcDir) dataDir = getDataDir() appResDir = join(dataDir, "res") if os.sep == "/": # Operating system is Unix-Like homeDir = os.getenv("HOME") user = os.getenv("USER") tmpDir = os.getenv("TMPDIR", "/tmp") if sysName == "darwin": # MacOS X _libDir = join(homeDir, "Library") confDir = join(_libDir, "Preferences", "PyGlossary") # or maybe: join(_libDir, "PyGlossary") # os.environ["OSTYPE"] == "darwin10.0" # os.environ["MACHTYPE"] == "x86_64-apple-darwin10.0" # platform.dist() == ("", "", "") # platform.release() == "10.3.0" cacheDir = join(_libDir, "Caches", "PyGlossary") pip = "pip3" else: # GNU/Linux, Termux, FreeBSD, etc # should switch to "$XDG_CONFIG_HOME/pyglossary" in version 5.0.0 # which generally means ~/.config/pyglossary confDir = join(homeDir, ".pyglossary") cacheDir = join(homeDir, ".cache", "pyglossary") if "/com.termux/" in homeDir: pip = "pip3" else: pip = "sudo pip3" elif os.sep == "\\": # Operating system is Windows homeDir = join(os.getenv("HOMEDRIVE"), os.getenv("HOMEPATH")) user = os.getenv("USERNAME") tmpDir = os.getenv("TEMP") _appData = os.getenv("APPDATA") confDir = join(_appData, "PyGlossary") _localAppData = os.getenv("LOCALAPPDATA") if not _localAppData: # Windows Vista or older _localAppData = abspath(join(_appData, "..", "Local")) cacheDir = join(_localAppData, "PyGlossary", "Cache") pip = "pip3" else: raise RuntimeError( f"Unknown path seperator(os.sep=={os.sep!r})" f", unknown operating system!" ) pluginsDir = join(rootDir, "pyglossary", "plugins") confJsonFile = join(confDir, "config.json") rootConfJsonFile = join(dataDir, "config.json") userPluginsDir = join(confDir, "plugins") pyglossary-4.5.0/pyglossary/core_test.py000066400000000000000000000023541417733132500205230ustar00rootroot00000000000000#!/usr/bin/python3 import logging class MockLogHandler(logging.Handler): def __init__(self): logging.Handler.__init__(self) self.clear() def clear(self): self.recordsByLevel = {} def emit(self, record): level = record.levelno if level in self.recordsByLevel: self.recordsByLevel[level].append(record) else: self.recordsByLevel[level] = [record] def popLog(self, level: int, msg: str) -> "Optional[logging.Record]": if level not in self.recordsByLevel: return None records = self.recordsByLevel[level] for index, record in enumerate(records): if record.getMessage() == msg: return records.pop(index) return None def printRemainingErrors(self) -> int: count = 0 for level in ( logging.CRITICAL, logging.ERROR, logging.WARNING, ): if level not in self.recordsByLevel: continue for record in self.recordsByLevel[level]: count += 1 print(repr(self.format(record))) return count mockLog = None def getMockLogger(): global mockLog if mockLog is not None: return mockLog log = logging.getLogger("pyglossary") for handler in log.handlers: log.removeHandler(handler) mockLog = MockLogHandler() mockLog.setLevel(logging.WARNING) log.addHandler(mockLog) return mockLog pyglossary-4.5.0/pyglossary/ebook_base.py000066400000000000000000000270661417733132500206340ustar00rootroot00000000000000# -*- coding: utf-8 -*- # The MIT License (MIT) # Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) # Copyright © 2016-2019 Saeed Rasooli # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from itertools import groupby import os from os.path import join import zipfile import tempfile from datetime import datetime import shutil from pyglossary.text_utils import toStr, toBytes from pyglossary.os_utils import indir, rmtree import logging log = logging.getLogger("pyglossary") class GroupState(object): def __init__(self, writer) -> None: self.writer = writer self.last_prefix = "" self.group_index = -1 self.reset() def reset(self) -> None: self.first_word = "" self.last_word = "" self.group_contents = [] def is_new(self, prefix: str) -> bool: return self.last_prefix and prefix != self.last_prefix def add(self, entry: "BaseEntry", prefix: str) -> None: word = entry.s_word defi = entry.defi if not self.first_word: self.first_word = word self.last_word = word self.last_prefix = prefix self.group_contents.append(self.writer.format_group_content(word, defi)) class EbookWriter(object): """ A class representing a generic ebook containing a dictionary. It can be used to output a MOBI or an EPUB 2 container. The ebook must have an OPF, and one or more group XHTML files. Optionally, it can have a cover image, an NCX TOC, an index XHTML file. The actual file templates are provided by the caller. """ _keep: bool = False _group_by_prefix_length: int = 2 _include_index_page: bool = False _compress: bool = True _css: str = "" # path to css file, or "" _cover_path: str = "" # path to cover file, or "" CSS_CONTENTS = "" GROUP_XHTML_TEMPLATE = "" GROUP_XHTML_INDEX_LINK = "" GROUP_XHTML_WORD_DEFINITION_TEMPLATE = "" GROUP_XHTML_WORD_DEFINITION_JOINER = "\n" MIMETYPE_CONTENTS = "" CONTAINER_XML_CONTENTS = "" GROUP_START_INDEX = 2 COVER_TEMPLATE = "{cover}" INDEX_XHTML_TEMPLATE = """ {title}

{indexTitle}

{links}

""" INDEX_XHTML_LINK_TEMPLATE = " " \ "
{label}
" INDEX_XHTML_LINK_JOINER = " •\n" OPF_MANIFEST_ITEM_TEMPLATE = " " OPF_SPINE_ITEMREF_TEMPLATE = " " def get_opf_contents(self, manifest_contents, spine_contents): raise NotImplementedError def __init__( self, glos, escape_strings=False, # ignore_synonyms=False, # flatten_synonyms=False, ): self._glos = glos self._filename = None self._escape_strings = escape_strings # self._ignore_synonyms = ignore_synonyms # self._flatten_synonyms = flatten_synonyms # Penelope's extra options: # "bookeen_collation_function": None, # bookeen format # "bookeen_install_file": False, # bookeen format # "group_by_prefix_merge_across_first": False, # "group_by_prefix_merge_min_size": 0, self._tmpDir = None self.cover = None self.files = [] self.manifest_files = [] self._group_labels = [] def finish(self): self._filename = None def myOpen(self, fname, mode): return open(join(self._tmpDir, fname), mode) def add_file(self, relative_path, contents, mode=None): if mode is None: mode = zipfile.ZIP_DEFLATED file_path = os.path.join(self._tmpDir, relative_path) contents = toBytes(contents) with self.myOpen(file_path, "wb") as file_obj: file_obj.write(contents) self.files.append({ "path": relative_path, "mode": mode, }) def write_cover(self, cover_path): basename = os.path.basename(cover_path) with self.myOpen(cover_path, "rb") as cover_obj: cover = cover_obj.read() b = basename.lower() mimetype = "image/jpeg" if b.endswith(".png"): mimetype = "image/png" elif b.endswith(".gif"): mimetype = "image/gif" self.add_file_manifest("OEBPS/" + basename, basename, cover, mimetype) self.cover = basename def write_css(self, custom_css_path_absolute): css = self.CSS_CONTENTS if custom_css_path_absolute is not None: try: with self.myOpen(custom_css_path_absolute, "rb") as css_obj: css = css_obj.read() except Exception: log.exception("") self.add_file_manifest("OEBPS/style.css", "style.css", css, "text/css") def add_file_manifest(self, relative_path, id, contents, mimetype): self.add_file(relative_path, contents) self.manifest_files.append({ "path": relative_path, "id": id, "mimetype": mimetype, }) def get_group_xhtml_file_name_from_index(self, index): if index < self.GROUP_START_INDEX: # or index >= groupCount + self.GROUP_START_INDEX: # number of groups are not known, FIXME # so we can not say if the current group is the last or not return "#groupPage" return f"g{index:06d}.xhtml" def get_prefix(self, word: str) -> str: raise NotImplementedError def sortKey(self, words: "List[str]") -> "Any": raise NotImplementedError def write_groups(self): # TODO: rtl=False option # TODO: handle alternates better (now shows word1|word2... in title) group_labels = [] def add_group(state): if not state.last_prefix: return state.group_index += 1 index = state.group_index + self.GROUP_START_INDEX group_label = state.last_prefix if group_label != "SPECIAL": group_label = state.first_word + "–" + state.last_word log.debug(f"add_group: {state.group_index}, {state.last_prefix!r}") group_labels.append(group_label) previous_link = self.get_group_xhtml_file_name_from_index(index - 1) next_link = self.get_group_xhtml_file_name_from_index(index + 1) group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) self.add_file_manifest( "OEBPS/" + group_xhtml_path, group_xhtml_path, self.GROUP_XHTML_TEMPLATE.format( title=group_label, group_title=group_label, previous_link=previous_link, index_link=( self.GROUP_XHTML_INDEX_LINK if self._include_index_page else "" ), next_link=next_link, group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join( state.group_contents, ), ), "application/xhtml+xml", ) state = GroupState(self) while True: entry = yield if entry is None: break if entry.isData(): if entry.getFileName() == "style.css": self.add_file_manifest( "OEBPS/style.css", "style.css", entry.data.decode("utf-8"), "text/css", ) continue prefix = self.get_prefix(entry.s_word) if state.is_new(prefix): add_group(state) state.reset() state.add(entry, prefix) add_group(state) self._group_labels = group_labels def format_group_content(self, word: str, defi: str) -> str: return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format( headword=self.escape_if_needed(word), definition=self.escape_if_needed(defi), ) def escape_if_needed(self, string): if self._escape_strings: string = string.replace("&", "&")\ .replace('"', """)\ .replace("'", "'")\ .replace(">", ">")\ .replace("<", "<") return string def write_index(self, group_labels): """ group_labels: a list of labels """ links = [] for label_i, label in enumerate(group_labels): links.append(self.INDEX_XHTML_LINK_TEMPLATE.format( ref=self.get_group_xhtml_file_name_from_index( self.GROUP_START_INDEX + label_i ), label=label, )) links = self.INDEX_XHTML_LINK_JOINER.join(links) title = self._glos.getInfo("name") contents = self.INDEX_XHTML_TEMPLATE.format( title=title, indexTitle=title, links=links, ) self.add_file_manifest( "OEBPS/index.xhtml", "index.xhtml", contents, "application/xhtml+xml", ) def get_opf_contents(self, manifest_contents, spine_contents): cover = "" if self.cover: cover = self.COVER_TEMPLATE.format(cover=self.cover) creationDate = datetime.now().strftime("%Y-%m-%d") return self.OPF_TEMPLATE.format( identifier=self._glos.getInfo("uuid"), sourceLang=self._glos.sourceLangName, targetLang=self._glos.targetLangName, title=self._glos.getInfo("name"), creator=self._glos.author, copyright=self._glos.getInfo("copyright"), creationDate=creationDate, cover=cover, manifest=manifest_contents, spine=spine_contents, ) def write_opf(self): manifest_lines = [] spine_lines = [] for mi in self.manifest_files: manifest_lines.append(self.OPF_MANIFEST_ITEM_TEMPLATE.format( ref=mi["id"], id=mi["id"], mediaType=mi["mimetype"] )) if mi["mimetype"] == "application/xhtml+xml": spine_lines.append(self.OPF_SPINE_ITEMREF_TEMPLATE.format( id=mi["id"], )) manifest_contents = "\n".join(manifest_lines) spine_contents = "\n".join(spine_lines) opf_contents = self.get_opf_contents( manifest_contents, spine_contents, ) self.add_file("OEBPS/content.opf", opf_contents) def write_ncx(self, group_labels): """ write_ncx only for epub """ pass def open(self, filename: str): self._filename = filename self._tmpDir = tempfile.mkdtemp() def write(self): filename = self._filename # self._group_by_prefix_length # self._include_index_page css = self._css cover_path = self._cover_path with indir(self._tmpDir): if cover_path: cover_path = os.path.abspath(cover_path) if css: css = os.path.abspath(css) os.makedirs("META-INF") os.makedirs("OEBPS") if self.MIMETYPE_CONTENTS: self.add_file("mimetype", self.MIMETYPE_CONTENTS, mode=zipfile.ZIP_STORED) if self.CONTAINER_XML_CONTENTS: self.add_file("META-INF/container.xml", self.CONTAINER_XML_CONTENTS) if cover_path: try: self.write_cover(cover_path) except Exception: log.exception("") if css: self.write_css(css) yield from self.write_groups() group_labels = self._group_labels if self._include_index_page: self.write_index() self.write_ncx(group_labels) self.write_opf() if self._compress: zipFp = zipfile.ZipFile( filename, "w", compression=zipfile.ZIP_DEFLATED, ) for fileDict in self.files: zipFp.write( fileDict["path"], compress_type=fileDict["mode"], ) zipFp.close() if not self._keep: rmtree(self._tmpDir) else: if self._keep: shutil.copytree(self._tmpDir, filename) else: shutil.move(self._tmpDir, filename) pyglossary-4.5.0/pyglossary/entry.py000066400000000000000000000243661417733132500177040ustar00rootroot00000000000000# -*- coding: utf-8 -*- import re import shutil import os from os.path import ( join, exists, dirname, getsize, ) from .entry_base import BaseEntry, MultiStr, RawEntryType from .iter_utils import unique_everseen from .text_utils import ( joinByBar, ) from pickle import dumps, loads from zlib import compress, decompress import logging log = logging.getLogger("pyglossary") # aka Resource class DataEntry(BaseEntry): __slots__ = [ "_fname", "_data", "_tmpPath", "_byteProgress", ] def isData(self) -> bool: return True def __init__( self, fname: str, data: bytes = b"", tmpPath: "Optional[str]" = None, byteProgress: "Optional[Tuple[int, int]]" = None, ) -> None: assert isinstance(fname, str) assert isinstance(data, bytes) if data and tmpPath: with open(tmpPath, "wb") as toFile: toFile.write(data) data = b"" self._fname = fname self._data = data # bytes instance self._tmpPath = tmpPath self._byteProgress = byteProgress # Optional[Tuple[int, int]] def getFileName(self) -> str: return self._fname @property def data(self) -> bytes: if self._tmpPath: with open(self._tmpPath, "rb") as _file: return _file.read() else: return self._data def size(self): if self._tmpPath: return getsize(self._tmpPath) else: return len(self._data) def save(self, directory: str) -> str: fname = self._fname # fix filename depending on operating system? FIXME fpath = join(directory, fname) fdir = dirname(fpath) try: os.makedirs(fdir, mode=0o755, exist_ok=True) if self._tmpPath: shutil.move(self._tmpPath, fpath) self._tmpPath = fpath else: with open(fpath, "wb") as toFile: toFile.write(self._data) except FileNotFoundError as e: log.error(f"error in DataEntry.save: {e}") except Exception: log.exception(f"error while saving {fpath}") return "" return fpath @property def s_word(self) -> str: return self._fname @property def l_word(self) -> "List[str]": return [self._fname] @property def defi(self) -> str: return f"File: {self._fname}" def byteProgress(self): return self._byteProgress @property def defiFormat(self) -> 'Literal["b"]': return "b" @defiFormat.setter def defiFormat(self, defiFormat: str) -> None: pass def detectDefiFormat(self) -> None: pass def addAlt(self, alt: str) -> None: pass def editFuncWord(self, func: "Callable[[str], str]") -> None: pass # modify fname? # FIXME def editFuncDefi(self, func: "Callable[[str], str]") -> None: pass def strip(self) -> None: pass def replaceInWord(self, source: str, target: str) -> None: pass def replaceInDefi(self, source: str, target: str) -> None: pass def replace(self, source: str, target: str) -> None: pass def removeEmptyAndDuplicateAltWords(self): pass def getRaw(self, glos: "GlossaryType") -> "RawEntryType": b_fpath = b"" if glos.tmpDataDir: b_fpath = self.save(glos.tmpDataDir).encode("utf-8") tpl = ( [self._fname], b_fpath, "b", ) if glos.rawEntryCompress: return compress(dumps(tpl), level=9) return tpl class Entry(BaseEntry): xdxfPattern = re.compile("^[^<>]*", re.S | re.I) htmlPattern = re.compile( ".*(?:" + "|".join([ r"]", r"", r"]", r"]", r"]", r"", r"]*href=" r"]", r"]", r"]", r"]", r"]", r"]", r"]", r"]", r"]", r"]", ]) + "|&[a-z]{2,8};|&#x?[0-9]{2,5};)", re.S | re.I, ) __slots__ = [ "_word", "_defi", "_defiFormat", "_byteProgress", ] def isData(self) -> bool: return False @staticmethod def getRawEntrySortKey( glos: "GlossaryType", key: "Callable[[bytes], Any]", ) -> "Callable[[Tuple], Any]": # here `x` is raw entity, meaning a tuple of form (word, defi) or # (word, defi, defiFormat) # so x[0] is word(s) in bytes, that can be a str (one word), # or a list or tuple (one word with or more alternaties) if glos.rawEntryCompress: return lambda x: key(loads(decompress(x))[0]) else: # x is rawEntry, so x[0] is list of words (entry.l_word) return lambda x: key(x[0]) def __init__( self, word: MultiStr, defi: MultiStr, defiFormat: str = "m", byteProgress: "Optional[Tuple[int, int]]" = None, ) -> None: """ word: string or a list of strings (including alternate words) defi: string or a list of strings (including alternate definitions) defiFormat (optional): definition format: "m": plain text "h": html "x": xdxf """ # memory optimization: if isinstance(word, list): if len(word) == 1: word = word[0] elif not isinstance(word, str): raise TypeError(f"invalid word type {type(word)}") if isinstance(defi, list): if len(defi) == 1: defi = defi[0] elif not isinstance(defi, str): raise TypeError(f"invalid defi type {type(defi)}") if defiFormat not in ("m", "h", "x"): raise ValueError(f"invalid defiFormat {defiFormat!r}") self._word = word self._defi = defi self._defiFormat = defiFormat self._byteProgress = byteProgress # Optional[Tuple[int, int]] def __repr__(self): return ( f"Entry({self._word!r}, {self._defi!r}, " f"defiFormat={self._defiFormat!r})" ) @property def s_word(self): """ returns string of word, and all the alternate words seperated by "|" """ if isinstance(self._word, str): return self._word else: return joinByBar(self._word) @property def l_word(self) -> "List[str]": """ returns list of the word and all the alternate words """ if isinstance(self._word, str): return [self._word] else: return self._word @property def defi(self) -> str: """ returns string of definition """ return self._defi @property def defiFormat(self) -> str: """ returns definition format: "m": plain text "h": html "x": xdxf """ # TODO: type: Literal["m", "h", "x"] return self._defiFormat @defiFormat.setter def defiFormat(self, defiFormat: str) -> None: """ defiFormat: "m": plain text "h": html "x": xdxf """ self._defiFormat = defiFormat def detectDefiFormat(self) -> None: if self._defiFormat != "m": return if Entry.xdxfPattern.match(self.defi): self._defiFormat = "x" return if Entry.htmlPattern.match(self.defi): self._defiFormat = "h" return def byteProgress(self): return self._byteProgress def addAlt(self, alt: str) -> None: l_word = self.l_word l_word.append(alt) self._word = l_word def editFuncWord(self, func: "Callable[[str], str]") -> None: """ run function `func` on all the words `func` must accept only one string as argument and return the modified string """ if isinstance(self._word, str): self._word = func(self._word) else: self._word = tuple( func(st) for st in self._word ) def editFuncDefi(self, func: "Callable[[str], str]") -> None: """ run function `func` on all the definitions `func` must accept only one string as argument and return the modified string """ self._defi = func(self._defi) def _stripTrailingBR(self, s: str) -> str: while s.endswith('
') or s.endswith('
'): s = s[:-4] return s def strip(self) -> None: """ strip whitespaces from all words and definitions """ self.editFuncWord(str.strip) self.editFuncDefi(str.strip) self.editFuncDefi(self._stripTrailingBR) def replaceInWord(self, source: str, target: str) -> None: """ replace string `source` with `target` in all words """ if isinstance(self._word, str): self._word = self._word.replace(source, target) else: self._word = tuple( st.replace(source, target) for st in self._word ) def replaceInDefi(self, source: str, target: str) -> None: """ replace string `source` with `target` in all definitions """ self._defi = self._defi.replace(source, target) def replace(self, source: str, target: str) -> None: """ replace string `source` with `target` in all words and definitions """ self.replaceInWord(source, target) self.replaceInDefi(source, target) def removeEmptyAndDuplicateAltWords(self): l_word = self.l_word if len(l_word) == 1: return l_word = [word for word in l_word if word] l_word = list(unique_everseen(l_word)) self._word = l_word def stripFullHtml(self) -> None: defi = self._defi if not defi.startswith('<'): return if defi.startswith(''): defi = defi[len(''):].strip() if not defi.startswith(' not found: word={self.s_word}") log.error(f"defi={defi[:100]}...") else: if not defi.startswith(''): return word = self.s_word i = defi.find('') if i == -1: log.error(f"'>' after RawEntryType: """ returns a tuple (word, defi) or (word, defi, defiFormat) where both word and defi might be string or list of strings """ if self._defiFormat and self._defiFormat != glos.getDefaultDefiFormat(): tpl = ( self.l_word, self.b_defi, self._defiFormat, ) else: tpl = ( self.l_word, self.b_defi, ) if glos.rawEntryCompress: return compress(dumps(tpl), level=9) return tpl @classmethod def fromRaw( cls, glos: "GlossaryType", rawEntry: RawEntryType, defaultDefiFormat: str = "m", ): """ rawEntry can be (word, defi) or (word, defi, defiFormat) where both word and defi can be string or list of strings if defiFormat is missing, defaultDefiFormat will be used creates and return an Entry object from `rawEntry` tuple """ if isinstance(rawEntry, bytes): rawEntry = loads(decompress(rawEntry)) word = rawEntry[0] defi = rawEntry[1].decode("utf-8") if len(rawEntry) > 2: defiFormat = rawEntry[2] if defiFormat == "b": fname = word if isinstance(fname, list): fname = fname[0] return DataEntry(fname, tmpPath=defi) else: defiFormat = defaultDefiFormat return cls( word, defi, defiFormat=defiFormat, ) pyglossary-4.5.0/pyglossary/entry_base.py000066400000000000000000000042241417733132500206650ustar00rootroot00000000000000# -*- coding: utf-8 -*- MultiStr = "Union[str, List[str]]" RawEntryType = """Union[ bytes, # compressed Tuple[List[str], bytes], # uncompressed, without defiFormat Tuple[List[str], bytes, str], # uncompressed, with defiFormat ]""" class BaseEntry(object): __slots__ = [] def isData(self) -> bool: raise NotImplementedError def getFileName(self) -> str: raise NotImplementedError @property def data(self) -> bytes: raise NotImplementedError def save(self, directory: str) -> str: raise NotImplementedError @property def s_word(self) -> str: raise NotImplementedError @property def l_word(self) -> "List[str]": raise NotImplementedError @property def defi(self) -> str: raise NotImplementedError @property def b_word(self): """ returns bytes of word, and all the alternate words seperated by b"|" """ return self.s_word.encode("utf-8") @property def b_defi(self): """ returns bytes of definition, and all the alternate definitions seperated by b"|" """ return self.defi.encode("utf-8") @property def defiFormat(self) -> str: # TODO: type: Literal["m", "h", "x", "b"] raise NotImplementedError @defiFormat.setter def defiFormat(self, defiFormat: str) -> None: # TODO: type: Literal["m", "h", "x", "b"] raise NotImplementedError def detectDefiFormat(self) -> None: raise NotImplementedError def addAlt(self, alt: str) -> None: raise NotImplementedError def editFuncWord(self, func: "Callable[[str], str]") -> None: raise NotImplementedError def editFuncDefi(self, func: "Callable[[str], str]") -> None: raise NotImplementedError def strip(self) -> None: raise NotImplementedError def replaceInWord(self, source: str, target: str) -> None: raise NotImplementedError def replaceInDefi(self, source: str, target: str) -> None: raise NotImplementedError def replace(self, source: str, target: str) -> None: raise NotImplementedError def getRaw(self, glos: "GlossaryType") -> RawEntryType: raise NotImplementedError @staticmethod def getRawEntrySortKey( glos: "GlossaryType", key: "Callable[[str], Any]", ) -> "Callable[[Tuple], str]": raise NotImplementedError pyglossary-4.5.0/pyglossary/entry_filters.py000066400000000000000000000207221417733132500214240ustar00rootroot00000000000000# -*- coding: utf-8 -*- import re import logging from .text_utils import ( fixUtf8, ) from .entry_base import BaseEntry log = logging.getLogger("pyglossary") class EntryFilter(object): name = "" desc = "" def __init__(self, glos: "GlossaryType"): self.glos = glos def prepare(self) -> None: """ run this after glossary info is set and ready """ pass def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": """ returns an Entry object, or None to skip may return the same `entry`, or modify and return it, or return a new Entry object """ return entry class StripWhitespaces(EntryFilter): name = "strip" desc = "Strip whitespaces in word(s) and definition" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": entry.strip() entry.replace("\r", "") return entry class NonEmptyWordFilter(EntryFilter): name = "non_empty_word" desc = "Skip entries with empty word" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": if not entry.s_word: return return entry class NonEmptyDefiFilter(EntryFilter): name = "non_empty_defi" desc = "Skip entries with empty definition" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": if not entry.defi: return return entry class RemoveEmptyAndDuplicateAltWords(EntryFilter): name = "remove_empty_dup_alt_words" desc = "Remove empty and duplicate alternate words" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": entry.removeEmptyAndDuplicateAltWords() if not entry.l_word: return return entry class FixUnicode(EntryFilter): name = "utf8_check" desc = "Fix Unicode in word(s) and definition" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": entry.editFuncWord(fixUtf8) entry.editFuncDefi(fixUtf8) return entry class LowerWord(EntryFilter): name = "lower" desc = "Lowercase word(s)" def __init__(self, glos: "GlossaryType"): EntryFilter.__init__(self, glos) self._re_word_ref = re.compile('href=["\'](bword://[^"\']+)["\']') def lowerWordRefs(self, defi): return self._re_word_ref.sub( lambda m: m.group(0).lower(), defi, ) def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": entry.editFuncWord(str.lower) entry.editFuncDefi(self.lowerWordRefs) return entry class RTLDefi(EntryFilter): name = "rtl" desc = "Make definition right-to-left" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": entry.editFuncDefi(lambda defi: f'
{defi}
') return entry class RemoveHtmlTagsAll(EntryFilter): name = "remove_html_all" desc = "Remove all HTML tags from definition" def __init__(self, glos: "GlossaryType"): self._p_pattern = re.compile( ']*?)?>(.*?)

', re.DOTALL, ) self._br_pattern = re.compile( "", re.IGNORECASE, ) def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": from bs4 import BeautifulSoup def fixStr(st: str) -> str: st = self._p_pattern.sub("\\2\n", st) # if there is

left without opening, replace with
st = st.replace("

", "\n") st = self._br_pattern.sub("\n", st) return BeautifulSoup(st, "lxml").text entry.editFuncDefi(fixStr) return entry class RemoveHtmlTags(EntryFilter): name = "remove_html" desc = "Remove specific HTML tags from definition" def __init__(self, glos: "GlossaryType", tagsStr: str): import re tags = tagsStr.split(",") self.glos = glos self.tags = tags tagsRE = "|".join(self.tags) self.pattern = re.compile(f"]*)?>") def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": def fixStr(st: str) -> str: return self.pattern.sub("", st) entry.editFuncDefi(fixStr) return entry # FIXME: It's is not safe to lowercases everything between < and > # including class name, element ids/names, scripts,
# etc. How can we fix that? class NormalizeHtml(EntryFilter): name = "normalize_html" desc = "Normalize HTML tags in definition (WIP)" def __init__(self, glos: "GlossaryType"): log.info("Normalizing HTML tags") self._pattern = re.compile( "(" + "|".join([ fr"]*?>" for tag in ( "a", "font", "i", "b", "u", "p", "sup", "div", "span", "table", "tr", "th", "td", "ul", "ol", "li", "img", "br", "hr", ) ]) + ")", re.S | re.I, ) def _subLower(self, m) -> str: return m.group(0).lower() def _fixDefi(self, st: str) -> str: st = self._pattern.sub(self._subLower, st) return st def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": entry.editFuncDefi(self._fixDefi) return entry class SkipDataEntry(EntryFilter): name = "skip_resources" desc = "Skip resources / data files" def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": if entry.isData(): return return entry class LanguageCleanup(EntryFilter): name = "lang" desc = "Language-specific cleanup/fixes" def __init__(self, glos: "GlossaryType"): EntryFilter.__init__(self, glos) self._run_func = None # type: Callable[[BaseEntry], [Optional[BaseEntry]]] def prepare(self) -> None: langCodes = { lang.code for lang in (self.glos.sourceLang, self.glos.targetLang) if lang is not None } if "fa" in langCodes: self._run_func = self.run_fa log.info("Using Persian filter") def run_fa(self, entry: BaseEntry) -> "Optional[BaseEntry]": from pyglossary.persian_utils import faEditStr entry.editFuncWord(faEditStr) entry.editFuncDefi(faEditStr) # RLM = "\xe2\x80\x8f" # defi = "\n".join([RLM+line for line in defi.split("\n")]) # for GoldenDict ^^ FIXME return entry def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": if self._run_func: entry = self._run_func(entry) return entry class TextListSymbolCleanup(EntryFilter): """ Symbols like ♦ (diamond) ● (black circle) or * (star) are used in some plaintext or even html glossaries to represent items of a list (like
  • in proper html). This EntryFilter cleans up spaces/newlines issues around them. """ name = "text_list_symbol_cleanup" desc = "Text List Symbol Cleanup" winNewlinePattern = re.compile("[\r\n]+") spacesNewlinePattern = re.compile(" *\n *") blocksNewlinePattern = re.compile("♦\n+♦") def cleanDefi(self, st: str) -> str: st = st.replace("♦ ", "♦ ") st = self.winNewlinePattern.sub("\n", st) st = self.spacesNewlinePattern.sub("\n", st) st = self.blocksNewlinePattern.sub("♦", st) if st.endswith(" "Optional[BaseEntry]": entry.editFuncDefi(self.cleanDefi) return entry class PreventDuplicateWords(EntryFilter): name = "prevent_duplicate_words" desc = "Prevent duplicate words" def __init__(self, glos: "GlossaryType"): EntryFilter.__init__(self, glos) self._wordSet = set() def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": if entry.isData(): return entry wordSet = self._wordSet word = entry.s_word if word not in wordSet: wordSet.add(word) return entry n = 2 while f"{word} ({n})" in wordSet: n += 1 word = f"{word} ({n})" wordSet.add(word) entry._word = word # use entry.editFuncWord? return entry class ShowProgressBar(EntryFilter): name = "progressbar" desc = "Progress Bar" def __init__(self, glos: "GlossaryType"): EntryFilter.__init__(self, glos) self._wordCount = -1 self._wordCountThreshold = 0 self._lastPos = 0 self._index = 0 def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": index = self._index self._index = index + 1 if entry is not None: bp = entry.byteProgress() if bp: if bp[0] > self._lastPos + 20000: self.glos.progress(bp[0], bp[1], unit="bytes") self._lastPos = bp[0] return entry if self._wordCount == -1: self._wordCount = len(self.glos) self._wordCountThreshold = max(1, min( 500, self._wordCount // 200, )) if self._wordCount > 1: if index % self._wordCountThreshold == 0: self.glos.progress(index, self._wordCount) return entry class ShowMaxMemoryUsage(EntryFilter): name = "max_memory_usage" desc = "Show Max Memory Usage" def __init__(self, glos: "GlossaryType"): EntryFilter.__init__(self, glos) self._max_mem_usage = 0 def run(self, entry: BaseEntry) -> "Optional[BaseEntry]": import os import psutil usage = psutil.Process(os.getpid()).memory_info().rss // 1024 if usage > self._max_mem_usage: self._max_mem_usage = usage word = entry.s_word if len(word) > 30: word = word[:37] + "..." log.trace(f"MaxMemUsage: {usage}, word={word}") return entry pyglossary-4.5.0/pyglossary/file_utils.py000066400000000000000000000007221417733132500206700ustar00rootroot00000000000000from itertools import ( takewhile, repeat, ) from io import IOBase def toBytes(s: "AnyStr") -> bytes: return bytes(s, "utf-8") if isinstance(s, str) else bytes(s) def fileCountLines(filename: str, newline: str = "\n"): newline = toBytes(newline) # required? FIXME with open(filename, "rb") as _file: bufgen = takewhile( lambda x: x, (_file.read(1024 * 1024) for _ in repeat(None)) ) return sum( buf.count(newline) for buf in bufgen if buf ) pyglossary-4.5.0/pyglossary/flags.py000066400000000000000000000010161417733132500176220ustar00rootroot00000000000000flagsByName = {} class StrWithDesc(str): def __new__(cls, name: str, desc: str): s = str.__new__(cls, name) s.desc = desc flagsByName[name] = s return s ALWAYS = StrWithDesc("always", "Always") DEFAULT_YES = StrWithDesc("default_yes", "Yes (by default)") DEFAULT_NO = StrWithDesc("default_no", "No (by default)") NEVER = StrWithDesc("never", "Never") # typing.Literal is added in Python 3.8 YesNoAlwaysNever = """Union[ "Literal[ALWAYS]", "Literal[DEFAULT_YES]", "Literal[DEFAULT_NO]", "Literal[NEVER]", ]""" pyglossary-4.5.0/pyglossary/glossary.py000066400000000000000000000705251417733132500204040ustar00rootroot00000000000000# -*- coding: utf-8 -*- # glossary.py # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import logging import sys import os import os.path from os.path import ( split, join, splitext, isfile, isdir, dirname, basename, abspath, ) from time import time as now from collections import OrderedDict as odict from .flags import * from . import core from .core import ( dataDir, pluginsDir, userPluginsDir, cacheDir, ) from .entry import Entry, DataEntry from .entry_filters import * from .glossary_utils import ( splitFilenameExt, EntryList, ) from .sort_keys import namedSortKeyByName, NamedSortKey from .os_utils import showMemoryUsage, rmtree from .glossary_info import GlossaryInfo from .plugin_manager import PluginManager from .glossary_type import GlossaryType from .info import * log = logging.getLogger("pyglossary") """ sortKeyType = Callable[ [[List[str]], Any, ] """ defaultSortKeyName = "headword_lower" class Glossary(GlossaryInfo, PluginManager, GlossaryType): """ Direct access to glos.data is droped Use `glos.addEntryObj(glos.newEntry(word, defi, [defiFormat]))` where both word and defi can be list (including alternates) or string See help(glos.addEntryObj) Use `for entry in glos:` to iterate over entries (glossary data) See help(pyglossary.entry.Entry) for details """ entryFiltersRules = [ (None, StripWhitespaces), (None, NonEmptyWordFilter), (("skip_resources", False), SkipDataEntry), (("utf8_check", False), FixUnicode), (("lower", False), LowerWord), (("rtl", False), RTLDefi), (("remove_html_all", False), RemoveHtmlTagsAll), (("remove_html", ""), RemoveHtmlTags), (("normalize_html", False), NormalizeHtml), (None, LanguageCleanup), # TODO # (("text_list_symbol_cleanup", False), TextListSymbolCleanup), (None, NonEmptyWordFilter), (None, NonEmptyDefiFilter), (None, RemoveEmptyAndDuplicateAltWords), ] def _closeReaders(self): for reader in self._readers: try: reader.close() except Exception: log.exception("") def clear(self) -> None: self._info = odict() self._data.clear() # type: List[RawEntryType] readers = getattr(self, "_readers", []) for reader in readers: try: reader.close() except Exception: log.exception("") self._readers = [] self._readersOpenArgs = {} self._defiHasWordTitle = False self._iter = None self._entryFilters = [] self._entryFiltersName = set() self._sort = False self._filename = "" self._defaultDefiFormat = "m" self._progressbar = True self.tmpDataDir = "" def __init__( self, info: "Optional[Dict[str, str]]" = None, ui: "Optional[UIBase]" = None, ) -> None: """ info: OrderedDict or dict instance, or None no need to copy OrderedDict instance before passing here we will not reference to it """ GlossaryInfo.__init__(self) self._config = {} self._data = EntryList(self) self._sqlite = False self._rawEntryCompress = True self._cleanupPathList = set() self.clear() if info: if not isinstance(info, (dict, odict)): raise TypeError( "Glossary: `info` has invalid type" ", dict or OrderedDict expected" ) for key, value in info.items(): self.setInfo(key, value) self.ui = ui def cleanup(self): if not self._cleanupPathList: return if not self._config.get("cleanup", True): log.info("Not cleaning up files:") log.info("\n".join(self._cleanupPathList)) return for cleanupPath in self._cleanupPathList: if isfile(cleanupPath): log.debug(f"Removing file {cleanupPath}") try: os.remove(cleanupPath) except Exception: log.exception(f"error removing {cleanupPath}") elif isdir(cleanupPath): log.debug(f"Removing directory {cleanupPath}") rmtree(cleanupPath) else: log.error(f"no such file or directory: {cleanupPath}") self._cleanupPathList = set() @property def rawEntryCompress(self) -> bool: return self._rawEntryCompress def setRawEntryCompress(self, enable: bool) -> bool: self._rawEntryCompress = enable def updateEntryFilters(self) -> None: entryFilters = [] config = self._config for configRule, filterClass in self.entryFiltersRules: args = () if configRule is not None: param, default = configRule value = config.get(param, default) if not value: continue if not isinstance(default, bool): args = (value,) entryFilters.append(filterClass(self, *args)) if self.ui and self._progressbar: entryFilters.append(ShowProgressBar(self)) if log.level <= core.TRACE: try: import psutil except ModuleNotFoundError: pass else: entryFilters.append(ShowMaxMemoryUsage(self)) self._entryFilters = entryFilters self._entryFiltersName = { entryFilter.name for entryFilter in entryFilters } def prepareEntryFilters(self) -> None: """ call .prepare() method on all _entryFilters run this after glossary info is set and ready for most entry filters, it won't do anything """ for entryFilter in self._entryFilters: entryFilter.prepare() def _addExtraEntryFilter(self, cls): if cls.name in self._entryFiltersName: return self._entryFilters.append(cls(self)) self._entryFiltersName.add(cls.name) def removeHtmlTagsAll(self) -> None: """ Remove all HTML tags from definition This should only be called from a plugin's Writer.__init__ method. Does not apply on entries added with glos.addEntryObj """ self._addExtraEntryFilter(RemoveHtmlTagsAll) def preventDuplicateWords(self): """ Adds entry filter to prevent duplicate `entry.s_word` This should only be called from a plugin's Writer.__init__ method. Does not apply on entries added with glos.addEntryObj Note: there may be still duplicate headwords or alternate words but we only care about making the whole `entry.s_word` (aka entry key) unique """ self._addExtraEntryFilter(PreventDuplicateWords) def __str__(self) -> str: return ( "Glossary{" f"filename: {self._filename!r}" f", name: {self._info.get('name')!r}" "}" ) def _loadedEntryGen(self) -> "Iterator[BaseEntry]": if not (self.ui and self._progressbar): yield from self._data return pbFilter = ShowProgressBar(self) self.progressInit("Writing") for entry in self._data: pbFilter.run(entry) yield entry self.progressEnd() def _readersEntryGen(self) -> "Iterator[BaseEntry]": for reader in self._readers: self.progressInit("Converting") try: yield from self._applyEntryFiltersGen(reader) finally: reader.close() self.progressEnd() # This iterator/generator does not give None entries. # And Entry is not falsable, so bool(entry) is always True. # Since ProgressBar is already handled with an EntryFilter, there is # no point of returning None entries anymore. def _applyEntryFiltersGen( self, gen: "Iterator[BaseEntry]", ) -> "Iterator[BaseEntry]": for entry in gen: if entry is None: continue for entryFilter in self._entryFilters: entry = entryFilter.run(entry) if entry is None: break else: yield entry def __iter__(self) -> "Iterator[BaseEntry]": if self._iter is None: log.error( "Trying to iterate over a blank Glossary" ", must call `glos.read` first" ) return iter([]) return self._iter # TODO: switch to @property defaultDefiFormat def setDefaultDefiFormat(self, defiFormat: str) -> None: """ defiFormat must be empty or one of these: "m": plain text "h": html "x": xdxf """ self._defaultDefiFormat = defiFormat def getDefaultDefiFormat(self) -> str: return self._defaultDefiFormat # TODO # def _reopenReader(self, reader): # log.info(f"re-opening {reader.__class__}") # filename, options = self._readersOpenArgs[reader] # reader.close() # reader.open(filename, **options) def collectDefiFormat( self, maxCount: int, ) -> "Optional[Dict[str, float]]": """ example return value: [("h", 0.91), ("m", 0.09)] """ from collections import Counter readers = self._readers if readers: log.error("collectDefiFormat: not supported in direct mode") return None counter = Counter() count = 0 for entry in self: if entry.isData(): continue entry.detectDefiFormat() counter[entry.defiFormat] += 1 count += 1 if count >= maxCount: break result = { defiFormat: itemCount / count for defiFormat, itemCount in counter.items() } for defiFormat in ("h", "m", "x"): if defiFormat not in result: result[defiFormat] = 0 # TODO # for reader in readers: # self._reopenReader(reader) # self._readers = readers self._updateIter() return result def __len__(self) -> int: return len(self._data) + sum( len(reader) for reader in self._readers ) @property def config(self): raise NotImplementedError @config.setter def config(self, c: "Dict[str, Any]"): if self._config: log.error(f"glos.config is set more than once") return self._config = c @property def alts(self) -> bool: return self._config.get("enable_alts", True) @property def filename(self): return self._filename def wordTitleStr( self, word: str, sample: str = "", _class: str = "", ) -> str: """ notes: - `word` needs to be escaped before passing - `word` can contain html code (multiple words, colors, etc) - if input format (reader) indicates that words are already included in definition (as title), this method will return empty string - depending on glossary's `sourceLang` or writing system of `word`, (or sample if given) either '' or '' will be used """ if self._defiHasWordTitle: return "" if not word: return "" if not sample: sample = word tag = self._getTitleTag(sample) if _class: return f'<{tag} class="{_class}">{word}
    ' return f'<{tag}>{word}
    ' def getConfig(self, name: str, default: "Optional[str]") -> "Optional[str]": return self._config.get(name, default) def addEntryObj(self, entry: Entry) -> None: self._data.append(entry) def newEntry( self, word: str, defi: str, defiFormat: str = "", byteProgress: "Optional[Tuple[int, int]]" = None, ) -> "Entry": """ create and return a new entry object defiFormat must be empty or one of these: "m": plain text "h": html "x": xdxf """ if not defiFormat: defiFormat = self._defaultDefiFormat return Entry( word, defi, defiFormat=defiFormat, byteProgress=byteProgress, ) def newDataEntry(self, fname: str, data: bytes) -> "DataEntry": import uuid tmpPath = None if not self._readers: if self.tmpDataDir: tmpPath = join(self.tmpDataDir, fname.replace("/", "_")) else: os.makedirs(join(cacheDir, "tmp"), mode=0o700, exist_ok=True) self._cleanupPathList.add(join(cacheDir, "tmp")) tmpPath = join(cacheDir, "tmp", uuid.uuid1().hex) return DataEntry(fname, data, tmpPath=tmpPath) # ________________________________________________________________________# # def _hasWriteAccessToDir(self, dirPath: str) -> None: # if isdir(dirPath): # return os.access(dirPath, os.W_OK) # return os.access(dirname(dirPath), os.W_OK) def _createReader(self, format: str, options: "Dict[str, Any]") -> "Any": reader = self.plugins[format].readerClass(self) for name, value in options.items(): setattr(reader, f"_{name}", value) return reader def _setTmpDataDir(self, filename): # good thing about cacheDir is that we don't have to clean it up after # conversion is finished. # specially since dataEntry.save(...) will move the file from cacheDir # to the new directory (associated with output glossary path) # And we don't have to check for write access to cacheDir because it's # inside user's home dir. But input glossary might be in a directory # that we don't have write access to. # still maybe add a config key to decide if we should always use cacheDir # if self._hasWriteAccessToDir(f"{filename}_res", os.W_OK): # self.tmpDataDir = f"{filename}_res" # else: self.tmpDataDir = join(cacheDir, basename(filename) + "_res") log.debug(f"tmpDataDir = {self.tmpDataDir}") os.makedirs(self.tmpDataDir, mode=0o700, exist_ok=True) self._cleanupPathList.add(self.tmpDataDir) def read( self, filename: str, format: str = "", direct: bool = False, **kwargs ) -> bool: """ filename (str): name/path of input file format (str): name of input format, or "" to detect from file extension direct (bool): enable direct mode progressbar (bool): enable progressbar read-options can be passed as additional keyword arguments """ if type(filename) is not str: raise TypeError("filename must be str") if format is not None and type(format) is not str: raise TypeError("format must be str") # don't allow direct=False when there are readers # (read is called before with direct=True) if self._readers and not direct: raise ValueError( f"there are already {len(self._readers)} readers" f", you can not read with direct=False mode" ) return self._read( filename=filename, format=format, direct=direct, **kwargs ) def _read( self, filename: str, format: str = "", direct: bool = False, progressbar: bool = True, **options ) -> bool: filename = abspath(filename) self._setTmpDataDir(filename) ### inputArgs = self.detectInputFormat(filename, format=format) if inputArgs is None: return False origFilename = filename filename, format, compression = inputArgs if compression: from pyglossary.compression import uncompress uncompress(origFilename, filename, compression) validOptionKeys = list(self.formatsReadOptions[format].keys()) for key in list(options.keys()): if key not in validOptionKeys: log.error( f"Invalid read option {key!r} " f"given for {format} format" ) del options[key] filenameNoExt, ext = splitext(filename) if not ext.lower() in self.plugins[format].extensions: filenameNoExt = filename self._filename = filenameNoExt if not self._info.get(c_name): self._info[c_name] = split(filename)[1] self._progressbar = progressbar self.updateEntryFilters() reader = self._createReader(format, options) try: reader.open(filename) except FileNotFoundError as e: log.critical(str(e)) return False except Exception: log.exception("") return False self._readersOpenArgs[reader] = (filename, options) self.prepareEntryFilters() hasTitleStr = self._info.get("definition_has_headwords", "") if hasTitleStr: if hasTitleStr.lower() == "true": self._defiHasWordTitle = True else: log.error(f"bad info value: definition_has_headwords={hasTitleStr!r}") self._readers.append(reader) if not direct: self._inactivateDirectMode() self._updateIter() self.detectLangsFromName() return True def loadReader(self, reader: "Any") -> None: """ iterates over `reader` object and loads the whole data into self._data must call `reader.open(filename)` before calling this function """ showMemoryUsage() self.progressInit("Reading") try: for entry in self._applyEntryFiltersGen(reader): self.addEntryObj(entry) finally: reader.close() self.progressEnd() log.trace(f"Loaded {len(self._data)} entries") showMemoryUsage() def _inactivateDirectMode(self) -> None: """ loads all of `self._readers` into `self._data` closes readers and sets self._readers to [] """ for reader in self._readers: self.loadReader(reader) self._readers = [] def _updateIter(self) -> None: """ updates self._iter depending on: 1- Wheather or not direct mode is On (self._readers not empty) or Off (self._readers empty) """ if not self._readers: # indirect mode self._iter = self._loadedEntryGen() return # direct mode self._iter = self._readersEntryGen() def updateIter(self): if self._readers: raise RuntimeError("can not call this while having a reader") self._updateIter() def sortWords( self, sortKeyName: "str" = "headword_lower", sortEncoding: "str" = "utf-8", writeOptions: "Optional[Dict[str, Any]]" = None, ) -> None: """ sortKeyName: see doc/sort-key.md """ if self._readers: raise NotImplementedError( "can not use sortWords in direct mode" ) if self._sqlite: raise NotImplementedError( "can not use sortWords in SQLite mode" ) namedSortKey = namedSortKeyByName.get(sortKeyName) if namedSortKey is None: log.critical(f"invalid sortKeyName = {sortKeyName!r}") return if not sortEncoding: sortEncoding = "utf-8" if writeOptions is None: writeOptions = {} t0 = now() self._data.setSortKey( namedSortKey=namedSortKey, sortEncoding=sortEncoding, writeOptions=writeOptions, ) self._data.sort() log.info(f"Sorting took {now() - t0:.1f} seconds") self._sort = True self._updateIter() def _createWriter( self, format: str, options: "Dict[str, Any]", ) -> "Any": validOptions = self.formatsWriteOptions.get(format) if validOptions is None: log.critical(f"No write support for {format!r} format") return validOptionKeys = list(validOptions.keys()) for key in list(options.keys()): if key not in validOptionKeys: log.error( f"Invalid write option {key!r}" f" given for {format} format" ) del options[key] writer = self.plugins[format].writerClass(self) for name, value in options.items(): setattr(writer, f"_{name}", value) return writer def write( self, filename: str, format: str, **kwargs ) -> "Optional[str]": """ filename (str): file name or path to write format (str): format name sort (bool): True (enable sorting), False (disable sorting), None (auto, get from UI) sortKeyName (str or None): key function name for sorting sortEncoding (str or None): encoding for sorting, default utf-8 You can pass write-options (of given format) as keyword arguments returns absolute path of output file, or None if failed """ if type(filename) is not str: raise TypeError("filename must be str") if format is not None and type(format) is not str: raise TypeError("format must be str") return self._write( filename=filename, format=format, **kwargs ) def _write( self, filename: str, format: str, sort: "Optional[bool]" = None, **options ) -> "Optional[str]": filename = abspath(filename) if format not in self.plugins or not self.plugins[format].canWrite: log.critical(f"No Writer class found for plugin {format}") return plugin = self.plugins[format] if self._readers and sort: log.warning( f"Full sort enabled, falling back to indirect mode" ) self._inactivateDirectMode() log.info(f"Writing to {format} file {filename!r}") writer = self._createWriter(format, options) self._sort = sort if sort: t0 = now() self._data.sort() log.info(f"Sorting took {now() - t0:.1f} seconds") self._updateIter() try: writer.open(filename) except FileNotFoundError as e: log.critical(str(e)) return False except Exception: log.exception("") return False showMemoryUsage() writerList = [writer] try: genList = [] gen = writer.write() if gen is None: log.error(f"{format} write function is not a generator") else: genList.append(gen) if self._config.get("save_info_json", False): infoWriter = self._createWriter("Info", {}) filenameNoExt, _, _, _ = splitFilenameExt(filename) infoWriter.open(f"{filenameNoExt}.info") genList.append(infoWriter.write()) writerList.append(infoWriter) for gen in genList: gen.send(None) for entry in self: for gen in genList: gen.send(entry) for gen in genList: try: gen.send(None) except StopIteration: pass except FileNotFoundError as e: log.critical(str(e)) return except Exception: log.exception("Exception while calling plugin\'s write function") return finally: showMemoryUsage() log.debug("Running writer.finish()") for writer in writerList: writer.finish() self.clear() showMemoryUsage() return filename def _compressOutput(self, filename: str, compression: str) -> str: from pyglossary.compression import compress return compress(self, filename, compression) def _switchToSQLite( self, inputFilename: str, outputFormat: str, ) -> bool: from pyglossary.sq_entry_list import SqEntryList sq_fpath = join(cacheDir, f"{basename(inputFilename)}.db") if isfile(sq_fpath): log.info(f"Removing and re-creating {sq_fpath!r}") os.remove(sq_fpath) self._data = SqEntryList( self, sq_fpath, create=True, persist=True, ) self._rawEntryCompress = False self._cleanupPathList.add(sq_fpath) if not self.alts: log.warning( f"SQLite mode only works with enable_alts=True" f", force-enabling it." ) self._config["enable_alts"] = True self._sqlite = True def _resolveConvertSortParams( self, sort: "Optional[bool]", sortKeyName: "Optional[str]", sortEncoding: "Optional[str]", direct: "Optional[bool]", sqlite: "Optional[bool]", inputFilename: str, outputFormat: str, writeOptions: "Dict[str, Any]", ) -> "Optional[Tuple[bool, bool]]": """ sortKeyName: see doc/sort-key.md returns (sort, direct) or None if fails """ plugin = self.plugins[outputFormat] sortOnWrite = plugin.sortOnWrite if sortOnWrite == ALWAYS: if sort is False: log.warning( f"Writing {outputFormat} requires sorting" f", ignoring user sort=False option" ) sort = True elif sortOnWrite == DEFAULT_YES: if sort is None: sort = True elif sortOnWrite == DEFAULT_NO: if sort is None: sort = False elif sortOnWrite == NEVER: if sort: log.warning( "Plugin prevents sorting before write" + ", ignoring user sort=True option" ) sort = False if direct and sqlite: raise ValueError(f"Conflictng arguments: direct={direct}, sqlite={sqlite}") if not sort: if direct is None: direct = True return direct, False direct = False # from this point, sort == True and direct == False writerSortKeyName = plugin.sortKeyName namedSortKey = None writerSortEncoding = getattr(plugin, "sortEncoding", None) if sqlite is None: sqlite = sort and self._config.get("auto_sqlite", True) if sqlite: log.info( "Automatically switching to SQLite mode" f" for writing {outputFormat}" ) if sortOnWrite == ALWAYS: if writerSortKeyName: if sortKeyName and sortKeyName != writerSortKeyName: log.warning( f"Ignoring user-defined sort order {sortKeyName!r}" f", and using sortKey function from {outputFormat} plugin" ) sortKeyName = writerSortKeyName else: log.critical(f"No sortKeyName was found in plugin") return None if writerSortEncoding: sortEncoding = writerSortEncoding elif not sortKeyName: if writerSortKeyName: sortKeyName = writerSortKeyName else: sortKeyName = defaultSortKeyName namedSortKey = namedSortKeyByName.get(sortKeyName) if namedSortKey is None: log.critical(f"invalid sortKeyName = {sortKeyName!r}") return None log.info(f"Using sortKeyName = {namedSortKey.name!r}") if sqlite: self._switchToSQLite( inputFilename=inputFilename, outputFormat=outputFormat, ) if not sortEncoding: sortEncoding = "utf-8" if writeOptions is None: writeOptions = {} self._data.setSortKey( namedSortKey=namedSortKey, sortEncoding=sortEncoding, writeOptions=writeOptions, ) return False, True def convert( self, inputFilename: str, inputFormat: str = "", direct: "Optional[bool]" = None, progressbar: bool = True, outputFilename: str = "", outputFormat: str = "", sort: "Optional[bool]" = None, sortKeyName: "Optional[str]" = None, sortEncoding: "Optional[str]" = None, readOptions: "Optional[Dict[str, Any]]" = None, writeOptions: "Optional[Dict[str, Any]]" = None, sqlite: "Optional[bool]" = None, infoOverride: "Optional[Dict[str, str]]" = None, ) -> "Optional[str]": """ returns absolute path of output file, or None if failed sortKeyName: name of sort key/algorithm defaults to `defaultSortKeyName` in glossary.py see doc/sort-key.md or sort_keys.py for other possible values sortEncoding: encoding/charset for sorting, default to utf-8 """ if type(inputFilename) is not str: raise TypeError("inputFilename must be str") if type(outputFilename) is not str: raise TypeError("outputFilename must be str") if inputFormat is not None and type(inputFormat) is not str: raise TypeError("inputFormat must be str") if outputFormat is not None and type(outputFormat) is not str: raise TypeError("outputFormat must be str") if not readOptions: readOptions = {} if not writeOptions: writeOptions = {} if outputFilename == inputFilename: log.critical(f"Input and output files are the same") return if readOptions: log.info(f"readOptions = {readOptions}") if writeOptions: log.info(f"writeOptions = {writeOptions}") outputArgs = self.detectOutputFormat( filename=outputFilename, format=outputFormat, inputFilename=inputFilename, ) if not outputArgs: log.critical(f"Writing file {outputFilename!r} failed.") return outputFilename, outputFormat, compression = outputArgs del outputArgs if isdir(outputFilename): log.critical(f"Directory already exists: {outputFilename}") return sortParams = self._resolveConvertSortParams( sort=sort, sortKeyName=sortKeyName, sortEncoding=sortEncoding, direct=direct, sqlite=sqlite, inputFilename=inputFilename, outputFormat=outputFormat, writeOptions=writeOptions, ) if sortParams is None: return direct, sort = sortParams del sqlite showMemoryUsage() tm0 = now() if not self._read( inputFilename, format=inputFormat, direct=direct, progressbar=progressbar, **readOptions ): log.critical(f"Reading file {inputFilename!r} failed.") self.cleanup() return del inputFilename, inputFormat, direct, readOptions log.info("") if infoOverride: for key, value in infoOverride.items(): self.setInfo(key, value) finalOutputFile = self._write( outputFilename, outputFormat, sort=sort, **writeOptions ) log.info("") if not finalOutputFile: log.critical(f"Writing file {outputFilename!r} failed.") self._closeReaders() self.cleanup() return if compression: finalOutputFile = self._compressOutput(finalOutputFile, compression) log.info(f"Writing file {finalOutputFile!r} done.") log.info(f"Running time of convert: {now()-tm0:.1f} seconds") showMemoryUsage() self.cleanup() return finalOutputFile # ________________________________________________________________________# def progressInit(self, *args) -> None: if self.ui and self._progressbar: self.ui.progressInit(*args) def progress(self, pos: int, total: int, unit: str = "entries") -> None: if total == 0: log.warning(f"pos={pos}, total={total}") return self.ui.progress( min(pos + 1, total) / total, f"{pos:,} / {total:,} {unit}", ) def progressEnd(self) -> None: if self.ui and self._progressbar: self.ui.progressEnd() # ________________________________________________________________________# @classmethod def init( cls, usePluginsJson: bool = True, skipDisabledPlugins: bool = True, ): """ Glossary.init() must be called only once, so make sure you put it in the right place. Probably in the top of your program's main function or module. """ cls.readFormats = [] cls.writeFormats = [] pluginsJsonPath = join(dataDir, "plugins-meta", "index.json") # even if usePluginsJson, we should still call loadPlugins to load # possible new plugins that are not in json file if usePluginsJson: cls.loadPluginsFromJson(pluginsJsonPath) cls.loadPlugins(pluginsDir, skipDisabled=skipDisabledPlugins) if isdir(userPluginsDir): cls.loadPlugins(userPluginsDir) os.makedirs(cacheDir, mode=0o700, exist_ok=True) pyglossary-4.5.0/pyglossary/glossary_info.py000066400000000000000000000130301417733132500214030ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import logging from collections import OrderedDict as odict from .info import * from .text_utils import ( fixUtf8, ) from .langs import langDict, Lang log = logging.getLogger("pyglossary") class GlossaryInfo(object): def __init__(self): self._info = odict() def infoKeys(self) -> "List[str]": return list(self._info.keys()) # def formatInfoKeys(self, format: str):# FIXME def iterInfo(self) -> "Iterator[Tuple[str, str]]": return self._info.items() def getInfo(self, key: str) -> str: if not isinstance(key, str): raise TypeError(f"invalid key={key!r}, must be str") return self._info.get( infoKeysAliasDict.get(key.lower(), key), "", ) def setInfo(self, key: str, value: "Optional[str]") -> None: if value is None: try: del self._info[key] except KeyError: pass return if not isinstance(key, str): raise TypeError(f"invalid key={key!r}, must be str") key = fixUtf8(key) value = fixUtf8(str(value)) key = infoKeysAliasDict.get(key.lower(), key) self._info[key] = value def getExtraInfos(self, excludeKeys: "List[str]") -> "odict": """ excludeKeys: a list of (basic) info keys to be excluded returns an OrderedDict including the rest of info keys, with associated values """ excludeKeySet = set() for key in excludeKeys: excludeKeySet.add(key) key2 = infoKeysAliasDict.get(key.lower()) if key2: excludeKeySet.add(key2) extra = odict() for key, value in self._info.items(): if key in excludeKeySet: continue extra[key] = value return extra @property def author(self) -> str: for key in (c_author, c_publisher): value = self._info.get(key, "") if value: return value return "" def _getLangByStr(self, st) -> "Optional[Lang]": lang = langDict[st] if lang: return lang log.error(f"unknown language {st!r}") return def _getLangByInfoKey(self, key: str) -> "Optional[Lang]": st = self._info.get(key, "") if not st: return return self._getLangByStr(st) @property def sourceLang(self) -> "Optional[Lang]": return self._getLangByInfoKey(c_sourceLang) @property def targetLang(self) -> "Optional[Lang]": return self._getLangByInfoKey(c_targetLang) @sourceLang.setter def sourceLang(self, lang) -> None: if not isinstance(lang, Lang): raise TypeError(f"invalid lang={lang}, must be a Lang object") self._info[c_sourceLang] = lang.name @targetLang.setter def targetLang(self, lang) -> None: if not isinstance(lang, Lang): raise TypeError(f"invalid lang={lang}, must be a Lang object") self._info[c_targetLang] = lang.name @property def sourceLangName(self) -> str: lang = self.sourceLang if lang is None: return "" return lang.name @sourceLangName.setter def sourceLangName(self, langName: str) -> None: if not langName: self._info[c_sourceLang] = "" return lang = self._getLangByStr(langName) if lang is None: return self._info[c_sourceLang] = lang.name @property def targetLangName(self) -> str: lang = self.targetLang if lang is None: return "" return lang.name @targetLangName.setter def targetLangName(self, langName: str) -> None: if not langName: self._info[c_targetLang] = "" return lang = self._getLangByStr(langName) if lang is None: return self._info[c_targetLang] = lang.name def _getTitleTag(self, sample: str) -> str: from .langs.writing_system import getWritingSystemFromText ws = getWritingSystemFromText(sample) if ws and ws.name != "Latin": return ws.titleTag sourceLang = self.sourceLang if sourceLang: return sourceLang.titleTag return "b" def detectLangsFromName(self): """ extract sourceLang and targetLang from glossary name/title """ import re name = self._info.get(c_name) if not name: return if self._info.get(c_sourceLang): return langNames = [] def checkPart(part: str): for match in re.findall("\w\w\w*", part): # print(f"match = {match!r}") lang = langDict[match] if lang is None: continue langNames.append(lang.name) for part in re.split("-| to ", name): # print(f"part = {part!r}") checkPart(part) if len(langNames) >= 2: break if len(langNames) < 2: log.info( f"Failed to detect sourceLang and targetLang" f" from glossary name {name!r}" ) return if len(langNames) > 2: log.warning(f"detectLangsFromName: langNames = {langNames!r}") log.info( f"Detected sourceLang={langNames[0]!r}, " f"targetLang={langNames[1]!r} " f"from glossary name {name!r}" ) self.sourceLangName = langNames[0] self.targetLangName = langNames[1] def titleElement( self, hf: "lxml.etree.htmlfile", sample: str = "", ) -> "lxml.etree._FileWriterElement": return hf.element(self._getTitleTag(sample)) pyglossary-4.5.0/pyglossary/glossary_type.py000066400000000000000000000042741417733132500214430ustar00rootroot00000000000000# -*- coding: utf-8 -*- from .entry_base import BaseEntry from .entry import Entry, DataEntry from .langs import Lang class GlossaryType(object): """ an abstract type class for Glossary class in plugins. it only contains methods and properties that might be used in plugins """ def setDefaultDefiFormat(self, defiFormat: str) -> None: raise NotImplementedError def getDefaultDefiFormat(self) -> str: raise NotImplementedError def collectDefiFormat( self, maxCount: int, ) -> "Optional[Dict[str, float]]": raise NotImplementedError def iterInfo(self) -> "Iterator[Tuple[str, str]]": raise NotImplementedError def getInfo(self, key: str) -> str: raise NotImplementedError def setInfo(self, key: str, value: str) -> None: raise NotImplementedError def getExtraInfos(self, excludeKeys: "List[str]") -> "OrderedDict": raise NotImplementedError @property def author(self) -> str: raise NotImplementedError @property def alts(self) -> bool: raise NotImplementedError @property def filename(self): raise NotImplementedError @property def sourceLang(self) -> "Optional[Lang]": raise NotImplementedError @property def targetLang(self) -> "Optional[Lang]": raise NotImplementedError @property def sourceLangName(self) -> str: raise NotImplementedError @sourceLangName.setter def sourceLangName(self, langName: str) -> None: raise NotImplementedError @property def targetLangName(self) -> str: raise NotImplementedError @targetLangName.setter def targetLangName(self, langName: str) -> None: raise NotImplementedError def titleElement( self, hf: "lxml.etree.htmlfile", sample: str = "", ) -> "lxml.etree._FileWriterElement": raise NotImplementedError def wordTitleStr( self, word: str, sample: str = "", _class: str = "", ) -> str: raise NotImplementedError def getConfig(self, name: str, default: "Optional[str]") -> "Optional[str]": raise NotImplementedError def addEntryObj(self, entry: "Entry") -> None: raise NotImplementedError def newEntry(self, word: str, defi: str, defiFormat: str = "") -> "Entry": raise NotImplementedError def newDataEntry(self, fname: str, data: bytes) -> DataEntry: raise NotImplementedError pyglossary-4.5.0/pyglossary/glossary_utils.py000066400000000000000000000047751417733132500216300ustar00rootroot00000000000000# -*- coding: utf-8 -*- # glossary_utils.py # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import os from os.path import ( split, splitext, ) import subprocess import logging from .compression import ( stdCompressions, ) from .entry import Entry log = logging.getLogger("pyglossary") class EntryList(object): def __init__(self, glos): self._l = [] self._glos = glos self._sortKey = None def append(self, entry): self._l.append(entry.getRaw(self._glos)) def insert(self, pos, entry): self._l.insert(pos, entry.getRaw(self._glos)) def clear(self): self._l.clear() def __len__(self): return len(self._l) def __iter__(self): glos = self._glos for rawEntry in self._l: yield Entry.fromRaw( glos, rawEntry, defaultDefiFormat=glos._defaultDefiFormat, ) def setSortKey( self, namedSortKey: "NamedSortKey", sortEncoding: "Optional[str]", writeOptions: "Dict[str, Any]", ): sortKey = namedSortKey.normal(sortEncoding, **writeOptions) self._sortKey = Entry.getRawEntrySortKey(self._glos, sortKey) def sort(self): if self._sortKey is None: raise ValueError("EntryList.sort: sortKey is not set") self._l.sort(key=self._sortKey) def close(self): pass def splitFilenameExt( filename: str = "", ) -> "Tuple[str, str, str]": """ returns (filenameNoExt, ext, compression) """ compression = "" filenameNoExt, ext = splitext(filename) ext = ext.lower() if not ext and len(filenameNoExt) < 5: filenameNoExt, ext = "", filenameNoExt if not ext: return filename, filename, "", "" if ext[1:] in stdCompressions + ("zip", "dz"): compression = ext[1:] filename = filenameNoExt filenameNoExt, ext = splitext(filename) ext = ext.lower() return filenameNoExt, filename, ext, compression pyglossary-4.5.0/pyglossary/gregorian.py000066400000000000000000000051711417733132500205110ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright © 2008-2019 Saeed Rasooli # Copyright © 2007 Mehdi Bayazee # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. If not, see . # Also avalable in /usr/share/common-licenses/GPL on Debian systems # or /usr/share/licenses/common/GPL3/license.txt on ArchLinux # Gregorian calendar: # http://en.wikipedia.org/wiki/Gregorian_calendar from datetime import datetime name = "gregorian" desc = "Gregorian" monthName = ( "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", ) monthNameAb = ( "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", ) epoch = 1721426 options = () def save() -> None: pass def isLeap(y: int) -> bool: return y % 4 == 0 and not (y % 100 == 0 and y % 400 != 0) def to_jd(year: int, month: int, day: int) -> int: if 0 < year < 10000: # > 1.5x faster return datetime(year, month, day).toordinal() + 1721425 if month <= 2: tm = 0 elif isLeap(year): tm = -1 else: tm = -2 return ( epoch - 1 + 365 * (year - 1) + (year - 1) // 4 + -((year - 1) // 100) + (year - 1) // 400 + (367 * month - 362) // 12 + tm + day ) def jd_to(jd: int) -> "Tuple[int, int, int]": ordinal = int(jd) - 1721425 if 0 < ordinal < 3652060: # > 4x faster # datetime(9999, 12, 31).toordinal() == 3652059 dt = datetime.fromordinal(ordinal) return (dt.year, dt.month, dt.day) # wjd = floor(jd - 0.5) + 0.5 qc, dqc = divmod(jd - epoch, 146097) # qc ~~ quadricent cent, dcent = divmod(dqc, 36524) quad, dquad = divmod(dcent, 1461) yindex = dquad // 365 # divmod(dquad, 365)[0] year = ( qc * 400 + cent * 100 + quad * 4 + yindex + (cent != 4 and yindex != 4) ) yearday = jd - to_jd(year, 1, 1) if jd < to_jd(year, 3, 1): leapadj = 0 elif isLeap(year): leapadj = 1 else: leapadj = 2 month = ((yearday + leapadj) * 12 + 373) // 367 day = jd - to_jd(year, month, 1) + 1 return int(year), int(month), int(day) pyglossary-4.5.0/pyglossary/html_utils.py000066400000000000000000000206321417733132500207170ustar00rootroot00000000000000# -*- coding: utf-8 -*- import re import logging log = logging.getLogger("pyglossary") def toStr(s: "AnyStr") -> str: return str(s, "utf-8") if isinstance(s, bytes) else str(s) re_entity = re.compile( r"&#?\w+;", ) special_chars = { "<", ">", "&", '"', "'", "\xa0", # " " or " " } # these are not included in html.entities.name2codepoint name2codepoint_extra = { "itilde": 0x0129, # ĩ "utilde": 0x0169, # ũ "uring": 0x016f, # ů "ycirc": 0x0177, # ŷ "wring": 0x1e98, # ẘ "yring": 0x1e99, # ẙ "etilde": 0x1ebd, # ẽ "ygrave": 0x1ef3, # ỳ "ytilde": 0x1ef9, # ỹ "ldash": 0x2013, # – "frac13": 0x2153, # ⅓ "xfrac13": 0x2153, # ⅓ "frac23": 0x2154, # ⅔ } # Use build_name2codepoint_dict function to update this dictionary name2codepoint = { "Aacute": 0x00c1, # Á "aacute": 0x00e1, # á "Acirc": 0x00c2, #  "acirc": 0x00e2, # â "acute": 0x00b4, # ´ "AElig": 0x00c6, # Æ "aelig": 0x00e6, # æ "Agrave": 0x00c0, # À "agrave": 0x00e0, # à "alefsym": 0x2135, # ℵ "Alpha": 0x0391, # Α "alpha": 0x03b1, # α "amp": 0x0026, # & "and": 0x2227, # ∧ "ang": 0x2220, # ∠ "Aring": 0x00c5, # Å "aring": 0x00e5, # å "asymp": 0x2248, # ≈ "Atilde": 0x00c3, # à "atilde": 0x00e3, # ã "Auml": 0x00c4, # Ä "auml": 0x00e4, # ä "bdquo": 0x201e, # „ "Beta": 0x0392, # Β "beta": 0x03b2, # β "brvbar": 0x00a6, # ¦ "bull": 0x2022, # • "cap": 0x2229, # ∩ "Ccedil": 0x00c7, # Ç "ccedil": 0x00e7, # ç "cedil": 0x00b8, # ¸ "cent": 0x00a2, # ¢ "Chi": 0x03a7, # Χ "chi": 0x03c7, # χ "circ": 0x02c6, # ˆ "clubs": 0x2663, # ♣ "cong": 0x2245, # ≅ "copy": 0x00a9, # © "crarr": 0x21b5, # ↵ "cup": 0x222a, # ∪ "curren": 0x00a4, # ¤ "Dagger": 0x2021, # ‡ "dagger": 0x2020, # † "dArr": 0x21d3, # ⇓ "darr": 0x2193, # ↓ "deg": 0x00b0, # ° "Delta": 0x0394, # Δ "delta": 0x03b4, # δ "diams": 0x2666, # ♦ "divide": 0x00f7, # ÷ "Eacute": 0x00c9, # É "eacute": 0x00e9, # é "Ecirc": 0x00ca, # Ê "ecirc": 0x00ea, # ê "Egrave": 0x00c8, # È "egrave": 0x00e8, # è "empty": 0x2205, # ∅ "emsp": 0x2003, #   "ensp": 0x2002, #   "Epsilon": 0x0395, # Ε "epsilon": 0x03b5, # ε "equiv": 0x2261, # ≡ "Eta": 0x0397, # Η "eta": 0x03b7, # η "ETH": 0x00d0, # Ð "eth": 0x00f0, # ð "etilde": 0x1ebd, # ẽ "Euml": 0x00cb, # Ë "euml": 0x00eb, # ë "euro": 0x20ac, # € "exist": 0x2203, # ∃ "fnof": 0x0192, # ƒ "forall": 0x2200, # ∀ "frac12": 0x00bd, # ½ "frac13": 0x2153, # ⅓ "frac14": 0x00bc, # ¼ "frac23": 0x2154, # ⅔ "frac34": 0x00be, # ¾ "frasl": 0x2044, # ⁄ "Gamma": 0x0393, # Γ "gamma": 0x03b3, # γ "ge": 0x2265, # ≥ "gt": 0x003e, # > "hArr": 0x21d4, # ⇔ "harr": 0x2194, # ↔ "hearts": 0x2665, # ♥ "hellip": 0x2026, # … "Iacute": 0x00cd, # Í "iacute": 0x00ed, # í "Icirc": 0x00ce, # Î "icirc": 0x00ee, # î "iexcl": 0x00a1, # ¡ "Igrave": 0x00cc, # Ì "igrave": 0x00ec, # ì "image": 0x2111, # ℑ "infin": 0x221e, # ∞ "int": 0x222b, # ∫ "Iota": 0x0399, # Ι "iota": 0x03b9, # ι "iquest": 0x00bf, # ¿ "isin": 0x2208, # ∈ "itilde": 0x0129, # ĩ "Iuml": 0x00cf, # Ï "iuml": 0x00ef, # ï "Kappa": 0x039a, # Κ "kappa": 0x03ba, # κ "Lambda": 0x039b, # Λ "lambda": 0x03bb, # λ "lang": 0x2329, # 〈 "laquo": 0x00ab, # « "lArr": 0x21d0, # ⇐ "larr": 0x2190, # ← "lceil": 0x2308, # ⌈ "ldash": 0x2013, # – "ldquo": 0x201c, # “ "le": 0x2264, # ≤ "lfloor": 0x230a, # ⌊ "lowast": 0x2217, # ∗ "loz": 0x25ca, # ◊ "lrm": 0x200e, # ‎ "lsaquo": 0x2039, # ‹ "lsquo": 0x2018, # ‘ "lt": 0x003c, # < "macr": 0x00af, # ¯ "mdash": 0x2014, # — "micro": 0x00b5, # µ "middot": 0x00b7, # · "minus": 0x2212, # − "Mu": 0x039c, # Μ "mu": 0x03bc, # μ "nabla": 0x2207, # ∇ "nbsp": 0x00a0, #   "ndash": 0x2013, # – "ne": 0x2260, # ≠ "ni": 0x220b, # ∋ "not": 0x00ac, # ¬ "notin": 0x2209, # ∉ "nsub": 0x2284, # ⊄ "Ntilde": 0x00d1, # Ñ "ntilde": 0x00f1, # ñ "Nu": 0x039d, # Ν "nu": 0x03bd, # ν "Oacute": 0x00d3, # Ó "oacute": 0x00f3, # ó "Ocirc": 0x00d4, # Ô "ocirc": 0x00f4, # ô "OElig": 0x0152, # Œ "oelig": 0x0153, # œ "Ograve": 0x00d2, # Ò "ograve": 0x00f2, # ò "oline": 0x203e, # ‾ "Omega": 0x03a9, # Ω "omega": 0x03c9, # ω "Omicron": 0x039f, # Ο "omicron": 0x03bf, # ο "oplus": 0x2295, # ⊕ "or": 0x2228, # ∨ "ordf": 0x00aa, # ª "ordm": 0x00ba, # º "Oslash": 0x00d8, # Ø "oslash": 0x00f8, # ø "Otilde": 0x00d5, # Õ "otilde": 0x00f5, # õ "otimes": 0x2297, # ⊗ "Ouml": 0x00d6, # Ö "ouml": 0x00f6, # ö "para": 0x00b6, # ¶ "part": 0x2202, # ∂ "permil": 0x2030, # ‰ "perp": 0x22a5, # ⊥ "Phi": 0x03a6, # Φ "phi": 0x03c6, # φ "Pi": 0x03a0, # Π "pi": 0x03c0, # π "piv": 0x03d6, # ϖ "plusmn": 0x00b1, # ± "pound": 0x00a3, # £ "Prime": 0x2033, # ″ "prime": 0x2032, # ′ "prod": 0x220f, # ∏ "prop": 0x221d, # ∝ "Psi": 0x03a8, # Ψ "psi": 0x03c8, # ψ "quot": 0x0022, # " "radic": 0x221a, # √ "rang": 0x232a, # 〉 "raquo": 0x00bb, # » "rArr": 0x21d2, # ⇒ "rarr": 0x2192, # → "rceil": 0x2309, # ⌉ "rdquo": 0x201d, # ” "real": 0x211c, # ℜ "reg": 0x00ae, # ® "rfloor": 0x230b, # ⌋ "Rho": 0x03a1, # Ρ "rho": 0x03c1, # ρ "rlm": 0x200f, # ‏ "rsaquo": 0x203a, # › "rsquo": 0x2019, # ’ "sbquo": 0x201a, # ‚ "Scaron": 0x0160, # Š "scaron": 0x0161, # š "sdot": 0x22c5, # ⋅ "sect": 0x00a7, # § "shy": 0x00ad, # ­ "Sigma": 0x03a3, # Σ "sigma": 0x03c3, # σ "sigmaf": 0x03c2, # ς "sim": 0x223c, # ∼ "spades": 0x2660, # ♠ "sub": 0x2282, # ⊂ "sube": 0x2286, # ⊆ "sum": 0x2211, # ∑ "sup": 0x2283, # ⊃ "sup1": 0x00b9, # ¹ "sup2": 0x00b2, # ² "sup3": 0x00b3, # ³ "supe": 0x2287, # ⊇ "szlig": 0x00df, # ß "Tau": 0x03a4, # Τ "tau": 0x03c4, # τ "there4": 0x2234, # ∴ "Theta": 0x0398, # Θ "theta": 0x03b8, # θ "thetasym": 0x03d1, # ϑ "thinsp": 0x2009, #   "THORN": 0x00de, # Þ "thorn": 0x00fe, # þ "tilde": 0x02dc, # ˜ "times": 0x00d7, # × "trade": 0x2122, # ™ "Uacute": 0x00da, # Ú "uacute": 0x00fa, # ú "uArr": 0x21d1, # ⇑ "uarr": 0x2191, # ↑ "Ucirc": 0x00db, # Û "ucirc": 0x00fb, # û "Ugrave": 0x00d9, # Ù "ugrave": 0x00f9, # ù "uml": 0x00a8, # ¨ "upsih": 0x03d2, # ϒ "Upsilon": 0x03a5, # Υ "upsilon": 0x03c5, # υ "uring": 0x016f, # ů "utilde": 0x0169, # ũ "Uuml": 0x00dc, # Ü "uuml": 0x00fc, # ü "weierp": 0x2118, # ℘ "wring": 0x1e98, # ẘ "xfrac13": 0x2153, # ⅓ "Xi": 0x039e, # Ξ "xi": 0x03be, # ξ "Yacute": 0x00dd, # Ý "yacute": 0x00fd, # ý "ycirc": 0x0177, # ŷ "yen": 0x00a5, # ¥ "ygrave": 0x1ef3, # ỳ "yring": 0x1e99, # ẙ "ytilde": 0x1ef9, # ỹ "Yuml": 0x0178, # Ÿ "yuml": 0x00ff, # ÿ "Zeta": 0x0396, # Ζ "zeta": 0x03b6, # ζ "zwj": 0x200d, # ‍ "zwnj": 0x200c, # ‌ } def build_name2codepoint_dict(): """ Builds name to codepoint dictionary copy and paste the output to the name2codepoint dictionary name2str - name to utf-8 string dictionary """ import html.entities name2str = {} for k, v in name2codepoint_extra.items(): name2str[k] = chr(v) for k, v in html.entities.name2codepoint.items(): name2str[k] = chr(v) for key in sorted(name2str.keys(), key=lambda s: (s.lower(), s)): value = name2str[key] if len(value) > 1: raise ValueError(f"value = {value!r}") print(f"\t\"{key}\": 0x{ord(value):0>4x}, # {value}") def _sub_unescape_unicode(m: "re.Match") -> str: text = m.group(0) if text[:2] == "&#": # character reference if text.startswith("&#x"): code = int(text[3:-1], 16) else: code = int(text[2:-1]) try: char = chr(code) except ValueError: return text if char not in special_chars: return char return text # named entity name = text[1:-1] if name in name2codepoint: char = chr(name2codepoint[name]) if char not in special_chars: return char return text def unescape_unicode(text): """ unscape unicode entities, but not "<", ">" and "&" leave these 3 special entities alone, since unescaping them creates invalid html we also ignore quotations: """ and "'" """ return re_entity.sub(_sub_unescape_unicode, text) if __name__ == "__main__": build_name2codepoint_dict() pyglossary-4.5.0/pyglossary/image_utils.py000066400000000000000000000022651417733132500210370ustar00rootroot00000000000000import re import base64 import logging from os.path import join from pyglossary.text_utils import crc32hex log = logging.getLogger("pyglossary") re_inline_image = re.compile('src="(data:image/[^<>"]*)"') def extractInlineHtmlImages( defi: str, outDir: str, fnamePrefix: str = "", ) -> "Tuple[str, List[Tuple[str, str]]]": imageDataDict = {} # type: Dict[str, bytes] def subFunc(m: "Match"): nonlocal images src = m.group(1)[len("data:image/"):] i = src.find(";") if i < 0: log.error(f"no semicolon, bad inline img src: {src[:60]}...") return imgFormat, src = src[:i], src[i + 1:] if not src.startswith("base64,"): log.error(f"no 'base64,', bad inline img src: {src[:60]}...") return imgDataB64 = src[len("base64,"):] imgData = base64.b64decode(imgDataB64) imgFname = f"{fnamePrefix}{crc32hex(imgData)}.{imgFormat}" imageDataDict[imgFname] = imgData return f'src="./{imgFname}"' defi = re_inline_image.sub(subFunc, defi) images = [] for imgFname, imgData in imageDataDict.items(): imgPath = join(outDir, imgFname) with open(imgPath, mode="wb") as _file: _file.write(imgData) del imgData images.append((imgFname, imgPath)) return defi, images pyglossary-4.5.0/pyglossary/info.py000066400000000000000000000010411417733132500174570ustar00rootroot00000000000000c_name = "name" c_sourceLang = "sourceLang" c_targetLang = "targetLang" c_copyright = "copyright" c_author = "author" c_publisher = "publisher" infoKeysAliasDict = { "title": c_name, "bookname": c_name, "dbname": c_name, ## "sourcelang": c_sourceLang, "inputlang": c_sourceLang, "origlang": c_sourceLang, ## "targetlang": c_targetLang, "outputlang": c_targetLang, "destlang": c_targetLang, ## "license": c_copyright, ## # do not map "publisher" to "author" ## # are there alternatives to "creationTime" # and "lastUpdated"? } pyglossary-4.5.0/pyglossary/iter_utils.py000066400000000000000000000027251417733132500207210ustar00rootroot00000000000000# Copyright (c) 2019 Saeed Rasooli # Copyright (c) 2012 Erik Rose # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # from https://github.com/erikrose/more-itertools def unique_everseen(iterable): from itertools import filterfalse "List unique elements, preserving order. Remember all elements ever seen." # unique_everseen('AAAABBBCCDAABBB') --> A B C D seen = set() seen_add = seen.add for element in filterfalse(seen.__contains__, iterable): seen_add(element) yield element pyglossary-4.5.0/pyglossary/json_utils.py000066400000000000000000000011721417733132500207220ustar00rootroot00000000000000import sys try: import json except ImportError: import simplejson as json from collections import OrderedDict JsonEncodable = "Union[Dict, List]" # OrderedDict is also subclass of Dict, issubclass(OrderedDict, Dict) is True def dataToPrettyJson( data: "JsonEncodable", ensure_ascii: bool = False, sort_keys: bool = False, ): return json.dumps( data, sort_keys=sort_keys, indent="\t", ensure_ascii=ensure_ascii, ) def jsonToData(st: "AnyStr") -> "JsonEncodable": return json.loads(st) def jsonToOrderedData(text: str) -> "OrderedDict": return json.JSONDecoder( object_pairs_hook=OrderedDict, ).decode(text) pyglossary-4.5.0/pyglossary/langs/000077500000000000000000000000001417733132500172625ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/langs/__init__.py000066400000000000000000000034751417733132500214040ustar00rootroot00000000000000 from pyglossary.core import rootDir from os.path import join import json import logging log = logging.getLogger("pyglossary") class Lang(object): def __init__( self, codes: "List[str]", names: "List[str]", titleTag: str = "b", rtl: int = 0, ): self._codes = codes self._names = names self._titleTag = titleTag self._rtl = rtl def __repr__(self) -> str: return ( f'Lang(' f'codes={self._codes!r}, ' f'names={self._names!r}, ' f'titleTag={self._titleTag!r}' f')' ) def __str__(self) -> str: return f"Lang({self._codes + self._names})" @property def codes(self) -> "List[str]": return self._codes @property def names(self) -> "List[str]": return self._names @property def name(self) -> str: return self._names[0] @property def code(self) -> str: return self._codes[0] @property def titleTag(self) -> str: return self._titleTag @property def rtl(self) -> int: return self._rtl class LangDict(dict): def load(self): from time import time as now if len(self) > 0: return t0 = now() filename = join(rootDir, "pyglossary", "langs", "langs.json") with open(filename, "r", encoding="utf-8") as _file: data = json.load(_file) for row in data: lang = Lang( codes=row["codes"], names=[row["name"]] + row["alt_names"], titleTag=row["title_tag"], rtl=row.get("rtl", 0), ) for key in lang.codes: if key in self: log.error(f"duplicate language code: {key}") self[key] = lang for name in lang.names: if name in self: log.error(f"duplicate language name: {name}") self[name.lower()] = lang log.debug(f"LangDict: loaded, {len(self)} keys, took {(now() - t0)*1000:.1f} ms") def __getitem__(self, key: str) -> "Optional[Lang]": self.load() return self.get(key.lower(), None) langDict = LangDict() pyglossary-4.5.0/pyglossary/langs/langs.json000066400000000000000000001306011417733132500212620ustar00rootroot00000000000000[ { "codes": ["aa", "aar"], "name": "Afar", "alt_names": ["Qafaraf", "’Afar Af", "Afaraf", "Qafar af"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Afar_language", "title_tag": "b" }, { "codes": ["ab", "abk"], "name": "Abkhaz", "alt_names": ["Abkhazian", "Abxaz", "Аҧсуа", "Аҧсуа бызшәа"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Abkhaz_language", "title_tag": "b" }, { "codes": ["ae", "ave"], "name": "Avestan", "alt_names": ["Zend", "اوستایی"], "rtl": 1, "script": ["Avestan"], "wiki": "https://en.wikipedia.org/wiki/Avestan", "title_tag": "b" }, { "codes": ["af", "afr"], "name": "Afrikaans", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Afrikaans", "title_tag": "b" }, { "codes": ["ain"], "name": "Ainu", "alt_names": ["Ainuic", "Aynu", "itak", "アィヌ・イタㇰ"], "script": ["CJK", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Ainu_language", "title_tag": "big" }, { "codes": ["aib"], "name": "Äynu", "alt_names": ["Aynu", "Ainu", "Aini", "Eynu", "Abdal", "Äynú", "ئەينۇ‎", "ئابدال"], "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/%C3%84ynu_language", "title_tag": "big" }, { "codes": ["ak", "aka"], "name": "Akan", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Akan_language", "title_tag": "b" }, { "codes": ["alg"], "name": "Algonquian", "alt_names": ["Algonkian"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Algonquian_languages", "title_tag": "b" }, { "codes": ["am", "amh"], "name": "Amharic", "alt_names": ["አማርኛ", "Amarəñña"], "script": ["Ge'ez"], "wiki": "https://en.wikipedia.org/wiki/Amharic", "title_tag": "big" }, { "codes": ["an", "arg"], "name": "Aragonese", "alt_names": ["Aragonés"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Aragonese_language", "title_tag": "b" }, { "codes": ["ar", "ara"], "name": "Arabic", "alt_names": ["اَلْعَرَبِيَّةُ", "العربیه", "عَرَبِيّ", "عربی"], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Arabic", "title_tag": "b" }, { "codes": ["arc", "syc"], "name": "Aramaic", "alt_names": ["Classical Syriac", "ܐܪܡܝܐ"], "rtl": 1, "script": ["Syriac"], "wiki": "https://en.wikipedia.org/wiki/Aramaic", "title_tag": "b" }, { "codes": ["arn"], "name": "Mapuche", "alt_names": ["Mapudungun"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Mapuche_language", "title_tag": "b" }, { "codes": ["as", "asm"], "name": "Assamese", "alt_names": ["Asamiya", "অসমীয়া"], "script": ["Bengali-Assamese"], "wiki": "https://en.wikipedia.org/wiki/Assamese_language", "title_tag": "big" }, { "codes": ["av", "ava", "aya"], "name": "Avar", "alt_names": ["Avaric", "Авар", "Awar"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Avar_language", "title_tag": "b" }, { "codes": ["ay", "aym"], "name": "Aymara", "alt_names": ["Aymar aru", "Aymaran"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Aymara_language", "title_tag": "b" }, { "codes": ["az", "aze"], "name": "Azerbaijani", "alt_names": ["Azeri"], "script": ["Latin", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Azerbaijani_language", "title_tag": "b" }, { "codes": ["ba", "bak"], "name": "Bashkir", "alt_names": ["Башҡортса‎", "Башҡорт теле", "Башорца", "Башкирский"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Bashkir_language", "title_tag": "b" }, { "codes": ["be", "bel"], "name": "Belarusian", "alt_names": ["Беларуская", "Белорусский"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Belarusian_language", "title_tag": "b" }, { "codes": ["bg", "bul"], "name": "Bulgarian", "alt_names": ["български", "български език", "Bǎlgarski"], "script": ["Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Bulgarian_language", "title_tag": "b" }, { "codes": ["bh", "bih"], "name": "Bihari", "alt_names": ["Bihari languages", "बिहारी"], "script": ["Devanagari"], "wiki": "https://en.wikipedia.org/wiki/Bihari_languages", "title_tag": "big" }, { "codes": ["bi", "bis"], "name": "Bislama", "alt_names": ["Bichelamar"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Bislama", "title_tag": "b" }, { "codes": ["bm", "bam"], "name": "Bambara", "alt_names": ["Bamanankan", "ߓߡߊߣߊ߲", "ߓߡߊߣߊ߲ߞߊ߲"], "script": ["Latin", "N'Ko"], "wiki": "https://en.wikipedia.org/wiki/Bambara_language", "title_tag": "big" }, { "codes": ["bn", "ben"], "name": "Bengali", "alt_names": ["বাংলা"], "script": ["Bengali-Assamese"], "wiki": "https://en.wikipedia.org/wiki/Bengali_language", "title_tag": "big" }, { "codes": ["bnt"], "name": "Bantu", "alt_names": ["*bantʊ̀", "bantʊ̀"], "script": ["Latin", "Arabic", "Mandombe"], "wiki": "https://en.wikipedia.org/wiki/Bantu_languages", "title_tag": "b" }, { "codes": ["bo", "tib", "bod"], "name": "Tibetan", "alt_names": ["Standard Tibetan"], "script": ["Tibetan"], "wiki": "https://en.wikipedia.org/wiki/Standard_Tibetan", "title_tag": "b" }, { "codes": ["br", "bre"], "name": "Breton", "alt_names": ["Brezhoneg"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Breton_language", "title_tag": "b" }, { "codes": ["bs", "bos"], "name": "Bosnian", "alt_names": ["Bosanski", "Босански"], "script": ["Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Bosnian_language", "title_tag": "b" }, { "codes": ["ca", "cat"], "name": "Catalan", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Catalan_language", "title_tag": "b" }, { "codes": ["ce", "che"], "name": "Chechen", "alt_names": ["нохчийн", "нохчийн мотт"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Chechen_language", "title_tag": "b" }, { "codes": ["ch", "cha"], "name": "Chamorro", "alt_names": ["Chamoru", "CHamoru"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Chamorro_language", "title_tag": "b" }, { "codes": ["chn"], "name": "Chinook Jargon", "alt_names": ["Chinuk Wawa", "Chinook Wawa", "wawa", "chinook lelang", "lelang", "chinook"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Chinook_Jargon", "title_tag": "b" }, { "codes": ["co", "cos"], "name": "Corsican", "alt_names": ["Corsa"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Corsican_language", "title_tag": "b" }, { "codes": ["cr", "cre"], "name": "Cree", "alt_names": [], "script": ["Canadian syllabic"], "wiki": "https://en.wikipedia.org/wiki/Cree", "title_tag": "b" }, { "codes": ["cs", "cze", "ces"], "name": "Czech", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Czech_language", "title_tag": "b" }, { "codes": ["cu", "chu"], "name": "Church Slavonic", "alt_names": ["Church Slavic", "New Church Slavonic", "New Church Slavic", "црькъвьнословѣньскъ ѩзыкъ"], "script": ["Glagolitic", "Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Church_Slavonic", "title_tag": "b" }, { "codes": ["cv", "chv"], "name": "Chuvash", "alt_names": ["Căvašla", "Çovaşla", "Чӑвашла"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Chuvash_language", "title_tag": "b" }, { "codes": ["cy", "wel", "cym"], "name": "Welsh", "alt_names": ["Cymraeg", "y Gymraeg"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Welsh_language", "title_tag": "b" }, { "codes": ["da", "dan"], "name": "Danish", "alt_names": ["dansk"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Danish_language", "title_tag": "b" }, { "codes": ["de", "ger", "deu"], "name": "German", "alt_names": ["Deutsch"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/German_language", "title_tag": "b" }, { "codes": ["dv", "div"], "name": "Maldivian", "alt_names": ["Dhivehi", "Divehi", "ދިވެހި"], "rtl": 1, "script": ["Thaana"], "wiki": "https://en.wikipedia.org/wiki/Maldivian_language", "title_tag": "b" }, { "codes": ["dz", "dzo"], "name": "Dzongkha", "alt_names": ["རྫོང་ཁ་"], "script": ["Tibetan"], "wiki": "https://en.wikipedia.org/wiki/Dzongkha", "title_tag": "big" }, { "codes": ["ee", "ewe"], "name": "Ewe", "alt_names": ["Èʋe", "Èʋegbe"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Ewe_language", "title_tag": "b" }, { "codes": ["el", "gre", "ell", "gr"], "name": "Greek", "alt_names": ["ελληνικά", "Elliniká"], "script": ["Greek"], "wiki": "https://en.wikipedia.org/wiki/Greek_language", "title_tag": "b" }, { "codes": ["grc"], "name": "Ancient Greek", "alt_names": ["Ἑλληνική", "Hellēnikḗ"], "script": ["Greek"], "wiki": "https://en.wikipedia.org/wiki/Ancient_Greek", "title_tag": "b" }, { "codes": ["en", "eng"], "name": "English", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/English_language", "title_tag": "b" }, { "codes": ["eo", "epo"], "name": "Esperanto", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Esperanto", "title_tag": "b" }, { "codes": ["es", "spa"], "name": "Spanish", "alt_names": ["español", "española"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Spanish_language", "title_tag": "b" }, { "codes": ["et", "est"], "name": "Estonian", "alt_names": ["eesti keel", "eesti"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Estonian_language", "title_tag": "b" }, { "codes": ["eu", "baq", "eus"], "name": "Basque", "alt_names": ["Euskara"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Basque_language", "title_tag": "b" }, { "codes": ["fa", "per", "fas", "prp"], "name": "Persian", "alt_names": ["Farsi", "فارسی", "Parsi", "Fārsī", "форсӣ", "Forsī", "Porsī"], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Persian_language", "title_tag": "b" }, { "codes": ["ff", "ful", "fuc", "fuf"], "name": "Fula", "alt_names": ["Fulani", "Fulah", "Fulfulde", "𞤊𞤵𞤤𞤬𞤵𞤤𞤣𞤫", "Pulaar", "𞤆𞤵𞤤𞤢𞥄𞤪", "Pular", "𞤆𞤵𞤤𞤢𞤪", "Peul"], "script": ["Latin", "Arabic", "Adlam"], "wiki": "https://en.wikipedia.org/wiki/Fula_language", "title_tag": "big" }, { "codes": ["fi", "fin"], "name": "Finnish", "alt_names": ["suomi", "suomen kieli"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Finnish_language", "title_tag": "b" }, { "codes": ["fil"], "name": "Filipino", "alt_names": ["Pilipino", "Wikang Filipino"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Filipino_language", "title_tag": "b" }, { "codes": ["fj", "fij"], "name": "Fijian", "alt_names": ["Na Vosa Vakaviti", "Na vosa vaka-Viti"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Fijian_language", "title_tag": "b" }, { "codes": ["fo", "fao"], "name": "Faroese", "alt_names": ["føroyskt mál"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Faroese_language", "title_tag": "b" }, { "codes": ["fr", "fre", "fra"], "name": "French", "alt_names": ["français", "langue française"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/French_language", "title_tag": "b" }, { "codes": ["fy", "fry"], "name": "West Frisian", "alt_names": ["Western Frisian", "Frisian"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/West_Frisian_languages", "title_tag": "b" }, { "codes": ["ga", "gle"], "name": "Irish", "alt_names": ["Gaeilge"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Irish_language", "title_tag": "b" }, { "codes": ["gd", "gla"], "name": "Scottish Gaelic", "alt_names": ["Gaelic", "Gàidhlig", "Scots Gaelic"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Scottish_Gaelic", "title_tag": "b" }, { "codes": ["gju"], "name": "Gurjari", "alt_names": ["Gujri", "गुर्जरी", "گُوجَری"], "rtl": 1, "script": ["Takri", "Arabic", "Devanagari"], "wiki": "https://en.wikipedia.org/wiki/Gujari_language", "title_tag": "big" }, { "codes": ["gl", "glg"], "name": "Galician", "alt_names": ["galego"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Galician_language", "title_tag": "b" }, { "codes": ["gn", "grn"], "name": "Guarani", "alt_names": ["Paraguayan Guarani", "avañeʼẽ"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Guarani_language", "title_tag": "b" }, { "codes": ["gu", "guj"], "name": "Gujarati", "alt_names": ["ગુજરાતી", "Gujarātī"], "script": ["Gujarati"], "wiki": "https://en.wikipedia.org/wiki/Gujarati_language", "title_tag": "big" }, { "codes": ["gv", "glv"], "name": "Manx", "alt_names": ["Manx Gaelic", "Gaelg", "Gailck"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Manx_language", "title_tag": "b" }, { "codes": ["gwc"], "name": "Kalami", "alt_names": ["Gawri", "Garwi", "Bashkarik", "کالامي", "ګاوری"], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Kalami_language", "title_tag": "b" }, { "codes": ["ha", "hau"], "name": "Hausa", "alt_names": ["Harshen Hausa", "Halshen Hausa", "هَرْشَن هَوْسَ‎"], "rtl": 2, "script": ["Latin", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Hausa_language", "title_tag": "b" }, { "codes": [], "name": "Haitian French", "alt_names": ["français haïtien"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Haitian_French", "title_tag": "b" }, { "codes": ["haw"], "name": "Hawaiian", "alt_names": ["ʻŌlelo Hawaiʻi"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Hawaiian_language", "title_tag": "b" }, { "codes": ["he", "heb"], "name": "Hebrew", "alt_names": ["Ivrit", "עִבְרִית"], "rtl": 1, "script": ["Hebrew"], "wiki": "https://en.wikipedia.org/wiki/Hebrew_language", "title_tag": "b" }, { "codes": ["hi", "hin"], "name": "Hindi", "alt_names": ["Hindī", "हिंदी"], "script": ["Devanagari", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Hindi", "title_tag": "big" }, { "codes": [], "name": "Hindko", "alt_names": ["ہندکو"], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Hindko", "title_tag": "b" }, { "codes": ["hnd"], "name": "Southern Hindko", "alt_names": [], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Hindko", "title_tag": "b" }, { "codes": ["hno"], "name": "Northern Hindko", "alt_names": [], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Hindko", "title_tag": "b" }, { "codes": ["ho", "hmo"], "name": "Hiri Motu", "alt_names": ["Police Motu", "Pidgin Motu", "Hiri"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Hiri_Motu", "title_tag": "b" }, { "codes": ["hr", "hrv", "scr"], "name": "Croatian", "alt_names": ["hrvatski"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Croatian_language", "title_tag": "b" }, { "codes": ["ht", "hat"], "name": "Haitian Creole", "alt_names": ["Haitian", "kreyòl ayisyen", "kreyòl", "créole haïtien"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Haitian_Creole", "title_tag": "b" }, { "codes": ["hu", "hun"], "name": "Hungarian", "alt_names": ["magyar nyelv"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Hungarian_language", "title_tag": "b" }, { "codes": ["hy", "arm", "hye"], "name": "Armenian", "alt_names": ["հայերէն", "հայերեն", "hayeren"], "script": ["Armenian"], "wiki": "https://en.wikipedia.org/wiki/Armenian_language", "title_tag": "b" }, { "codes": ["hz", "her"], "name": "Herero", "alt_names": ["Otjiherero"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Herero_language", "title_tag": "b" }, { "codes": ["ia", "ina"], "name": "Interlingua", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Interlingua", "title_tag": "b" }, { "codes": ["id", "ind"], "name": "Indonesian", "alt_names": ["bahasa Indonesia"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Indonesian_language", "title_tag": "b" }, { "codes": ["ie", "ile"], "name": "Interlingue", "alt_names": ["Occidental", "Interlingue"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Interlingue", "title_tag": "b" }, { "codes": ["ig", "ibo"], "name": "Igbo", "alt_names": ["Ásụ̀sụ̀ Ìgbò"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Igbo_language", "title_tag": "b" }, { "codes": ["ii", "iii"], "name": "Nuosu", "alt_names": ["Nosu", "Northern Yi", "Liangshan Yi", "Sichuan Yi", "ꆈꌠꉙ", "Nuosuhxop", "彝語", "诺苏语"], "script": ["CJK"], "wiki": "https://en.wikipedia.org/wiki/Nuosu_language", "title_tag": "big" }, { "codes": ["ik", "ipk"], "name": "Inupiaq", "alt_names": ["Inupiat", "Inupiatun", "Alaskan Inuit", "Iñupiatun", "Inupiaqtun"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Inupiaq_language", "title_tag": "b" }, { "codes": ["io", "ido"], "name": "Ido", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Ido", "title_tag": "b" }, { "codes": ["is", "ice", "isl"], "name": "Icelandic", "alt_names": ["íslenska"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Icelandic_language", "title_tag": "b" }, { "codes": ["it", "ita"], "name": "Italian", "alt_names": ["italiano", "lingua italiana"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Italian_language", "title_tag": "b" }, { "codes": ["iu", "iku", "ike"], "name": "Inuktitut", "alt_names": ["Eastern Canadian Inuktitut", "ᐃᓄᒃᑎᑐᑦ"], "script": ["Canadian syllabic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Inuktitut", "title_tag": "b" }, { "codes": ["ja", "jpn"], "name": "Japanese", "alt_names": ["日本語", "にほんご", "Nihongo"], "script": ["CJK"], "wiki": "https://en.wikipedia.org/wiki/Japanese_language", "title_tag": "big" }, { "codes": ["jb", "jbo"], "name": "Lojban", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Lojban", "title_tag": "b" }, { "codes": ["jv", "jav"], "name": "Javanese", "alt_names": ["ꦧꦱꦗꦮ", "ꦕꦫꦗꦮ", "باسا جاوا", "Basa Jawa", "Båså Jåwå", "Cara Jawa"], "script": ["Latin", "Javanese", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Javanese_language", "title_tag": "big" }, { "codes": ["ka", "geo", "kat"], "name": "Georgian", "alt_names": ["Kartuli", "ქართული"], "script": ["Georgian"], "wiki": "https://en.wikipedia.org/wiki/Georgian_language", "title_tag": "big" }, { "codes": ["kea"], "name": "Cape Verdean Creole", "alt_names": ["Cape Verdean"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Cape_Verdean_Creole", "title_tag": "b" }, { "codes": ["kg", "kon"], "name": "Kongo", "alt_names": ["Kikongo"], "script": ["Latin", "Mandombe"], "wiki": "https://en.wikipedia.org/wiki/Kongo_language", "title_tag": "b" }, { "codes": ["kha"], "name": "Khasi", "alt_names": ["Ka Ktien Khasi", "ক ক্ত্যেন খসি"], "script": ["Latin", "Bengali-Assamese"], "wiki": "https://en.wikipedia.org/wiki/Khasi_language", "title_tag": "big" }, { "codes": ["ki", "kik"], "name": "Kikuyu", "alt_names": ["Gĩkũyũ"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Kikuyu_language", "title_tag": "b" }, { "codes": ["kj", "kua"], "name": "Kwanyama", "alt_names": ["Kuanyama", "Cuanhama"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Kwanyama_dialect", "title_tag": "b" }, { "codes": ["kk", "kaz"], "name": "Kazakh", "alt_names": ["qazaqşa", "qazaq tili", "қазақша", "қазақ тілі", "قازاقشا", "قازاق تىلى"], "script": ["Cyrillic", "Latin", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Kazakh_language", "title_tag": "b" }, { "codes": ["kl", "kal"], "name": "Greenlandic", "alt_names": ["kalaallisut", "grønlandsk"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Greenlandic_language", "title_tag": "b" }, { "codes": ["km", "khm"], "name": "Khmer", "alt_names": ["Cambodian", "ភាសាខ្មែរ", "phiăsaa khmae", "ខ្មែរ", "khmae"], "script": ["Khmer"], "wiki": "https://en.wikipedia.org/wiki/Khmer_language", "title_tag": "big" }, { "codes": ["kn", "kan"], "name": "Kannada", "alt_names": ["Kanarese", "ಕನ್ನಡ"], "script": ["Kannada"], "wiki": "https://en.wikipedia.org/wiki/Kannada", "title_tag": "b" }, { "codes": ["ko", "kor"], "name": "Korean", "alt_names": ["한국어", "韓國語", "조선말", "朝鮮말"], "script": ["CJK"], "wiki": "https://en.wikipedia.org/wiki/Korean_language", "title_tag": "big" }, { "codes": ["kr", "kau"], "name": "Kanuri", "alt_names": ["Kànùrí"], "script": ["Arabic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Kanuri_language", "title_tag": "b" }, { "codes": ["ks", "kas"], "name": "Kashmiri", "alt_names": ["Koshur", "कॉशुर", "كٲشُر"], "script": ["Arabic", "Devanagari"], "wiki": "https://en.wikipedia.org/wiki/Kashmiri_language", "title_tag": "big" }, { "codes": ["ku", "kur"], "name": "Kurdish", "alt_names": ["Kurdî", "کوردی"], "script": ["Arabic", "Latin", "Armenian"], "wiki": "https://en.wikipedia.org/wiki/Kurdish_languages", "title_tag": "b" }, { "codes": ["kv", "kom"], "name": "Komi", "alt_names": ["Komi-Zyryan", "Коми", "Коми кыв"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Komi-Zyryan_language", "title_tag": "b" }, { "codes": ["kw", "cor"], "name": "Cornish", "alt_names": ["Kernewek", "Kernowek"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Cornish_language", "title_tag": "b" }, { "codes": ["ky", "kir"], "name": "Kyrgyz", "alt_names": ["Kirghiz", "Kirgiz", "Qirghiz", "Кыргызча", "Qırğızça"], "script": ["Cyrillic", "Arabic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Kyrgyz_language", "title_tag": "b" }, { "codes": ["la", "lat"], "name": "Latin", "alt_names": ["latine", "latīne"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Latin", "title_tag": "b" }, { "codes": ["lb", "ltz"], "name": "Luxembourgish", "alt_names": ["Luxemburgish", "Letzeburgesch", "Lëtzebuergesch", "Luxembourgian"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Luxembourgish", "title_tag": "b" }, { "codes": ["lg", "lug"], "name": "Luganda", "alt_names": ["Ganda", "Oluganda"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Luganda", "title_tag": "b" }, { "codes": ["li", "lim"], "name": "Limburgish", "alt_names": ["Limburgan", "Limburgian", "Limburgic", "Lèmburgs", "Limburgs", "Limburgisch", "Limbourgeois"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Limburgish", "title_tag": "b" }, { "codes": ["ln", "lin"], "name": "Lingala", "alt_names": ["lingála"], "script": ["Latin", "Mandombe"], "wiki": "https://en.wikipedia.org/wiki/Lingala", "title_tag": "b" }, { "codes": ["lo", "lao"], "name": "Lao", "alt_names": ["Laotian", "ລາວ", "ພາສາລາວ"], "script": ["Lao", "Thai"], "wiki": "https://en.wikipedia.org/wiki/Lao_language", "title_tag": "big" }, { "codes": ["lt", "lit"], "name": "Lithuanian", "alt_names": ["lietuvių kalba"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Lithuanian_language", "title_tag": "b" }, { "codes": ["lu", "lub"], "name": "Luba-Katanga", "alt_names": ["Luba-Shaba", "Kiluba"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Luba-Katanga_language", "title_tag": "b" }, { "codes": ["lv", "lav"], "name": "Latvian", "alt_names": ["Lettish", "latviešu valoda"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Latvian_language", "title_tag": "b" }, { "codes": ["mas", "cma"], "name": "Maasai", "alt_names": ["Masai", "Maa", "ɔl"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Maasai_language", "title_tag": "b" }, { "codes": ["mg", "mlg"], "name": "Malagasy", "alt_names": [], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Malagasy_language", "title_tag": "b" }, { "codes": ["mh", "mah"], "name": "Marshallese", "alt_names": ["Ebon", "Kajin M̧ajeļ", "Kajin Majōl"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Marshallese_language", "title_tag": "b" }, { "codes": ["mi", "mao", "mri"], "name": "Maori", "alt_names": ["Māori", "Te reo Māori"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/M%C4%81ori_language", "title_tag": "b" }, { "codes": ["mk", "mac", "mkd"], "name": "Macedonian", "alt_names": ["македонски", "македонски јазик"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Macedonian_language", "title_tag": "b" }, { "codes": ["ml", "mal"], "name": "Malayalam", "alt_names": ["മലയാളം", "Malayāḷam"], "script": ["Malayalam"], "wiki": "https://en.wikipedia.org/wiki/Malayalam", "title_tag": "big" }, { "codes": ["mnc"], "name": "Manchu", "alt_names": ["manju gisun", "ᠮᠠᠨᠵᡠᡤᡳᠰᡠᠨ"], "script": ["Mongolian"], "wiki": "https://en.wikipedia.org/wiki/Manchu_language", "title_tag": "big" }, { "codes": ["mn", "mon"], "name": "Mongolian", "alt_names": ["монгол хэл", "ᠮᠣᠩᠭᠣᠯ ᠬᠡᠯᠡ"], "script": ["Mongolian", "Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Mongolian_language", "title_tag": "b" }, { "codes": ["mo", "mol"], "name": "Moldovan", "alt_names": ["Moldavian", "limba moldovenească", "лимба молдовеняскэ", "лимба Молдовенѣскъ"], "script": ["Latin", "Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Moldovan_language", "title_tag": "b" }, { "codes": ["mr", "mar"], "name": "Marathi", "alt_names": ["मराठी", "Marāṭhī"], "script": ["Devanagari"], "wiki": "https://en.wikipedia.org/wiki/Marathi_language", "title_tag": "b" }, { "codes": ["ms", "may"], "name": "Malay", "alt_names": ["bahasa Melayu", "بهاس ملايو", "ꤷꥁꤼ ꤸꥍꤾꤿꥈ"], "script": ["Latin", "Arabic", "Thai"], "wiki": "https://en.wikipedia.org/wiki/Malay_language", "title_tag": "b" }, { "codes": ["mt", "mlt"], "name": "Maltese", "alt_names": ["Malti"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Maltese_language", "title_tag": "b" }, { "codes": ["my", "bur", "mya"], "name": "Burmese", "alt_names": ["မြန်မာစာ", "မြန်မာစကား"], "script": ["Burmese"], "wiki": "https://en.wikipedia.org/wiki/Burmese_language", "title_tag": "big" }, { "codes": ["na", "nau"], "name": "Nauruan", "alt_names": ["Nauru", "dorerin Naoero", "Ekaiairũ Naoero"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Nauruan_language", "title_tag": "b" }, { "codes": ["nb", "nob"], "name": "Bokmal", "alt_names": ["Bokmål", "Norwegian Bokmal", "Norwegian Bokmål"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Bokm%C3%A5l", "title_tag": "b" }, { "codes": ["nd", "nde"], "name": "North Ndebele", "alt_names": ["Ndebele", "amaNdebele", "Zimbabwean Ndebele", "North Ndebele"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Northern_Ndebele_language", "title_tag": "b" }, { "codes": ["ne", "nep"], "name": "Nepali", "alt_names": ["Gorkhali", "Khaskura", "Nepalese", "Parbate", "नेपाली", "खस कुरा"], "script": ["Devanagari"], "wiki": "https://en.wikipedia.org/wiki/Nepali_language", "title_tag": "big" }, { "codes": ["ng", "ndo"], "name": "Ndonga", "alt_names": ["Oshindonga"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Ndonga_dialect", "title_tag": "b" }, { "codes": ["nl", "dut", "nld"], "name": "Dutch", "alt_names": ["Flemish", "Nederlands"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Dutch_language", "title_tag": "b" }, { "codes": ["nn", "nno"], "name": "Norwegian Nynorsk", "alt_names": ["nynorsk"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Nynorsk", "title_tag": "b" }, { "codes": ["no", "nor"], "name": "Norwegian", "alt_names": ["norsk"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Norwegian_language", "title_tag": "b" }, { "codes": ["nr", "nbl"], "name": "Southern Ndebele", "alt_names": ["South Ndebele", "Transvaal Ndebele", "isiNdebele seSewula"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Southern_Ndebele_language", "title_tag": "b" }, { "codes": ["nv", "nav"], "name": "Navajo", "alt_names": ["Navaho", "Diné bizaad", "Naabeehó bizaad"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Navajo_language", "title_tag": "b" }, { "codes": ["ny", "nya"], "name": "Chewa", "alt_names": ["Nyanja", "Chichewa", "Chinyanja"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Chewa_language", "title_tag": "b" }, { "codes": ["oc", "oci"], "name": "Occitan", "alt_names": ["lenga d'òc", "provençal"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Occitan_language", "title_tag": "b" }, { "codes": ["oj", "oji"], "name": "Ojibwe", "alt_names": ["Ojibwa", "Ojibway", "Otchipwe", "Anishinaabemowin"], "script": ["Latin", "Canadian syllabic"], "wiki": "https://en.wikipedia.org/wiki/Ojibwe_language", "title_tag": "b" }, { "codes": ["om", "orm"], "name": "Oromo", "alt_names": ["Afaan Oromoo"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Oromo_language", "title_tag": "b" }, { "codes": ["or", "ori", "ory"], "name": "Odia", "alt_names": ["Oriya", "ଓଡ଼ିଆ", "Oṛiā"], "script": ["Odia"], "wiki": "https://en.wikipedia.org/wiki/Odia_language", "title_tag": "big" }, { "codes": ["os", "oss"], "name": "Ossetian", "alt_names": ["Ossetic", "Ossete", "ирон ӕвзаг", "дигорон ӕвзаг"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Ossetian_language", "title_tag": "b" }, { "codes": ["pa", "pan"], "name": "Punjabi", "alt_names": ["Panjabi", "ਪੰਜਾਬੀ", "پن٘جابی"], "script": ["Gurmukhi", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Punjabi_language", "title_tag": "big" }, { "codes": ["pi", "pli"], "name": "Pali", "alt_names": ["Magadhan"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Pali", "title_tag": "b" }, { "codes": ["pl", "pol"], "name": "Polish", "alt_names": ["język polski", "polszczyzna", "polski"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Polish_language", "title_tag": "b" }, { "codes": ["ps", "pus"], "name": "Pashto", "alt_names": ["پښتو", "Pax̌tō"], "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Pashto", "title_tag": "b" }, { "codes": ["pt", "por"], "name": "Portuguese", "alt_names": ["português", "língua portuguesa"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Portuguese_language", "title_tag": "b" }, { "codes": ["qu", "que"], "name": "Quechua", "alt_names": ["Runasimi", "Kechua", "Runa Simi"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Quechuan_languages", "title_tag": "b" }, { "codes": ["rm", "roh"], "name": "Rhaeto-Romance", "alt_names": ["Rheto-Romance", "Rhaetian", "Raeto-Romance"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Rhaeto-Romance_languages", "title_tag": "b" }, { "codes": ["rn", "run"], "name": "Kirundi", "alt_names": ["Rundi", "Ikirundi"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Kirundi", "title_tag": "b" }, { "codes": ["ro", "rum", "ron"], "name": "Romanian", "alt_names": ["Rumanian", "Roumanian", "Daco-Romanian", "limba română"], "script": ["Latin", "Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Romanian_language", "title_tag": "b" }, { "codes": ["ru", "rus"], "name": "Russian", "alt_names": ["русский", "русский язык"], "script": ["Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Russian_language", "title_tag": "b" }, { "codes": ["rw", "kin"], "name": "Kinyarwanda", "alt_names": ["Ikinyarwanda"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Kinyarwanda", "title_tag": "b" }, { "codes": ["sa", "san"], "name": "Sanskrit", "alt_names": ["संस्कृतम्", "saṃskṛtam"], "script": ["Devanagari"], "wiki": "https://en.wikipedia.org/wiki/Sanskrit", "title_tag": "big" }, { "codes": ["sc", "srd"], "name": "Sardinian", "alt_names": ["Sard", "sardu", "sadru", "limba sarda", "lìngua sarda"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Sardinian_language", "title_tag": "b" }, { "codes": ["sd", "snd"], "name": "Sindhi", "alt_names": ["سنڌي", "सिंधी", "ਸਿੰਧੀ", "𑈩𑈭𑈴𑈝𑈮", "𑋝𑋡𑋟𑋐𑋢"], "script": ["Arabic", "Devanagari", "Gurmukhi", "Khojki", "Khudabadi"], "wiki": "https://en.wikipedia.org/wiki/Sindhi_language", "title_tag": "big" }, { "codes": ["se", "sme"], "name": "Northern Sami", "alt_names": ["North Sami", "Sami", "davvisámegiella"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Northern_Sami", "title_tag": "b" }, { "codes": ["sg", "sag"], "name": "Sango", "alt_names": ["Sangho", "yângâ tî sängö"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Sango_language", "title_tag": "b" }, { "codes": ["sh", "shr", "hbs"], "name": "Serbo-Croatian", "alt_names": ["Serbo-Croat", "Serbo-Croat-Bosnian", "Bosnian-Croatian-Serbian", "Bosnian-Croatian-Montenegrin-Serbian", "srpskohrvatski", "hrvatskosrpski", "српскохрватски", "хрватскосрпски", "naš jezik", "наш језик"], "script": ["Latin", "Cyrillic"], "wiki": "https://en.wikipedia.org/wiki/Serbo-Croatian", "title_tag": "b" }, { "codes": ["si", "sin"], "name": "Sinhala", "alt_names": ["Sinhalese", "සිංහල", "Siṁhala"], "script": ["Sinhala"], "wiki": "https://en.wikipedia.org/wiki/Sinhala_language", "title_tag": "big" }, { "codes": ["sk", "slo", "slk"], "name": "Slovak", "alt_names": ["slovenčina", "slovenský jazyk"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Slovak_language", "title_tag": "b" }, { "codes": ["sl", "slv"], "name": "Slovene", "alt_names": ["Slovenian", "slovenski jezik", "slovenščina"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Slovene_language", "title_tag": "b" }, { "codes": ["sm", "smo"], "name": "Samoan", "alt_names": ["Gagana faʻa Sāmoa", "Gagana Sāmoa"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Samoan_language", "title_tag": "b" }, { "codes": ["sn", "sna"], "name": "Shona", "alt_names": ["chiShona"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Shona_language", "title_tag": "b" }, { "codes": ["so", "som"], "name": "Somali", "alt_names": ["Af Soomaali"], "script": ["Latin", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Somali_language", "title_tag": "b" }, { "codes": ["sq", "alb", "sqi"], "name": "Albanian", "alt_names": ["shqip", "gjuha shqipe"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Albanian_language", "title_tag": "b" }, { "codes": ["sr", "srp"], "name": "Serbian", "alt_names": ["српски", "srpski"], "script": ["Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Serbian_language", "title_tag": "b" }, { "codes": ["ss", "ssw"], "name": "Swazi", "alt_names": ["Swati", "siSwati"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Swazi_language", "title_tag": "b" }, { "codes": ["st", "sot"], "name": "Sotho", "alt_names": ["Sesotho", "Southern Sotho"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Sotho_language", "title_tag": "b" }, { "codes": ["su", "sun"], "name": "Sundanese", "alt_names": ["Basa Sunda", "ᮘᮞ ᮞᮥᮔ᮪ᮓ"], "script": ["Latin", "Sundanese", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Sundanese_language", "title_tag": "b" }, { "codes": ["sv", "swe"], "name": "Swedish", "alt_names": ["svenska"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Swedish_language", "title_tag": "b" }, { "codes": ["sw", "swa"], "name": "Swahili", "alt_names": ["Kiswahili"], "script": ["Latin", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Swahili_language", "title_tag": "b" }, { "codes": ["ta", "tam"], "name": "Tamil", "alt_names": ["தமிழ்", "Tamiḻ"], "script": ["Tamil", "Arabic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Tamil_language", "title_tag": "big" }, { "codes": ["te", "tel"], "name": "Telugu", "alt_names": ["తెలుగు"], "script": ["Telugu"], "wiki": "https://en.wikipedia.org/wiki/Telugu_language", "title_tag": "big" }, { "codes": ["tg", "tgk"], "name": "Tajik", "alt_names": ["Tajiki", "тоҷик", "Тоҷикӣ", "tojikī", "забо́ни тоҷикӣ́", "zaboni tojikī"], "script": ["Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Tajik_language", "title_tag": "b" }, { "codes": ["th", "tha"], "name": "Thai", "alt_names": ["Central Thai", "Siamese", "ภาษาไทย", "Phasa Thai"], "script": ["Thai"], "wiki": "https://en.wikipedia.org/wiki/Thai_language", "title_tag": "big" }, { "codes": ["ti", "tir"], "name": "Tigrinya", "alt_names": ["Tigrigna", "ትግርኛ", "tigriññā"], "script": ["Ge'ez"], "wiki": "https://en.wikipedia.org/wiki/Tigrinya_language", "title_tag": "big" }, { "codes": ["tk", "tuk"], "name": "Turkmen", "alt_names": ["Türkmençe", "Türkmen", "Türkmen dili", "Түркменче Түркмен дили", "تۆرکمن ديلی", "تۆرکمنچه‎", "تۆرکمن"], "script": ["Latin", "Cyrillic", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Turkmen_language", "title_tag": "b" }, { "codes": ["tl", "tgl"], "name": "Tagalog", "alt_names": ["Wikang Tagalog"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Tagalog_language", "title_tag": "b" }, { "codes": ["tn", "tsn"], "name": "Tswana", "alt_names": ["Setswana"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Tswana_language", "title_tag": "b" }, { "codes": ["to", "ton"], "name": "Tongan", "alt_names": ["Tonga", "lea fakatonga", "lea faka-Tonga"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Tongan_language", "title_tag": "b" }, { "codes": ["tr", "tur"], "name": "Turkish", "alt_names": ["Türkçe", "Türk dili", "Istanbul Turkish", "Turkey Turkish"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Turkish_language", "title_tag": "b" }, { "codes": ["ota"], "name": "Ottoman Turkish", "alt_names": ["لسان عثمانى‎", "lisân-ı Osmânî", "Osmanlı Türkçesi"], "rtl": 1, "script": ["Arabic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Ottoman_Turkish", "title_tag": "b" }, { "codes": ["trw"], "name": "Torwali", "alt_names": ["توروالی"], "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Torwali_language", "title_tag": "b" }, { "codes": ["ts", "tso"], "name": "Tsonga", "alt_names": ["Xitsonga", "Xitsonga"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Tsonga_language", "title_tag": "b" }, { "codes": ["tt", "tat"], "name": "Tatar", "alt_names": ["татар", "تاتار", "татар теле", "tatar tele", "تاتار تلی‎"], "script": ["Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Tatar_language", "title_tag": "b" }, { "codes": ["tw", "twi"], "name": "Twi", "alt_names": ["Akan Kasa"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Twi", "title_tag": "b" }, { "codes": ["ty", "tah"], "name": "Tahitian", "alt_names": ["Reo Tahiti", "Reo Māꞌohi"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Tahitian_language", "title_tag": "b" }, { "codes": ["ug", "uig"], "name": "Uyghur", "alt_names": ["Uighur", "Uyƣur", "Uyğur", "ئۇيغۇر", "Уйғур", "ئۇيغۇر تىلى", "Уйғур тили", "Uyghur tili", "Uyƣur tili", "Uyğur tili"], "script": ["Arabic", "Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Uyghur_language", "title_tag": "b" }, { "codes": ["uk", "ukr"], "name": "Ukrainian", "alt_names": ["українська", "українська мова", "ukrayins'ka mova"], "script": ["Cyrillic", "Latin"], "wiki": "https://en.wikipedia.org/wiki/Ukrainian_language", "title_tag": "b" }, { "codes": ["ur", "urd"], "name": "Urdu", "alt_names": ["Urdū", "اُردُو", "اردو", "Lashkari", "لشکری", "Laškarī", "Modern Standard Urdu"], "rtl": 1, "script": ["Arabic"], "wiki": "https://en.wikipedia.org/wiki/Urdu", "title_tag": "b" }, { "codes": ["uz", "uzb"], "name": "Uzbek", "alt_names": ["O‘zbekcha", "o‘zbek tili", "Ўзбекча", "ўзбек тили", "اۉزبېکچه", "اۉزبېک تیلی", "Özbekçä", "Özbek Tili"], "script": ["Latin", "Cyrillic", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Uzbek_language", "title_tag": "b" }, { "codes": ["ve", "ven"], "name": "Venda", "alt_names": ["Tshivenda", "Tshivenḓa", "Luvenḓa"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Venda_language", "title_tag": "b" }, { "codes": ["vi", "vie"], "name": "Vietnamese", "alt_names": ["Tiếng Việt", "Việt"], "script": ["Latin", "CJK"], "wiki": "https://en.wikipedia.org/wiki/Vietnamese_language", "title_tag": "b" }, { "codes": ["vo", "vol"], "name": "Volapuk", "alt_names": ["Volapük", "Volapük nulik"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Volap%C3%BCk", "title_tag": "b" }, { "codes": ["wa", "wln"], "name": "Walloon", "alt_names": ["walon"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Walloon_language", "title_tag": "b" }, { "codes": ["wo", "wol"], "name": "Wolof", "alt_names": [], "script": ["Latin", "Arabic"], "wiki": "https://en.wikipedia.org/wiki/Wolof_language", "title_tag": "b" }, { "codes": ["xh", "xho"], "name": "Xhosa", "alt_names": ["isiXhosa"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Xhosa_language", "title_tag": "b" }, { "codes": ["yi", "yid"], "name": "Yiddish", "alt_names": ["ייִדיש", "יידיש", "אידיש", "yidish", "idish"], "script": ["Hebrew"], "wiki": "https://en.wikipedia.org/wiki/Yiddish", "title_tag": "b" }, { "codes": ["yo", "yor"], "name": "Yoruba", "alt_names": ["Èdè Yorùbá"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Yoruba_language", "title_tag": "b" }, { "codes": ["za", "zha"], "name": "Zhuang", "alt_names": ["Vahcuengh", "話僮", "壮语", "壯語", "Zhuàngyǔ"], "script": ["CJK"], "wiki": "https://en.wikipedia.org/wiki/Zhuang_languages", "title_tag": "big" }, { "codes": ["zh", "chi", "zho", "cmn"], "name": "Chinese", "alt_names": ["汉语", "漢語", "Mandarin", "Standard Chinese", "Modern Standard Mandarin", "Standard Mandarin", "Mandarin Chinese", "普通话", "普通話", "国语", "國語", "华语", "華語"], "script": ["CJK"], "wiki": "https://en.wikipedia.org/wiki/Standard_Chinese", "title_tag": "big" }, { "codes": ["zu", "zul"], "name": "Zulu", "alt_names": ["isiZulu"], "script": ["Latin"], "wiki": "https://en.wikipedia.org/wiki/Zulu_language", "title_tag": "b" } ] pyglossary-4.5.0/pyglossary/langs/writing_system.py000066400000000000000000000171571417733132500227360ustar00rootroot00000000000000import unicodedata from collections import namedtuple import string WritingSystem = namedtuple( "WritingSystem", [ "name", "iso", "unicode", "titleTag", "direction", # ltr | rtl | ttb "comma", "pop", # population in millions ], defaults=( None, # name None, # iso [], # unicode "b", # titleTag "ltr", # direction ", ", # comma 0, # pop ), ) # digits and FULLWIDTH DIGITs are considered neutral/ignored, not Latin # scripts are separated into multiple groups based on their popularity # (usage in multiple live languages, and number of native speakers) writingSystemList = [ WritingSystem( name="Latin", iso=(215, "Latn"), unicode=[ "LATIN", ], titleTag="b", comma=", ", pop=4900, ), WritingSystem( name="Arabic", iso=(160, "Arab"), unicode=["ARABIC"], titleTag="b", direction="rtl", comma="، ", pop=670, ), WritingSystem( name="Cyrillic", # iso=(220, "Cyrl"), unicode=["CYRILLIC"], titleTag="b", comma=", ", pop=250, ), WritingSystem( name="CJK", # iso: 286=Hang, 500=Hani, 410=Hira, 412=Hrkt, 411=Kana, # 501=Hans, 502=Hant unicode=[ "CJK", "HIRAGANA", "KATAKANA", "IDEOGRAPHIC", # Ideographic Description Characters "DITTO", # Ditto mark "HANGUL", # Korean alphabet "HALFWIDTH KATAKANA", "HALFWIDTH HANGUL", "YI", # https://en.wikipedia.org/wiki/Yi_script "FULLWIDTH LATIN", ], titleTag="big", comma="、", pop=1540, # Chinese=1340, Kana=120, Hangul=78.7 ), WritingSystem( name="Devanagari", # iso=(315 , "Deva"), unicode=["DEVANAGARI"], titleTag="big", comma=", ", pop=610, ), # _____________________________________________________ WritingSystem( name="Armenian", iso=(230, "Armn"), unicode=["ARMENIAN"], titleTag="big", comma=", ", pop=12, ), WritingSystem( name="Bengali-Assamese", iso=(325, "Beng"), unicode=["BENGALI"], titleTag="big", comma=", ", pop=270, ), WritingSystem( name="Burmese", iso=(350, "Mymr"), unicode=["MYANMAR"], titleTag="big", comma=", ", # almost not used except in English phrases pop=39, ), WritingSystem( name="Ge'ez", iso=(430, "Ethi"), unicode=["ETHIOPIC"], titleTag="big", comma=", ", pop=21, ), WritingSystem( name="Greek", iso=(200, "Grek"), unicode=["GREEK"], titleTag="big", comma=", ", pop=11, ), WritingSystem( name="Gujarati", iso=(320, "Gujr"), unicode=["GUJARATI"], titleTag="big", comma=", ", pop=48, ), WritingSystem( name="Gurmukhi", iso=(310, "Guru"), unicode=["GURMUKHI"], titleTag="big", comma=", ", pop=22, ), WritingSystem( name="Hebrew", iso=(125, "Hebr"), unicode=["HEBREW"], titleTag="big", direction="rtl", comma=", ", pop=14, ), WritingSystem( name="Kannada", iso=(345, "Knda"), unicode=["KANNADA"], titleTag="big", comma=", ", pop=45, ), WritingSystem( name="Khmer", iso=(355, "Khmr"), unicode=["KHMER"], titleTag="big", comma=", ", pop=11.4, ), WritingSystem( name="Lao", iso=(356, "Laoo"), unicode=["LAO"], titleTag="big", comma=", ", pop=22, ), WritingSystem( name="Malayalam", iso=(347, "Mlym"), unicode=["MALAYALAM"], titleTag="big", comma=", ", pop=38, ), WritingSystem( name="Odia", iso=(327, "Orya"), unicode=["ORIYA"], titleTag="big", comma=", ", pop=21, ), WritingSystem( name="Sinhala", iso=(348, "Sinh"), unicode=["SINHALA"], titleTag="big", comma=", ", pop=14.4, ), WritingSystem( name="Sundanese", iso=(362, "Sund"), unicode=["SUNDANESE"], titleTag="big", comma=", ", pop=38, ), WritingSystem( name="Tamil", iso=(346, "Taml"), unicode=["TAMIL"], titleTag="big", # Parent scripts: Brahmi, Tamil-Brahmi, Pallava comma=", ", pop=70, ), WritingSystem( name="Telugu", iso=(340, "Telu"), unicode=["TELUGU"], titleTag="big", comma=", ", pop=74, ), WritingSystem( name="Thai", iso=(352, "Thai"), unicode=["THAI"], titleTag="big", comma=", ", pop=38, ), # _____________________________________________________ WritingSystem( name="Syriac", iso=(135, "Syrc"), unicode=["SYRIAC"], titleTag="b", direction="rtl", comma="، ", pop=8, # Syriac=0.4, Lontara=7.6 # Lontara is a separate script according to Wikipedia # but not according to Unicode ), WritingSystem( name="Tibetan", iso=(330, "Tibt"), unicode=["TIBETAN"], titleTag="big", comma=", ", # almost not used expect in numbers! pop=5, ), WritingSystem( name="Georgian", iso=(240, "Geor"), unicode=["GEORGIAN"], titleTag="big", comma=", ", pop=4.5, ), WritingSystem( name="Mongolian", iso=(145, "Mong"), unicode=["MONGOLIAN"], titleTag="big", direction="ltr", # historically ttb? comma=", ", pop=2, ), WritingSystem( name="Thaana", iso=(170, "Thaa"), unicode=["THAANA"], titleTag="big", direction="rtl", comma="، ", pop=0.35, ), # _____________________________________________________ WritingSystem( name="Javanese", iso=(361, "Java"), unicode=["JAVANESE"], titleTag="big", # Since around 1945 Javanese script has largely been # supplanted by Latin script to write Javanese. ), WritingSystem( name="Canadian syllabic", iso=(440, "Cans"), unicode=["CANADIAN SYLLABICS"], titleTag="big", comma=", ", ), WritingSystem( name="Takri", iso=(321, "Takr"), unicode=["TAKRI"], titleTag="b", # comma="", FIXME ), # _____________________________________________________ WritingSystem( name="SignWriting", iso=(95, "Sgnw"), unicode=["SIGNWRITING"], titleTag="big", direction="ttb", comma="𝪇", ), # _____________________________________________________ WritingSystem( name="Adlam", iso=(166, "Adlm"), unicode=["ADLAM"], titleTag="big", direction="rtl", ), WritingSystem( name="Avestan", iso=(134, "Avst"), unicode=["AVESTAN"], titleTag="b", direction="rtl", ), WritingSystem( name="Glagolitic", iso=(225, "Glag"), unicode=["GLAGOLITIC"], titleTag="b", ), WritingSystem( name="Khojki", iso=(322, "Khoj"), unicode=["KHOJKI"], titleTag="big", ), WritingSystem( name="Khudabadi", # aka: "Khudawadi", "Sindhi" iso=(318, "Sind"), unicode=["KHUDAWADI"], titleTag="big", ), WritingSystem( name="N'Ko", iso=(165, "Nkoo"), unicode=["NKO"], titleTag="big", ), # _____________________________________________________ # WritingSystem( # name="Baybayin", # unicode=["TAGALOG"], # ), # WritingSystem( # name="Rejang", # unicode=["REJANG"], # ), # WritingSystem( # name="Mandombe", # unicode=[], # ), # WritingSystem( # name="Mwangwego", # unicode=[], # ), ] for ws in writingSystemList: if not ws.name: raise ValueError(f"empty name in {ws}") writingSystemByUnicode = { uni: ws for ws in writingSystemList for uni in ws.unicode } writingSystemByName = { ws.name: ws for ws in writingSystemList } unicodeNextWord = { "HALFWIDTH", "FULLWIDTH", "CANADIAN", } def _getWritingSystemFromText(st: str, start: int, end: int): for c in st[start:end]: try: unicodeWords = unicodedata.name(c).split(' ') except ValueError as e: # if c not in string.whitespace: # print(f"c={c!r}, {e}") continue alias = unicodeWords[0] ws = writingSystemByUnicode.get(alias) if ws: return ws if alias in unicodeNextWord: ws = writingSystemByUnicode.get(" ".join(unicodeWords[:2])) if ws: return ws def getWritingSystemFromText(st: str): st = st.strip() if not st: return None # some special first words in unicodedata.name(c): # "RIGHT", "ASTERISK", "MODIFIER" k = (len(st) + 1) // 2 - 1 ws = _getWritingSystemFromText(st, k, len(st)) if ws: return ws return _getWritingSystemFromText(st, 0, k) pyglossary-4.5.0/pyglossary/option.py000066400000000000000000000213171417733132500200440ustar00rootroot00000000000000# -*- coding: utf-8 -*- import re import logging log = logging.getLogger("pyglossary") def optionFromDict(data): className = data.pop("class") if className == "Option": data["typ"] = data.pop("type") optClass = Option else: data.pop("type") optClass = Option.classes[className] return optClass(**data) class Option(object): classes = {} @classmethod def register(cls, optClass): cls.classes[optClass.__name__] = optClass return optClass def __init__( self, typ: str, customValue: bool = False, values: "Optional[List[str]]" = None, allowNone: bool = False, comment: str = "", multiline: bool = False, disabled: bool = False, hasFlag: bool = False, customFlag: str = "", falseComment: str = "", ) -> None: if values is None: # otherwise there would not be any valid value customValue = True self.typ = typ self.values = values self.allowNone = allowNone self.customValue = customValue self.comment = comment self.multiline = multiline self.disabled = disabled self.hasFlag = hasFlag self.customFlag = customFlag self.falseComment = falseComment @property def typeDesc(self): return self.typ @property def longComment(self): comment = self.typeDesc if self.comment: if comment: comment += ", " comment += self.comment return comment def toDict(self): data = { "class": self.__class__.__name__, "type": self.typ, "customValue": self.customValue, } if self.values: data["values"] = self.values if self.comment: data["comment"] = self.comment if self.disabled: data["disabled"] = True if self.hasFlag: data["hasFlag"] = True data["customFlag"] = self.customFlag if self.falseComment: data["falseComment"] = self.falseComment return data def evaluate(self, raw: str) -> "Tuple[Any, bool]": "returns (value, isValid)" if raw == "None": return None, True return raw, True def validate(self, value): if not self.customValue: if not self.values: log.error( f"invalid option: customValue={self.customValue!r}" f", values={self.values!r}" ) return False return value in self.values if value is None: return self.allowNone valueType = type(value).__name__ return self.typ == valueType def validateRaw(self, raw: str) -> bool: "returns isValid" value, isValid = self.evaluate(raw) if not isValid: return False if not self.validate(value): return False return True def groupValues(self) -> "Optional[Dict[str, Any]]": return None @Option.register class BoolOption(Option): def __init__(self, allowNone=False, **kwargs): values = [False, True] if allowNone: values.append(None) Option.__init__( self, typ="bool", customValue=False, values=values, allowNone=allowNone, **kwargs, ) def toDict(self): data = Option.toDict(self) del data["customValue"] del data["values"] return data def evaluate(self, raw: "Union[str, bool]") -> "Tuple[Optional[bool], bool]": if raw is None or raw.lower() == "none": return None, True if isinstance(raw, bool): return raw, True if raw.lower() in ("yes", "true", "1"): return True, True if raw.lower() in ("no", "false", "0"): return False, True return None, False # not valid @Option.register class StrOption(Option): def __init__(self, **kwargs): Option.__init__( self, typ="str", **kwargs ) def validate(self, value): if not self.customValue: if not self.values: log.error( f"invalid option: customValue={self.customValue!r}" f", values={self.values!r}" ) return False return value in self.values return type(value).__name__ == "str" def groupValues(self) -> "Optional[Dict[str, Any]]": return None @Option.register class IntOption(Option): def __init__(self, **kwargs): Option.__init__( self, typ="int", **kwargs ) def evaluate(self, raw: "Union[str, int]") -> "Tuple[Optional[int], bool]": "returns (value, isValid)" try: value = int(raw) except ValueError: return None, False return value, True @Option.register class FileSizeOption(IntOption): factors = { "KiB": 1024, "kib": 1024, "Ki": 1024, "ki": 1024, "MiB": 1048576, "mib": 1048576, "Mi": 1048576, "mi": 1048576, "GiB": 1073741824, "gib": 1073741824, "Gi": 1073741824, "gi": 1073741824, "kB": 1000, "kb": 1000, "KB": 1000, "k": 1000, "K": 1000, "MB": 1000000, "mb": 1000000, "mB": 1000000, "M": 1000000, "m": 1000000, "GB": 1000000000, "gb": 1000000000, "gB": 1000000000, "G": 1000000000, "g": 1000000000, } validPattern = "^([0-9.]+)([kKmMgG]i?[bB]?)$" @property def typeDesc(self): return "" def evaluate(self, raw: "Union[str, int]") -> "Tuple[Optional[int], bool]": if not raw: return 0 factor = 1 m = re.match(self.validPattern, raw) if m is not None: raw, unit = m.groups() factor = self.factors.get(unit) if factor is None: return None, False try: value = float(raw) except ValueError: return None, False if value < 0: return None, False return int(value * factor), True @Option.register class FloatOption(Option): def __init__(self, **kwargs): Option.__init__( self, typ="float", **kwargs ) def evaluate( self, raw: "Union[str, float, int]", ) -> "Tuple[Optional[float], bool]": "returns (value, isValid)" try: value = float(raw) except ValueError: return None, False else: return value, True @Option.register class DictOption(Option): def __init__(self, **kwargs): Option.__init__( self, typ="dict", customValue=True, allowNone=True, multiline=True, **kwargs, ) def toDict(self): data = Option.toDict(self) del data["customValue"] return data def evaluate(self, raw: "Union[str, dict]") -> "Tuple[Optional[Dict], bool]": import ast if isinstance(raw, dict): return raw, True if raw == "": return None, True # valid try: value = ast.literal_eval(raw) except SyntaxError: return None, False # not valid if type(value).__name__ != "dict": return None, False # not valid return value, True # valid @Option.register class ListOption(Option): def __init__(self, **kwargs): Option.__init__( self, typ="list", customValue=True, allowNone=True, multiline=True, **kwargs, ) def toDict(self): data = Option.toDict(self) del data["customValue"] return data def evaluate(self, raw: str) -> "Tuple[Optional[List], bool]": import ast if raw == "": return None, True # valid try: value = ast.literal_eval(raw) except SyntaxError: return None, False # not valid if type(value).__name__ != "list": return None, False # not valid return value, True # valid @Option.register class EncodingOption(Option): re_category = re.compile("^[a-z]+") def __init__( self, customValue=True, values=None, comment=None, **kwargs ): if values is None: values = [ "utf-8", "utf-16", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "mac_cyrillic", "mac_greek", "mac_iceland", "mac_latin2", "mac_roman", "mac_turkish", "cyrillic", "arabic", "greek", "hebrew", "latin2", "latin3", "latin4", "latin5", "latin6", ] if comment is None: comment = "Encoding/charset" Option.__init__( self, typ="str", customValue=customValue, values=values, comment=comment, **kwargs ) def toDict(self): data = Option.toDict(self) del data["values"] return data def groupValues(self) -> "Optional[Dict[str, Any]]": from collections import OrderedDict groups = OrderedDict() # type: Dict[str, List[str]] others = [] # type: List[str] for value in self.values: cats = self.re_category.findall(value) if not cats: others.append(value) continue cat = cats[0] if len(cat) == len(value): others.append(value) continue if cat not in groups: groups[cat] = [] groups[cat].append(value) if others: groups["other"] = others return groups @Option.register class NewlineOption(Option): def __init__( self, customValue=True, values=None, comment=None, **kwargs ): if values is None: values = [ "\r\n", "\n", "\r", ] if comment is None: comment = "Newline string" Option.__init__( self, typ="str", customValue=customValue, values=values, multiline=True, comment=comment, **kwargs ) @Option.register class HtmlColorOption(Option): def toDict(self): data = Option.toDict(self) del data["customValue"] return data def __init__(self, **kwargs): Option.__init__( self, typ="str", customValue=True, **kwargs ) # TODO: use a specific type? pyglossary-4.5.0/pyglossary/os_utils.py000066400000000000000000000047311417733132500203760ustar00rootroot00000000000000import os import shutil import logging from pyglossary import core log = logging.getLogger("pyglossary") class indir(object): """ mkdir + chdir shortcut to use with `with` statement. >>> print(os.getcwd()) # -> "~/projects" >>> with indir('my_directory', create=True): >>> print(os.getcwd()) # -> "~/projects/my_directory" >>> # do some work inside new 'my_directory'... >>> print(os.getcwd()) # -> "~/projects" >>> # automatically return to previous directory. """ def __init__(self, directory: str, create: bool = False, clear: bool = False): self.oldpwd = None self.dir = directory self.create = create self.clear = clear def __enter__(self): self.oldpwd = os.getcwd() if os.path.exists(self.dir): if self.clear: shutil.rmtree(self.dir) os.makedirs(self.dir) elif self.create: os.makedirs(self.dir) os.chdir(self.dir) def __exit__(self, exc_type, exc_val, exc_tb): os.chdir(self.oldpwd) self.oldpwd = None def runDictzip(filename: str) -> None: import shutil import subprocess dictzipCmd = shutil.which("dictzip") if not dictzipCmd: log.warning("dictzip command was not found. Make sure it's in your $PATH") return False (out, err) = subprocess.Popen( [dictzipCmd, filename], stdout=subprocess.PIPE ).communicate() log.debug(f"dictzip command: {dictzipCmd!r}") if err: err = err.replace('\n', ' ') log.error(f"dictzip error: {err}") if out: out = out.replace('\n', ' ') log.error(f"dictzip error: {out}") def _rmtreeError(func, direc, exc_info): exc_type, exc_val, exc_tb = exc_info log.error(exc_val) def rmtree(direc): import shutil from os.path import isdir try: for i in range(2): if isdir(direc): shutil.rmtree( direc, onerror=_rmtreeError, ) except Exception: log.exception(f"error removing directory: {direc}") def showMemoryUsage(): if log.level > core.TRACE: return try: import psutil except ModuleNotFoundError: return usage = psutil.Process(os.getpid()).memory_info().rss // 1024 log.trace(f"Memory Usage: {usage} kB") def my_url_show(link: str) -> None: import subprocess for path in ( '/usr/bin/gnome-www-browser', '/usr/bin/firefox', '/usr/bin/iceweasel', '/usr/bin/konqueror', ): if os.path.isfile(path): subprocess.call([path, link]) break """ try: from gnome import url_show except: try: from gnomevfs import url_show except: url_show = my_url_show """ def click_website(widget: "Any", link: str) -> None: my_url_show(link) pyglossary-4.5.0/pyglossary/persian_utils.py000066400000000000000000000003501417733132500214070ustar00rootroot00000000000000# -*- coding: utf-8 -*- from .text_utils import replacePostSpaceChar def faEditStr(st: str) -> str: return replacePostSpaceChar( st.replace("ي", "ی").replace("ك", "ک").replace("ۂ", "هٔ").replace("ہ", "ه"), "،", ) pyglossary-4.5.0/pyglossary/plugin_lib/000077500000000000000000000000001417733132500203025ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugin_lib/__init__.py000066400000000000000000000000001417733132500224010ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugin_lib/dictdlib.py000066400000000000000000000236021417733132500224350ustar00rootroot00000000000000# Dictionary creation library # Copyright (C) 2002 John Goerzen # Copyright (C) 2020 Saeed Rasooli # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import sys import string import gzip import os b64_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" url_headword = "00-database-url" short_headword = "00-database-short" info_headword = "00-database-info" def b64_encode(val): """Takes as input an integer val and returns a string of it encoded with the base64 algorithm used by dict indexes.""" startfound = 0 retval = "" for i in range(5, -1, -1): thispart = (val >> (6 * i)) & ((2 ** 6) - 1) if (not startfound) and (not thispart): # Both zero -- keep going. continue startfound = 1 retval += b64_list[thispart] if len(retval): return retval else: return b64_list[0] def b64_decode(str): """Takes as input a string and returns an integer value of it decoded with the base64 algorithm used by dict indexes.""" if not len(str): return 0 retval = 0 shiftval = 0 for i in range(len(str) - 1, -1, -1): val = b64_list.index(str[i]) retval = retval | (val << shiftval) shiftval += 6 return retval validdict = {} for x in string.ascii_letters + string.digits + " \t": validdict[x] = 1 def sortnormalize(x): """Returns a value such that x is mapped to a format that sorts properly with standard comparison.""" x2 = '' for i in range(len(x)): if x[i] in validdict: x2 += x[i] return x2.upper() + "\0" + x.upper() def sortKey(x): """Emulate sort -df.""" return x.split("\0") class DictDB: def __init__(self, basename, mode='read', quiet=0): #, url = 'unknown', shortname = 'unknown', # longinfo = 'unknown', quiet = 0): """Initialize a DictDB object. Mode must be one of: read -- read-only access write -- write-only access, truncates existing files, does not work with .dz. dict created if nonexistant. update -- read/write access, dict created if nonexistant. Does not work with .dz. Read can read dict or dict.dz files. Write and update will NOT work with dict.dz files. If quiet is nonzero, status messages will be suppressed.""" self.mode = mode self.quiet = quiet self.indexentries = {} self.count = 0 self.basename = basename self.indexfilename = self.basename + ".index" if mode == 'read' and os.path.isfile(self.basename + ".dict.dz"): self.usecompression = 1 else: self.usecompression = 0 if self.usecompression: self.dictfilename = self.basename + ".dict.dz" else: self.dictfilename = self.basename + ".dict" if mode == 'read': self.indexfile = open(self.indexfilename, "rt") if self.usecompression: self.dictfile = gzip.GzipFile(self.dictfilename, "rb") else: self.dictfile = open(self.dictfilename, "rb") self._initindex() elif mode == 'write': self.indexfile = open(self.indexfilename, "wt") if self.usecompression: raise ValueError("'write' mode incompatible with .dz files") else: self.dictfile = open(self.dictfilename, "wb") elif mode == 'update': try: self.indexfile = open(self.indexfilename, "r+b") except IOError: self.indexfile = open(self.indexfilename, "w+b") if self.usecompression: # Open it read-only since we don't support mods. self.dictfile = gzip.GzipFile(self.dictfilename, "rb") else: try: self.dictfile = open(self.dictfilename, "r+b") except IOError: self.dictfile = open(self.dictfilename, "w+b") self._initindex() else: raise ValueError("mode must be 'read', 'write', or 'update'") #self.writeentry(url_headword + "\n " + url, [url_headword]) #self.writeentry(short_headword + "\n " + shortname, # [short_headword]) #self.writeentry(info_headword + "\n" + longinfo, [info_headword]) def _initindex(self): """Load the entire index off disk into memory.""" self.indexfile.seek(0) for line in self.indexfile: splits = line.rstrip().split("\t") if splits[0] not in self.indexentries: self.indexentries[splits[0]] = [] self.indexentries[splits[0]].append([ b64_decode(splits[1]), b64_decode(splits[2]), ]) def addindexentry(self, word, start, size): """Adds an entry to the index. word is the relevant word. start is the starting position in the dictionary and size is the size of the definition; both are integers.""" if word not in self.indexentries: self.indexentries[word] = [] self.indexentries[word].append([start, size]) def delindexentry(self, word, start=None, size=None): """Removes an entry from the index; word is the word to search for. start and size are optional. If they are specified, only index entries matching the specified values will be removed. For instance, if word is "foo" and start and size are not specified, all index entries for the word foo will be removed. If start and size are specified, only those entries matching all criteria will be removed. This function does not actually remove the data from the .dict file. Therefore, information removed by this function will still exist on-disk in the .dict file, but the dict server will just not "see" it -- there will be no way to get to it anymore. Returns a count of the deleted entries.""" if word not in self.indexentries: return 0 retval = 0 entrylist = self.indexentries[word] for i in range(len(entrylist) - 1, -1, -1): # Go backwords so the del doesn't effect the index. if (start is None or start == entrylist[i][0]) and \ (size is None or size == entrylist[i][1]): del(entrylist[i]) retval += 1 if len(entrylist) == 0: # If we emptied it, del it completely del(self.indexentries[word]) return retval def update(self, string): """Writes string out, if not quiet.""" if not self.quiet: sys.stdout.write(string) sys.stdout.flush() def seturl(self, url): """Sets the URL attribute of this database. If there was already a URL specified, we will use delindexentry() on it first.""" self.delindexentry(url_headword) self.addentry(url_headword + "\n " + url, [url_headword]) def setshortname(self, shortname): """Sets the shortname for this database. If there was already a shortname specified, we will use delindexentry() on it first.""" self.delindexentry(short_headword) self.addentry( short_headword + "\n " + shortname, [short_headword], ) def setlonginfo(self, longinfo): """Sets the extended information for this database. If there was already long info specified, we will use delindexentry() on it first.""" self.delindexentry(info_headword) self.addentry(info_headword + "\n" + longinfo, [info_headword]) def addentry(self, defstr, headwords): """Writes an entry. defstr holds the content of the definition. headwords is a list specifying one or more words under which this definition should be indexed. This function always adds \\n to the end of defstr.""" self.dictfile.seek(0, 2) # Seek to end of file start = self.dictfile.tell() defstr += b"\n" self.dictfile.write(defstr) for word in headwords: self.addindexentry(word, start, len(defstr)) self.count += 1 if self.count % 1000 == 0: self.update("Processed %d records\r" % self.count) def finish(self, dosort=1): """Called to finish the writing process. **REQUIRED IF OPENED WITH 'update' OR 'write' MODES**. This will write the index and close the files. dosort is optional and defaults to true. If set to false, dictlib will not sort the index file. In this case, you MUST manually sort it through "sort -df" before it can be used.""" self.update("Processed %d records.\n" % self.count) if dosort: self.update("Sorting index: converting") indexlist = [] for word, defs in self.indexentries.items(): for thisdef in defs: indexlist.append("%s\t%s\t%s" % ( word, b64_encode(thisdef[0]), b64_encode(thisdef[1]), )) self.update(" mapping") sortmap = {} for entry in indexlist: norm = sortnormalize(entry) if norm in sortmap: sortmap[norm].append(entry) sortmap[norm].sort(key=sortKey) else: sortmap[norm] = [entry] self.update(" listing") normalizedentries = list(sortmap.keys()) self.update(" sorting") normalizedentries.sort() self.update(" re-mapping") indexlist = [] for normentry in normalizedentries: for entry in sortmap[normentry]: indexlist.append(entry) self.update(", done.\n") self.update("Writing index...\n") self.indexfile.seek(0) for entry in indexlist: self.indexfile.write(entry + "\n") if self.mode == 'update': # In case things were deleted self.indexfile.truncate() self.indexfile.close() self.dictfile.close() self.update("Complete.\n") def getdeflist(self): """Returns a list of strings naming all definitions contained in this dictionary.""" return self.indexentries.keys() def hasdef(self, word): return word in self.indexentries def getdef(self, word): """Given a definition name, returns a list of strings with all matching definitions. This is an *exact* match, not a case-insensitive one. Returns [] if word is not in the dictionary.""" retval = [] if not self.hasdef(word): return retval for start, length in self.indexentries[word]: self.dictfile.seek(start) retval.append(self.dictfile.read(length)) return retval pyglossary-4.5.0/pyglossary/plugin_lib/pureSalsa20.py000066400000000000000000000310761417733132500227640ustar00rootroot00000000000000#!/usr/bin/env python # coding: utf-8 """ Copyright by https://github.com/zhansliu/writemdict pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3 v4.0: Added Python 3 support, dropped support for Python <= 2.5. // zhansliu Original comments below. ==================================================================== There are comments here by two authors about three pieces of software: comments by Larry Bugbee about Salsa20, the stream cipher by Daniel J. Bernstein (including comments about the speed of the C version) and pySalsa20, Bugbee's own Python wrapper for salsa20.c (including some references), and comments by Steve Witham about pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20, which follows pySalsa20's API, and is in this file. Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee) ----------------------------------------------------------- Salsa20 is a fast stream cipher written by Daniel Bernstein that basically uses a hash function and XOR making for fast encryption. (Decryption uses the same function.) Salsa20 is simple and quick. Some Salsa20 parameter values... design strength 128 bits key length 128 or 256 bits, exactly IV, aka nonce 64 bits, always chunk size must be in multiples of 64 bytes Salsa20 has two reduced versions, 8 and 12 rounds each. One benchmark (10 MB): 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds (no I/O and before Python GC kicks in) Salsa20 is a Phase 3 finalist in the EU eSTREAM competition and appears to be one of the fastest ciphers. It is well documented so I will not attempt any injustice here. Please see "References" below. ...and Salsa20 is "free for any use". pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee) ------------------------------------------------------------------ pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is as it's name implies, 20 rounds, but there are two reduced versions, 8 and 12 rounds each. Because the APIs are identical, pySalsa20 is capable of wrapping all three versions (number of rounds hardcoded), including a special version that allows you to set the number of rounds with a set_rounds() function. Compile the version of your choice as a shared library (not as a Python extension), name and install it as libsalsa20.so. Sample usage: from pySalsa20 import Salsa20 s20 = Salsa20(key, IV) dataout = s20.encryptBytes(datain) # same for decrypt This is EXPERIMENTAL software and intended for educational purposes only. To make experimentation less cumbersome, pySalsa20 is also free for any use. THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF ANY KIND. USE AT YOUR OWN RISK. Enjoy, Larry Bugbee bugbee@seanet.com April 2007 References: ----------- http://en.wikipedia.org/wiki/Salsa20 http://en.wikipedia.org/wiki/Daniel_Bernstein http://cr.yp.to/djb.html http://www.ecrypt.eu.org/stream/salsa20p3.html http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip Prerequisites for pySalsa20: ---------------------------- - Python 2.5 (haven't tested in 2.4) pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham) ------------------------------------------------------------------ pureSalsa20 is the stand-alone Python code in this file. It implements the underlying Salsa20 core algorithm and emulates pySalsa20's Salsa20 class API (minus a bug(*)). pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20-- about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8, when encrypting 64k-byte blocks on my computer. pureSalsa20 is for cases where portability is much more important than speed. I wrote it for use in a "structured" random number generator. There are comments about the reasons for this slowness in http://www.tiac.net/~sw/2010/02/PureSalsa20 Sample usage: from pureSalsa20 import Salsa20 s20 = Salsa20(key, IV) dataout = s20.encryptBytes(datain) # same for decrypt I took the test code from pySalsa20, added a bunch of tests including rough speed tests, and moved them into the file testSalsa20.py. To test both pySalsa20 and pureSalsa20, type python testSalsa20.py (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the libsalsa20.so library and not switched when switching between instances of the Salsa20 class. s1 = Salsa20( key, IV, 20 ) s2 = Salsa20( key, IV, 8 ) In this example, with pySalsa20, both s1 and s2 will do 8 rounds of encryption. with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds. Perhaps giving each instance its own nRounds variable, which is passed to the salsa20wordtobyte() function, is insecure. I'm not a cryptographer. pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and intended for educational purposes only. To make experimentation less cumbersome, pureSalsa20.py and testSalsa20.py are free for any use. Revisions: ---------- p3.2 Fixed bug that initialized the output buffer with plaintext! Saner ramping of nreps in speed test. Minor changes and print statements. p3.1 Took timing variability out of add32() and rot32(). Made the internals more like pySalsa20/libsalsa . Put the semicolons back in the main loop! In encryptBytes(), modify a byte array instead of appending. Fixed speed calculation bug. Used subclasses instead of patches in testSalsa20.py . Added 64k-byte messages to speed test to be fair to pySalsa20. p3 First version, intended to parallel pySalsa20 version 3. More references: ---------------- http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20] http://cr.yp.to/snuffle.html [The original name of Salsa20] http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design] http://www.tiac.net/~sw/2010/02/PureSalsa20 THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF ANY KIND. USE AT YOUR OWN RISK. Cheers, Steve Witham sw at remove-this tiac dot net February, 2010 """ import sys assert(sys.version_info >= (2, 6)) if sys.version_info >= (3,): integer_types = (int,) python3 = True else: integer_types = (int, long) python3 = False from struct import Struct little_u64 = Struct( "= 2**64" ctx = self.ctx ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) ) def getCounter( self ): return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0] def setRounds(self, rounds, testing=False ): assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20' self.rounds = rounds def encryptBytes(self, data): assert type(data) == bytes, 'data must be byte string' assert self._lastChunk64, 'previous chunk not multiple of 64 bytes' lendata = len(data) munged = bytearray(lendata) for i in range( 0, lendata, 64 ): h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False ) self.setCounter( ( self.getCounter() + 1 ) % 2**64 ) # Stopping at 2^70 bytes per nonce is user's responsibility. for j in range( min( 64, lendata - i ) ): if python3: munged[ i+j ] = data[ i+j ] ^ h[j] else: munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j]) self._lastChunk64 = not lendata % 64 return bytes(munged) decryptBytes = encryptBytes # encrypt and decrypt use same function #-------------------------------------------------------------------------- def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ): """ Do nRounds Salsa20 rounds on a copy of input: list or tuple of 16 ints treated as little-endian unsigneds. Returns a 64-byte string. """ assert( type(input) in ( list, tuple ) and len(input) == 16 ) assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) ) x = list( input ) def XOR( a, b ): return a ^ b ROTATE = rot32 PLUS = add32 for i in range( nRounds // 2 ): # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c # unchanged except for indents and the blank line between rounds: x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7)); x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9)); x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13)); x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18)); x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7)); x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9)); x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13)); x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18)); x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7)); x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9)); x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13)); x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18)); x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7)); x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9)); x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13)); x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18)); x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7)); x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9)); x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13)); x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18)); x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7)); x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9)); x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13)); x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18)); x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7)); x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9)); x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13)); x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18)); x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7)); x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9)); x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13)); x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18)); for i in range( len( input ) ): x[i] = PLUS( x[i], input[i] ) return little16_i32.pack( *x ) #--------------------------- 32-bit ops ------------------------------- def trunc32( w ): """ Return the bottom 32 bits of w as a Python int. This creates longs temporarily, but returns an int. """ w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) ) assert type(w) == int return w def add32( a, b ): """ Add two 32-bit words discarding carry above 32nd bit, and without creating a Python long. Timing shouldn't vary. """ lo = ( a & 0xFFFF ) + ( b & 0xFFFF ) hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 ) return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF ) def rot32( w, nLeft ): """ Rotate 32-bit word left by nLeft or right by -nLeft without creating a Python long. Timing depends on nLeft but not on w. """ nLeft &= 31 # which makes nLeft >= 0 if nLeft == 0: return w # Note: now 1 <= nLeft <= 31. # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, # => sLLLLLLRRR and one s which becomes the sign bit. RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) ) sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w return RRR | ( sLLLLLL << nLeft ) # --------------------------------- end ----------------------------------- pyglossary-4.5.0/pyglossary/plugin_lib/py37/000077500000000000000000000000001417733132500211045ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugin_lib/py37/gzip.py.orig000066400000000000000000000475621417733132500234040ustar00rootroot00000000000000"""Functions that read and write gzipped files. The user of the file doesn't have to worry about the compression, but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module import struct, sys, time, os import zlib import builtins import io import _compression __all__ = ["GzipFile", "open", "compress", "decompress"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 def open(filename, mode="rb", compresslevel=9, encoding=None, errors=None, newline=None): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or an existing file object to read from or write to. The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is "rb", and the default compresslevel is 9. For binary mode, this function is equivalent to the GzipFile constructor: GzipFile(filename, mode, compresslevel). In this case, the encoding, errors and newline arguments must not be provided. For text mode, a GzipFile object is created, and wrapped in an io.TextIOWrapper instance with the specified encoding, error handling behavior, and line ending(s). """ if "t" in mode: if "b" in mode: raise ValueError("Invalid mode: %r" % (mode,)) else: if encoding is not None: raise ValueError("Argument 'encoding' not supported in binary mode") if errors is not None: raise ValueError("Argument 'errors' not supported in binary mode") if newline is not None: raise ValueError("Argument 'newline' not supported in binary mode") gz_mode = mode.replace("t", "") if isinstance(filename, (str, bytes, os.PathLike)): binary_file = GzipFile(filename, gz_mode, compresslevel) elif hasattr(filename, "read") or hasattr(filename, "write"): binary_file = GzipFile(None, gz_mode, compresslevel, filename) else: raise TypeError("filename must be a str or bytes object, or a file") if "t" in mode: return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file def write32u(output, value): # The L format writes the bit pattern correctly whether signed # or unsigned. output.write(struct.pack("' def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") self.size = 0 self.writebuf = [] self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. fname = os.path.basename(self.name) if not isinstance(fname, bytes): fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: fname = b'' flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) self.fileobj.write(b'\002') self.fileobj.write(b'\377') if fname: self.fileobj.write(fname + b'\000') def write(self,data): self._check_not_closed() if self.mode != WRITE: import errno raise OSError(errno.EBADF, "write() on read-only GzipFile object") if self.fileobj is None: raise ValueError("write() on closed GzipFile object") if isinstance(data, bytes): length = len(data) else: # accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes if length > 0: self.fileobj.write(self.compress.compress(data)) self.size += length self.crc = zlib.crc32(data, self.crc) self.offset += length return length def read(self, size=-1): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") return self._buffer.read(size) def read1(self, size=-1): """Implements BufferedIOBase.read1() Reads up to a buffer's worth of data is size is negative.""" self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") if size < 0: size = io.DEFAULT_BUFFER_SIZE return self._buffer.read1(size) def peek(self, n): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") return self._buffer.peek(n) @property def closed(self): return self.fileobj is None def close(self): fileobj = self.fileobj if fileobj is None: return self.fileobj = None try: if self.mode == WRITE: fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2 GiB, or even 4 GiB write32u(fileobj, self.size & 0xffffffff) elif self.mode == READ: self._buffer.close() finally: myfileobj = self.myfileobj if myfileobj: self.myfileobj = None myfileobj.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed self.fileobj.write(self.compress.flush(zlib_mode)) self.fileobj.flush() def fileno(self): """Invoke the underlying file object's fileno() method. This will raise AttributeError if the underlying file object doesn't support fileno(). """ return self.fileobj.fileno() def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") self._buffer.seek(0) def readable(self): return self.mode == READ def writable(self): return self.mode == WRITE def seekable(self): return True def seek(self, offset, whence=io.SEEK_SET): if self.mode == WRITE: if whence != io.SEEK_SET: if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 for i in range(count // 1024): self.write(chunk) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) return self.offset def readline(self, size=-1): self._check_not_closed() return self._buffer.readline(size) class _GzipReader(_compression.DecompressReader): def __init__(self, fp): super().__init__(_PaddedFile(fp), zlib.decompressobj, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True self._last_mtime = None def _init_read(self): self._crc = zlib.crc32(b"") self._stream_size = 0 # Decompressed size of unconcatenated stream def _read_exact(self, n): '''Read exactly *n* bytes from `self._fp` This method is required because self._fp may be unbuffered, i.e. return short reads. ''' data = self._fp.read(n) while len(data) < n: b = self._fp.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self._fp.read(2) if magic == b'': return False if magic != b'\037\213': raise OSError('Not a gzipped file (%r)' % magic) (method, flag, self._last_mtime) = struct.unpack(" import logging > log = logging.getLogger('root') > 498c501 < raise OSError("CRC check failed %s != %s" % (hex(crc32), --- > log.warning("CRC check failed %s != %s" % (hex(crc32), pyglossary-4.5.0/pyglossary/plugin_lib/py37/gzip_no_crc.py000066400000000000000000000476401417733132500237650ustar00rootroot00000000000000"""Functions that read and write gzipped files. The user of the file doesn't have to worry about the compression, but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module import logging log = logging.getLogger('root') import struct, sys, time, os import zlib import builtins import io import _compression __all__ = ["GzipFile", "open", "compress", "decompress"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 def open(filename, mode="rb", compresslevel=9, encoding=None, errors=None, newline=None): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or an existing file object to read from or write to. The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is "rb", and the default compresslevel is 9. For binary mode, this function is equivalent to the GzipFile constructor: GzipFile(filename, mode, compresslevel). In this case, the encoding, errors and newline arguments must not be provided. For text mode, a GzipFile object is created, and wrapped in an io.TextIOWrapper instance with the specified encoding, error handling behavior, and line ending(s). """ if "t" in mode: if "b" in mode: raise ValueError("Invalid mode: %r" % (mode,)) else: if encoding is not None: raise ValueError("Argument 'encoding' not supported in binary mode") if errors is not None: raise ValueError("Argument 'errors' not supported in binary mode") if newline is not None: raise ValueError("Argument 'newline' not supported in binary mode") gz_mode = mode.replace("t", "") if isinstance(filename, (str, bytes, os.PathLike)): binary_file = GzipFile(filename, gz_mode, compresslevel) elif hasattr(filename, "read") or hasattr(filename, "write"): binary_file = GzipFile(None, gz_mode, compresslevel, filename) else: raise TypeError("filename must be a str or bytes object, or a file") if "t" in mode: return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file def write32u(output, value): # The L format writes the bit pattern correctly whether signed # or unsigned. output.write(struct.pack("' def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") self.size = 0 self.writebuf = [] self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. fname = os.path.basename(self.name) if not isinstance(fname, bytes): fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: fname = b'' flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) self.fileobj.write(b'\002') self.fileobj.write(b'\377') if fname: self.fileobj.write(fname + b'\000') def write(self,data): self._check_not_closed() if self.mode != WRITE: import errno raise OSError(errno.EBADF, "write() on read-only GzipFile object") if self.fileobj is None: raise ValueError("write() on closed GzipFile object") if isinstance(data, bytes): length = len(data) else: # accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes if length > 0: self.fileobj.write(self.compress.compress(data)) self.size += length self.crc = zlib.crc32(data, self.crc) self.offset += length return length def read(self, size=-1): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") return self._buffer.read(size) def read1(self, size=-1): """Implements BufferedIOBase.read1() Reads up to a buffer's worth of data is size is negative.""" self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") if size < 0: size = io.DEFAULT_BUFFER_SIZE return self._buffer.read1(size) def peek(self, n): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") return self._buffer.peek(n) @property def closed(self): return self.fileobj is None def close(self): fileobj = self.fileobj if fileobj is None: return self.fileobj = None try: if self.mode == WRITE: fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2 GiB, or even 4 GiB write32u(fileobj, self.size & 0xffffffff) elif self.mode == READ: self._buffer.close() finally: myfileobj = self.myfileobj if myfileobj: self.myfileobj = None myfileobj.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed self.fileobj.write(self.compress.flush(zlib_mode)) self.fileobj.flush() def fileno(self): """Invoke the underlying file object's fileno() method. This will raise AttributeError if the underlying file object doesn't support fileno(). """ return self.fileobj.fileno() def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") self._buffer.seek(0) def readable(self): return self.mode == READ def writable(self): return self.mode == WRITE def seekable(self): return True def seek(self, offset, whence=io.SEEK_SET): if self.mode == WRITE: if whence != io.SEEK_SET: if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 for i in range(count // 1024): self.write(chunk) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) return self.offset def readline(self, size=-1): self._check_not_closed() return self._buffer.readline(size) class _GzipReader(_compression.DecompressReader): def __init__(self, fp): super().__init__(_PaddedFile(fp), zlib.decompressobj, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True self._last_mtime = None def _init_read(self): self._crc = zlib.crc32(b"") self._stream_size = 0 # Decompressed size of unconcatenated stream def _read_exact(self, n): '''Read exactly *n* bytes from `self._fp` This method is required because self._fp may be unbuffered, i.e. return short reads. ''' data = self._fp.read(n) while len(data) < n: b = self._fp.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self._fp.read(2) if magic == b'': return False if magic != b'\037\213': raise OSError('Not a gzipped file (%r)' % magic) (method, flag, self._last_mtime) = struct.unpack("' def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") self.size = 0 self.writebuf = [] self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. fname = os.path.basename(self.name) if not isinstance(fname, bytes): fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: fname = b'' flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) self.fileobj.write(b'\002') self.fileobj.write(b'\377') if fname: self.fileobj.write(fname + b'\000') def write(self,data): self._check_not_closed() if self.mode != WRITE: import errno raise OSError(errno.EBADF, "write() on read-only GzipFile object") if self.fileobj is None: raise ValueError("write() on closed GzipFile object") if isinstance(data, bytes): length = len(data) else: # accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes if length > 0: self.fileobj.write(self.compress.compress(data)) self.size += length self.crc = zlib.crc32(data, self.crc) self.offset += length return length def read(self, size=-1): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") return self._buffer.read(size) def read1(self, size=-1): """Implements BufferedIOBase.read1() Reads up to a buffer's worth of data is size is negative.""" self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") if size < 0: size = io.DEFAULT_BUFFER_SIZE return self._buffer.read1(size) def peek(self, n): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") return self._buffer.peek(n) @property def closed(self): return self.fileobj is None def close(self): fileobj = self.fileobj if fileobj is None: return self.fileobj = None try: if self.mode == WRITE: fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2GB, or even 4GB write32u(fileobj, self.size & 0xffffffff) elif self.mode == READ: self._buffer.close() finally: myfileobj = self.myfileobj if myfileobj: self.myfileobj = None myfileobj.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed self.fileobj.write(self.compress.flush(zlib_mode)) self.fileobj.flush() def fileno(self): """Invoke the underlying file object's fileno() method. This will raise AttributeError if the underlying file object doesn't support fileno(). """ return self.fileobj.fileno() def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") self._buffer.seek(0) def readable(self): return self.mode == READ def writable(self): return self.mode == WRITE def seekable(self): return True def seek(self, offset, whence=io.SEEK_SET): if self.mode == WRITE: if whence != io.SEEK_SET: if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 for i in range(count // 1024): self.write(chunk) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) return self.offset def readline(self, size=-1): self._check_not_closed() return self._buffer.readline(size) class _GzipReader(_compression.DecompressReader): def __init__(self, fp): super().__init__(_PaddedFile(fp), zlib.decompressobj, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True self._last_mtime = None def _init_read(self): self._crc = zlib.crc32(b"") self._stream_size = 0 # Decompressed size of unconcatenated stream def _read_exact(self, n): '''Read exactly *n* bytes from `self._fp` This method is required because self._fp may be unbuffered, i.e. return short reads. ''' data = self._fp.read(n) while len(data) < n: b = self._fp.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self._fp.read(2) if magic == b'': return False if magic != b'\037\213': raise OSError('Not a gzipped file (%r)' % magic) (method, flag, self._last_mtime) = struct.unpack(" import logging > log = logging.getLogger('root') > 498c501 < raise OSError("CRC check failed %s != %s" % (hex(crc32), --- > log.warning("CRC check failed %s != %s" % (hex(crc32), pyglossary-4.5.0/pyglossary/plugin_lib/py38/gzip_no_crc.py000066400000000000000000000476341417733132500237710ustar00rootroot00000000000000"""Functions that read and write gzipped files. The user of the file doesn't have to worry about the compression, but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module import logging log = logging.getLogger('root') import struct, sys, time, os import zlib import builtins import io import _compression __all__ = ["GzipFile", "open", "compress", "decompress"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 def open(filename, mode="rb", compresslevel=9, encoding=None, errors=None, newline=None): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or an existing file object to read from or write to. The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is "rb", and the default compresslevel is 9. For binary mode, this function is equivalent to the GzipFile constructor: GzipFile(filename, mode, compresslevel). In this case, the encoding, errors and newline arguments must not be provided. For text mode, a GzipFile object is created, and wrapped in an io.TextIOWrapper instance with the specified encoding, error handling behavior, and line ending(s). """ if "t" in mode: if "b" in mode: raise ValueError("Invalid mode: %r" % (mode,)) else: if encoding is not None: raise ValueError("Argument 'encoding' not supported in binary mode") if errors is not None: raise ValueError("Argument 'errors' not supported in binary mode") if newline is not None: raise ValueError("Argument 'newline' not supported in binary mode") gz_mode = mode.replace("t", "") if isinstance(filename, (str, bytes, os.PathLike)): binary_file = GzipFile(filename, gz_mode, compresslevel) elif hasattr(filename, "read") or hasattr(filename, "write"): binary_file = GzipFile(None, gz_mode, compresslevel, filename) else: raise TypeError("filename must be a str or bytes object, or a file") if "t" in mode: return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file def write32u(output, value): # The L format writes the bit pattern correctly whether signed # or unsigned. output.write(struct.pack("' def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") self.size = 0 self.writebuf = [] self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. fname = os.path.basename(self.name) if not isinstance(fname, bytes): fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: fname = b'' flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) self.fileobj.write(b'\002') self.fileobj.write(b'\377') if fname: self.fileobj.write(fname + b'\000') def write(self,data): self._check_not_closed() if self.mode != WRITE: import errno raise OSError(errno.EBADF, "write() on read-only GzipFile object") if self.fileobj is None: raise ValueError("write() on closed GzipFile object") if isinstance(data, bytes): length = len(data) else: # accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes if length > 0: self.fileobj.write(self.compress.compress(data)) self.size += length self.crc = zlib.crc32(data, self.crc) self.offset += length return length def read(self, size=-1): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") return self._buffer.read(size) def read1(self, size=-1): """Implements BufferedIOBase.read1() Reads up to a buffer's worth of data is size is negative.""" self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") if size < 0: size = io.DEFAULT_BUFFER_SIZE return self._buffer.read1(size) def peek(self, n): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") return self._buffer.peek(n) @property def closed(self): return self.fileobj is None def close(self): fileobj = self.fileobj if fileobj is None: return self.fileobj = None try: if self.mode == WRITE: fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2GB, or even 4GB write32u(fileobj, self.size & 0xffffffff) elif self.mode == READ: self._buffer.close() finally: myfileobj = self.myfileobj if myfileobj: self.myfileobj = None myfileobj.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed self.fileobj.write(self.compress.flush(zlib_mode)) self.fileobj.flush() def fileno(self): """Invoke the underlying file object's fileno() method. This will raise AttributeError if the underlying file object doesn't support fileno(). """ return self.fileobj.fileno() def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") self._buffer.seek(0) def readable(self): return self.mode == READ def writable(self): return self.mode == WRITE def seekable(self): return True def seek(self, offset, whence=io.SEEK_SET): if self.mode == WRITE: if whence != io.SEEK_SET: if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 for i in range(count // 1024): self.write(chunk) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) return self.offset def readline(self, size=-1): self._check_not_closed() return self._buffer.readline(size) class _GzipReader(_compression.DecompressReader): def __init__(self, fp): super().__init__(_PaddedFile(fp), zlib.decompressobj, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True self._last_mtime = None def _init_read(self): self._crc = zlib.crc32(b"") self._stream_size = 0 # Decompressed size of unconcatenated stream def _read_exact(self, n): '''Read exactly *n* bytes from `self._fp` This method is required because self._fp may be unbuffered, i.e. return short reads. ''' data = self._fp.read(n) while len(data) < n: b = self._fp.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self._fp.read(2) if magic == b'': return False if magic != b'\037\213': raise OSError('Not a gzipped file (%r)' % magic) (method, flag, self._last_mtime) = struct.unpack("' def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") self.size = 0 self.writebuf = [] self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self, compresslevel): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. fname = os.path.basename(self.name) if not isinstance(fname, bytes): fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: fname = b'' flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) if compresslevel == _COMPRESS_LEVEL_BEST: xfl = b'\002' elif compresslevel == _COMPRESS_LEVEL_FAST: xfl = b'\004' else: xfl = b'\000' self.fileobj.write(xfl) self.fileobj.write(b'\377') if fname: self.fileobj.write(fname + b'\000') def write(self,data): self._check_not_closed() if self.mode != WRITE: import errno raise OSError(errno.EBADF, "write() on read-only GzipFile object") if self.fileobj is None: raise ValueError("write() on closed GzipFile object") if isinstance(data, bytes): length = len(data) else: # accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes if length > 0: self.fileobj.write(self.compress.compress(data)) self.size += length self.crc = zlib.crc32(data, self.crc) self.offset += length return length def read(self, size=-1): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") return self._buffer.read(size) def read1(self, size=-1): """Implements BufferedIOBase.read1() Reads up to a buffer's worth of data if size is negative.""" self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") if size < 0: size = io.DEFAULT_BUFFER_SIZE return self._buffer.read1(size) def peek(self, n): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") return self._buffer.peek(n) @property def closed(self): return self.fileobj is None def close(self): fileobj = self.fileobj if fileobj is None: return self.fileobj = None try: if self.mode == WRITE: fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2 GiB, or even 4 GiB write32u(fileobj, self.size & 0xffffffff) elif self.mode == READ: self._buffer.close() finally: myfileobj = self.myfileobj if myfileobj: self.myfileobj = None myfileobj.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed self.fileobj.write(self.compress.flush(zlib_mode)) self.fileobj.flush() def fileno(self): """Invoke the underlying file object's fileno() method. This will raise AttributeError if the underlying file object doesn't support fileno(). """ return self.fileobj.fileno() def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") self._buffer.seek(0) def readable(self): return self.mode == READ def writable(self): return self.mode == WRITE def seekable(self): return True def seek(self, offset, whence=io.SEEK_SET): if self.mode == WRITE: if whence != io.SEEK_SET: if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 for i in range(count // 1024): self.write(chunk) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) return self.offset def readline(self, size=-1): self._check_not_closed() return self._buffer.readline(size) class _GzipReader(_compression.DecompressReader): def __init__(self, fp): super().__init__(_PaddedFile(fp), zlib.decompressobj, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True self._last_mtime = None def _init_read(self): self._crc = zlib.crc32(b"") self._stream_size = 0 # Decompressed size of unconcatenated stream def _read_exact(self, n): '''Read exactly *n* bytes from `self._fp` This method is required because self._fp may be unbuffered, i.e. return short reads. ''' data = self._fp.read(n) while len(data) < n: b = self._fp.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self._fp.read(2) if magic == b'': return False if magic != b'\037\213': raise BadGzipFile('Not a gzipped file (%r)' % magic) (method, flag, self._last_mtime) = struct.unpack(" import logging > log = logging.getLogger('root') > 524c527 < raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32), --- > log.warning("CRC check failed %s != %s" % (hex(crc32), pyglossary-4.5.0/pyglossary/plugin_lib/py39/gzip_no_crc.py000066400000000000000000000525261417733132500237660ustar00rootroot00000000000000"""Functions that read and write gzipped files. The user of the file doesn't have to worry about the compression, but random access is not allowed.""" # based on Andrew Kuchling's minigzip.py distributed with the zlib module import logging log = logging.getLogger('root') import struct, sys, time, os import zlib import builtins import io import _compression __all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 _COMPRESS_LEVEL_FAST = 1 _COMPRESS_LEVEL_TRADEOFF = 6 _COMPRESS_LEVEL_BEST = 9 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, encoding=None, errors=None, newline=None): """Open a gzip-compressed file in binary or text mode. The filename argument can be an actual filename (a str or bytes object), or an existing file object to read from or write to. The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is "rb", and the default compresslevel is 9. For binary mode, this function is equivalent to the GzipFile constructor: GzipFile(filename, mode, compresslevel). In this case, the encoding, errors and newline arguments must not be provided. For text mode, a GzipFile object is created, and wrapped in an io.TextIOWrapper instance with the specified encoding, error handling behavior, and line ending(s). """ if "t" in mode: if "b" in mode: raise ValueError("Invalid mode: %r" % (mode,)) else: if encoding is not None: raise ValueError("Argument 'encoding' not supported in binary mode") if errors is not None: raise ValueError("Argument 'errors' not supported in binary mode") if newline is not None: raise ValueError("Argument 'newline' not supported in binary mode") gz_mode = mode.replace("t", "") if isinstance(filename, (str, bytes, os.PathLike)): binary_file = GzipFile(filename, gz_mode, compresslevel) elif hasattr(filename, "read") or hasattr(filename, "write"): binary_file = GzipFile(None, gz_mode, compresslevel, filename) else: raise TypeError("filename must be a str or bytes object, or a file") if "t" in mode: return io.TextIOWrapper(binary_file, encoding, errors, newline) else: return binary_file def write32u(output, value): # The L format writes the bit pattern correctly whether signed # or unsigned. output.write(struct.pack("' def _init_write(self, filename): self.name = filename self.crc = zlib.crc32(b"") self.size = 0 self.writebuf = [] self.bufsize = 0 self.offset = 0 # Current file offset for seek(), tell(), etc def _write_gzip_header(self, compresslevel): self.fileobj.write(b'\037\213') # magic header self.fileobj.write(b'\010') # compression method try: # RFC 1952 requires the FNAME field to be Latin-1. Do not # include filenames that cannot be represented that way. fname = os.path.basename(self.name) if not isinstance(fname, bytes): fname = fname.encode('latin-1') if fname.endswith(b'.gz'): fname = fname[:-3] except UnicodeEncodeError: fname = b'' flags = 0 if fname: flags = FNAME self.fileobj.write(chr(flags).encode('latin-1')) mtime = self._write_mtime if mtime is None: mtime = time.time() write32u(self.fileobj, int(mtime)) if compresslevel == _COMPRESS_LEVEL_BEST: xfl = b'\002' elif compresslevel == _COMPRESS_LEVEL_FAST: xfl = b'\004' else: xfl = b'\000' self.fileobj.write(xfl) self.fileobj.write(b'\377') if fname: self.fileobj.write(fname + b'\000') def write(self,data): self._check_not_closed() if self.mode != WRITE: import errno raise OSError(errno.EBADF, "write() on read-only GzipFile object") if self.fileobj is None: raise ValueError("write() on closed GzipFile object") if isinstance(data, bytes): length = len(data) else: # accept any data that supports the buffer protocol data = memoryview(data) length = data.nbytes if length > 0: self.fileobj.write(self.compress.compress(data)) self.size += length self.crc = zlib.crc32(data, self.crc) self.offset += length return length def read(self, size=-1): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read() on write-only GzipFile object") return self._buffer.read(size) def read1(self, size=-1): """Implements BufferedIOBase.read1() Reads up to a buffer's worth of data if size is negative.""" self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "read1() on write-only GzipFile object") if size < 0: size = io.DEFAULT_BUFFER_SIZE return self._buffer.read1(size) def peek(self, n): self._check_not_closed() if self.mode != READ: import errno raise OSError(errno.EBADF, "peek() on write-only GzipFile object") return self._buffer.peek(n) @property def closed(self): return self.fileobj is None def close(self): fileobj = self.fileobj if fileobj is None: return self.fileobj = None try: if self.mode == WRITE: fileobj.write(self.compress.flush()) write32u(fileobj, self.crc) # self.size may exceed 2 GiB, or even 4 GiB write32u(fileobj, self.size & 0xffffffff) elif self.mode == READ: self._buffer.close() finally: myfileobj = self.myfileobj if myfileobj: self.myfileobj = None myfileobj.close() def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): self._check_not_closed() if self.mode == WRITE: # Ensure the compressor's buffer is flushed self.fileobj.write(self.compress.flush(zlib_mode)) self.fileobj.flush() def fileno(self): """Invoke the underlying file object's fileno() method. This will raise AttributeError if the underlying file object doesn't support fileno(). """ return self.fileobj.fileno() def rewind(self): '''Return the uncompressed stream file position indicator to the beginning of the file''' if self.mode != READ: raise OSError("Can't rewind in write mode") self._buffer.seek(0) def readable(self): return self.mode == READ def writable(self): return self.mode == WRITE def seekable(self): return True def seek(self, offset, whence=io.SEEK_SET): if self.mode == WRITE: if whence != io.SEEK_SET: if whence == io.SEEK_CUR: offset = self.offset + offset else: raise ValueError('Seek from end not supported') if offset < self.offset: raise OSError('Negative seek in write mode') count = offset - self.offset chunk = b'\0' * 1024 for i in range(count // 1024): self.write(chunk) self.write(b'\0' * (count % 1024)) elif self.mode == READ: self._check_not_closed() return self._buffer.seek(offset, whence) return self.offset def readline(self, size=-1): self._check_not_closed() return self._buffer.readline(size) class _GzipReader(_compression.DecompressReader): def __init__(self, fp): super().__init__(_PaddedFile(fp), zlib.decompressobj, wbits=-zlib.MAX_WBITS) # Set flag indicating start of a new member self._new_member = True self._last_mtime = None def _init_read(self): self._crc = zlib.crc32(b"") self._stream_size = 0 # Decompressed size of unconcatenated stream def _read_exact(self, n): '''Read exactly *n* bytes from `self._fp` This method is required because self._fp may be unbuffered, i.e. return short reads. ''' data = self._fp.read(n) while len(data) < n: b = self._fp.read(n - len(data)) if not b: raise EOFError("Compressed file ended before the " "end-of-stream marker was reached") data += b return data def _read_gzip_header(self): magic = self._fp.read(2) if magic == b'': return False if magic != b'\037\213': raise BadGzipFile('Not a gzipped file (%r)' % magic) (method, flag, self._last_mtime) = struct.unpack(" # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import logging log = logging.getLogger('root') from struct import pack, unpack from io import BytesIO import re import sys from .ripemd128 import ripemd128 from .pureSalsa20 import Salsa20 # zlib compression is used for engine version >=2.0 import zlib # LZO compression is used for engine version < 2.0 try: import lzo except ImportError: lzo = None print("LZO compression support is not available") # 2x3 compatible if sys.hexversion >= 0x03000000: str, unicode = bytes, str def _unescape_entities(text): """ unescape offending tags < > " & """ text = text.replace(b'<', b'<') text = text.replace(b'>', b'>') text = text.replace(b'"', b'"') text = text.replace(b'&', b'&') return text def _fast_decrypt(data, key): b = bytearray(data) key = bytearray(key) previous = 0x36 for i in range(len(b)): t = (b[i] >> 4 | b[i] << 4) & 0xff t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)] previous = b[i] b[i] = t return bytes(b) def _mdx_decrypt(comp_block): key = ripemd128(comp_block[4:8] + pack(b' """ taglist = re.findall(rb'(\w+)="(.*?)"', header, re.DOTALL) tagdict = {} for key, value in taglist: tagdict[key] = _unescape_entities(value) return tagdict def _decode_key_block_info(self, key_block_info_compressed): if self._version >= 2: # zlib compression assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00') # decrypt if needed if self._encrypt & 0x02: key_block_info_compressed = _mdx_decrypt(key_block_info_compressed) # decompress key_block_info = zlib.decompress(key_block_info_compressed[8:]) # adler checksum adler32 = unpack('>I', key_block_info_compressed[4:8])[0] assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff) else: # no compression key_block_info = key_block_info_compressed # decode key_block_info_list = [] num_entries = 0 i = 0 if self._version >= 2: byte_format = '>H' byte_width = 2 text_term = 1 else: byte_format = '>B' byte_width = 1 text_term = 0 while i < len(key_block_info): # number of entries in current key block num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0] i += self._number_width # text head size text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0] i += byte_width # text head if self._encoding != 'UTF-16': i += text_head_size + text_term else: i += (text_head_size + text_term) * 2 # text tail size text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0] i += byte_width # text tail if self._encoding != 'UTF-16': i += text_tail_size + text_term else: i += (text_tail_size + text_term) * 2 # key block compressed size key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0] i += self._number_width # key block decompressed size key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0] i += self._number_width key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)] #assert(num_entries == self._num_entries) return key_block_info_list def _decode_key_block(self, key_block_compressed, key_block_info_list): key_list = [] i = 0 for compressed_size, decompressed_size in key_block_info_list: start = i end = i + compressed_size # 4 bytes : compression type key_block_type = key_block_compressed[start:start+4] # 4 bytes : adler checksum of decompressed key block adler32 = unpack('>I', key_block_compressed[start+4:start+8])[0] if key_block_type == b'\x00\x00\x00\x00': key_block = key_block_compressed[start+8:end] elif key_block_type == b'\x01\x00\x00\x00': if lzo is None: print("LZO compression is not supported") break # decompress key block header = b'\xf0' + pack('>I', decompressed_size) key_block = lzo.decompress(header + key_block_compressed[start+8:end]) elif key_block_type == b'\x02\x00\x00\x00': # decompress key block key_block = zlib.decompress(key_block_compressed[start+8:end]) # extract one single key block into a key list key_list += self._split_key_block(key_block) # notice that adler32 returns signed value assert(adler32 == zlib.adler32(key_block) & 0xffffffff) i += compressed_size return key_list def _split_key_block(self, key_block): key_list = [] key_start_index = 0 while key_start_index < len(key_block): # the corresponding record's offset in record block key_id = unpack(self._number_format, key_block[key_start_index:key_start_index+self._number_width])[0] # key text ends with '\x00' if self._encoding == 'UTF-16': delimiter = b'\x00\x00' width = 2 else: delimiter = b'\x00' width = 1 i = key_start_index + self._number_width while i < len(key_block): if key_block[i:i+width] == delimiter: key_end_index = i break i += width key_text = key_block[key_start_index+self._number_width:key_end_index]\ .decode(self._encoding, errors='ignore').encode('utf-8').strip() key_start_index = key_end_index + width key_list += [(key_id, key_text)] return key_list def _read_header(self): f = open(self._fname, 'rb') # number of bytes of header text header_bytes_size = unpack('>I', f.read(4))[0] header_bytes = f.read(header_bytes_size) # 4 bytes: adler32 checksum of header, in little endian adler32 = unpack('= 0x03000000: encoding = encoding.decode('utf-8') # GB18030 > GBK > GB2312 if encoding in ['GBK', 'GB2312']: encoding = 'GB18030' self._encoding = encoding # encryption flag # 0x00 - no encryption # 0x01 - encrypt record block # 0x02 - encrypt key info block if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No': self._encrypt = 0 elif header_tag[b'Encrypted'] == b'Yes': self._encrypt = 1 else: self._encrypt = int(header_tag[b'Encrypted']) # stylesheet attribute if present takes form of: # style_number # 1-255 # style_begin # or '' # style_end # or '' # store stylesheet in dict in the form of # {'number' : ('style_begin', 'style_end')} self._stylesheet = {} if header_tag.get('StyleSheet'): lines = header_tag['StyleSheet'].splitlines() for i in range(0, len(lines), 3): self._stylesheet[lines[i]] = (lines[i+1], lines[i+2]) # before version 2.0, number is 4 bytes integer # version 2.0 and above uses 8 bytes self._version = float(header_tag[b'GeneratedByEngineVersion']) if self._version < 2.0: self._number_width = 4 self._number_format = '>I' else: self._number_width = 8 self._number_format = '>Q' return header_tag def _read_keys(self): f = open(self._fname, 'rb') f.seek(self._key_block_offset) # the following numbers could be encrypted if self._version >= 2.0: num_bytes = 8 * 5 else: num_bytes = 4 * 4 block = f.read(num_bytes) if self._encrypt & 1: if self._passcode is None: raise RuntimeError('user identification is needed to read encrypted file') regcode, userid = self._passcode if isinstance(userid, unicode): userid = userid.encode('utf8') if self.header[b'RegisterBy'] == b'EMail': encrypted_key = _decrypt_regcode_by_email(regcode, userid) else: encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid) block = _salsa_decrypt(block, encrypted_key) # decode this block sf = BytesIO(block) # number of key blocks num_key_blocks = self._read_number(sf) # number of entries self._num_entries = self._read_number(sf) # number of bytes of key block info after decompression if self._version >= 2.0: key_block_info_decomp_size = self._read_number(sf) # number of bytes of key block info key_block_info_size = self._read_number(sf) # number of bytes of key block key_block_size = self._read_number(sf) # 4 bytes: adler checksum of previous 5 numbers if self._version >= 2.0: adler32 = unpack('>I', f.read(4))[0] assert adler32 == (zlib.adler32(block) & 0xffffffff) # read key block info, which indicates key block's compressed and decompressed size key_block_info = f.read(key_block_info_size) key_block_info_list = self._decode_key_block_info(key_block_info) assert(num_key_blocks == len(key_block_info_list)) # read key block key_block_compressed = f.read(key_block_size) # extract key block key_list = self._decode_key_block(key_block_compressed, key_block_info_list) self._record_block_offset = f.tell() f.close() return key_list def _read_keys_brutal(self): f = open(self._fname, 'rb') f.seek(self._key_block_offset) # the following numbers could be encrypted, disregard them! if self._version >= 2.0: num_bytes = 8 * 5 + 4 key_block_type = b'\x02\x00\x00\x00' else: num_bytes = 4 * 4 key_block_type = b'\x01\x00\x00\x00' block = f.read(num_bytes) # key block info # 4 bytes '\x02\x00\x00\x00' # 4 bytes adler32 checksum # unknown number of bytes follows until '\x02\x00\x00\x00' which marks the beginning of key block key_block_info = f.read(8) if self._version >= 2.0: assert key_block_info[:4] == b'\x02\x00\x00\x00' while True: fpos = f.tell() t = f.read(1024) index = t.find(key_block_type) if index != -1: key_block_info += t[:index] f.seek(fpos + index) break else: key_block_info += t key_block_info_list = self._decode_key_block_info(key_block_info) key_block_size = sum(list(zip(*key_block_info_list))[0]) # read key block key_block_compressed = f.read(key_block_size) # extract key block key_list = self._decode_key_block(key_block_compressed, key_block_info_list) self._record_block_offset = f.tell() f.close() self._num_entries = len(key_list) return key_list class MDD(MDict): """ MDict resource file format (*.MDD) reader. >>> mdd = MDD('example.mdd') >>> len(mdd) 208 >>> for filename,content in mdd.items(): ... print filename, content[:10] """ def __init__(self, fname, passcode=None): MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode) def items(self): """Return a generator which in turn produce tuples in the form of (filename, content) """ return self._decode_record_block() def _decode_record_block(self): f = open(self._fname, 'rb') f.seek(self._record_block_offset) num_record_blocks = self._read_number(f) num_entries = self._read_number(f) assert(num_entries == self._num_entries) record_block_info_size = self._read_number(f) record_block_size = self._read_number(f) # record block info section record_block_info_list = [] size_counter = 0 for i in range(num_record_blocks): compressed_size = self._read_number(f) decompressed_size = self._read_number(f) record_block_info_list += [(compressed_size, decompressed_size)] size_counter += self._number_width * 2 assert(size_counter == record_block_info_size) # actual record block offset = 0 i = 0 size_counter = 0 for compressed_size, decompressed_size in record_block_info_list: record_block_compressed = f.read(compressed_size) # 4 bytes: compression type record_block_type = record_block_compressed[:4] # 4 bytes: adler32 checksum of decompressed record block adler32 = unpack('>I', record_block_compressed[4:8])[0] if record_block_type == b'\x00\x00\x00\x00': record_block = record_block_compressed[8:] elif record_block_type == b'\x01\x00\x00\x00': if lzo is None: print("LZO compression is not supported") break # decompress header = b'\xf0' + pack('>I', decompressed_size) record_block = lzo.decompress(header + record_block_compressed[8:]) elif record_block_type == b'\x02\x00\x00\x00': # decompress record_block = zlib.decompress(record_block_compressed[8:]) # notice that adler32 return signed value assert(adler32 == zlib.adler32(record_block) & 0xffffffff) assert(len(record_block) == decompressed_size) # split record block according to the offset info from key block while i < len(self._key_list): record_start, key_text = self._key_list[i] # reach the end of current record block if record_start - offset >= len(record_block): break # record end index if i < len(self._key_list)-1: record_end = self._key_list[i+1][0] else: record_end = len(record_block) + offset i += 1 data = record_block[record_start-offset:record_end-offset] yield key_text, data offset += len(record_block) size_counter += compressed_size assert(size_counter == record_block_size) f.close() class MDX(MDict): """ MDict dictionary file format (*.MDD) reader. >>> mdx = MDX('example.mdx') >>> len(mdx) 42481 >>> for key,value in mdx.items(): ... print key, value[:10] """ def __init__(self, fname, encoding='', substyle=False, passcode=None): MDict.__init__(self, fname, encoding, passcode) self._substyle = substyle def items(self): """Return a generator which in turn produce tuples in the form of (key, value) """ return self._decode_record_block() def _substitute_stylesheet(self, txt): # substitute stylesheet definition txt_list = re.split(r'`\d+`', txt) txt_tag = re.findall(r'`\d+`', txt) txt_styled = txt_list[0] for j, p in enumerate(txt_list[1:]): key = txt_tag[j][1:-1] try: style = self._stylesheet[key] except KeyError: log.error('invalid stylesheet key "%s"'%key) continue if p and p[-1] == '\n': txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n' else: txt_styled = txt_styled + style[0] + p + style[1] return txt_styled def _decode_record_block(self): f = open(self._fname, 'rb') f.seek(self._record_block_offset) num_record_blocks = self._read_number(f) num_entries = self._read_number(f) assert(num_entries == self._num_entries) record_block_info_size = self._read_number(f) record_block_size = self._read_number(f) # record block info section record_block_info_list = [] size_counter = 0 for i in range(num_record_blocks): compressed_size = self._read_number(f) decompressed_size = self._read_number(f) record_block_info_list += [(compressed_size, decompressed_size)] size_counter += self._number_width * 2 assert(size_counter == record_block_info_size) # actual record block data offset = 0 i = 0 size_counter = 0 for compressed_size, decompressed_size in record_block_info_list: record_block_compressed = f.read(compressed_size) # 4 bytes indicates block compression type record_block_type = record_block_compressed[:4] # 4 bytes adler checksum of uncompressed content adler32 = unpack('>I', record_block_compressed[4:8])[0] # no compression if record_block_type == b'\x00\x00\x00\x00': record_block = record_block_compressed[8:] # lzo compression elif record_block_type == b'\x01\x00\x00\x00': if lzo is None: print("LZO compression is not supported") break # decompress header = b'\xf0' + pack('>I', decompressed_size) record_block = lzo.decompress(header + record_block_compressed[8:]) # zlib compression elif record_block_type == b'\x02\x00\x00\x00': # decompress record_block = zlib.decompress(record_block_compressed[8:]) # notice that adler32 return signed value assert(adler32 == zlib.adler32(record_block) & 0xffffffff) assert(len(record_block) == decompressed_size) # split record block according to the offset info from key block while i < len(self._key_list): record_start, key_text = self._key_list[i] # reach the end of current record block if record_start - offset >= len(record_block): break # record end index if i < len(self._key_list)-1: record_end = self._key_list[i+1][0] else: record_end = len(record_block) + offset i += 1 record = record_block[record_start-offset:record_end-offset] # convert to utf-8 record = record.decode(self._encoding, errors='ignore').strip(unicode('\x00')).encode('utf-8') # substitute styles if self._substyle and self._stylesheet: record = self._substitute_stylesheet(record) yield key_text, record offset += len(record_block) size_counter += compressed_size assert(size_counter == record_block_size) f.close() if __name__ == '__main__': import sys import os import os.path import argparse import codecs def passcode(s): try: regcode, userid = s.split(',') except: raise argparse.ArgumentTypeError("Passcode must be regcode,userid") try: regcode = codecs.decode(regcode, 'hex') except: raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string") return regcode, userid parser = argparse.ArgumentParser() parser.add_argument('-x', '--extract', action="store_true", help='extract mdx to source format and extract files from mdd') parser.add_argument('-s', '--substyle', action="store_true", help='substitute style definition if present') parser.add_argument('-d', '--datafolder', default="data", help='folder to extract data files from mdd') parser.add_argument('-e', '--encoding', default="", help='folder to extract data files from mdd') parser.add_argument('-p', '--passcode', default=None, type=passcode, help='register_code,email_or_deviceid') parser.add_argument("filename", nargs='?', help="mdx file name") args = parser.parse_args() # use GUI to select file, default to extract if not args.filename: if sys.hexversion >= 0x03000000: import tkinter as tk import tkinter.filedialog as filedialog else: import Tkinter as tk import tkFileDialog as filedialog root = tk.Tk() root.withdraw() args.filename = filedialog.askopenfilename(parent=root) args.extract = True if not os.path.exists(args.filename): print("Please specify a valid MDX/MDD file") base, ext = os.path.splitext(args.filename) # read mdx file if ext.lower() == os.path.extsep + 'mdx': mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode) if type(args.filename) is unicode: bfname = args.filename.encode('utf-8') else: bfname = args.filename print('======== %s ========' % bfname) print(' Number of Entries : %d' % len(mdx)) for key, value in mdx.header.items(): print(' %s : %s' % (key, value)) else: mdx = None # find companion mdd file mdd_filename = ''.join([base, os.path.extsep, 'mdd']) if os.path.exists(mdd_filename): mdd = MDD(mdd_filename, args.passcode) if type(mdd_filename) is unicode: bfname = mdd_filename.encode('utf-8') else: bfname = mdd_filename print('======== %s ========' % bfname) print(' Number of Entries : %d' % len(mdd)) for key, value in mdd.header.items(): print(' %s : %s' % (key, value)) else: mdd = None if args.extract: # write out glos if mdx: output_fname = ''.join([base, os.path.extsep, 'txt']) tf = open(output_fname, 'wb') for key, value in mdx.items(): tf.write(key) tf.write(b'\r\n') tf.write(value) if not value.endswith(b'\n'): tf.write(b'\r\n') tf.write(b'\r\n') tf.close() # write out style if mdx.header.get('StyleSheet'): style_fname = ''.join([base, '_style', os.path.extsep, 'txt']) sf = open(style_fname, 'wb') sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines())) sf.close() # write out optional data files if mdd: datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder) if not os.path.exists(datafolder): os.makedirs(datafolder) for key, value in mdd.items(): fname = key.decode('utf-8').replace('\\', os.path.sep) dfname = datafolder + fname if not os.path.exists(os.path.dirname(dfname)): os.makedirs(os.path.dirname(dfname)) df = open(dfname, 'wb') df.write(value) df.close() pyglossary-4.5.0/pyglossary/plugin_lib/ripemd128.py000066400000000000000000000067761417733132500224070ustar00rootroot00000000000000""" Copyright by https://github.com/zhansliu/writemdict ripemd128.py - A simple ripemd128 library in pure Python. Supports both Python 2 (versions >= 2.6) and Python 3. Usage: from ripemd128 import ripemd128 digest = ripemd128(b"The quick brown fox jumps over the lazy dog") assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96") """ import struct # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt def f(j, x, y, z): assert(0 <= j and j < 64) if j < 16: return x ^ y ^ z elif j < 32: return (x & y) | (z & ~x) elif j < 48: return (x | (0xffffffff & ~y)) ^ z else: return (x & z) | (y & ~z) def K(j): assert(0 <= j and j < 64) if j < 16: return 0x00000000 elif j < 32: return 0x5a827999 elif j < 48: return 0x6ed9eba1 else: return 0x8f1bbcdc def Kp(j): assert(0 <= j and j < 64) if j < 16: return 0x50a28be6 elif j < 32: return 0x5c4dd124 elif j < 48: return 0x6d703ef3 else: return 0x00000000 def padandsplit(message): """ returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges from 0 to 16. First pads the message to length in bytes is congruent to 56 (mod 64), by first adding a byte 0x80, and then padding with 0x00 bytes until the message length is congruent to 56 (mod 64). Then adds the little-endian 64-bit representation of the original length. Finally, splits the result up into 64-byte blocks, which are further parsed as 32-bit integers. """ origlen = len(message) padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1! message += b"\x80" message += b"\x00" * (padlength - 1) message += struct.pack("> (32-s)) & 0xffffffff r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2] rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14] s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12] sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8] def ripemd128(message): h0 = 0x67452301 h1 = 0xefcdab89 h2 = 0x98badcfe h3 = 0x10325476 X = padandsplit(message) for i in range(len(X)): (A,B,C,D) = (h0,h1,h2,h3) (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3) for j in range(64): T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j))) (A,D,C,B) = (D,C,B,T) T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j))) (Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T) T = add(h1,C,Dp) h1 = add(h2,D,Ap) h2 = add(h3,A,Bp) h3 = add(h0,B,Cp) h0 = T return struct.pack(" pyglossary-4.5.0/pyglossary/plugin_lib/slob.py000066400000000000000000000712431417733132500216220ustar00rootroot00000000000000# pylint: disable=C0111,C0103,C0302,R0903,R0904,R0914,R0201 import encodings import functools import io import os import pickle import sys import tempfile import warnings from abc import abstractmethod from bisect import bisect_left from builtins import open as fopen from collections import namedtuple from collections.abc import Sequence from datetime import datetime, timezone from functools import lru_cache from struct import pack, unpack, calcsize from threading import RLock from types import MappingProxyType from uuid import uuid4, UUID import icu from icu import Locale, Collator, UCollAttribute, UCollAttributeValue DEFAULT_COMPRESSION = 'lzma2' UTF8 = 'utf-8' MAGIC = b'!-1SLOB\x1F' Compression = namedtuple('Compression', 'compress decompress') Ref = namedtuple('Ref', 'key bin_index item_index fragment') Header = namedtuple( 'Header', 'magic uuid encoding ' 'compression tags content_types ' 'blob_count ' 'store_offset ' 'refs_offset ' 'size', ) U_CHAR = '>B' U_CHAR_SIZE = calcsize(U_CHAR) U_SHORT = '>H' U_SHORT_SIZE = calcsize(U_SHORT) U_INT = '>I' U_INT_SIZE = calcsize(U_INT) U_LONG_LONG = '>Q' U_LONG_LONG_SIZE = calcsize(U_LONG_LONG) def calcmax(len_size_spec): return 2 ** (calcsize(len_size_spec) * 8) - 1 MAX_TEXT_LEN = calcmax(U_SHORT) MAX_TINY_TEXT_LEN = calcmax(U_CHAR) MAX_LARGE_BYTE_STRING_LEN = calcmax(U_INT) MAX_BIN_ITEM_COUNT = calcmax(U_SHORT) PRIMARY = Collator.PRIMARY SECONDARY = Collator.SECONDARY TERTIARY = Collator.TERTIARY QUATERNARY = Collator.QUATERNARY IDENTICAL = Collator.IDENTICAL def init_compressions(): def ident(x): return x compressions = {'': Compression(ident, ident)} for name in ('bz2', 'zlib'): try: m = __import__(name) except ImportError: warnings.warn('%s is not available' % name) else: compressions[name] = Compression( lambda x: m.compress(x, 9), m.decompress) try: import lzma except ImportError: warnings.warn('lzma is not available') else: filters = [{'id': lzma.FILTER_LZMA2}] compressions['lzma2'] = Compression( lambda s: lzma.compress( s, format=lzma.FORMAT_RAW, filters=filters, ), lambda s: lzma.decompress( s, format=lzma.FORMAT_RAW, filters=filters, ), ) return compressions COMPRESSIONS = init_compressions() del init_compressions MIME_TEXT = 'text/plain' MIME_HTML = 'text/html' MIME_CSS = 'text/css' MIME_JS = 'application/javascript' MIME_TYPES = { "html": MIME_HTML, "txt": MIME_TEXT, "js": MIME_JS, "css": MIME_CSS, "json": "application/json", "woff": "application/font-woff", "svg": "image/svg+xml", "png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "gif": "image/gif", "ttf": "application/x-font-ttf", "otf": "application/x-font-opentype" } class FileFormatException(Exception): pass class UnknownFileFormat(FileFormatException): pass class UnknownCompression(FileFormatException): pass class UnknownEncoding(FileFormatException): pass class IncorrectFileSize(FileFormatException): pass class TagNotFound(Exception): pass @lru_cache(maxsize=None) def sortkey(strength, maxlength=None): c = Collator.createInstance(Locale('')) c.setStrength(strength) c.setAttribute( UCollAttribute.ALTERNATE_HANDLING, UCollAttributeValue.SHIFTED, ) if maxlength is None: return c.getSortKey else: return lambda x: c.getSortKey(x)[:maxlength] class MultiFileReader(io.BufferedIOBase): def __init__(self, *args): filenames = [] for arg in args: if isinstance(arg, str): filenames.append(arg) else: for name in arg: filenames.append(name) files = [] ranges = [] offset = 0 for name in filenames: size = os.stat(name).st_size ranges.append(range(offset, offset + size)) files.append(fopen(name, 'rb')) offset += size self.size = offset self._ranges = ranges self._files = files self._fcount = len(self._files) self._offset = -1 self.seek(0) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False def close(self): for f in self._files: f.close() self._files.clear() self._ranges.clear() def closed(self): return len(self._ranges) == 0 def isatty(self): return False def readable(self): return True def seek(self, offset, whence=io.SEEK_SET): if whence == io.SEEK_SET: self._offset = offset elif whence == io.SEEK_CUR: self._offset = self._offset + offset elif whence == io.SEEK_END: self._offset = self.size + offset else: raise ValueError('Invalid value for parameter whence: %r' % whence) return self._offset def seekable(self): return True def tell(self): return self._offset def writable(self): return False def read(self, n=-1): file_index = -1 actual_offset = 0 for i, r in enumerate(self._ranges): if self._offset in r: file_index = i actual_offset = self._offset - r.start break result = b'' if (n == -1 or n is None): to_read = self.size else: to_read = n while -1 < file_index < self._fcount: f = self._files[file_index] f.seek(actual_offset) read = f.read(to_read) read_count = len(read) self._offset += read_count result += read to_read -= read_count if to_read > 0: file_index += 1 actual_offset = 0 else: break return result class CollationKeyList(object): def __init__(self, lst, sortkey_): self.lst = lst self.sortkey = sortkey_ def __len__(self): return len(self.lst) def __getitem__(self, i): return self.sortkey(self.lst[i].key) class KeydItemDict(object): def __init__(self, lst, strength, maxlength=None): self.lst = lst self.sortkey = sortkey(strength, maxlength=maxlength) self.sortkeylist = CollationKeyList(lst, self.sortkey) def __len__(self): return len(self.lst) def __getitem__(self, key): key_as_sk = self.sortkey(key) i = bisect_left(self.sortkeylist, key_as_sk) if i != len(self.lst): while i < len(self.lst): if (self.sortkey(self.lst[i].key) == key_as_sk): yield self.lst[i] else: break i += 1 def __contains__(self, key): try: next(self[key]) except StopIteration: return False else: return True class Blob(object): def __init__( self, content_id, key, fragment, read_content_type_func, read_func, ): self._content_id = content_id self._key = key self._fragment = fragment self._read_content_type = read_content_type_func self._read = read_func @property def id(self): return self._content_id @property def key(self): return self._key @property def fragment(self): return self._fragment @property def content_type(self): return self._read_content_type() @property def content(self): return self._read() def __str__(self): return self.key def __repr__(self): return f'<{self.__class__.__module__}.{self.__class__.__name__} {self.key}>' def read_byte_string(f, len_spec): length = unpack(len_spec, f.read(calcsize(len_spec)))[0] return f.read(length) class StructReader: def __init__(self, file_, encoding=None): self._file = file_ self.encoding = encoding def read_int(self): s = self.read(U_INT_SIZE) return unpack(U_INT, s)[0] def read_long(self): b = self.read(U_LONG_LONG_SIZE) return unpack(U_LONG_LONG, b)[0] def read_byte(self): s = self.read(U_CHAR_SIZE) return unpack(U_CHAR, s)[0] def read_short(self): return unpack(U_SHORT, self._file.read(U_SHORT_SIZE))[0] def _read_text(self, len_spec): max_len = 2 ** (8 * calcsize(len_spec)) - 1 byte_string = read_byte_string(self._file, len_spec) if len(byte_string) == max_len: terminator = byte_string.find(0) if terminator > -1: byte_string = byte_string[:terminator] return byte_string.decode(self.encoding) def read_tiny_text(self): return self._read_text(U_CHAR) def read_text(self): return self._read_text(U_SHORT) def __getattr__(self, name): return getattr(self._file, name) class StructWriter: def __init__(self, file_, encoding=None): self._file = file_ self.encoding = encoding def write_int(self, value): self._file.write(pack(U_INT, value)) def write_long(self, value): self._file.write(pack(U_LONG_LONG, value)) def write_byte(self, value): self._file.write(pack(U_CHAR, value)) def write_short(self, value): self._file.write(pack(U_SHORT, value)) def _write_text(self, text, len_size_spec, encoding=None, pad_to_length=None): if encoding is None: encoding = self.encoding text_bytes = text.encode(encoding) length = len(text_bytes) max_length = calcmax(len_size_spec) if length > max_length: raise ValueError("Text is too long for size spec %s" % len_size_spec) self._file.write(pack( len_size_spec, pad_to_length if pad_to_length else length )) self._file.write(text_bytes) if pad_to_length: for _ in range(pad_to_length - length): self._file.write(pack(U_CHAR, 0)) def write_tiny_text(self, text, encoding=None, editable=False): pad_to_length = 255 if editable else None self._write_text( text, U_CHAR, encoding=encoding, pad_to_length=pad_to_length, ) def write_text(self, text, encoding=None): self._write_text(text, U_SHORT, encoding=encoding) def __getattr__(self, name): return getattr(self._file, name) def set_tag_value(filename, name, value): with fopen(filename, 'rb+') as f: f.seek(len(MAGIC) + 16) encoding = read_byte_string(f, U_CHAR).decode(UTF8) if encodings.search_function(encoding) is None: raise UnknownEncoding(encoding) f = StructWriter( StructReader(f, encoding=encoding), encoding=encoding) f.read_tiny_text() tag_count = f.read_byte() for _ in range(tag_count): key = f.read_tiny_text() if key == name: f.write_tiny_text(value, editable=True) return f.read_tiny_text() raise TagNotFound(name) def read_header(f): f.seek(0) magic = f.read(len(MAGIC)) if (magic != MAGIC): raise UnknownFileFormat() uuid = UUID(bytes=f.read(16)) encoding = read_byte_string(f, U_CHAR).decode(UTF8) if encodings.search_function(encoding) is None: raise UnknownEncoding(encoding) f = StructReader(f, encoding) compression = f.read_tiny_text() if compression not in COMPRESSIONS: raise UnknownCompression(compression) def read_tags(): tags = {} count = f.read_byte() for _ in range(count): key = f.read_tiny_text() value = f.read_tiny_text() tags[key] = value return tags tags = read_tags() def read_content_types(): content_types = [] count = f.read_byte() for _ in range(count): content_type = f.read_text() content_types.append(content_type) return tuple(content_types) content_types = read_content_types() blob_count = f.read_int() store_offset = f.read_long() size = f.read_long() refs_offset = f.tell() return Header( magic=magic, uuid=uuid, encoding=encoding, compression=compression, tags=MappingProxyType(tags), content_types=content_types, blob_count=blob_count, store_offset=store_offset, refs_offset=refs_offset, size=size, ) def meld_ints(a, b): return (a << 16) | b def unmeld_ints(c): bstr = bin(c).lstrip("0b").zfill(48) a, b = bstr[-48:-16], bstr[-16:] return int(a, 2), int(b, 2) class Slob(Sequence): def __init__(self, file_or_filenames): self._f = MultiFileReader(file_or_filenames) try: self._header = read_header(self._f) if (self._f.size != self._header.size): raise IncorrectFileSize( 'File size should be {0}, {1} bytes found' .format(self._header.size, self._f.size)) except FileFormatException: self._f.close() raise self._refs = RefList( self._f, self._header.encoding, offset=self._header.refs_offset, ) self._g = MultiFileReader(file_or_filenames) self._store = Store( self._g, self._header.store_offset, COMPRESSIONS[self._header.compression].decompress, self._header.content_types, ) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() return False @property def id(self): return self._header.uuid.hex @property def content_types(self): return self._header.content_types @property def tags(self): return self._header.tags @property def blob_count(self): return self._header.blob_count @property def compression(self): return self._header.compression @property def encoding(self): return self._header.encoding def __len__(self): return len(self._refs) def __getitem__(self, i): ref = self._refs[i] def read_func(): return self._store.get(ref.bin_index, ref.item_index)[1] read_func = lru_cache(maxsize=None)(read_func) def read_content_type_func(): return self._store.content_type(ref.bin_index, ref.item_index) content_id = meld_ints(ref.bin_index, ref.item_index) return Blob( content_id, ref.key, ref.fragment, read_content_type_func, read_func, ) def get(self, blob_id): bin_index, bin_item_index = unmeld_ints(blob_id) return self._store.get(bin_index, bin_item_index) @lru_cache(maxsize=None) def as_dict( self, strength=TERTIARY, maxlength=None, ): return KeydItemDict(self, strength, maxlength=maxlength) def close(self): self._f.close() self._g.close() def find_parts(fname): fname = os.path.expanduser(fname) dirname = os.path.dirname(fname) or os.getcwd() basename = os.path.basename(fname) candidates = [] for name in os.listdir(dirname): if name.startswith(basename): candidates.append(os.path.join(dirname, name)) return sorted(candidates) def open(file_or_filenames): if isinstance(file_or_filenames, str): if not os.path.exists(file_or_filenames): file_or_filenames = find_parts(file_or_filenames) return Slob(file_or_filenames) class BinMemWriter: def __init__(self): self.content_type_ids = [] self.item_dir = [] self.items = [] self.current_offset = 0 def add(self, content_type_id, blob): self.content_type_ids.append(content_type_id) self.item_dir.append(pack(U_INT, self.current_offset)) length_and_bytes = pack(U_INT, len(blob)) + blob self.items.append(length_and_bytes) self.current_offset += len(length_and_bytes) def __len__(self): return len(self.item_dir) def finalize(self, fout: 'output file', compress: 'function'): count = len(self) fout.write(pack(U_INT, count)) for content_type_id in self.content_type_ids: fout.write(pack(U_CHAR, content_type_id)) content = b''.join(self.item_dir + self.items) compressed = compress(content) fout.write(pack(U_INT, len(compressed))) fout.write(compressed) self.content_type_ids.clear() self.item_dir.clear() self.items.clear() class ItemList(Sequence): def __init__( self, file_, offset, count_or_spec, pos_spec, cache_size=None, ): self.lock = RLock() self._file = file_ file_.seek(offset) if isinstance(count_or_spec, str): count_spec = count_or_spec self.count = unpack(count_spec, file_.read(calcsize(count_spec)))[0] else: self.count = count_or_spec self.pos_offset = file_.tell() self.pos_spec = pos_spec self.pos_size = calcsize(pos_spec) self.data_offset = self.pos_offset + self.pos_size * self.count if cache_size: self.__getitem__ = lru_cache(maxsize=cache_size)(self.__getitem__) def __len__(self): return self.count def pos(self, i): with self.lock: self._file.seek(self.pos_offset + self.pos_size * i) return unpack(self.pos_spec, self._file.read(self.pos_size))[0] def read(self, pos): with self.lock: self._file.seek(self.data_offset + pos) return self._read_item() @abstractmethod def _read_item(self): pass def __getitem__(self, i): if i >= len(self) or i < 0: raise IndexError('index out of range') return self.read(self.pos(i)) class RefList(ItemList): def __init__(self, f, encoding, offset=0, count=None): super().__init__( StructReader(f, encoding), offset, U_INT if count is None else count, U_LONG_LONG, cache_size=512, ) def _read_item(self): key = self._file.read_text() bin_index = self._file.read_int() item_index = self._file.read_short() fragment = self._file.read_tiny_text() return Ref( key=key, bin_index=bin_index, item_index=item_index, fragment=fragment, ) @lru_cache(maxsize=None) def as_dict( self, strength=TERTIARY, maxlength=None, ): return KeydItemDict(self, strength, maxlength=maxlength) class Bin(ItemList): def __init__(self, count, bin_bytes): super().__init__( StructReader(io.BytesIO(bin_bytes)), 0, count, U_INT, ) def _read_item(self): content_len = self._file.read_int() content = self._file.read(content_len) return content StoreItem = namedtuple('StoreItem', 'content_type_ids compressed_content') class Store(ItemList): def __init__(self, file_, offset, decompress, content_types): super().__init__( StructReader(file_), offset, U_INT, U_LONG_LONG, cache_size=32, ) self.decompress = decompress self.content_types = content_types def _read_item(self): bin_item_count = self._file.read_int() packed_content_type_ids = self._file.read(bin_item_count * U_CHAR_SIZE) content_type_ids = [] for i in range(bin_item_count): content_type_id = unpack(U_CHAR, packed_content_type_ids[i:i + 1])[0] content_type_ids.append(content_type_id) content_length = self._file.read_int() content = self._file.read(content_length) return StoreItem( content_type_ids=content_type_ids, compressed_content=content, ) def _content_type(self, bin_index, item_index): store_item = self[bin_index] content_type_id = store_item.content_type_ids[item_index] content_type = self.content_types[content_type_id] return content_type, store_item def content_type(self, bin_index, item_index): return self._content_type(bin_index, item_index)[0] @lru_cache(maxsize=16) def _decompress(self, bin_index): store_item = self[bin_index] return self.decompress(store_item.compressed_content) def get(self, bin_index, item_index): content_type, store_item = self._content_type(bin_index, item_index) content = self._decompress(bin_index) count = len(store_item.content_type_ids) store_bin = Bin(count, content) content = store_bin[item_index] return (content_type, content) WriterEvent = namedtuple('WriterEvent', 'name data') class KeyTooLongException(Exception): @property def key(self): return self.args[0] class Writer(object): def __init__( self, filename, workdir=None, encoding=UTF8, compression=DEFAULT_COMPRESSION, min_bin_size=512 * 1024, max_redirects=5, observer=None, ): self.filename = filename self.observer = observer if os.path.exists(self.filename): raise SystemExit('File %r already exists' % self.filename) # make sure we can write with fopen(self.filename, 'wb'): pass self.encoding = encoding if encodings.search_function(self.encoding) is None: raise UnknownEncoding(self.encoding) self.workdir = workdir self.tmpdir = tmpdir = tempfile.TemporaryDirectory( prefix='{0}-'.format(os.path.basename(filename)), dir=workdir) self.f_ref_positions = self._wbfopen('ref-positions') self.f_store_positions = self._wbfopen('store-positions') self.f_refs = self._wbfopen('refs') self.f_store = self._wbfopen('store') self.max_redirects = max_redirects if max_redirects: self.aliases_path = os.path.join(tmpdir.name, 'aliases') self.f_aliases = Writer( self.aliases_path, workdir=tmpdir.name, max_redirects=0, compression=None, ) if compression is None: compression = '' if compression not in COMPRESSIONS: raise UnknownCompression(compression) else: self.compress = COMPRESSIONS[compression].compress self.compression = compression self.content_types = {} self.min_bin_size = min_bin_size self.current_bin = None self.blob_count = 0 self.ref_count = 0 self.bin_count = 0 self._tags = { 'version.python': sys.version.replace('\n', ' '), 'version.pyicu': icu.VERSION, 'version.icu': icu.ICU_VERSION, 'created.at': datetime.now(timezone.utc).isoformat() } self.tags = MappingProxyType(self._tags) def _wbfopen(self, name): return StructWriter( fopen(os.path.join(self.tmpdir.name, name), 'wb'), encoding=self.encoding) def tag(self, name, value=''): if len(name.encode(self.encoding)) > MAX_TINY_TEXT_LEN: self._fire_event('tag_name_too_long', (name, value)) return if len(value.encode(self.encoding)) > MAX_TINY_TEXT_LEN: self._fire_event('tag_value_too_long', (name, value)) value = '' self._tags[name] = value def _split_key(self, key): if isinstance(key, str): actual_key = key fragment = '' else: actual_key, fragment = key if len(actual_key) > MAX_TEXT_LEN or len(fragment) > MAX_TINY_TEXT_LEN: raise KeyTooLongException(key) return actual_key, fragment def add(self, blob, *keys, content_type=''): if len(blob) > MAX_LARGE_BYTE_STRING_LEN: self._fire_event('content_too_long', blob) return if len(content_type) > MAX_TEXT_LEN: self._fire_event('content_type_too_long', content_type) return actual_keys = [] for key in keys: try: actual_key, fragment = self._split_key(key) except KeyTooLongException as e: self._fire_event('key_too_long', e.key) else: actual_keys.append((actual_key, fragment)) if len(actual_keys) == 0: return if self.current_bin is None: self.current_bin = BinMemWriter() self.bin_count += 1 if content_type not in self.content_types: self.content_types[content_type] = len(self.content_types) self.current_bin.add(self.content_types[content_type], blob) self.blob_count += 1 bin_item_index = len(self.current_bin) - 1 bin_index = self.bin_count - 1 for actual_key, fragment in actual_keys: self._write_ref(actual_key, bin_index, bin_item_index, fragment) if ( self.current_bin.current_offset > self.min_bin_size or len(self.current_bin) == MAX_BIN_ITEM_COUNT ): self._write_current_bin() def add_alias(self, key, target_key): if self.max_redirects: try: self._split_key(key) except KeyTooLongException as e: self._fire_event('alias_too_long', e.key) return try: self._split_key(target_key) except KeyTooLongException as e: self._fire_event('alias_target_too_long', e.key) return self.f_aliases.add(pickle.dumps(target_key), key) else: raise NotImplementedError() def _fire_event(self, name, data=None): if self.observer: self.observer(WriterEvent(name, data)) def _write_current_bin(self): self.f_store_positions.write_long(self.f_store.tell()) self.current_bin.finalize(self.f_store, self.compress) self.current_bin = None def _write_ref(self, key, bin_index, item_index, fragment=''): self.f_ref_positions.write_long(self.f_refs.tell()) self.f_refs.write_text(key) self.f_refs.write_int(bin_index) self.f_refs.write_short(item_index) self.f_refs.write_tiny_text(fragment) self.ref_count += 1 def _sort(self): self._fire_event('begin_sort') f_ref_positions_sorted = self._wbfopen('ref-positions-sorted') self.f_refs.flush() self.f_ref_positions.close() with MultiFileReader(self.f_ref_positions.name, self.f_refs.name) as f: ref_list = RefList(f, self.encoding, count=self.ref_count) sortkey_func = sortkey(IDENTICAL) for i in sorted( range(len(ref_list)), key=lambda j: sortkey_func(ref_list[j].key) ): ref_pos = ref_list.pos(i) f_ref_positions_sorted.write_long(ref_pos) f_ref_positions_sorted.close() os.remove(self.f_ref_positions.name) os.rename(f_ref_positions_sorted.name, self.f_ref_positions.name) self.f_ref_positions = StructWriter( fopen(self.f_ref_positions.name, 'ab'), encoding=self.encoding) self._fire_event('end_sort') def _resolve_aliases(self): self._fire_event('begin_resolve_aliases') self.f_aliases.finalize() with MultiFileReader(self.f_ref_positions.name, self.f_refs.name) as f_ref_list: ref_list = RefList(f_ref_list, self.encoding, count=self.ref_count) ref_dict = ref_list.as_dict() with Slob(self.aliases_path) as r: aliases = r.as_dict() path = os.path.join(self.tmpdir.name, 'resolved-aliases') with Writer( path, workdir=self.tmpdir.name, max_redirects=0, compression=None, ) as alias_writer: def read_key_frag(item, default_fragment): key_frag = pickle.loads(item.content) if isinstance(key_frag, str): return key_frag, default_fragment else: return key_frag for item in r: from_key = item.key keys = set() keys.add(from_key) to_key, fragment = read_key_frag(item, item.fragment) count = 0 while count <= self.max_redirects: # is target key itself a redirect? try: orig_to_key = to_key to_key, fragment = read_key_frag( next(aliases[to_key]), fragment) count += 1 keys.add(orig_to_key) except StopIteration: break if count > self.max_redirects: self._fire_event('too_many_redirects', from_key) try: target_ref = next(ref_dict[to_key]) except StopIteration: self._fire_event('alias_target_not_found', to_key) else: for key in keys: ref = Ref( key=key, bin_index=target_ref.bin_index, item_index=target_ref.item_index, # last fragment in the chain wins fragment=target_ref.fragment or fragment, ) alias_writer.add(pickle.dumps(ref), key) with Slob(path) as resolved_aliases_reader: previous_key = None for item in resolved_aliases_reader: ref = pickle.loads(item.content) if ref.key == previous_key: continue self._write_ref( ref.key, ref.bin_index, ref.item_index, ref.fragment, ) previous_key = ref.key self._sort() self._fire_event('end_resolve_aliases') def finalize(self): self._fire_event('begin_finalize') if self.current_bin is not None: self._write_current_bin() self._sort() if self.max_redirects: self._resolve_aliases() files = ( self.f_ref_positions, self.f_refs, self.f_store_positions, self.f_store, ) for f in files: f.close() buf_size = 10 * 1024 * 1024 with fopen(self.filename, mode='wb') as output_file: out = StructWriter(output_file, self.encoding) out.write(MAGIC) out.write(uuid4().bytes) out.write_tiny_text(self.encoding, encoding=UTF8) out.write_tiny_text(self.compression) def write_tags(tags, f): f.write(pack(U_CHAR, len(tags))) for key, value in tags.items(): f.write_tiny_text(key) f.write_tiny_text(value, editable=True) write_tags(self.tags, out) def write_content_types(content_types, f): count = len(content_types) f.write(pack(U_CHAR, count)) types = sorted(content_types.items(), key=lambda x: x[1]) for content_type, _ in types: f.write_text(content_type) write_content_types(self.content_types, out) out.write_int(self.blob_count) store_offset = ( out.tell() + U_LONG_LONG_SIZE + # this value U_LONG_LONG_SIZE + # file size value U_INT_SIZE + # ref count value os.stat(self.f_ref_positions.name).st_size + os.stat(self.f_refs.name).st_size ) out.write_long(store_offset) out.flush() file_size = ( out.tell() + # bytes written so far U_LONG_LONG_SIZE + # file size value 2 * U_INT_SIZE # ref count and bin count ) file_size += sum((os.stat(f.name).st_size for f in files)) out.write_long(file_size) def mv(src, out): fname = src.name self._fire_event('begin_move', fname) with fopen(fname, mode='rb') as f: while True: data = f.read(buf_size) if len(data) == 0: break out.write(data) out.flush() os.remove(fname) self._fire_event('end_move', fname) out.write_int(self.ref_count) mv(self.f_ref_positions, out) mv(self.f_refs, out) out.write_int(self.bin_count) mv(self.f_store_positions, out) mv(self.f_store, out) self.tmpdir.cleanup() self._fire_event('end_finalize') def size_header(self): size = 0 size += len(MAGIC) size += 16 # uuid bytes size += U_CHAR_SIZE + len(self.encoding.encode(UTF8)) size += U_CHAR_SIZE + len(self.compression.encode(self.encoding)) size += U_CHAR_SIZE # tag length size += U_CHAR_SIZE # content types count # tags and content types themselves counted elsewhere size += U_INT_SIZE # blob count size += U_LONG_LONG_SIZE # store offset size += U_LONG_LONG_SIZE # file size size += U_INT_SIZE # ref count size += U_INT_SIZE # bin count return size def size_tags(self): size = 0 for key, _ in self.tags.items(): size += U_CHAR_SIZE + len(key.encode(self.encoding)) size += 255 return size def size_content_types(self): size = 0 for content_type in self.content_types: size += U_CHAR_SIZE + len(content_type.encode(self.encoding)) return size def size_data(self): files = ( self.f_ref_positions, self.f_refs, self.f_store_positions, self.f_store, ) return sum((os.stat(f.name).st_size for f in files)) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.finalize() return False pyglossary-4.5.0/pyglossary/plugin_lib/slob_extra.py000066400000000000000000000017241417733132500230220ustar00rootroot00000000000000 from pyglossary.plugin_lib.slob import * def sortkey_length(strength, word): c = Collator.createInstance(Locale('')) c.setStrength(strength) c.setAttribute( UCollAttribute.ALTERNATE_HANDLING, UCollAttributeValue.SHIFTED, ) coll_key = c.getSortKey(word) return len(coll_key) - 1 # subtract 1 for ending \x00 byte def find(word, slobs, match_prefix=True): seen = set() if isinstance(slobs, Slob): slobs = [slobs] variants = [] for strength in (QUATERNARY, TERTIARY, SECONDARY, PRIMARY): variants.append((strength, None)) if match_prefix: for strength in (QUATERNARY, TERTIARY, SECONDARY, PRIMARY): variants.append((strength, sortkey_length(strength, word))) for strength, maxlength in variants: for slob in slobs: d = slob.as_dict(strength=strength, maxlength=maxlength) for item in d[word]: dedup_key = (slob.id, item.id, item.fragment) if dedup_key in seen: continue else: seen.add(dedup_key) yield slob, item pyglossary-4.5.0/pyglossary/plugin_lib/slob_extra_test.py000066400000000000000000000026561417733132500240660ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import os from os.path import dirname, abspath import unittest import logging import tempfile rootDir = dirname(dirname(dirname(abspath(__file__)))) sys.path.insert(0, rootDir) from pyglossary.plugin_lib.slob_extra import * from pyglossary.plugin_lib.slob_test import BaseTest class TestBestMatch(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path1 = os.path.join(self.tmpdir.name, 'test1.slob') self.path2 = os.path.join(self.tmpdir.name, 'test2.slob') data1 = ['aa', 'Aa', 'a-a', 'aabc', 'Äā', 'bb', 'aa'] data2 = ['aa', 'aA', 'āā', 'a,a', 'a-a', 'aade', 'Äā', 'cc'] with self.create(self.path1) as w: for key in data1: w.add(b'', key) with self.create(self.path2) as w: for key in data2: w.add(b'', key) def test_best_match(self): self.maxDiff = None with open(self.path1) as s1, open(self.path2) as s2: result = find('aa', [s1, s2], match_prefix=True) actual = list((s.id, item.key) for s, item in result) expected = [ (s1.id, 'aa'), (s1.id, 'aa'), (s2.id, 'aa'), (s1.id, 'a-a'), (s2.id, 'a-a'), (s2.id, 'a,a'), (s1.id, 'Aa'), (s2.id, 'aA'), (s1.id, 'Äā'), (s2.id, 'Äā'), (s2.id, 'āā'), (s1.id, 'aabc'), (s2.id, 'aade'), ] self.assertEqual(expected, actual) def tearDown(self): self.tmpdir.cleanup() if __name__ == '__main__': unittest.main() pyglossary-4.5.0/pyglossary/plugin_lib/slob_test.py000066400000000000000000000414721417733132500226620ustar00rootroot00000000000000#!/usr/bin/env python3 import sys import os from os.path import dirname, abspath import unittest import random import unicodedata import logging import tempfile rootDir = dirname(dirname(dirname(abspath(__file__)))) sys.path.insert(0, rootDir) from pyglossary.plugin_lib.slob import * from pyglossary.core_test import MockLogHandler mockLog = MockLogHandler() log = logging.getLogger("pyglossary") log.addHandler(mockLog) class BaseTest(unittest.TestCase): def _observer(self, event: "slob.WriterEvent"): log.info(f"slob: {event.name}{': ' + event.data if event.data else ''}") # self._writers = [] def create(self, *args, observer=None, **kwargs): if observer is None: observer = self._observer w = Writer(*args, observer=observer, **kwargs) # self._writers.append(w) return w class TestReadWrite(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') with self.create(self.path) as w: self.tags = { 'a': 'abc', 'bb': 'xyz123', 'ccc': 'lkjlk', } for name, value in self.tags.items(): w.tag(name, value) self.tag2 = 'bb', 'xyz123' self.blob_encoding = 'ascii' self.data = [ (('c', 'cc', 'ccc'), MIME_TEXT, 'Hello C 1'), ('a', MIME_TEXT, 'Hello A 12'), ('z', MIME_TEXT, 'Hello Z 123'), ('b', MIME_TEXT, 'Hello B 1234'), ('d', MIME_TEXT, 'Hello D 12345'), ('uuu', MIME_HTML, 'Hello U!'), ((('yy', 'frag1'),), MIME_HTML, '

    Section 1

    '), ] self.all_keys = [] self.data_as_dict = {} for k, t, v in self.data: if isinstance(k, str): k = (k,) for key in k: if isinstance(key, tuple): key, fragment = key else: fragment = '' self.all_keys.append(key) self.data_as_dict[key] = (t, v, fragment) w.add(v.encode(self.blob_encoding), *k, content_type=t) self.all_keys.sort() self.w = w def test_header(self): with MultiFileReader(self.path) as f: header = read_header(f) for key, value in self.tags.items(): self.assertEqual(header.tags[key], value) self.assertEqual(self.w.encoding, UTF8) self.assertEqual(header.encoding, self.w.encoding) self.assertEqual(header.compression, self.w.compression) for i, content_type in enumerate(header.content_types): self.assertEqual(self.w.content_types[content_type], i) self.assertEqual(header.blob_count, len(self.data)) def test_content(self): with open(self.path) as r: self.assertEqual(len(r), len(self.all_keys)) self.assertRaises(IndexError, r.__getitem__, len(self.all_keys)) for i, item in enumerate(r): self.assertEqual(item.key, self.all_keys[i]) content_type, value, fragment = self.data_as_dict[item.key] self.assertEqual( item.content_type, content_type) self.assertEqual( item.content.decode(self.blob_encoding), value) self.assertEqual( item.fragment, fragment) def tearDown(self): self.tmpdir.cleanup() class TestSort(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') with self.create(self.path) as w: data = [ 'Ф, ф', 'Ф ф', 'Ф', 'Э', 'Е е', 'г', 'н', 'ф', 'а', 'Ф, Ф', 'е', 'Е', 'Ее', 'ё', 'Ё', 'Её', 'Е ё', 'А', 'э', 'ы' ] self.data_sorted = sorted(data, key=sortkey(IDENTICAL)) for k in data: v = ';'.join(unicodedata.name(c) for c in k) w.add(v.encode('ascii'), k) self.r = open(self.path) def test_sort_order(self): for i in range(len(self.r)): self.assertEqual(self.r[i].key, self.data_sorted[i]) def tearDown(self): self.r.close() self.tmpdir.cleanup() class TestFind(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') with self.create(self.path) as w: data = [ 'Cc', 'aA', 'aa', 'Aa', 'Bb', 'cc', 'Äā', 'ăÀ', 'a\u00A0a', 'a-a', 'a\u2019a', 'a\u2032a', 'a,a', 'a a', ] for k in data: v = ';'.join(unicodedata.name(c) for c in k) w.add(v.encode('ascii'), k) self.r = open(self.path) def get(self, d, key): return list(item.content.decode('ascii') for item in d[key]) def test_find_identical(self): d = self.r.as_dict(IDENTICAL) self.assertEqual( self.get(d, 'aa'), ['LATIN SMALL LETTER A;LATIN SMALL LETTER A']) self.assertEqual( self.get(d, 'a-a'), ['LATIN SMALL LETTER A;HYPHEN-MINUS;LATIN SMALL LETTER A']) self.assertEqual( self.get(d, 'aA'), ['LATIN SMALL LETTER A;LATIN CAPITAL LETTER A']) self.assertEqual( self.get(d, 'Äā'), [ 'LATIN CAPITAL LETTER A WITH DIAERESIS;' 'LATIN SMALL LETTER A WITH MACRON', ], ) self.assertEqual( self.get(d, 'a a'), ['LATIN SMALL LETTER A;SPACE;LATIN SMALL LETTER A']) def test_find_quaternary(self): d = self.r.as_dict(QUATERNARY) self.assertEqual( self.get(d, 'a\u2032a'), ['LATIN SMALL LETTER A;PRIME;LATIN SMALL LETTER A']) self.assertEqual( self.get(d, 'a a'), [ 'LATIN SMALL LETTER A;SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;NO-BREAK SPACE;LATIN SMALL LETTER A', ], ) def test_find_tertiary(self): d = self.r.as_dict(TERTIARY) self.assertEqual( self.get(d, 'aa'), [ 'LATIN SMALL LETTER A;SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;NO-BREAK SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;HYPHEN-MINUS;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;COMMA;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;RIGHT SINGLE QUOTATION MARK;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;PRIME;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;LATIN SMALL LETTER A', ], ) def test_find_secondary(self): d = self.r.as_dict(SECONDARY) self.assertEqual( self.get(d, 'aa'), [ 'LATIN SMALL LETTER A;SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;NO-BREAK SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;HYPHEN-MINUS;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;COMMA;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;RIGHT SINGLE QUOTATION MARK;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;PRIME;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;LATIN CAPITAL LETTER A', 'LATIN CAPITAL LETTER A;LATIN SMALL LETTER A', ], ) def test_find_primary(self): d = self.r.as_dict(PRIMARY) self.assertEqual( self.get(d, 'aa'), [ 'LATIN SMALL LETTER A;SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;NO-BREAK SPACE;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;HYPHEN-MINUS;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;COMMA;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;RIGHT SINGLE QUOTATION MARK;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;PRIME;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A;LATIN CAPITAL LETTER A', 'LATIN CAPITAL LETTER A;LATIN SMALL LETTER A', 'LATIN SMALL LETTER A WITH BREVE;LATIN CAPITAL LETTER A WITH GRAVE', 'LATIN CAPITAL LETTER A WITH DIAERESIS;LATIN SMALL LETTER A WITH MACRON', ], ) def tearDown(self): self.r.close() self.tmpdir.cleanup() class TestPrefixFind(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') self.data = ['a', 'ab', 'abc', 'abcd', 'abcde'] with self.create(self.path) as w: for k in self.data: w.add(k.encode('ascii'), k) def tearDown(self): self.tmpdir.cleanup() def test(self): with open(self.path) as r: for i, k in enumerate(self.data): d = r.as_dict(IDENTICAL, len(k)) self.assertEqual( [v.content.decode('ascii') for v in d[k]], self.data[i:], ) class TestAlias(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') def tearDown(self): self.tmpdir.cleanup() def test_alias(self): too_many_redirects = [] target_not_found = [] def observer(event): if event.name == 'too_many_redirects': too_many_redirects.append(event.data) elif event.name == 'alias_target_not_found': target_not_found.append(event.data) with self.create(self.path, observer=observer) as w: data = ['z', 'b', 'q', 'a', 'u', 'g', 'p', 'n'] for k in data: v = ';'.join(unicodedata.name(c) for c in k) w.add(v.encode('ascii'), k) w.add_alias('w', 'u') w.add_alias('y1', 'y2') w.add_alias('y2', 'y3') w.add_alias('y3', 'z') w.add_alias('ZZZ', 'YYY') w.add_alias('l3', 'l1') w.add_alias('l1', 'l2') w.add_alias('l2', 'l3') w.add_alias('a1', ('a', 'a-frag1')) w.add_alias('a2', 'a1') w.add_alias('a3', ('a2', 'a-frag2')) w.add_alias('g1', 'g') w.add_alias('g2', ('g1', 'g-frag1')) self.assertEqual(too_many_redirects, ['l1', 'l2', 'l3']) self.assertEqual(target_not_found, ['l2', 'l3', 'l1', 'YYY']) with open(self.path) as r: d = r.as_dict() def get(key): return [ item.content.decode('ascii') for item in d[key] ] self.assertEqual(get('w'), ['LATIN SMALL LETTER U']) self.assertEqual(get('y1'), ['LATIN SMALL LETTER Z']) self.assertEqual(get('y2'), ['LATIN SMALL LETTER Z']) self.assertEqual(get('y3'), ['LATIN SMALL LETTER Z']) self.assertEqual(get('ZZZ'), []) self.assertEqual(get('l1'), []) self.assertEqual(get('l2'), []) self.assertEqual(get('l3'), []) item_a1 = next(d['a1']) self.assertEqual(item_a1.content, b'LATIN SMALL LETTER A') self.assertEqual(item_a1.fragment, 'a-frag1') item_a2 = next(d['a2']) self.assertEqual(item_a2.content, b'LATIN SMALL LETTER A') self.assertEqual(item_a2.fragment, 'a-frag1') item_a3 = next(d['a3']) self.assertEqual(item_a3.content, b'LATIN SMALL LETTER A') self.assertEqual(item_a3.fragment, 'a-frag1') item_g1 = next(d['g1']) self.assertEqual(item_g1.content, b'LATIN SMALL LETTER G') self.assertEqual(item_g1.fragment, '') item_g2 = next(d['g2']) self.assertEqual(item_g2.content, b'LATIN SMALL LETTER G') self.assertEqual(item_g2.fragment, 'g-frag1') class TestBlobId(BaseTest): def test(self): max_i = 2**32 - 1 max_j = 2**16 - 1 i_values = [0, max_i] + [ random.randint(1, max_i - 1) for _ in range(100) ] j_values = [0, max_j] + [ random.randint(1, max_j - 1) for _ in range(100) ] for i in i_values: for j in j_values: self.assertEqual(unmeld_ints(meld_ints(i, j)), (i, j)) class TestMultiFileReader(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') def tearDown(self): self.tmpdir.cleanup() def test_read_all(self): fnames = [] for name in 'abcdef': path = os.path.join(self.tmpdir.name, name) fnames.append(path) with fopen(path, 'wb') as f: f.write(name.encode(UTF8)) with MultiFileReader(fnames) as m: self.assertEqual(m.read().decode(UTF8), 'abcdef') def test_seek_and_read(self): def mkfile(basename, content): part = os.path.join(self.tmpdir.name, basename) with fopen(part, 'wb') as f: f.write(content) return part content = b'abc\nd\nefgh\nij' part1 = mkfile('1', content[:4]) part2 = mkfile('2', content[4:5]) part3 = mkfile('3', content[5:]) with MultiFileReader(part1, part2, part3) as m: self.assertEqual(m.size, len(content)) m.seek(2) self.assertEqual(m.read(2), content[2:4]) m.seek(1) self.assertEqual(m.read(len(content) - 2), content[1:-1]) m.seek(-1, whence=io.SEEK_END) self.assertEqual(m.read(10), content[-1:]) m.seek(4) m.seek(-2, whence=io.SEEK_CUR) self.assertEqual(m.read(3), content[2:5]) class TestFormatErrors(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') def tearDown(self): self.tmpdir.cleanup() def test_wrong_file_type(self): name = os.path.join(self.tmpdir.name, '1') with fopen(name, 'wb') as f: f.write(b'123') self.assertRaises(UnknownFileFormat, open, name) def test_truncated_file(self): name = os.path.join(self.tmpdir.name, '1') with self.create(name) as f: f.add(b'123', 'a') f.add(b'234', 'b',) with fopen(name, 'rb') as f: all_bytes = f.read() with fopen(name, 'wb') as f: f.write(all_bytes[:-1]) self.assertRaises(IncorrectFileSize, open, name) with fopen(name, 'wb') as f: f.write(all_bytes) f.write(b'\n') self.assertRaises(IncorrectFileSize, open, name) class TestFindParts(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') def tearDown(self): self.tmpdir.cleanup() def test_find_parts(self): names = [ os.path.join(self.tmpdir.name, name) for name in ('abc-1', 'abc-2', 'abc-3') ] for name in names: with fopen(name, 'wb'): pass parts = find_parts(os.path.join(self.tmpdir.name, 'abc')) self.assertEqual(names, parts) class TestTooLongText(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') def tearDown(self): self.tmpdir.cleanup() def test_too_long(self): rejected_keys = [] rejected_aliases = [] rejected_alias_targets = [] rejected_tags = [] rejected_content_types = [] def observer(event): if event.name == 'key_too_long': rejected_keys.append(event.data) elif event.name == 'alias_too_long': rejected_aliases.append(event.data) elif event.name == 'alias_target_too_long': rejected_alias_targets.append(event.data) elif event.name == 'tag_name_too_long': rejected_tags.append(event.data) elif event.name == 'content_type_too_long': rejected_content_types.append(event.data) long_tag_name = 't' * (MAX_TINY_TEXT_LEN + 1) long_tag_value = 'v' * (MAX_TINY_TEXT_LEN + 1) long_content_type = 'T' * (MAX_TEXT_LEN + 1) long_key = 'c' * (MAX_TEXT_LEN + 1) long_frag = 'd' * (MAX_TINY_TEXT_LEN + 1) key_with_long_frag = ('d', long_frag) tag_with_long_name = (long_tag_name, 't3 value') tag_with_long_value = ('t1', long_tag_value) long_alias = 'f' * (MAX_TEXT_LEN + 1) alias_with_long_frag = ('i', long_frag) long_alias_target = long_key long_alias_target_frag = key_with_long_frag with self.create(self.path, observer=observer) as w: w.tag(*tag_with_long_value) w.tag('t2', 't2 value') w.tag(*tag_with_long_name) data = ['a', 'b', long_key, key_with_long_frag] for k in data: if isinstance(k, str): v = k.encode('ascii') else: v = '#'.join(k).encode('ascii') w.add(v, k) w.add_alias('e', 'a') w.add_alias(long_alias, 'a') w.add_alias(alias_with_long_frag, 'a') w.add_alias('g', long_alias_target) w.add_alias('h', long_alias_target_frag) w.add(b'Hello', 'hello', content_type=long_content_type) self.assertEqual( rejected_keys, [long_key, key_with_long_frag], ) self.assertEqual( rejected_aliases, [long_alias, alias_with_long_frag], ) self.assertEqual( rejected_alias_targets, [long_alias_target, long_alias_target_frag], ) self.assertEqual( rejected_tags, [tag_with_long_name], ) self.assertEqual( rejected_content_types, [long_content_type], ) with open(self.path) as r: self.assertEqual(r.tags['t2'], 't2 value') self.assertFalse(tag_with_long_name[0] in r.tags) self.assertTrue(tag_with_long_value[0] in r.tags) self.assertEqual(r.tags[tag_with_long_value[0]], '') d = r.as_dict() self.assertTrue('a' in d) self.assertTrue('b' in d) self.assertFalse(long_key in d) self.assertFalse(key_with_long_frag[0] in d) self.assertTrue('e' in d) self.assertFalse(long_alias in d) self.assertFalse('g' in d) self.assertRaises( ValueError, set_tag_value, self.path, 't1', 'ы' * 128, ) class TestEditTag(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') with self.create(self.path) as w: w.tag('a', '123456') w.tag('b', '654321') def tearDown(self): self.tmpdir.cleanup() def test_edit_existing_tag(self): with open(self.path) as f: self.assertEqual(f.tags['a'], '123456') self.assertEqual(f.tags['b'], '654321') set_tag_value(self.path, 'b', 'efg') set_tag_value(self.path, 'a', 'xyz') with open(self.path) as f: self.assertEqual(f.tags['a'], 'xyz') self.assertEqual(f.tags['b'], 'efg') def test_edit_nonexisting_tag(self): self.assertRaises(TagNotFound, set_tag_value, self.path, 'z', 'abc') class TestBinItemNumberLimit(BaseTest): def setUp(self): self.tmpdir = tempfile.TemporaryDirectory(prefix='test') self.path = os.path.join(self.tmpdir.name, 'test.slob') def tearDown(self): self.tmpdir.cleanup() def test_writing_more_then_max_number_of_bin_items(self): with self.create(self.path) as w: for _ in range(MAX_BIN_ITEM_COUNT + 2): w.add(b'a', 'a') self.assertEqual(w.bin_count, 2) if __name__ == '__main__': unittest.main() pyglossary-4.5.0/pyglossary/plugin_manager.py000066400000000000000000000167121417733132500215270ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import logging import sys import os from .plugin_prop import PluginProp from .glossary_utils import ( splitFilenameExt, ) from . import core from .core import ( pluginsDir, ) log = logging.getLogger("pyglossary") class PluginManager(object): plugins = {} # type: Dict[str, PluginProp] pluginByExt = {} # type: Dict[str, PluginProp] loadedModules = set() formatsReadOptions = {} # type: Dict[str, OrderedDict[str, Any]] formatsWriteOptions = {} # type: Dict[str, OrderedDict[str, Any]] # for example formatsReadOptions[format][optName] gives you the default value readFormats = [] # type: List[str] writeFormats = [] # type: List[str] @classmethod def loadPluginsFromJson(cls: "ClassVar", jsonPath: str) -> None: import json from os.path import dirname, join with open(jsonPath) as _file: data = json.load(_file) for attrs in data: moduleName = attrs["module"] cls.loadPluginByDict( attrs=attrs, modulePath=join(pluginsDir, moduleName), ) cls.loadedModules.add(moduleName) @classmethod def loadPlugins( cls: "ClassVar", directory: str, skipDisabled: bool = True, ) -> None: """ executed on startup. as name implies, loads plugins from directory it skips importing plugin modules that are already loaded """ import pkgutil from os.path import isdir # log.debug(f"Loading plugins from directory: {directory!r}") if not isdir(directory): log.critical(f"Invalid plugin directory: {directory!r}") return moduleNames = [ moduleName for _, moduleName, _ in pkgutil.iter_modules([directory]) if moduleName not in cls.loadedModules and moduleName not in ("paths", "formats_common") ] moduleNames.sort() sys.path.append(directory) for moduleName in moduleNames: cls.loadPlugin(moduleName, skipDisabled=skipDisabled) sys.path.pop() @classmethod def loadPluginByDict( cls: "ClassVar", attrs: "Dict[str, Any]", modulePath: str, ) -> None: format = attrs["name"] extensions = attrs["extensions"] prop = PluginProp.fromDict( attrs=attrs, modulePath=modulePath, ) cls.plugins[format] = prop cls.loadedModules.add(attrs["module"]) if not prop.enable: return for ext in extensions: if ext.lower() != ext: log.error(f"non-lowercase extension={ext!r} in {moduleName} plugin") cls.pluginByExt[ext.lstrip(".")] = prop cls.pluginByExt[ext] = prop if attrs["canRead"]: cls.formatsReadOptions[format] = attrs["readOptions"] cls.readFormats.append(format) if attrs["canWrite"]: cls.formatsWriteOptions[format] = attrs["writeOptions"] cls.writeFormats.append(format) if log.level <= core.TRACE: prop.module # to make sure imporing works @classmethod def loadPlugin( cls: "ClassVar", moduleName: str, skipDisabled: bool = True, ) -> None: log.debug(f"importing {moduleName} in loadPlugin") try: module = __import__(moduleName) except ModuleNotFoundError as e: log.warning(f"Module {e.name!r} not found, skipping plugin {moduleName!r}") return except Exception as e: log.exception(f"Error while importing plugin {moduleName}") return enable = getattr(module, "enable", False) if skipDisabled and not enable: # log.debug(f"Plugin disabled or not a module: {moduleName}") return name = module.format prop = PluginProp.fromModule(module) cls.plugins[name] = prop cls.loadedModules.add(moduleName) if not enable: return for ext in prop.extensions: if ext.lower() != ext: log.error(f"non-lowercase extension={ext!r} in {moduleName} plugin") cls.pluginByExt[ext.lstrip(".")] = prop cls.pluginByExt[ext] = prop if prop.canRead: options = prop.getReadOptions() cls.formatsReadOptions[name] = options cls.readFormats.append(name) if prop.canWrite: options = prop.getWriteOptions() cls.formatsWriteOptions[name] = options cls.writeFormats.append(name) @classmethod def findPlugin(cls, query: str) -> "Optional[PluginProp]": """ find plugin by name or extention """ plugin = cls.plugins.get(query) if plugin: return plugin plugin = cls.pluginByExt.get(query) if plugin: return plugin return None @classmethod def detectInputFormat( cls, filename: str, format: str = "", quiet: bool = False, ) -> "Optional[Tuple[str, str, str]]": """ returns (filename, format, compression) or None """ def error(msg: str) -> None: if not quiet: log.critical(msg) return None filenameOrig = filename filenameNoExt, filename, ext, compression = splitFilenameExt(filename) plugin = None if format: plugin = cls.plugins.get(format) if plugin is None: return error(f"Invalid format {format!r}") else: plugin = cls.pluginByExt.get(ext) if not plugin: plugin = cls.findPlugin(filename) if not plugin: return error("Unable to detect input format!") if not plugin.canRead: return error(f"plugin {plugin.name} does not support reading") if compression in plugin.readCompressions: compression = "" filename = filenameOrig return filename, plugin.name, compression @classmethod def detectOutputFormat( cls, filename: str = "", format: str = "", inputFilename: str = "", quiet: bool = False, addExt: bool = False, ) -> "Optional[Tuple[str, str, str]]": """ returns (filename, format, compression) or None """ from os.path import splitext def error(msg: str) -> None: if not quiet: log.critical(msg) return None plugin = None if format: plugin = cls.plugins.get(format) if not plugin: return error(f"Invalid format {format}") if not plugin.canWrite: return error(f"plugin {plugin.name} does not support writing") if not filename: if not inputFilename: return error(f"Invalid filename {filename!r}") if not plugin: return error("No filename nor format is given for output file") filename = splitext(inputFilename)[0] + plugin.ext return filename, plugin.name, "" filenameOrig = filename filenameNoExt, filename, ext, compression = splitFilenameExt(filename) if not plugin: plugin = cls.pluginByExt.get(ext) if not plugin: plugin = cls.findPlugin(filename) if not plugin: return error("Unable to detect output format!") if not plugin.canWrite: return error(f"plugin {plugin.name} does not support writing") if compression in getattr(plugin.writerClass, "compressions", []): compression = "" filename = filenameOrig if addExt: if not filenameNoExt: if inputFilename: ext = plugin.ext filename = splitext(inputFilename)[0] + ext else: log.error("inputFilename is empty") if not ext and plugin.ext: filename += plugin.ext return filename, plugin.name, compression pyglossary-4.5.0/pyglossary/plugin_prop.py000066400000000000000000000253751417733132500211020ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from .option import Option, optionFromDict from .flags import ( YesNoAlwaysNever, DEFAULT_NO, flagsByName, ) import logging from collections import OrderedDict as odict from os.path import dirname log = logging.getLogger("pyglossary") def optionsPropFromDict(optionsPropDict): props = {} for name, propDict in optionsPropDict.items(): try: prop = optionFromDict(propDict) except Exception: log.exception(f"name={name!r}, propDict={propDict}\n") continue props[name] = prop return props def sortOnWriteFromStr(sortOnWriteStr): if sortOnWriteStr is None: return DEFAULT_NO return flagsByName[sortOnWriteStr] class PluginProp(object): __slots__ = [ "_mod", "_Reader", "_ReaderLoaded", "_Writer", "_WriterLoaded", "_moduleName", "_modulePath", "_enable", "_lname", "_name", "_description", "_extensions", "_extensionCreate", "_singleFile", "_optionsProp", "_sortOnWrite", "_sortKeyName", "_canRead", "_canWrite", "_readOptions", "_writeOptions", "_readCompressions", "_readDepends", "_writeDepends", ] @classmethod def fromDict( cls, attrs, modulePath, ) -> None: self = cls() self._mod = None self._Reader = None self._ReaderLoaded = False self._Writer = None self._WriterLoaded = False self._moduleName = attrs["module"] self._modulePath = modulePath self._enable = attrs.get("enable", True) self._lname = attrs["lname"] self._name = attrs["name"] self._description = attrs["description"] self._extensions = attrs["extensions"] self._extensionCreate = attrs.get("extensionCreate", "") self._singleFile = attrs["singleFile"] self._optionsProp = optionsPropFromDict(attrs["optionsProp"]) self._sortOnWrite = sortOnWriteFromStr(attrs.get("sortOnWrite")) self._sortKeyName = attrs.get("sortKeyName") self._canRead = attrs["canRead"] self._canWrite = attrs["canWrite"] self._readOptions = attrs.get("readOptions", []) self._writeOptions = attrs.get("writeOptions", []) self._readCompressions = attrs.get("readCompressions", []) self._readDepends = attrs.get("readDepends", {}) self._writeDepends = attrs.get("writeDepends", {}) return self @classmethod def fromModule(cls, mod): self = cls() self._mod = mod self._Reader = None self._ReaderLoaded = False self._Writer = None self._WriterLoaded = False self._moduleName = mod.__name__ self._modulePath = mod.__file__ if self._modulePath.endswith("__init__.py"): self._modulePath = self._modulePath[:-len("/__init__.py")] elif self._modulePath.endswith(".py"): self._modulePath = self._modulePath[:-3] self._lname = mod.lname self._name = mod.format self._description = mod.description self._extensions = mod.extensions self._extensionCreate = getattr(mod, "extensionCreate", "") self._singleFile = getattr(mod, "singleFile", False) self._optionsProp = getattr(mod, "optionsProp", {}) self._sortOnWrite = getattr(mod, "sortOnWrite", DEFAULT_NO) self._sortKeyName = getattr(mod, "sortKeyName", None) self._canRead = hasattr(mod, "Reader") self._canWrite = hasattr(mod, "Writer") self._readOptions = None self._writeOptions = None self._readCompressions = None self._readDepends = None self._writeDepends = None if log.isDebug(): self.checkModule() return self @property def enable(self): return self._enable @property def module(self): if self._mod is not None: return self._mod moduleName = self._moduleName log.debug(f"importing {moduleName} in DictPluginProp") try: _mod = __import__( f"pyglossary.plugins.{moduleName}", fromlist=moduleName, ) except ModuleNotFoundError as e: log.warning( f"Module {e.name!r} not found in {self._modulePath}" f", skipping plugin {moduleName!r}" ) return except Exception as e: log.exception(f"Error while importing plugin {moduleName}") return else: return _mod @property def lname(self) -> str: return self._lname @property def name(self) -> str: return self._name @property def description(self) -> str: return self._description @property def extensions(self) -> "Tuple[str, ...]": return self._extensions @property def ext(self) -> str: extensions = self.extensions if extensions: return extensions[0] return "" @property def extensionCreate(self) -> str: return self._extensionCreate @property def singleFile(self) -> bool: return self._singleFile @property def optionsProp(self) -> "Dict[str, Option]": return self._optionsProp @property def sortOnWrite(self) -> YesNoAlwaysNever: return self._sortOnWrite @property def sortKeyName(self) -> "Optional[Callable]": return self._sortKeyName @property def path(self) -> "pathlib.Path": from pathlib import Path return Path(self._modulePath) @property def readerClass(self) -> "Optional[Any]": if self._ReaderLoaded: return self._Reader cls = getattr(self.module, "Reader", None) self._Reader = cls self._ReaderLoaded = True if log.isDebug(): self.checkReaderClass() return cls @property def writerClass(self) -> "Optional[Any]": if self._WriterLoaded: return self._Writer cls = getattr(self.module, "Writer", None) self._Writer = cls self._WriterLoaded = True if log.isDebug(): self.checkWriterClass() return cls @property def canRead(self) -> bool: return self._canRead @property def canWrite(self) -> bool: return self._canWrite def getOptionAttrNamesFromClass(self, rwclass): nameList = [] for cls in rwclass.__bases__ + (rwclass,): for _name in cls.__dict__: if not _name.startswith("_") or _name.startswith("__"): # and _name not in ("_open",) continue nameList.append(_name) # rwclass.__dict__ does not include attributes of parent/base class # and dir(rwclass) is sorted by attribute name alphabetically # using rwclass.__bases__ solves the problem return nameList def getOptionsFromClass(self, rwclass): optionsProp = self.optionsProp options = odict() if rwclass is None: return options for attrName in self.getOptionAttrNamesFromClass(rwclass): name = attrName[1:] default = getattr(rwclass, attrName) if name not in optionsProp: if not callable(default): log.warning( f"format={self.name}, attrName={attrName}, type={type(default)}" ) continue prop = optionsProp[name] if prop.disabled: log.trace(f"skipping disabled option {name} in {self.name} plugin") continue if not prop.validate(default): log.warning( "invalid default value for option: " f"{name} = {default!r} in plugin {self.name}" ) options[name] = default return options def getReadOptions(self): if self._readOptions is None: self._readOptions = self.getOptionsFromClass(self.readerClass) return self._readOptions def getWriteOptions(self): if self._writeOptions is None: self._writeOptions = self.getOptionsFromClass(self.writerClass) return self._writeOptions def getReadExtraOptions(self): return [] def getWriteExtraOptions(self): return [] @property def readCompressions(self) -> "List[str]": if self._readCompressions is None: self._readCompressions = getattr(self.readerClass, "compressions", []) return self._readCompressions @property def readDepends(self) -> "Dict[str, str]": if self._readDepends is None: self._readDepends = getattr(self.readerClass, "depends", {}) return self._readDepends @property def writeDepends(self) -> "Dict[str, str]": if self._writeDepends is None: self._writeDepends = getattr(self.writerClass, "depends", {}) return self._writeDepends def checkModule(self): module = self.module if hasattr(module, "write"): log.error( f"plugin {format} has write function, " f"must migrate to Writer class" ) extensions = module.extensions if not isinstance(extensions, tuple): msg = f"{format} plugin: extensions must be tuple" if isinstance(extensions, list): extensions = tuple(extensions) log.error(msg) else: raise ValueError(msg) if not isinstance(self.readDepends, dict): log.error( f"invalid depends={self.readDepends}" f" in {self.name!r}.Reader class" ) if not isinstance(self.writeDepends, dict): log.error( f"invalid depends={self.writeDepends}" f" in {self.name!r}.Reader class" ) for name, opt in self.optionsProp.items(): if name.lower() != name: suggestName = "".join([ "_" + x.lower() if x.isupper() else x for x in name ]) log.debug( f"{self.name}: please rename option " f"{name} to {suggestName}" ) if not opt.comment: log.debug( f"{self.name}: please add comment for option {name}" ) def checkReaderClass(self) -> bool: cls = self._Reader for attr in ( "__init__", "open", "close", "__len__", "__iter__", ): if not hasattr(cls, attr): log.error( f"Invalid Reader class in {self.name!r} plugin" f", no {attr!r} method" ) self._Reader = None return False return True def checkWriterClass(self) -> bool: cls = self._Writer for attr in ( "__init__", "open", "write", "finish", ): if not hasattr(cls, attr): log.error( f"Invalid Writer class in {self.name!r} plugin" f", no {attr!r} method" ) self._Writer = None return False return True def getReadExtraOptions(self): return self.__class__.getExtraOptionsFromFunc( self.readerClass.open, self.name, ) def getWriteExtraOptions(self): return self.__class__.getExtraOptionsFromFunc( self.writerClass.write, self.name, ) @classmethod def getExtraOptionsFromFunc(cls, func, format): import inspect extraOptNames = [] for name, param in inspect.signature(func).parameters.items(): if name == "self": continue if str(param.default) != "": extraOptNames.append(name) continue if name not in ("filename", "dirname"): extraOptNames.append(name) if extraOptNames: log.warning(f"{format}: extraOptNames = {extraOptNames}") return extraOptNames pyglossary-4.5.0/pyglossary/plugins/000077500000000000000000000000001417733132500176375ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/aard2_slob.py000066400000000000000000000210311417733132500222160ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * import shutil enable = True lname = "aard2_slob" format = 'Aard2Slob' description = 'Aard 2 (.slob)' extensions = ('.slob',) extensionCreate = ".slob" singleFile = True kind = "binary" wiki = "https://github.com/itkach/slob/wiki" website = ( "http://aarddict.org/", "aarddict.org", ) optionsProp = { "compression": StrOption( values=["", "bz2", "zlib", "lzma2"], comment="Compression Algorithm", ), "content_type": StrOption( customValue=True, values=[ "text/plain; charset=utf-8", "text/html; charset=utf-8", ], comment="Content Type", ), # "encoding": EncodingOption(), "file_size_approx": FileSizeOption( comment="split up by given approximate file size\nexamples: 100m, 1g", ), "separate_alternates": BoolOption( comment="add alternate headwords as separate entries to slob", ), "word_title": BoolOption( comment="add headwords title to begining of definition", ), } extraDocs = [ ( "PyICU", "See [doc/pyicu.md](./doc/pyicu.md) file for more detailed" " instructions on how to install PyICU.", ), ] file_size_check_every = 100 class Reader(object): depends = { "icu": "PyICU", # >=1.5 } def __init__(self, glos): self._glos = glos self._clear() self._re_bword = re.compile( '(
    ]+?>)', re.I, ) try: import icu except ModuleNotFoundError as e: e.msg += f", run `{pip} install PyICU` to install" raise e def close(self): if self._slobObj is not None: self._slobObj.close() self._clear() def _clear(self): self._filename = "" self._slobObj = None # slobObj is instance of slob.Slob class def open(self, filename): from pyglossary.plugin_lib import slob self._filename = filename self._slobObj = slob.open(filename) tags = dict(self._slobObj.tags.items()) try: name = tags.pop("label") except KeyError: pass else: self._glos.setInfo("name", name) try: creationTime = tags.pop("created.at") except KeyError: pass else: self._glos.setInfo("creationTime", creationTime) try: createdBy = tags.pop("created.by") except KeyError: pass else: self._glos.setInfo("author", createdBy) copyrightLines = [] for key in ("copyright", "license.name", "license.url"): try: value = tags.pop(key) except KeyError: continue copyrightLines.append(value) if copyrightLines: self._glos.setInfo("copyright", "\n".join(copyrightLines)) try: uri = tags.pop("uri") except KeyError: pass else: self._glos.setInfo("website", uri) try: edition = tags.pop("edition") except KeyError: pass else: self._glos.setInfo("edition", edition) for key, value in tags.items(): self._glos.setInfo(f"slob.{key}", value) def __len__(self): if self._slobObj is None: log.error("called len() on a reader which is not open") return 0 return len(self._slobObj) def _href_sub(self, m: "re.Match") -> str: st = m.group(0) if "//" in st: return st st = st.replace('href="', 'href="bword://') st = st.replace("href='", "href='bword://") return st def __iter__(self): from pyglossary.plugin_lib.slob import MIME_HTML, MIME_TEXT if self._slobObj is None: raise RuntimeError("iterating over a reader while it's not open") slobObj = self._slobObj blobSet = set() # slob library gives duplicate blobs when iterating over slobObj # even keeping the last id is not enough, since duplicate blobs # are not all consecutive. so we have to keep a set of blob IDs for blob in slobObj: _id = blob.id if _id in blobSet: yield None # update progressbar continue blobSet.add(_id) # blob.key is str, blob.content is bytes word = blob.key ctype = blob.content_type.split(";")[0] if ctype not in (MIME_HTML, MIME_TEXT): log.debug(f"{word!r}: content_type={blob.content_type}") if word.startswith("~/"): word = word[2:] yield self._glos.newDataEntry(word, blob.content) continue defiFormat = "" if ctype == MIME_HTML: defiFormat = "h" elif ctype == MIME_TEXT: defiFormat = "m" defi = blob.content.decode("utf-8") defi = self._re_bword.sub(self._href_sub, defi) yield self._glos.newEntry(word, defi, defiFormat=defiFormat) class Writer(object): depends = { "icu": "PyICU", } _compression: str = "zlib" _content_type: str = "" _file_size_approx: int = 0 _separate_alternates: bool = False _word_title: bool = False resourceMimeTypes = { "png": "image/png", "jpeg": "image/jpeg", "jpg": "image/jpeg", "gif": "image/gif", "svg": "image/svg+xml", "webp": "image/webp", "tiff": "image/tiff", "tif": "image/tiff", "css": "text/css", "js": "application/javascript", "json": "application/json", "woff": "application/font-woff", "ttf": "application/x-font-ttf", "otf": "application/x-font-opentype", "mp3": "audio/mpeg", "ini": "text/plain", # "application/octet-stream+xapian", "eot": "application/vnd.ms-fontobject", "pdf": "application/pdf", } def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None self._resPrefix = "" self._slobWriter = None def _slobObserver(self, event: "slob.WriterEvent"): log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}") def _open(self, filename: str, namePostfix: str) -> None: import icu from pyglossary.plugin_lib import slob if isfile(filename): shutil.move(filename, f"{filename}.bak") log.warning(f"renamed existing {filename!r} to {filename+'.bak'!r}") kwargs = {} kwargs["compression"] = self._compression self._slobWriter = slobWriter = slob.Writer( filename, observer=self._slobObserver, workdir=cacheDir, **kwargs ) slobWriter.tag("label", self._glos.getInfo("name") + namePostfix) def open(self, filename: str) -> None: try: import icu except ModuleNotFoundError as e: e.msg += f", run `{pip} install PyICU` to install" raise e if isfile(filename): raise IOError(f"File '{filename}' already exists") namePostfix = "" if self._file_size_approx > 0: namePostfix = " (part 1)" self._open(filename, namePostfix) self._filename = filename def finish(self): self._filename = None if self._slobWriter is not None: self._slobWriter.finalize() self._slobWriter = None def addDataEntry(self, entry: "DataEntry") -> None: slobWriter = self._slobWriter rel_path = entry.s_word _, ext = splitext(rel_path) ext = ext.lstrip(os.path.extsep).lower() content_type = self.resourceMimeTypes.get(ext) if not content_type: log.error(f'unknown content type for {rel_path!r}') return content = entry.data key = self._resPrefix + rel_path try: key.encode(slobWriter.encoding) except UnicodeEncodeError: log.error('Failed to add, broken unicode in key: {!a}'.format(key)) return slobWriter.add(content, key, content_type=content_type) def addEntry(self, entry: "Entry") -> None: words = entry.l_word b_defi = entry.defi.encode("utf-8") _ctype = self._content_type writer = self._slobWriter entry.detectDefiFormat() defiFormat = entry.defiFormat if self._word_title and defiFormat in ("h", "m"): if defiFormat == "m": defiFormat = "h" title = self._glos.wordTitleStr( words[0], ) b_defi = title.encode("utf-8") + b_defi if defiFormat == "h": b_defi = b_defi.replace(b'"bword://', b'"') b_defi = b_defi.replace(b"'bword://", b"'") if not _ctype: if defiFormat == "h": _ctype = "text/html; charset=utf-8" elif defiFormat == "m": _ctype = "text/plain; charset=utf-8" else: _ctype = "text/plain; charset=utf-8" if not self._separate_alternates: writer.add( b_defi, *tuple(words), content_type=_ctype, ) return headword, *alts = words writer.add( b_defi, headword, content_type=_ctype, ) for alt in alts: writer.add( b_defi, f"{alt}, {headword}", content_type=_ctype, ) def write(self) -> "Generator[None, BaseEntry, None]": file_size_approx = int(self._file_size_approx * 0.95) entryCount = 0 sumBlobSize = 0 fileIndex = 0 filenameNoExt, _ = splitext(self._filename) while True: entry = yield if entry is None: break if entry.isData(): self.addDataEntry(entry) else: self.addEntry(entry) if file_size_approx > 0: entryCount += 1 if entryCount % file_size_check_every == 0: sumBlobSize = self._slobWriter.size_data() if sumBlobSize >= file_size_approx: self._slobWriter.finalize() fileIndex += 1 self._open(f"{filenameNoExt}.{fileIndex}.slob", f" (part {fileIndex+1})") sumBlobSize = 0 entryCount = 0 pyglossary-4.5.0/pyglossary/plugins/abc_medical_notes.py000066400000000000000000000026451417733132500236330ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * import html enable = True lname = "abc_medical_notes" format = 'ABCMedicalNotes' description = 'ABC Medical Notes (SQLite3)' extensions = () extensionCreate = ".db" kind = "binary" wiki = "" website = ( "https://play.google.com/store/apps/details?id=" + "com.pocketmednotes2014.secondapp", "ABC Medical Notes 2021 - Google Play", ) class Reader(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() self._glos.setDefaultDefiFormat("h") def __len__(self): self._cur.execute("select count(*) from NEW_TABLE") return self._cur.fetchone()[0] def __iter__(self): self._cur.execute( "select _id, contents from NEW_TABLE where _id is not null" ) # FIXME: iteration over self._cur stops after one entry # and self._cur.fetchone() returns None # for row in self._cur: for row in self._cur.fetchall(): word = html.unescape(row[0]) definition = row[1].decode("utf-8", errors="ignore") # print(f"{word!r}, {definition!r}") yield self._glos.newEntry(word, definition, defiFormat="h") def close(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() pyglossary-4.5.0/pyglossary/plugins/almaany.py000066400000000000000000000040521417733132500216340ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * import html enable = True lname = "almaany" format = 'Almaany' description = 'Almaany.com (SQLite3)' extensions = () extensionCreate = ".db" kind = "binary" wiki = "" website = ( "https://play.google.com/store/apps/details?id=com.almaany.arar", "Almaany.com Arabic Dictionary - Google Play", ) class Reader(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() self._glos.setDefaultDefiFormat("h") def __len__(self): self._cur.execute("select count(*) from WordsTable") return self._cur.fetchone()[0] def __iter__(self): from pyglossary.langs.writing_system import getWritingSystemFromText alternateDict = {} self._cur.execute("select wordkey, searchwordkey from Keys") for row in self._cur.fetchall(): if row[0] in alternateDict: alternateDict[row[0]].append(row[1]) else: alternateDict[row[0]] = [row[1]] self._cur.execute( "select word, searchword, root, meaning from WordsTable" " order by id" ) # FIXME: iteration over self._cur stops after one entry # and self._cur.fetchone() returns None # for row in self._cur: for row in self._cur.fetchall(): word = row[0] searchword = row[1] root = row[2] meaning = row[3] definition = meaning definition = definition.replace("|", "
    ") if root: definition += f'
    Root:
    {root}' ws = getWritingSystemFromText(meaning) if ws and ws.direction == "rtl": definition = f'
    {definition}
    ' words = [word, searchword] if word in alternateDict: words += alternateDict[word] yield self._glos.newEntry( words, definition, defiFormat="h", ) def close(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() pyglossary-4.5.0/pyglossary/plugins/appledict/000077500000000000000000000000001417733132500216045ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/__init__.py000066400000000000000000000234031417733132500237170ustar00rootroot00000000000000# -*- coding: utf-8 -*- # appledict/__init__.py # Output to Apple Dictionary xml sources for Dictionary Development Kit. # # Copyright © 2016-2021 Saeed Rasooli (ilius) # Copyright © 2016 Ratijas # Copyright © 2012-2015 Xiaoqiang Wang # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import sys import os from os.path import abspath, basename import re import pkgutil import shutil from pyglossary.plugins.formats_common import * from ._dict import * from ._content import prepare_content sys.setrecursionlimit(10000) enable = True lname = "appledict" format = "AppleDict" description = "AppleDict Source" extensions = (".apple",) extensionCreate = ".apple/" kind = "directory" wiki = "" website = ( "https://support.apple.com/en-gu/guide/dictionary/welcome/mac", "Dictionary User Guide for Mac", ) # FIXME: rename indexes arg/option to indexes_lang? optionsProp = { "clean_html": BoolOption(comment="use BeautifulSoup parser"), "css": StrOption( comment="custom .css file path", ), "xsl": StrOption( comment="custom XSL transformations file path", ), "default_prefs": DictOption( comment="default prefs in python dict format", # example: {"key": "value", "version": "1"} ), "prefs_html": StrOption( comment="preferences XHTML file path", ), "front_back_matter": StrOption( comment="XML file path with top-level tag", ), "jing": BoolOption(comment="run Jing check on generated XML"), "indexes": StrOption( customValue=False, values=["", "ru", "zh"], comment="Additional indexes to dictionary entries", ), } extraDocs = [ ( "Also see:", "See [doc/apple.md](./doc/apple.md) for additional AppleDict instructions.", ), ] BeautifulSoup = None def loadBeautifulSoup(): global BeautifulSoup try: import bs4 as BeautifulSoup except ImportError: try: import BeautifulSoup except ImportError: return if int(BeautifulSoup.__version__.split(".")[0]) < 4: raise ImportError( f"BeautifulSoup is too old, required at least version 4, " f"{BeautifulSoup.__version__!r} found.\n" f"Please run `{pip} install lxml beautifulsoup4 html5lib`" ) def abspath_or_None(path): return os.path.abspath(os.path.expanduser(path)) if path else None def write_header( glos: "GlossaryType", toFile: "TextIO", front_back_matter: "Optional[str]", ) -> None: # write header toFile.write( '\n' '\n' ) if front_back_matter: with open( front_back_matter, mode="r", encoding="utf-8", ) as front_back_matter: toFile.write(front_back_matter.read()) def format_default_prefs(default_prefs): """ :type default_prefs: dict or None as by 14th of Jan 2016, it is highly recommended that prefs should contain {"version": "1"}, otherwise Dictionary.app does not keep user changes between restarts. """ if not default_prefs: return "" if not isinstance(default_prefs, dict): raise TypeError(f"default_prefs not a dictionary: {default_prefs!r}") if str(default_prefs.get("version", None)) != "1": log.error( "default prefs does not contain {'version': '1'}. prefs " "will not be persistent between Dictionary.app restarts." ) return "\n".join( f"\t\t{key}\n\t\t{value}" for key, value in sorted(default_prefs.items()) ).strip() def write_css(fname, css_file): with open(fname, mode="wb") as toFile: if css_file: with open(css_file, mode="rb") as fromFile: toFile.write(fromFile.read()) else: toFile.write(pkgutil.get_data( __name__, "templates/Dictionary.css", )) """ write glossary to Apple dictionary .xml and supporting files. :param dirname: directory path, must not have extension :param clean_html: pass True to use BeautifulSoup parser. :param css: path to custom .css file :param xsl: path to custom XSL transformations file. :param default_prefs: Default prefs in python dictionary literal format, i.e. {"key1": "value1", "key2": "value2", ...}. All keys and values must be quoted strings; not allowed characters (e.g. single/double quotes,equal sign "=", semicolon) must be escaped as hex code according to python string literal rules. :param prefs_html: path to XHTML file with user interface for dictionary's preferences. refer to Apple's documentation for details. :param front_back_matter: path to XML file with top-level tag your front/back matter entry content :param jing: pass True to run Jing check on generated XML. # FIXME: rename to indexes_lang? :param indexes: Dictionary.app is dummy and by default it don't know how to perform flexible search. we can help it by manually providing additional indexes to dictionary entries. """ class Writer(object): depends = { "lxml": "lxml", "bs4": "beautifulsoup4", "html5lib": "html5lib", } _clean_html: bool = True _css: str = "" _xsl: str = "" _default_prefs: "Optional[Dict]" = None _prefs_html: str = "" _front_back_matter: str = "" _jing: bool = False _indexes: str = "" # FIXME: rename to indexes_lang? def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._dirname = None def finish(self): self._dirname = None def open(self, dirname: str) -> None: self._dirname = dirname if not isdir(dirname): os.mkdir(dirname) def write(self) -> "Generator[None, BaseEntry, None]": global BeautifulSoup from pyglossary.xdxf_transform import XdxfTransformer glos = self._glos clean_html = self._clean_html css = self._css xsl = self._xsl default_prefs = self._default_prefs prefs_html = self._prefs_html front_back_matter = self._front_back_matter jing = self._jing indexes = self._indexes xdxf_to_html = XdxfTransformer(encoding="utf-8") if clean_html: if BeautifulSoup is None: loadBeautifulSoup() if BeautifulSoup is None: log.warning( "clean_html option passed but BeautifulSoup not found. " f"to fix this run " f"`{pip} install lxml beautifulsoup4 html5lib`" ) else: BeautifulSoup = None dirname = self._dirname fileNameBase = basename(dirname).replace(".", "_") filePathBase = join(dirname, fileNameBase) # before chdir (outside indir block) css = abspath_or_None(css) xsl = abspath_or_None(xsl) prefs_html = abspath_or_None(prefs_html) front_back_matter = abspath_or_None(front_back_matter) generate_id = id_generator() generate_indexes = indexes_generator(indexes) myResDir = join(dirname, "OtherResources") if not isdir(myResDir): os.mkdir(myResDir) with open(filePathBase + ".xml", mode="w", encoding="utf-8") as toFile: write_header(glos, toFile, front_back_matter) while True: entry = yield if entry is None: break if entry.isData(): entry.save(myResDir) continue words = entry.l_word word, alts = words[0], words[1:] defi = entry.defi long_title = _normalize.title_long( _normalize.title(word, BeautifulSoup) ) if not long_title: continue _id = next(generate_id) if BeautifulSoup: title_attr = BeautifulSoup.dammit.EntitySubstitution\ .substitute_xml(long_title, True) else: title_attr = str(long_title) content_title = long_title if entry.defiFormat == "x": defi = xdxf_to_html.transformByInnerString(defi) content_title = None content = prepare_content(content_title, defi, BeautifulSoup) toFile.write( f'\n' + generate_indexes(long_title, alts, content, BeautifulSoup) + content + "\n\n" ) toFile.write("\n") if xsl: shutil.copy(xsl, myResDir) if prefs_html: shutil.copy(prefs_html, myResDir) write_css(filePathBase + ".css", css) with open(join(dirname, "Makefile"), mode="w", encoding="utf-8") as toFile: toFile.write( toStr(pkgutil.get_data( __name__, "templates/Makefile", )).format(dict_name=fileNameBase) ) copyright = glos.getInfo("copyright") if BeautifulSoup: # strip html tags copyright = str(BeautifulSoup.BeautifulSoup( copyright, features="lxml" ).text) # if DCSDictionaryXSL provided but DCSDictionaryDefaultPrefs not # present in Info.plist, Dictionary.app will crash. with open(filePathBase + ".plist", mode="w", encoding="utf-8") as toFile: frontMatterReferenceID = ( "DCSDictionaryFrontMatterReferenceID\n" "\tfront_back_matter" if front_back_matter else "" ) toFile.write( toStr(pkgutil.get_data( __name__, "templates/Info.plist", )).format( # identifier must be unique CFBundleIdentifier=fileNameBase.replace(" ", ""), CFBundleDisplayName=glos.getInfo("name"), CFBundleName=fileNameBase, DCSDictionaryCopyright=copyright, DCSDictionaryManufacturerName=glos.author, DCSDictionaryXSL=basename(xsl) if xsl else "", DCSDictionaryDefaultPrefs=format_default_prefs(default_prefs), DCSDictionaryPrefsHTML=basename(prefs_html) if prefs_html else "", DCSDictionaryFrontMatterReferenceID=frontMatterReferenceID, ) ) if jing: from .jing import run as jing_run jing_run(filePathBase + ".xml") pyglossary-4.5.0/pyglossary/plugins/appledict/_content.py000066400000000000000000000165731417733132500240030ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016-2019 Saeed Rasooli (ilius) # Copyright © 2016 Ratijas # Copyright © 2012-2015 Xiaoqiang Wang # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # FIXME: # MDX-specific parts should be isolated and moved to MDX Reader # and parts that are specific to one glossary # (like Oxford_Advanced_English-Chinese_Dictionary_9th_Edition.mdx) # should be moved to separate modules (like content processors) and enabled # per-glossary (by title or something else) import re import logging from xml.sax.saxutils import unescape, quoteattr from pyglossary.text_utils import toStr log = logging.getLogger("pyglossary") re_brhr = re.compile("<(BR|HR)>", re.IGNORECASE) re_nonprintable = re.compile("[\x00-\x07\x0e-\x1f]") re_img = re.compile("", re.IGNORECASE) re_div_margin_em = re.compile(r'
    ') sub_div_margin_em = r'
    ' re_div_margin_em_ex = re.compile( r'
    ', ) sub_div_margin_em_ex = r'
    ' re_href = re.compile(r"""href=(["'])(.*?)\1""") re_margin = re.compile(r"margin-left:(\d)em") def prepare_content( title: "Optional[str]", body: str, BeautifulSoup: "Any", ) -> str: # heavily integrated with output of dsl reader plugin! # and with xdxf also. """ :param title: str | None """ # class="sec" => d:priority="2" # style="color:steelblue" => class="ex" # class="p" style="color:green" => class="p" # style="color:green" => class="c" # style="margin-left:{}em" => class="m{}" # => # xhtml is strict if BeautifulSoup: content = prepare_content_with_soup(title, body, BeautifulSoup) else: content = prepare_content_without_soup(title, body) content = content.replace(" ", " ") content = re_nonprintable.sub("", content) return content def prepare_content_without_soup( title: "Optional[str]", body: str, ) -> str: # somewhat analogue to what BeautifulSoup suppose to do body = re_div_margin_em.sub(sub_div_margin_em, body) body = re_div_margin_em_ex.sub(sub_div_margin_em_ex, body) body = re_href.sub(href_sub, body) body = body \ .replace( '', '', ) \ .replace( '', '', ) \ .replace( '', '', ) \ .replace( '', '', ) \ .replace("", '').replace("", "") \ .replace("", "").replace("", "") # nice header to display content = f"

    {title}

    {body}" if title else body content = re_brhr.sub(r"<\g<1> />", content) content = re_img.sub(r"/>", content) return content def prepare_content_with_soup( title: "Optional[str]", body: str, BeautifulSoup: "Any", ) -> str: soup = BeautifulSoup.BeautifulSoup(body, features="lxml") # difference between "lxml" and "html.parser" if soup.body: soup = soup.body for tag in soup(class_="sec"): tag["class"].remove("sec") if not tag["class"]: del tag["class"] tag["d:priority"] = "2" for tag in soup(lambda x: "color:steelblue" in x.get("style", "")): remove_style(tag, "color:steelblue") if "ex" not in tag.get("class", []): tag["class"] = tag.get("class", []) + ["ex"] for tag in soup(is_green): remove_style(tag, "color:green") if "p" not in tag.get("class", ""): tag["class"] = tag.get("class", []) + ["c"] for tag in soup(True): if "style" in tag.attrs: m = re_margin.search(tag["style"]) if m: remove_style(tag, m.group(0)) tag["class"] = tag.get("class", []) + ["m" + m.group(1)] for tag in soup(lambda x: "xhtml:" in x.name): old_tag_name = tag.name tag.name = old_tag_name[len("xhtml:"):] if tag.string: tag.string = f"{tag.string} " for tag in soup.select("[href]"): href = tag["href"] href = cleanup_link_target(href) if href.startswith("sound:"): fix_sound_link(href, tag) elif href.startswith("phonetics") or href.startswith("help:phonetics"): # for oxford9 log.debug(f"phonetics: tag={tag}") if tag.audio and "name" in tag.audio.attrs: tag["onmousedown"] = f"this.lastChild.play(); return false;" src_name = tag.audio["name"].replace("#", "_") tag.audio["src"] = f"{src_name}.mp3" elif not link_is_url(href): tag["href"] = f"x-dictionary:d:{href}" for thumb in soup.find_all("div", "pic_thumb"): thumb["onclick"] = 'this.setAttribute("style", "display:none"); ' \ 'this.nextElementSibling.setAttribute("style", "display:block")' for pic in soup.find_all("div", "big_pic"): pic["onclick"] = 'this.setAttribute("style", "display:none"), ' \ 'this.previousElementSibling.setAttribute("style", "display:block")' # to unfold(expand) and fold(collapse) blocks for pos in soup.find_all("pos", onclick="toggle_infl(this)"): # TODO: simplify this! pos["onclick"] = ( r'var e = this.parentElement.parentElement.parentElement' r'.querySelector("res-g vp-gs"); style = window.' r'getComputedStyle(e), display = style.getPropertyValue' r'("display"), "none" === e.style.display || "none" === display' r' ? e.style.display = "block" : e.style.display = "none", ' r'this.className.match(/(?:^|\s)Clicked(?!\S)/) ? this.' r'className = this.className.replace(' r'/(?:^|\s)Clicked(?!\S)/g, "") : this.setAttribute(' r'"class", "Clicked")' ) for tag in soup.select("[src]"): src = tag["src"] if src.startswith("/"): tag["src"] = src[1:] for tag in soup("u"): tag.name = "span" tag["class"] = tag.get("class", []) + ["u"] for tag in soup("s"): tag.name = "del" if title and " str: href = x.groups()[1] if href.startswith("http"): return x.group() href = cleanup_link_target(href) return "href=" + quoteattr( "x-dictionary:d:" + unescape( href, {""": '"'}, ) ) def is_green(x: dict) -> bool: return "color:green" in x.get("style", "") def remove_style(tag: dict, line: str) -> None: s = "".join(tag["style"].replace(line, "").split(";")) if s: tag["style"] = s else: del tag["style"] def fix_sound_link(href: str, tag: dict): tag["href"] = f'javascript:new Audio("{href[len("sound://"):]}").play();' def link_is_url(href: str) -> bool: for prefix in ( "http:", "https:", "addexample:", "addid:", "addpv:", "help:", "helpg:", "helpp:", "helpr:", "helpxr:", "xi:", "xid:", "xp:", "sd:", "#", ): if href.startswith(prefix): return True return False pyglossary-4.5.0/pyglossary/plugins/appledict/_dict.py000066400000000000000000000063021417733132500232410ustar00rootroot00000000000000# -*- coding: utf-8 -*- # appledict/_dict.py # Output to Apple Dictionary xml sources for Dictionary Development Kit. # # Copyright © 2016-2019 Saeed Rasooli (ilius) # Copyright © 2016 Ratijas # Copyright © 2012-2015 Xiaoqiang Wang # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import logging import re import string from . import _normalize log = logging.getLogger("pyglossary") digs = string.digits + string.ascii_letters def base36(x: int) -> str: """ simplified version of int2base http://stackoverflow.com/questions/2267362/convert-integer-to-a-string-in-a-given-numeric-base-in-python#2267446 """ digits = [] while x: digits.append(digs[x % 36]) x //= 36 digits.reverse() return "".join(digits) def id_generator() -> "Iterator[str]": cnt = 1 while True: yield "_" + str(base36(cnt)) cnt += 1 def indexes_generator(indexes_lang: str) -> """Callable[ [str, List[str], str, Any], str, ]""": """ factory that acts according to glossary language """ indexer = None """Callable[[Sequence[str], str], Sequence[str]]""" if indexes_lang: from . import indexes as idxs indexer = idxs.languages.get(indexes_lang, None) if not indexer: keys_str = ", ".join(list(idxs.languages.keys())) msg = "extended indexes not supported for the" \ f" specified language: {indexes_lang}.\n" \ f"following languages available: {keys_str}." log.error(msg) raise ValueError(msg) def generate_indexes(title, alts, content, BeautifulSoup): indexes = [title] indexes.extend(alts) if BeautifulSoup: quoted_title = BeautifulSoup.dammit.EntitySubstitution\ .substitute_xml(title, True) else: quoted_title = '"' + \ title.replace(">", ">").replace('"', """) + \ '"' if indexer: indexes = set(indexer(indexes, content)) normal_indexes = set() for idx in indexes: normal = _normalize.title(idx, BeautifulSoup) normal_indexes.add(_normalize.title_long(normal)) normal_indexes.add(_normalize.title_short(normal)) normal_indexes.discard(title) normal_indexes = [s for s in normal_indexes if s.strip()] # skip empty titles. everything could happen. s = f"" if BeautifulSoup: for idx in normal_indexes: quoted_idx = BeautifulSoup.dammit.\ EntitySubstitution.substitute_xml(idx, True) s += f"" else: for idx in normal_indexes: quoted_idx = '"' + \ idx.replace(">", ">").replace('"', """) + \ '"' s += f"" return s return generate_indexes pyglossary-4.5.0/pyglossary/plugins/appledict/_normalize.py000066400000000000000000000062101417733132500243140ustar00rootroot00000000000000# -*- coding: utf-8 -*- # appledict/_normalize.py # Output to Apple Dictionary xml sources for Dictionary Development Kit. # # Copyright © 2016-2019 Saeed Rasooli (ilius) # Copyright © 2016 Ratijas # Copyright © 2012-2015 Xiaoqiang Wang # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import re re_spaces = re.compile(r"[ \t\n]{2,}") re_title = re.compile('<[^<]+?>|"|[<>]|\xef\xbb\xbf') re_title_short = re.compile(r"\[.*?\]") re_whitespace = re.compile("(\t|\n|\r)") def spaces(s): """ strip off leading and trailing whitespaces and replace contiguous whitespaces with just one space. """ return re_spaces.sub(" ", s.strip()) _brackets_sub = ( ( re.compile(r"( *)\{( *)\\\[( *)"), # { \[ r"\1\2\3[", ), ( re.compile(r"( *)\\\]( *)\}( *)"), # \] } r"]\1\2\3", ), ( re.compile(r"( *)\{( *)\(( *)\}( *)"), # { ( } r"\1\2\3\4[", ), ( re.compile(r"( *)\{( *)\)( *)\}( *)"), # { ) } r"]\1\2\3\4", ), ( re.compile(r"( *)\{( *)\(( *)"), # { ( r"\1\2\3[", ), ( re.compile(r"( *)\)( *)\}( *)"), # ) } r"]\1\2\3", ), ( re.compile(r"( *)\{( *)"), # { r"\1\2[", ), ( re.compile(r"( *)\}( *)"), # } r"]\1\2", ), ( re.compile(r"{.*?}"), r"", ), ) def brackets(s): r""" replace all crazy brackets with square ones []. following combinations are to replace: { \[ ... \] } { ( } ... { ) } { ( ... ) } { ... } """ if "{" in s: for exp, sub in _brackets_sub: s = exp.sub(sub, s) return spaces(s) def truncate(text, length=449): """ trunct a string to given length :param str text: :return: truncated text :rtype: str """ content = re_whitespace.sub(" ", text) if len(text) > length: # find the next space after max_len chars (do not break inside a word) pos = content[:length].rfind(" ") if pos == -1: pos = length text = text[:pos] return text def title(title, BeautifulSoup): """ strip double quotes and html tags. """ if BeautifulSoup: title = title.replace("\xef\xbb\xbf", "") if len(title) > 1: # BeautifulSoup has a bug when markup <= 1 char length title = BeautifulSoup.BeautifulSoup( title, features="lxml", # FIXME: html or lxml? gives warning unless it's lxml ).get_text(strip=True) else: title = re_title.sub("", title) title = title.replace("&", "&") title = brackets(title) title = truncate(title, 1126) return title def title_long(s): """ title_long("str[ing]") -> "string" """ return s.replace("[", "").replace("]", "") def title_short(s): """ title_short("str[ing]") -> "str" """ return spaces(re_title_short.sub("", s)) pyglossary-4.5.0/pyglossary/plugins/appledict/indexes/000077500000000000000000000000001417733132500232435ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/indexes/__init__.py000066400000000000000000000026511417733132500253600ustar00rootroot00000000000000# -*- coding: utf-8 -*- # appledict/indexes/__init__.py # # Copyright © 2016 Ratijas # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ extended indexes generation with respect to source language. """ import os import pkgutil from pyglossary.plugins.formats_common import log __all__ = ["languages", "log"] languages = {} """ Dict[str, Callable[[Sequence[str], str], Sequence[str]]] submodules must register languages by adding (language name -> function) pairs to the mapping. function must follow signature bellow: :param titles: flat iterable of title and altenrative titles :param content: cleaned entry content :return: iterable of indexes (str). use ``` from . import languages # or from appledict.indexes import languages ``` """ here = os.path.dirname(os.path.abspath(__file__)) for _, module, _ in pkgutil.iter_modules([here]): __import__(f"{__name__}.{module}") pyglossary-4.5.0/pyglossary/plugins/appledict/indexes/ru.py000066400000000000000000000045621417733132500242520ustar00rootroot00000000000000# -*- coding: utf-8 -*- # appledict/indexes/ru.py # # Copyright © 2016 Ratijas # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ Russian indexes based on pymorphy. """ from . import languages from pyglossary.plugins.formats_common import log, pip try: import pymorphy2 except ImportError: log.error(f"""module pymorphy2 is required to build extended Russian indexes. You can download it here: http://pymorphy2.readthedocs.org/en/latest/. Or by running: {pip} install pymorphy2""") raise morphy = pymorphy2.MorphAnalyzer() def ru(titles, _): """ gives a set of all declines, cases and other froms of word `title`. note that it works only if title is one word. :type titles: Sequence[str] :rtype: Set[str] """ indexes = set() indexes_norm = set() for title in titles: # in-place modification _ru(title, indexes, indexes_norm) return list(sorted(indexes)) def _ru(title, a, a_norm): # uppercase abbreviature if title.isupper(): return title_norm = normalize(title) # feature: put dot at the end to match only this word a.add(title) a.add(title + ".") a_norm.add(title_norm) # decline only one-word titles if len(title.split()) == 1: normal_forms = morphy.parse(title) if len(normal_forms) > 0: # forms of most probable match normal_form = normal_forms[0] for x in normal_form.lexeme: word = x.word # Apple Dictionary Services see no difference between # "й" and "и", "ё" and "е", so we're trying to avoid # "* Duplicate index. Skipped..." warning. # new: return indexes with original letters but check for # occurence against "normal forms". word_norm = normalize(word) if word_norm not in a_norm: a.add(word) a_norm.add(word_norm) def normalize(word): return word.lower().replace("й", "и").replace("ё", "е").replace("-", " ") languages["ru"] = ru pyglossary-4.5.0/pyglossary/plugins/appledict/indexes/zh.py000066400000000000000000000055761417733132500242530ustar00rootroot00000000000000# -*- coding: utf-8 -*- # appledict/indexes/zh.py # # Copyright © 2016 Ratijas # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ Chinese wildcard and pinyin indexes. """ from pyglossary.plugins.formats_common import log, pip import re import bs4 try: import colorize_pinyin as color except ImportError: log.error(f"""module colorize_pinyin is required to build extended Chinese indexes. You can install it by running: {pip} install colorize-pinyin""") raise from . import languages, log pinyinPattern = re.compile(r",|;") nonHieroglyphPattern = re.compile(r"[^\u4e00-\u9fff]") def zh(titles, content): """ Chinese indexes. assuming that content is HTML and pinyin is inside second tag (first is

    ), we can try to parse pinyin and generate indexes with pinyin subwords separated by whitespaces - pinyin itself - pinyin with diacritics replaced by tone numbers multiple pronunciations separated by comma or semicolon are supported. """ indexes = set() for title in titles: # feature: put dot at the end to match only this word indexes.update({title, title + "。"}) # remove all non hieroglyph title = nonHieroglyphPattern.sub("", title) indexes.add(title) indexes.update(pinyin_indexes(content)) return indexes def pinyin_indexes(content): pinyin = find_pinyin(content) # assert type(pinyin) == unicode if not pinyin or pinyin == "_": return () indexes = set() # multiple pronunciations for pinyin in pinyinPattern.split(pinyin): # find all pinyin ranges, use them to rip pinyin out py = [ r._slice(pinyin) for r in color.ranges_of_pinyin_in_string(pinyin) ] # maybe no pinyin here if not py: return () # just pinyin, with diacritics, separated by whitespace indexes.add(color.utf(" ".join(py)) + ".") # pinyin with diacritics replaced by tone numbers indexes.add( color.utf(" ".join([ color.lowercase_string_by_removing_pinyin_tones(p) + str(color.determine_tone(p)) for p in py ])) + "." ) return indexes def find_pinyin(content): # assume that content is HTML and pinyin is inside second tag # (first is

    ) soup = bs4.BeautifulSoup(content.splitlines()[0], features="lxml") if soup.body: soup = soup.body children = soup.children try: next(children) pinyin = next(children) except StopIteration: return None return pinyin.text languages["zh"] = zh pyglossary-4.5.0/pyglossary/plugins/appledict/jing/000077500000000000000000000000001417733132500225335ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/jing/DictionarySchema/000077500000000000000000000000001417733132500257615ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/jing/DictionarySchema/AppleDictionarySchema.rng000066400000000000000000000042141417733132500327020ustar00rootroot00000000000000 pyglossary-4.5.0/pyglossary/plugins/appledict/jing/DictionarySchema/modules/000077500000000000000000000000001417733132500274315ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/jing/DictionarySchema/modules/dict-struct.rng000066400000000000000000000060311417733132500324060ustar00rootroot00000000000000 1 pyglossary-4.5.0/pyglossary/plugins/appledict/jing/__init__.py000066400000000000000000000005011417733132500246400ustar00rootroot00000000000000"""checking XML files with Apple Dictionary Schema. this module can be run from command line with only argument -- file to be checked. otherwise, you need to import this module and call `run` function with the filename as its only argument. """ __all__ = ["run", "JingTestError"] from .main import run, JingTestError pyglossary-4.5.0/pyglossary/plugins/appledict/jing/__main__.py000066400000000000000000000006171417733132500246310ustar00rootroot00000000000000"""main entry point""" import logging import os import sys sys.path.append(os.path.abspath(os.path.dirname(__file__))) from . import main log = logging.getLogger('root') console_output_handler = logging.StreamHandler(sys.stderr) console_output_handler.setFormatter(logging.Formatter( '%(asctime)s: %(message)s' )) log.addHandler(console_output_handler) log.setLevel(logging.INFO) main.main() pyglossary-4.5.0/pyglossary/plugins/appledict/jing/jing/000077500000000000000000000000001417733132500234625ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/jing/jing/readme.html000066400000000000000000000052601417733132500256100ustar00rootroot00000000000000 Jing version 20091111

    Jing version 20091111

    Copyright © 2001, 2002, 2003, 2008 Thai Open Source Software Center Ltd. Jing can be freely copied subject to these conditions.

    This directory contains version 20091111 of Jing, a validator for RELAX NG and other schema languages.

    The directory bin contains jing.jar, which contains the code for Jing, ready to use with a Java runtime. For more information on how to use Jing, see this document.

    Apart from jing.jar, the bin directory contains some third-party jar files, which are used for XML parsing (under a pre-1.4 JRE that does not provide the Java XML parsing extension) and for validating with schema languages other than RELAX NG:

    saxon.jar
    Comes from the Saxon 6.5.5 distribution. Used for Schematron 1.5 validation.
    xercesImpl.jar
    xml-apis.jar
    Come from the Xerces2 Java 2.9.1 distribution. Used for W3C XML Schema validation and for XML parsing. Xerces2 Java is under the Apache License Version 2.0, which requires the following notice:
       Apache Xerces Java
       Copyright 1999-2007 The Apache Software Foundation
    
       This product includes software developed at
       The Apache Software Foundation (http://www.apache.org/).
    
       Portions of this software were originally based on the following:
         - software copyright (c) 1999, IBM Corporation., http://www.ibm.com.
         - software copyright (c) 1999, Sun Microsystems., http://www.sun.com.
         - voluntary contributions made by Paul Eng on behalf of the 
           Apache Software Foundation that were originally developed at iClick, Inc.,
           software copyright (c) 1999.
    isorelax.jar
    Comes from ISO RELAX 2004/11/11 distribution. Provides a bridge to validators that use the JARV interface.

    The file src.zip contains the Java source code. This is for reference purposes, and doesn't contain the supporting files, such as build scripts and test cases, that are needed for working conveniently with the source code. If you want to make changes to Jing, you should check out the source code and supporting files from the project's Subversion repository.

    pyglossary-4.5.0/pyglossary/plugins/appledict/jing/main.py000066400000000000000000000041551417733132500240360ustar00rootroot00000000000000"""Jing, a validator for RELAX NG and other schema languages.""" import logging from os import path import subprocess import sys __all__ = ["JingTestError", "run", "main"] log = logging.getLogger("pyglossary") log.setLevel(logging.DEBUG) class JingTestError(subprocess.CalledProcessError): """this exception raised when jing test failed, e.g. returned non-zero. the exit status will be stored in the `returncode` attribute. the `output` attribute also will store the output. """ def __init__(self, returncode, cmd, output): super(JingTestError, self).__init__(returncode, cmd, output) def __str__(self): return "\n".join([ f"Jing check failed with exit code {self.returncode}:", "-" * 80, self.output, ]) def run(filename): """run(filename) check whether the file named `filename` conforms to `AppleDictionarySchema.rng`. :returns: None :raises: JingTestError """ here = path.abspath(path.dirname(__file__)) filename = path.abspath(filename) jing_jar_path = path.join(here, "jing", "bin", "jing.jar") rng_path = path.join(here, "DictionarySchema", "AppleDictionarySchema.rng") # -Xmxn Specifies the maximum size, in bytes, of the memory allocation # pool. # -- from `man 1 java` args = ["java", "-Xmx2G", "-jar", jing_jar_path, rng_path, filename] cmd = " ".join(args) log.info("running Jing check:") log.info(cmd) log.info("...") pipe = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) returncode = pipe.wait() output = pipe.communicate()[0] if returncode != 0: if returncode < 0: log.error(f"Jing was terminated by signal {-returncode}") elif returncode > 0: log.error(f"Jing returned {returncode}") raise JingTestError(returncode, cmd, output) else: log.info("Jing check successfully passed!") def main(): """a command-line utility, runs Jing test on given dictionary XML file with Apple Dictionary Schema. """ if len(sys.argv) < 2: prog_name = path.basename(sys.argv[0]) log.info(f"usage:\n {prog_name} filename") exit(1) try: run(sys.argv[1]) except JingTestError as e: log.fatal(str(e)) exit(e.returncode) pyglossary-4.5.0/pyglossary/plugins/appledict/templates/000077500000000000000000000000001417733132500236025ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/appledict/templates/Dictionary.css000066400000000000000000000023101417733132500264150ustar00rootroot00000000000000@charset "UTF-8"; @namespace d url(http://www.apple.com/DTDs/DictionaryService-1.0.rng); @media (prefers-color-scheme: dark) { html { -apple-color-filter: apple-invert-lightness(); } img { filter: invert(0%); } } d|entry { } h1 { font-size: 150%; } h3 { font-size: 100%; } .ex, .m, .m0, .m1, .m2, .m3, .m4, .m5, .m6, .m7, .m8, .m9 { display: block; } .m { margin-left: 0em; } .m0 { margin-left: 0em; } .m1 { margin-left: 1em; } .m2 { margin-left: 2em; } .m3 { margin-left: 3em; } .m4 { margin-left: 4em; } .m5 { margin-left: 5em; } .m6 { margin-left: 6em; } .m7 { margin-left: 7em; } .m8 { margin-left: 8em; } .m9 { margin-left: 9em; } .ex + br, .k + br { display: none; } .c { color: green; } .p { font-style: italic; color: green; } .ex { color: #666; } .u { text-decoration: underline; } /* xdxf support */ .k { color: black; font-weight: bold; display: block; } .tr { color: black; } .abr { color: #008000; font-style: italic; } .hideextra .extra { display: none; } .stress { color: #FF0000; } .kref { color: #000080; text-decoration: none; } .pr { color: #000080; white-space: nowrap; text-decoration: none; overflow: hidden; text-overflow: ellipsis; padding-right: 1ex; } pyglossary-4.5.0/pyglossary/plugins/appledict/templates/Info.plist000066400000000000000000000016711417733132500255570ustar00rootroot00000000000000 CFBundleDevelopmentRegion English CFBundleIdentifier {CFBundleIdentifier} CFBundleDisplayName {CFBundleDisplayName} CFBundleName {CFBundleName} CFBundleShortVersionString 1.0 DCSDictionaryCopyright {DCSDictionaryCopyright}. DCSDictionaryManufacturerName {DCSDictionaryManufacturerName}. DCSDictionaryXSL {DCSDictionaryXSL} DCSDictionaryDefaultPrefs {DCSDictionaryDefaultPrefs} DCSDictionaryPrefsHTML {DCSDictionaryPrefsHTML} {DCSDictionaryFrontMatterReferenceID} pyglossary-4.5.0/pyglossary/plugins/appledict/templates/Makefile000066400000000000000000000024121417733132500252410ustar00rootroot00000000000000# # Makefile # # # ########################### # You need to edit these values. DICT_NAME = "{dict_name}" DICT_SRC_PATH = "{dict_name}.xml" CSS_PATH = "{dict_name}.css" PLIST_PATH = "{dict_name}.plist" DICT_BUILD_OPTS = # Suppress adding supplementary key. # DICT_BUILD_OPTS = -s 0 # Suppress adding supplementary key. ########################### # The DICT_BUILD_TOOL_DIR value is used also in "build_dict.sh" script. # You need to set it when you invoke the script directly. DICT_BUILD_TOOL_DIR = "/Applications/Utilities/Dictionary Development Kit" DICT_BUILD_TOOL_BIN = "$(DICT_BUILD_TOOL_DIR)/bin" ########################### DICT_DEV_KIT_OBJ_DIR = ./objects export DICT_DEV_KIT_OBJ_DIR DESTINATION_FOLDER = ~/Library/Dictionaries RM = /bin/rm ########################### all: "$(DICT_BUILD_TOOL_BIN)/build_dict.sh" $(DICT_BUILD_OPTS) $(DICT_NAME) $(DICT_SRC_PATH) $(CSS_PATH) $(PLIST_PATH) @echo "Done." install: @echo "Installing into $(DESTINATION_FOLDER)". mkdir -p $(DESTINATION_FOLDER) ditto --noextattr --norsrc $(DICT_DEV_KIT_OBJ_DIR)/$(DICT_NAME).dictionary $(DESTINATION_FOLDER)/$(DICT_NAME).dictionary touch $(DESTINATION_FOLDER) @echo "Done." @echo "To test the new dictionary, try Dictionary.app." clean: $(RM) -rf $(DICT_DEV_KIT_OBJ_DIR) pyglossary-4.5.0/pyglossary/plugins/appledict_bin.py000066400000000000000000000144621417733132500230150ustar00rootroot00000000000000# -*- coding: utf-8 -*- # Copyright © 2019 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from pyglossary.plugins.formats_common import * from struct import unpack from zlib import decompress enable = True lname = "appledict_bin" format = "AppleDictBin" description = "AppleDict Binary" extensions = (".dictionary", ".data",) extensionCreate = "" singleFile = True kind = "binary" wiki = "" website = ( "https://support.apple.com/en-gu/guide/dictionary/welcome/mac", "Dictionary User Guide for Mac", ) optionsProp = { "html": BoolOption(comment="Entries are HTML"), "html_full": BoolOption( comment="Turn every entry's definition into an HTML document", ), } class Reader(object): depends = { "lxml": "lxml", } _html: bool = True _html_full: bool = False def __init__(self, glos): self._glos = glos self._filename = "" self._file = None self._encoding = "utf-8" self._buf = "" self._defiFormat = "m" try: from lxml import etree except ModuleNotFoundError as e: e.msg += f", run `{pip} install lxml` to install" raise e def open(self, filename): self._defiFormat = "h" if self._html else "m" parts = filename.split(os.sep) dbname = parts[-1] if isdir(filename): if parts[-1] == "Contents": filename = join(filename, "Body.data") if len(parts) > 2: dbname = parts[-2] elif isfile(join(filename, "Contents/Body.data")): filename = join(filename, "Contents/Body.data") elif isfile(join(filename, "Contents/Resources/Body.data")): filename = join(filename, "Contents/Resources/Body.data") else: raise IOError( "could not find Body.data file, " "please select Body.data file instead of directory" ) elif dbname == "Body.data" and len(parts) > 1: dbname = parts[-2] if len(parts) > 2: if dbname == "Contents": dbname = parts[-3] elif dbname == "Resources" and len(parts) > 3: dbname = parts[-4] if not isfile(filename): raise IOError(f"no such file: {filename}") if dbname.endswith(".dictionary"): dbname = dbname[:-len(".dictionary")] self._glos.setInfo("name", dbname) self._filename = filename self._file = open(filename, "rb") self._file.seek(0x40) self._limit = 0x40 + unpack("i", self._file.read(4))[0] self._file.seek(0x60) def __len__(self): # FIXME: returning zero will disable the progress bar return 0 def close(self): if self._file is not None: self._file.close() self._file = None def decode(self, st: bytes) -> str: return st.decode(self._encoding, errors="replace") def getChunkSize(self, pos): plus = self._buf[pos:pos + 12].find(b" str: from lxml import etree if not self._html: # FIXME: this produces duplicate text for Idioms.dictionary, see #301 return "".join([ self.decode(etree.tostring( child, encoding="utf-8", )) for child in entryElem.iterdescendants() ]) defi = self.decode(etree.tostring( entryElem, encoding="utf-8", )) if self._html_full: defi = ( f'' f'' f'{defi}' ) return defi def _readEntry(self, pos: int) -> "Tuple[BaseEntry, int]": """ returns (entry, pos) """ from lxml import etree chunkSize, plus = self.getChunkSize(pos) pos += plus if chunkSize == 0: endI = self._buf[pos:].find(b"") if endI == -1: chunkSize = len(self._buf) - pos else: chunkSize = endI + 10 entryFull = self.decode(self._buf[pos:pos + chunkSize]) entryFull = entryFull.strip() if not entryFull: pos += chunkSize return None, pos try: entryRoot = etree.fromstring(entryFull) except etree.XMLSyntaxError as e: log.error(f"{self._buf[pos-plus:pos+100]}") log.error( f"chunkSize={chunkSize}, plus={plus}, pos={pos}, len(buf)={len(self._buf)}" ) log.error(f"entryFull={entryFull!r}") raise e entryElems = entryRoot.xpath("/d:entry", namespaces=entryRoot.nsmap) if not entryElems: return None, pos word = entryElems[0].xpath("./@d:title", namespaces=entryRoot.nsmap)[0] defi = self._getDefi(entryElems[0]) pos += chunkSize if self._limit <= 0: raise ValueError(f"self._limit = {self._limit}") return self._glos.newEntry( word, defi, defiFormat=self._defiFormat, byteProgress=(self._absPos, self._limit), ), pos def __iter__(self): from os.path import dirname if self._file is None: raise RuntimeError("iterating over a reader while it's not open") glos = self._glos cssFilename = join(dirname(self._filename), "DefaultStyle.css") if isfile(cssFilename): with open(cssFilename, mode="rb") as cssFile: cssBytes = cssFile.read() yield glos.newDataEntry("style.css", cssBytes) _file = self._file limit = self._limit while True: self._absPos = _file.tell() if self._absPos >= limit: break bufSizeB = _file.read(4) # type: bytes # alternative for buf, bufSize is calculated # ~ flag = f.tell() # ~ bufSize = 0 # ~ while True: # ~ zipp = f.read(bufSize) # ~ try: # ~ # print(zipp) # ~ input(zipp.decode(self._encoding)) # ~ buf = decompress(zipp[8:]) # ~ # print(buf) # ~ break # ~ except: # ~ print(bufSize) # ~ f.seek(flag) # ~ bufSize = bufSize+1 bufSize, = unpack("i", bufSizeB) # type: int self._buf = decompress(_file.read(bufSize)[8:]) pos = 0 while pos < len(self._buf): entry, pos = self._readEntry(pos) if entry is not None: yield entry pyglossary-4.5.0/pyglossary/plugins/babylon_bdc.py000066400000000000000000000004321417733132500224460ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = False lname = "babylon_bdc" format = "BabylonBdc" description = "Babylon (bdc)" extensions = (".bdc",) extensionCreate = "" singleFile = True kind = "binary" wiki = "" website = None optionsProp = {} pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/000077500000000000000000000000001417733132500221115ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/__init__.py000066400000000000000000000023071417733132500242240ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2021 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from pyglossary.plugins.formats_common import * from .bgl_reader import BglReader as Reader from .bgl_reader import optionsProp enable = True lname = "babylon_bgl" format = "BabylonBgl" description = "Babylon (.BGL)" extensions = (".bgl",) extensionCreate = "" singleFile = True kind = "binary" wiki = "" website = None # progressbar = DEFAULT_YES # FIXME: document type of read/write options # (that would be specified in command line) pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_charset.py000066400000000000000000000024231417733132500247410ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2020 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . charsetByCode = { 0x41: "cp1252", # Default, 0x41 0x42: "cp1252", # Latin, 0x42 0x43: "cp1250", # Eastern European, 0x43 0x44: "cp1251", # Cyrillic, 0x44 0x45: "cp932", # Japanese, 0x45 0x46: "cp950", # Traditional Chinese, 0x46 0x47: "cp936", # Simplified Chinese, 0x47 0x48: "cp1257", # Baltic, 0x48 0x49: "cp1253", # Greek, 0x49 0x4A: "cp949", # Korean, 0x4A 0x4B: "cp1254", # Turkish, 0x4B 0x4C: "cp1255", # Hebrew, 0x4C 0x4D: "cp1256", # Arabic, 0x4D 0x4E: "cp874", # Thai, 0x4E } pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_info.md000066400000000000000000000061161417733132500242160ustar00rootroot00000000000000## bgl_numEntries (0x0c) ``` bgl_numEntries does not always matches the number of entries in the dictionary, but it's close to it. the difference is usually +- 1 or 2, in rare cases may be 9, 29 and more ``` ## bgl_length (0x43) ``` The length of the substring match in a term. For example, if your glossary contains the term "Dog" and the substring length is 2, search of the substrings "Do" or "og" will retrieve the term dog. Use substring length 0 for exact match. ``` ## bgl_contractions (0x3b) ``` contains a value like this: V-0#Verb|V-0.0#|V-0.1#Infinitive|V-0.1.1#|V-1.0#|V-1.1#|V-1.1.1#Present Simple|V-1.1.2#Present Simple (3rd pers. sing.)|V-2.0#|V-2.1#|V-2.1.1#Past Simple|V-3.0#|V-3.1#|V-3.1.1#Present Participle|V-4.0#|V-4.1#|V-4.1.1#Past Participle|V-5.0#|V-5.1#|V-5.1.1#Future|V2-0#|V2-0.0#|V2-0.1#Infinitive|V2-0.1.1#|V2-1.0#|V2-1.1#|V2-1.1.1#Present Simple (1st pers. sing.)|V2-1.1.2#Present Simple (2nd pers. sing. & plural forms)|V2-1.1.3#Present Simple (3rd pers. sing.)|V2-2.0#|V2-2.1#|V2-2.1.1#Past Simple (1st & 3rd pers. sing.)|V2-2.1.2#Past Simple (2nd pers. sing. & plural forms)|V2-3.0#|V2-3.1#|V2-3.1.1#Present Participle|V2-4.0#|V2-4.1#|V2-4.1.1#Past Participle|V2-5.0#|V2-5.1#|V2-5.1.1#Future||N-0#Noun|N-1.0#|N-1.1#|N-1.1.1#Singular|N-2.0#|N-2.1#|N-2.1.1#Plural|N4-1.0#|N4-1.1#|N4-1.1.1#Singular Masc.|N4-1.1.2#Singular Fem.|N4-2.0#|N4-2.1#|N4-2.1.1#Plural Masc.|N4-2.1.2#Plural Fem.||ADJ-0#Adjective|ADJ-1.0#|ADJ-1.1#|ADJ-1.1.1#Adjective|ADJ-1.1.2#Comparative|ADJ-1.1.3#Superlative|| value format: ( "#" [] "|")+ The value is in second language, that is for Babylon Russian-English.BGL the value in russian, for Babylon English-Spanish.BGL the value is spanish (I guess), etc. ``` ## bgl_about: Glossary manual file (0x41) ``` additional information about the dictionary in .txt format this may be short info like this: Biology Glossary Author name: Hafez Divandari Author email: hafezdivandari@gmail.com ------------------------------------------- A functional glossary for translating English biological articles to fluent Farsi ------------------------------------------- Copyright (c) 2009 All rights reserved. in .pdf format this may be a quite large document (about 30 pages), an introduction into the dictionary. It describing structure of an article, editors, how to use the dictionary. format "\x00" file extension may be: ".txt", ".pdf" ``` ## bgl_purchaseLicenseMsg (0x2c) ``` contains a value like this: In order to view this glossary, you must purchase a license.
    Click here to purchase. ``` ## bgl_licenseExpiredMsg (0x2d) ``` contains a value like this: Your license for this glossary has expired. In order to view this glossary, you must have a valid license.
    Renew your license today. ``` ## bgl_purchaseAddress (0x2e) ``` contains a value like this: http://www.babylon.com/redirects/purchase.cgi?type=169&trid=BPCOT or mailto:larousse@babylon.com ``` pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_info.py000066400000000000000000000147421417733132500242520ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2021 Saeed Rasooli (ilius) # Copyright © 2011-2012 kubtek # This file is part of PyGlossary project, http://github.com/ilius/pyglossary # Thanks to Raul Fernandes and Karl Grill for reverse # engineering as part of https://sourceforge.net/projects/ktranslator/ # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from .bgl_language import languageByCode from .bgl_charset import charsetByCode from pyglossary.plugins.formats_common import log import pyglossary.gregorian as gregorian from pyglossary.text_utils import ( uintFromBytes, ) class InfoItem(object): __slots__ = ( "name", "decode", "attr", ) def __init__( self, name: str, decode: "Optional[Callable[[bytes], Any]]" = None, attr: bool = False, ): self.name = name self.decode = decode self.attr = attr def decodeBglBinTime(b_value): jd1970 = gregorian.to_jd(1970, 1, 1) djd, hm = divmod(uintFromBytes(b_value), 24 * 60) year, month, day = gregorian.jd_to(djd + jd1970) hour, minute = divmod(hm, 60) return f"{year:04d}/{month:02d}/{day:02d}, {hour:02d}:{minute:02d}" def languageInfoDecode(b_value): """ returns BabylonLanguage instance """ intValue = uintFromBytes(b_value) try: return languageByCode[intValue] except IndexError: log.warning(f"read_type_3: unknown language code = {intValue}") return def charsetInfoDecode(b_value): value = b_value[0] try: return charsetByCode[value] except KeyError: log.warning(f"read_type_3: unknown charset {value!r}") def aboutInfoDecode(b_value): if not b_value: return aboutExt, _, aboutContents = b_value.partition(b"\x00") if not aboutExt: log.warning("read_type_3: about: no file extension") return return { "about_extension": aboutExt, "about": aboutContents, } def utf16InfoDecode(b_value): """ b_value is byte array returns str, or None (on errors) block type = 3 block format: <2 byte code1><2 byte code2> if code2 == 0: then the block ends if code2 == 1: then the block continues as follows: <4 byte len1> \x00 \x00 len1 - length of message in 2-byte chars """ if b_value[0] != 0: log.warning( f"utf16InfoDecode: b_value={b_value}, null expected at 0", ) return if b_value[1] == 0: if len(b_value) > 2: log.warning( f"utf16InfoDecode: unexpected b_value size: {len(b_value)}", ) return elif b_value[1] > 1: log.warning( f"utf16InfoDecode: b_value={b_value!r}, unexpected byte at 1", ) return # now b_value[1] == 1 size = 2 * uintFromBytes(b_value[2:6]) if tuple(b_value[6:8]) != (0, 0): log.warning( f"utf16InfoDecode: b_value={b_value!r}, null expected at 6:8", ) if size != len(b_value) - 8: log.warning( f"utf16InfoDecode: b_value={b_value!r}, size does not match", ) return b_value[8:].decode("utf16") # str def flagsInfoDecode(b_value): """ returns a dict with these keys: utf8Encoding when this flag is set utf8 encoding is used for all articles when false, the encoding is set according to the source and target alphabet bgl_spellingAlternatives determines whether the glossary offers spelling alternatives for searched terms bgl_caseSensitive defines if the search for terms in this glossary is case sensitive see code 0x20 as well """ flags = uintFromBytes(b_value) return { "utf8Encoding": (flags & 0x8000 != 0), "bgl_spellingAlternatives": (flags & 0x10000 == 0), "bgl_caseSensitive": (flags & 0x1000 != 0), } infoType3ByCode = { # glossary name 0x01: InfoItem("title"), # glossary author name, a list of "|"-separated values 0x02: InfoItem("author"), # glossary author e-mail 0x03: InfoItem("email"), 0x04: InfoItem("copyright"), 0x07: InfoItem( "sourceLang", decode=languageInfoDecode, attr=True, ), 0x08: InfoItem( "targetLang", decode=languageInfoDecode, attr=True, ), 0x09: InfoItem("description"), # 0: browsing disabled, 1: browsing enabled 0x0a: InfoItem( "bgl_browsingEnabled", decode=lambda b_value: (b_value[0] != 0), ), 0x0b: InfoItem("icon1.ico"), 0x0c: InfoItem( "bgl_numEntries", decode=uintFromBytes, attr=True, ), # the value is a dict 0x11: InfoItem("flags", decode=flagsInfoDecode), 0x14: InfoItem("creationTime", decode=decodeBglBinTime), 0x1a: InfoItem( "sourceCharset", decode=charsetInfoDecode, attr=True, ), 0x1b: InfoItem( "targetCharset", decode=charsetInfoDecode, attr=True, ), 0x1c: InfoItem( "bgl_firstUpdated", decode=decodeBglBinTime, ), # bgl_firstUpdated was prevously called middleUpdated # in rare cases, bgl_firstUpdated is before creationTime # but usually it looks like to be the first update (after creation) # in some cases, it's the same as lastUpdated # in some cases, it's minutes after creationTime # bgl_firstUpdated exists in more glossaries than lastUpdated # so if lastUpdated is not there, we use bgl_firstUpdated as lastUpdated 0x20: InfoItem( "bgl_caseSensitive2", decode=lambda b_value: (b_value[0] == 0x31), # 0x30 - case sensitive search is disabled # 0x31 - case sensitive search is enabled ), 0x24: InfoItem("icon2.ico"), 0x2c: InfoItem( "bgl_purchaseLicenseMsg", decode=utf16InfoDecode, ), 0x2d: InfoItem( "bgl_licenseExpiredMsg", decode=utf16InfoDecode, ), 0x2e: InfoItem("bgl_purchaseAddress"), 0x30: InfoItem( "bgl_titleWide", decode=utf16InfoDecode, ), # a list of "|"-separated values 0x31: InfoItem( "bgl_authorWide", decode=utf16InfoDecode, ), 0x33: InfoItem( "lastUpdated", decode=decodeBglBinTime, ), 0x3b: InfoItem("bgl_contractions"), # contains a value like "Arial Unicode MS" or "Tahoma" 0x3d: InfoItem("bgl_fontName"), # value would be dict 0x41: InfoItem( "bgl_about", decode=aboutInfoDecode, ), # the length of the substring match in a term 0x43: InfoItem( "bgl_length", decode=uintFromBytes, ), } pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_language.py000066400000000000000000000334621417733132500251020ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2020 Saeed Rasooli (ilius) # Copyright © 2011-2012 kubtek # This file is part of PyGlossary project, http://github.com/ilius/pyglossary # Thanks to Raul Fernandes and Karl Grill for reverse # engineering as part of https://sourceforge.net/projects/ktranslator/ # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . """ language properties In this short note we describe how Babylon select encoding for key words, alternates and definitions. There are source and target encodings. The source encoding is used to encode keys and alternates, the target encoding is used to encode definitions. The source encoding is selected based on the source language of the dictionary, the target encoding is tied to the target language. Babylon Glossary Builder allows you to specify source and target languages. If you open a Builder project (a file with .gpr extension) in a text editor, you should find the following elements: Latin English Latin English Here bab:SourceLanguage is the source language that you select in the builder wizard, bab:SourceCharset - is the corresponding charset. bab:TargetLanguage - target language, bab:TargetCharset - corresponding charset. Unfortunately, builder does not tell us what encoding corresponds to charset, but we can detect it. A few words about how definitions are encoded. If all chars of the definition fall into the target charset, Babylon use that charset to encode the definition. If at least one char does not fall into the target charset, Babylon use utf-8 encoding, wrapping the definition into and tags. You can make Babylon use utf-8 encoding for the whole dictionary, in that case all definitions, keys and alternates are encoded with utf-8. See Babylon Glossary Builder wizard, Glossary Properties tab, Advanced button, Use UTF-8 encoding check box. Definitions are not augmented with extra mackup in this case, that is you'll not find charset tags in definitions. How you can tell what encoding was used for the particular definition in .bgl file? You need to check the following conditions. Block type 3, code 0x11. If 0x8000 bit is set, the whole dictionary use utf-8 encoding. If the definition starts with , that definition uses utf-8 encoding. Otherwise you need to consult the target encoding. Block type 3, code 0x1b. That field normally contains 1 byte code of the target encoding. Codes fill the range of 0x41 to 0x4e. Babylon Builder generate codes 0x42 - 0x4e. How to generate code 0x41? Occasionally you may encounter the field value is four zero bytes. In this case, I guess, the default encoding for the target language is used. Block type 3, code 0x08. That field contains 4-bytes code of the target language. The first three bytes are always zero, the last byte is the code. Playing with Babylon Glossary builder we can find language codes corresponding to target language. The language codes fill the range of 0 to 0x3d. How to detect the target encoding? Here is the technique I've used. - Create a babylon glossary source file ( a file with .gls extension) with the following contents. Start the file with utf-8 BOM for the builder to recognize the utf-8 encoding. Use unicode code point code as key, and a single unicode chars encoding in utf-8 as definition. Create keys for all code points in the range 32 - 0x10000, or you may use wider range. We do not use code points in the range 0-31, since they are control chars. You should skip the following three chars: & < >. Since the definition is supposed to contain html, these chars are be replaced by & < > respectively. You should skip the char $ as well, it has special meaning in definitions (?). Skip all code point that cannot encoded in utf-8 (not all code points in the range 32-0x10000 represent valid chars). - Now that you have a glossary source file, process it with builder selecting the desired target language. Make sure the "use utf-8" option is no set. You'll get a .bgl file. - Process the generated .bgl file with pyglossary. Skip all definitions that start with tag. Try to decode definitions using different encodings and match the result with the real value (key - code point char code). Thus you'll find the encoding having the best match. For example, you may do the following. Loop over all available encodings, loop over all definitions in the dictionary. Count the number of definitions that does not start with charset tag - total. Among them count the number of definitions that were correctly decoded - success. The encoding where total == success, is the target encoding. There are a few problems I encountered. It looks like python does not correctly implement cp932 and cp950 encodings. For Japanese charset I got 99.12% match, and for Traditional Chinese charset I got even less - 66.97%. To conform my guess that Japanese is cp932 and Traditional Chinese is cp950 I built a C++ utility that worked on the data extracted from .bgl dictionary. I used WideCharToMultiByte function for conversion. The C++ utility confirmed the cp932 and cp950 encodings, I got 100% match. """ class BabylonLanguage(object): """ Babylon language properties. name - bab:SourceLanguage, bab:TargetLanguage .gpr tags (English, French, Japanese) charset - bab:SourceCharset, bab:TargetCharset .gpr tags (Latin, Arabic, Cyrillic) encoding - Windows code page (cp1250, cp1251, cp1252) code - value of the type 3, code in .bgl file """ def __init__(self, name, charset, encoding, code, code2="", name2=""): self.name = name self.name2 = name2 self.charset = charset self.encoding = encoding self.code = code languages = ( BabylonLanguage( name="English", charset="Latin", encoding="cp1252", code=0x00, code2="en", ), BabylonLanguage( name="French", charset="Latin", encoding="cp1252", code=0x01, code2="fr", ), BabylonLanguage( name="Italian", charset="Latin", encoding="cp1252", code=0x02, code2="it", ), BabylonLanguage( name="Spanish", charset="Latin", encoding="cp1252", code=0x03, code2="es", ), BabylonLanguage( name="Dutch", charset="Latin", encoding="cp1252", code=0x04, code2="nl", ), BabylonLanguage( name="Portuguese", charset="Latin", encoding="cp1252", code=0x05, code2="pt", ), BabylonLanguage( name="German", charset="Latin", encoding="cp1252", code=0x06, code2="de", ), BabylonLanguage( name="Russian", charset="Cyrillic", encoding="cp1251", code=0x07, code2="ru", ), BabylonLanguage( name="Japanese", charset="Japanese", encoding="cp932", code=0x08, code2="ja", ), BabylonLanguage( name="Chinese", name2="Traditional Chinese", charset="Traditional Chinese", encoding="cp950", code=0x09, code2="zh", ), BabylonLanguage( name="Chinese", name2="Simplified Chinese", charset="Simplified Chinese", encoding="cp936", code=0x0a, code2="zh", ), BabylonLanguage( name="Greek", charset="Greek", encoding="cp1253", code=0x0b, code2="el", ), BabylonLanguage( name="Korean", charset="Korean", encoding="cp949", code=0x0c, code2="ko", ), BabylonLanguage( name="Turkish", charset="Turkish", encoding="cp1254", code=0x0d, code2="tr", ), BabylonLanguage( name="Hebrew", charset="Hebrew", encoding="cp1255", code=0x0e, code2="he", ), BabylonLanguage( name="Arabic", charset="Arabic", encoding="cp1256", code=0x0f, code2="ar", ), BabylonLanguage( name="Thai", charset="Thai", encoding="cp874", code=0x10, code2="th", ), BabylonLanguage( name="Other", charset="Latin", encoding="cp1252", code=0x11, code2="", # none ), BabylonLanguage( name="Chinese", name2="Other Simplified Chinese dialects", charset="Simplified Chinese", encoding="cp936", code=0x12, code2="zh", # duplicate ), BabylonLanguage( name="Chinese", name2="Other Traditional Chinese dialects", charset="Traditional Chinese", encoding="cp950", code=0x13, code2="zh", # duplicate ), BabylonLanguage( name="Other Eastern-European languages", charset="Eastern European", encoding="cp1250", code=0x14, code2="", # none ), BabylonLanguage( name="Other Western-European languages", charset="Latin", encoding="cp1252", code=0x15, code2="", # none ), BabylonLanguage( name="Other Russian languages", charset="Cyrillic", encoding="cp1251", code=0x16, code2="", # none ), BabylonLanguage( name="Other Japanese languages", charset="Japanese", encoding="cp932", code=0x17, code2="", # none ), BabylonLanguage( name="Other Baltic languages", charset="Baltic", encoding="cp1257", code=0x18, code2="bat", # no 2-letter code ), BabylonLanguage( name="Other Greek languages", charset="Greek", encoding="cp1253", code=0x19, code2="", # none ), BabylonLanguage( name="Other Korean dialects", charset="Korean", encoding="cp949", code=0x1a, code2="", # none ), BabylonLanguage( name="Other Turkish dialects", charset="Turkish", encoding="cp1254", code=0x1b, code2="", # none ), BabylonLanguage( name="Other Thai dialects", charset="Thai", encoding="cp874", code=0x1c, code2="tai", # no 2-letter code, and "tha" / "th" is for "Thai" ), BabylonLanguage( name="Polish", charset="Eastern European", encoding="cp1250", code=0x1d, code2="pl", ), BabylonLanguage( name="Hungarian", charset="Eastern European", encoding="cp1250", code=0x1e, code2="hu", ), BabylonLanguage( name="Czech", charset="Eastern European", encoding="cp1250", code=0x1f, code2="cs", ), BabylonLanguage( name="Lithuanian", charset="Baltic", encoding="cp1257", code=0x20, code2="lt", ), BabylonLanguage( name="Latvian", charset="Baltic", encoding="cp1257", code=0x21, code2="lv", ), BabylonLanguage( name="Catalan", charset="Latin", encoding="cp1252", code=0x22, code2="ca", ), BabylonLanguage( name="Croatian", charset="Eastern European", encoding="cp1250", code=0x23, code2="hr", ), BabylonLanguage( name="Serbian", charset="Eastern European", encoding="cp1250", code=0x24, code2="sr", ), BabylonLanguage( name="Slovak", charset="Eastern European", encoding="cp1250", code=0x25, code2="sk", ), BabylonLanguage( name="Albanian", charset="Latin", encoding="cp1252", code=0x26, code2="sq", ), BabylonLanguage( name="Urdu", charset="Arabic", encoding="cp1256", code=0x27, code2="ur", ), BabylonLanguage( name="Slovenian", charset="Eastern European", encoding="cp1250", code=0x28, code2="sl", ), BabylonLanguage( name="Estonian", charset="Latin", encoding="cp1252", code=0x29, code2="et", ), BabylonLanguage( name="Bulgarian", charset="Eastern European", encoding="cp1250", code=0x2a, code2="bg", ), BabylonLanguage( name="Danish", charset="Latin", encoding="cp1252", code=0x2b, code2="da", ), BabylonLanguage( name="Finnish", charset="Latin", encoding="cp1252", code=0x2c, code2="fi", ), BabylonLanguage( name="Icelandic", charset="Latin", encoding="cp1252", code=0x2d, code2="is", ), BabylonLanguage( name="Norwegian", charset="Latin", encoding="cp1252", code=0x2e, code2="no", ), BabylonLanguage( name="Romanian", charset="Latin", encoding="cp1252", code=0x2f, code2="ro", ), BabylonLanguage( name="Swedish", charset="Latin", encoding="cp1252", code=0x30, code2="sv", ), BabylonLanguage( name="Ukrainian", charset="Cyrillic", encoding="cp1251", code=0x31, code2="uk", ), BabylonLanguage( name="Belarusian", charset="Cyrillic", encoding="cp1251", code=0x32, code2="be", ), BabylonLanguage( name="Persian", # aka "Farsi" charset="Arabic", encoding="cp1256", code=0x33, code2="fa", ), BabylonLanguage( name="Basque", charset="Latin", encoding="cp1252", code=0x34, code2="eu", ), BabylonLanguage( name="Macedonian", charset="Eastern European", encoding="cp1250", code=0x35, code2="mk", ), BabylonLanguage( name="Afrikaans", charset="Latin", encoding="cp1252", code=0x36, code2="af", ), BabylonLanguage( # Babylon Glossary Builder spells this language "Faeroese" name="Faroese", charset="Latin", encoding="cp1252", code=0x37, code2="fo", ), BabylonLanguage( name="Latin", charset="Latin", encoding="cp1252", code=0x38, code2="la", ), BabylonLanguage( name="Esperanto", charset="Turkish", encoding="cp1254", code=0x39, code2="eo", ), BabylonLanguage( name="Tamazight", # aka "Standard Moroccan Tamazight", "Standard Moroccan Berber" # or "Standard Moroccan Amazigh" charset="Latin", encoding="cp1252", code=0x3a, code2="zgh", # no 2-letter code (ISO 639-1) ), BabylonLanguage( name="Armenian", charset="Latin", encoding="cp1252", code=0x3b, code2="hy", ), BabylonLanguage( name="Hindi", charset="Latin", encoding="cp1252", code=0x3c, code2="hi", ), BabylonLanguage( name="Somali", charset="Latin", encoding="cp1252", code=0x3d, code2="so", ), ) languageByCode = {lang.code: lang for lang in languages} pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_pos.py000066400000000000000000000043501417733132500241120ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2020 Saeed Rasooli (ilius) # Copyright © 2011-2012 kubtek # This file is part of PyGlossary project, http://github.com/ilius/pyglossary # Thanks to Raul Fernandes and Karl Grill for reverse # engineering as part of https://sourceforge.net/projects/ktranslator/ # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . partOfSpeechByCode = { # Use None for codes we have not seen yet # Use "" for codes we've seen but part of speech is unknown 0x30: "noun", 0x31: "adjective", 0x32: "verb", 0x33: "adverb", 0x34: "interjection", 0x35: "pronoun", 0x36: "preposition", 0x37: "conjunction", 0x38: "suffix", 0x39: "prefix", 0x3A: "article", 0x3B: "", # in Babylon Italian-English.BGL, # Babylon Spanish-English.BGL, # Babylon_Chinese_S_English.BGL # no indication of the part of speech 0x3C: "abbreviation", # (short form: 'ר"ת') # (full form: "ר"ת: ראשי תיבות") # "ת'" # adjective # (full form: "ת': תואר") # "ש"ע" # noun # (full form: "ש"ע: שם עצם") 0x3D: "masculine noun and adjective", 0x3E: "feminine noun and adjective", 0x3F: "masculine and feminine noun and adjective", 0x40: "feminine noun", # (short form: "נ\'") # (full form: "נ': נקבה") 0x41: "masculine and feminine noun", # 0x41: noun that may be used as masculine and feminine # (short form: "זו"נ") # (full form: "זו"נ: זכר ונקבה") 0x42: "masculine noun", # (short form: 'ז\'') # (full form: "ז': זכר") 0x43: "numeral", 0x44: "participle", 0x45: None, 0x46: None, 0x47: None, } pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_reader.py000066400000000000000000001312701417733132500245550ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2021 Saeed Rasooli (ilius) # Copyright © 2011-2012 kubtek # This file is part of PyGlossary project, http://github.com/ilius/pyglossary # Thanks to Raul Fernandes and Karl Grill for reverse # engineering as part of https://sourceforge.net/projects/ktranslator/ # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import io import gzip import re from collections import OrderedDict as odict from pyglossary.plugins.formats_common import * # FIXME try: vmajor, vminor = sys.version_info[:2] GzipFile = __import__( f"pyglossary.plugin_lib.py{vmajor}{vminor}.gzip_no_crc", fromlist="GzipFile", ).GzipFile except ImportError as e: from gzip import GzipFile log.warning(str(e)) from pyglossary.text_utils import ( uintFromBytes, excMessage, ) from pyglossary.xml_utils import xml_escape from .bgl_info import ( infoType3ByCode, charsetInfoDecode, ) from .bgl_pos import partOfSpeechByCode from .bgl_text import ( replaceHtmlEntries, replaceHtmlEntriesInKeys, stripHtmlTags, removeControlChars, removeNewlines, normalizeNewlines, replaceAsciiCharRefs, fixImgLinks, stripDollarIndexes, unknownHtmlEntries, ) file = io.BufferedReader debugReadOptions = { "search_char_samples", # bool "collect_metadata2", # bool "write_gz", # bool "char_samples_path", # str, file path "msg_log_path", # str, file path "raw_dump_path", # str, file path "unpacked_gzip_path", # str, file path } optionsProp = { "default_encoding_overwrite": EncodingOption( comment="Default encoding (overwrite)", ), "source_encoding_overwrite": EncodingOption( comment="Source encoding (overwrite)", ), "target_encoding_overwrite": EncodingOption( comment="Target encoding (overwrite)", ), "part_of_speech_color": HtmlColorOption( comment="Color for Part of Speech", ), "no_control_sequence_in_defi": BoolOption( comment="No control sequence in definitions", ), "strict_string_convertion": BoolOption( comment="Strict string convertion", ), "process_html_in_key": BoolOption( comment="Process HTML in (entry or info) key", ), "key_rstrip_chars": StrOption( multiline=True, comment="Characters to strip from right-side of keys", ), # debug read options: "search_char_samples": BoolOption( comment="(debug) Search character samples", ), "collect_metadata2": BoolOption( comment="(debug) Collect second pass metadata from definitions", ), "write_gz": BoolOption( comment="(debug) Create a file named *-data.gz", ), "char_samples_path": StrOption( # file path comment="(debug) File path for character samples", ), "msg_log_path": StrOption( # file path comment="(debug) File path for message log", ), "raw_dump_path": StrOption( # file path comment="(debug) File path for writing raw blocks", ), "unpacked_gzip_path": StrOption( # file path comment="(debug) Path to create unzipped file", ), } if os.sep == "/": # Operating system is Unix-like tmpDir = "/tmp" elif os.sep == "\\": # Operating system is ms-windows tmpDir = os.getenv("TEMP") else: raise RuntimeError( f"Unknown path separator(os.sep=={os.sep!r})" f"What is your operating system?" ) re_charset_decode = re.compile( b"(|)", re.I, ) re_b_reference = re.compile(b"^[0-9a-fA-F]{4}$") class BGLGzipFile(GzipFile): """ gzip_no_crc.py contains GzipFile class without CRC check. It prints a warning when CRC code does not match. The original method raises an exception in this case. Some dictionaries do not use CRC code, it is set to 0. """ def __init__( self, fileobj=None, closeFileobj=False, **kwargs ): GzipFile.__init__(self, fileobj=fileobj, **kwargs) self.closeFileobj = closeFileobj def close(self): if self.closeFileobj: self.fileobj.close() class Block(object): def __init__(self): self.data = b"" self.type = "" # block offset in the gzip stream, for debugging self.offset = -1 def __str__(self): return f"Block type={self.type}, length={self.length}, " \ f"len(data)={len(self.data)}" class FileOffS(file): """ A file class with an offset. This class provides an interface to a part of a file starting at specified offset and ending at the end of the file, making it appear an independent file. offset parameter of the constructor specifies the offset of the first byte of the modeled file. """ def __init__(self, filename, offset=0): fileObj = open(filename, "rb") file.__init__(self, fileObj) self._fileObj = fileObj self.offset = offset file.seek(self, offset) # OR self.seek(0) def close(self): self._fileObj.close() def seek(self, pos, whence=0): # position, whence if whence == 0: # relative to start of file file.seek( self, max(0, pos) + self.offset, 0, ) elif whence == 1: # relative to current position file.seek( self, max( self.offset, self.tell() + pos, ), 0 ) elif whence == 2: # relative to end of file file.seek(self, pos, 2) else: raise ValueError(f"FileOffS.seek: bad whence={whence}") def tell(self): return file.tell(self) - self.offset class DefinitionFields(object): """ Fields of entry definition Entry definition consists of a number of fields. The most important of them are: defi - the main definition, mandatory, comes first. part of speech title """ # nameByCode = { # } def __init__(self): # self.bytesByCode = {} # self.strByCode = {} self.encoding = None # encoding of the definition self.singleEncoding = True # singleEncoding=True if the definition was encoded with # a single encoding self.b_defi = None # bytes, main definition part of defi self.u_defi = None # str, main part of definition self.partOfSpeech = None # string representation of the part of speech, utf-8 self.b_title = None # bytes self.u_title = None # str self.b_title_trans = None # bytes self.u_title_trans = None # str self.b_transcription_50 = None # bytes self.u_transcription_50 = None # str self.code_transcription_50 = None self.b_transcription_60 = None # bytes self.u_transcription_60 = None # str self.code_transcription_60 = None self.b_field_1a = None # bytes self.u_field_1a = None # str self.b_field_07 = None # bytes self.b_field_06 = None # bytes self.b_field_13 = None # bytes class BglReader(object): _default_encoding_overwrite = "" _source_encoding_overwrite = "" _target_encoding_overwrite = "" _part_of_speech_color = "007000" _no_control_sequence_in_defi = False _strict_string_convertion = False # process keys and alternates as HTML # Babylon does not interpret keys and alternates as HTML text, # however you may encounter many keys containing character references # and html tags. That is clearly a bug of the dictionary. # We must be very careful processing HTML tags in keys, not damage # normal keys. This option should be disabled by default, enabled # explicitly by user. Namely this option does the following: # - resolve character references # - strip HTML tags _process_html_in_key = False # a string of characters that will be stripped from the end of the # key (and alternate), see str.rstrip function _key_rstrip_chars = "" ########################################################################## """ Dictionary properties --------------------- Dictionary (or glossary) properties are textual data like glossary name, glossary author name, glossary author e-mail, copyright message and glossary description. Most of the dictionaries have these properties set. Since they contain textual data we need to know the encoding. There may be other properties not listed here. I've enumerated only those that are available in Babylon Glossary builder. Playing with Babylon builder allows us detect how encoding is selected. If global utf-8 flag is set, utf-8 encoding is used for all properties. Otherwise the target encoding is used, that is the encoding corresponding to the target language. The chars that cannot be represented in the target encoding are replaced with question marks. Using this algorithm to decode dictionary properties you may encounter that some of them are decoded incorrectly. For example, it is clear that the property is in cp1251 encoding while the algorithm says we must use cp1252, and we get garbage after decoding. That is OK, the algorithm is correct. You may install that dictionary in Babylon and check dictionary properties. It shows the same garbage. Unfortunately, we cannot detect correct encoding in this case automatically. We may add a parameter the will overwrite the selected encoding, so the user may fix the encoding if needed. """ def __init__(self, glos): # no more arguments self._glos = glos self._filename = "" self.info = odict() self.numEntries = None #### self.sourceLang = "" self.targetLang = "" ## self.defaultCharset = "" self.sourceCharset = "" self.targetCharset = "" ## self.sourceEncoding = None self.targetEncoding = None #### self.bgl_numEntries = None self.wordLenMax = 0 self.defiMaxBytes = 0 ## self.metadata2 = None self.rawDumpFile = None self.msgLogFile = None self.samplesDumpFile = None ## self.stripSlashAltKeyPattern = re.compile(r"(^|\s)/(\w)", re.U) self.specialCharPattern = re.compile(r"[^\s\w.]", re.U) ### self.file = None # offset of gzip header, set in self.open() self.gzipOffset = None # must be a in RRGGBB format self.iconDataList = [] def __len__(self): if self.numEntries is None: log.warning("len(reader) called while numEntries=None") return 0 return self.numEntries + self.numResources # open .bgl file, read signature, find and open gzipped content # self.file - ungzipped content def open( self, filename, ): self._filename = filename if not self.openGzip(): return False self.readInfo() self.setGlossaryInfo() return True def openGzip(self): with open(self._filename, "rb") as bglFile: if not bglFile: log.error(f"file pointer empty: {bglFile}") return False b_head = bglFile.read(6) if len(b_head) < 6 or not b_head[:4] in ( b"\x12\x34\x00\x01", b"\x12\x34\x00\x02", ): log.error(f"invalid header: {b_head[:6]!r}") return False self.gzipOffset = gzipOffset = uintFromBytes(b_head[4:6]) log.debug(f"Position of gz header: {gzipOffset}") if gzipOffset < 6: log.error(f"invalid gzip header position: {gzipOffset}") return False self.file = BGLGzipFile( fileobj=FileOffS(self._filename, gzipOffset), closeFileobj=True, ) return True def readInfo(self): """ read meta information about the dictionary: author, description, source and target languages, etc (articles are not read) """ self.numEntries = 0 self.numBlocks = 0 self.numResources = 0 block = Block() while not self.isEndOfDictData(): if not self.readBlock(block): break self.numBlocks += 1 if not block.data: continue if block.type == 0: self.readType0(block) elif block.type in (1, 7, 10, 11, 13): self.numEntries += 1 elif block.type == 2: self.numResources += 1 elif block.type == 3: self.readType3(block) else: # Unknown block.type log.debug( f"Unknown Block type {block.type!r}" f", data_length = {len(block.data)}" f", number = {self.numBlocks}" ) self.file.seek(0) self.detectEncoding() log.debug(f"numEntries = {self.numEntries}") if self.bgl_numEntries and self.bgl_numEntries != self.numEntries: # There are a number of cases when these numbers do not match. # The dictionary is OK, and these is no doubt that we might missed # an entry. # self.bgl_numEntries may be less than the number of entries # we've read. log.warning( f"bgl_numEntries={self.bgl_numEntries}" f", numEntries={self.numEntries}" ) self.numBlocks = 0 encoding = self.targetEncoding # FIXME: confirm this is correct for key, value in self.info.items(): if isinstance(value, bytes): try: value = value.decode(encoding) except Exception: log.warning(f"failed to decode info value: {key} = {value}") else: self.info[key] = value def setGlossaryInfo(self): glos = self._glos ### if self.sourceLang: glos.sourceLangName = self.sourceLang.name if self.sourceLang.name2: glos.setInfo("sourceLang2", self.sourceLang.name2) if self.targetLang: glos.targetLangName = self.targetLang.name if self.targetLang.name2: glos.setInfo("targetLang2", self.targetLang.name2) ### for attr in ( "defaultCharset", "sourceCharset", "targetCharset", "defaultEncoding", "sourceEncoding", "targetEncoding", ): value = getattr(self, attr, None) if value: glos.setInfo("bgl_" + attr, value) ### glos.setInfo("sourceCharset", "UTF-8") glos.setInfo("targetCharset", "UTF-8") ### if "lastUpdated" not in self.info: if "bgl_firstUpdated" in self.info: log.debug(f"replacing bgl_firstUpdated with lastUpdated") self.info["lastUpdated"] = self.info.pop("bgl_firstUpdated") ### for key, value in self.info.items(): if value == "": continue # TODO: a bool flag to add empty value infos? # leave "creationTime" and "lastUpdated" as is if key in { "utf8Encoding", }: key = "bgl_" + key try: glos.setInfo(key, str(value)) except Exception: log.exception(f"key = {key}") def isEndOfDictData(self): """ Test for end of dictionary data. A bgl file stores dictionary data as a gzip compressed block. In other words, a bgl file stores a gzip data file inside. A gzip file consists of a series of "members". gzip data block in bgl consists of one member (I guess). Testing for block type returned by self.readBlock is not a reliable way to detect the end of gzip member. For example, consider "Airport Code Dictionary.BGL" dictionary. To reliably test for end of gzip member block we must use a number of undocumented variables of gzip.GzipFile class. self.file._new_member - true if the current member has been completely read from the input file self.file.extrasize - size of buffered data self.file.offset - offset in the input file after reading one gzip member current position in the input file is set to the first byte after gzip data We may get this offset: self.file_bgl.tell() The last 4 bytes of gzip block contains the size of the original (uncompressed) input data modulo 2^32 """ return False def close(self): if self.file: self.file.close() self.file = None def __del__(self): self.close() while unknownHtmlEntries: entity = unknownHtmlEntries.pop() log.debug(f"BGL: unknown html entity: {entity}") # returns False if error def readBlock(self, block): block.offset = self.file.tell() length = self.readBytes(1) if length == -1: log.debug("readBlock: length = -1") return False block.type = length & 0xf length >>= 4 if length < 4: length = self.readBytes(length + 1) if length == -1: log.error("readBlock: length = -1") return False else: length -= 4 self.file.flush() if length > 0: try: block.data = self.file.read(length) except Exception: # struct.error: unpack requires a string argument of length 4 # FIXME log.exception( f"failed to read block data" f": numBlocks={self.numBlocks}" f", length={length}" f", filePos={self.file.tell()}" ) block.data = b"" return False else: block.data = b"" return True def readBytes(self, num): """ return -1 if error """ if num < 1 or num > 4: log.error(f"invalid argument num={num}") return -1 self.file.flush() buf = self.file.read(num) if len(buf) == 0: log.debug("readBytes: end of file: len(buf)==0") return -1 if len(buf) != num: log.error( f"readBytes: expected to read {num} bytes" f", but found {len(buf)} bytes" ) return -1 return uintFromBytes(buf) def readType0(self, block): code = block.data[0] if code == 2: # this number is vary close to self.bgl_numEntries, # but does not always equal to the number of entries # see self.readType3, code == 12 as well num = uintFromBytes(block.data[1:]) elif code == 8: self.defaultCharset = charsetInfoDecode(block.data[1:]) if not self.defaultCharset: log.warning("defaultCharset is not valid") else: self.logUnknownBlock(block) return False return True def readType2(self, block): """ Process type 2 block Type 2 block is an embedded file (mostly Image or HTML). pass_num - pass number, may be 1 or 2 On the first pass self.sourceEncoding is not defined and we cannot decode file names. That is why the second pass is needed. The second pass is costly, it apparently increases total processing time. We should avoid the second pass if possible. Most of the dictionaries do not have valuable resources, and those that do, use file names consisting only of ASCII characters. We may process these resources on the second pass. If all files have been processed on the first pass, the second pass is not needed. All dictionaries I've processed so far use only ASCII chars in file names. Babylon glossary builder replaces names of files, like links to images, with what looks like a hash code of the file name, for example "8FFC5C68.png". returns: DataEntry instance if the resource was successfully processed and None if failed """ # Embedded File (mostly Image or HTML) name = "" # Embedded file name pos = 0 # name: Len = block.data[pos] pos += 1 if pos + Len > len(block.data): log.warning("reading block type 2: name too long") return b_name = block.data[pos:pos + Len] pos += Len b_data = block.data[pos:] # if b_name in (b"C2EEF3F6.html", b"8EAF66FD.bmp"): # log.debug(f"Skipping non-useful file {b_name!r}") # return u_name = b_name.decode(self.sourceEncoding) return self._glos.newDataEntry( u_name, b_data, ) def readType3(self, block): """ reads block with type 3, and updates self.info returns None """ code, b_value = uintFromBytes(block.data[:2]), block.data[2:] if not b_value: return # if not b_value.strip(b"\x00"): return # FIXME try: item = infoType3ByCode[code] except KeyError: if b_value.strip(b"\x00"): log.debug( f"Unknown info type code={code:#02x}, b_value={b_value!r}", ) return key = item.name decode = item.decode if key.endswith(".ico"): self.iconDataList.append((key, b_value)) return value = None if decode is None: value = b_value else: value = decode(b_value) # `value` can be None, str, bytes or dict if not value: return if isinstance(value, dict): self.info.update(value) return if item.attr: setattr(self, key, value) return self.info[key] = value def detectEncoding(self): """ assign self.sourceEncoding and self.targetEncoding """ utf8Encoding = self.info.get("utf8Encoding", False) if self._default_encoding_overwrite: self.defaultEncoding = self._default_encoding_overwrite elif self.defaultCharset: self.defaultEncoding = self.defaultCharset else: self.defaultEncoding = "cp1252" if self._source_encoding_overwrite: self.sourceEncoding = self._source_encoding_overwrite elif utf8Encoding: self.sourceEncoding = "utf-8" elif self.sourceCharset: self.sourceEncoding = self.sourceCharset elif self.sourceLang: self.sourceEncoding = self.sourceLang.encoding else: self.sourceEncoding = self.defaultEncoding if self._target_encoding_overwrite: self.targetEncoding = self._target_encoding_overwrite elif utf8Encoding: self.targetEncoding = "utf-8" elif self.targetCharset: self.targetEncoding = self.targetCharset elif self.targetLang: self.targetEncoding = self.targetLang.encoding else: self.targetEncoding = self.defaultEncoding def logUnknownBlock(self, block): log.debug( f"Unknown block: type={block.type}" f", number={self.numBlocks}" f", data={block.data!r}" ) def __iter__(self): if not self.file: raise RuntimeError("iterating over a reader while it's not open") for fname, iconData in self.iconDataList: yield self._glos.newDataEntry(fname, iconData) block = Block() while not self.isEndOfDictData(): if not self.readBlock(block): break if not block.data: continue if block.type == 2: yield self.readType2(block) elif block.type == 11: succeed, u_word, u_alts, u_defi = self.readEntry_Type11(block) if not succeed: continue yield self._glos.newEntry( [u_word] + u_alts, u_defi, ) elif block.type in (1, 7, 10, 11, 13): pos = 0 # word: succeed, pos, u_word, b_word = self.readEntryWord(block, pos) if not succeed: continue # defi: succeed, pos, u_defi, b_defi = self.readEntryDefi( block, pos, b_word, ) if not succeed: continue # now pos points to the first char after definition succeed, pos, u_alts = self.readEntryAlts( block, pos, b_word, u_word, ) if not succeed: continue yield self._glos.newEntry( [u_word] + u_alts, u_defi, ) def readEntryWord(self, block, pos): """ Read word part of entry. Return value is a list. (False, None, None, None) if error (True, pos, u_word, b_word) if OK u_word is a str instance (utf-8) b_word is a bytes instance """ Err = (False, None, None, None) if pos + 1 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading word size: pos + 1 > len(block.data)" ) return Err Len = block.data[pos] pos += 1 if pos + Len > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading word: pos + Len > len(block.data)" ) return Err b_word = block.data[pos:pos + Len] u_word = self.processKey(b_word) """ Entry keys may contain html text, for example: ante< meridiem arm und reich c=t>2003; und etc. Babylon does not process keys as html, it display them as is. Html in keys is the problem of that particular dictionary. We should not process keys as html, since Babylon do not process them as such. """ pos += Len self.wordLenMax = max(self.wordLenMax, len(u_word)) return True, pos, u_word, b_word def readEntryDefi(self, block, pos, b_word): """ Read defi part of entry. Return value is a list. (False, None, None, None) if error (True, pos, u_defi, b_defi) if OK u_defi is a str instance (utf-8) b_defi is a bytes instance """ Err = (False, None, None, None) if pos + 2 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading defi size: pos + 2 > len(block.data)" ) return Err Len = uintFromBytes(block.data[pos:pos + 2]) pos += 2 if pos + Len > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading defi: pos + Len > len(block.data)" ) return Err b_defi = block.data[pos:pos + Len] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += Len return True, pos, u_defi, b_defi def readEntryAlts(self, block, pos, b_word, u_word): """ returns: (False, None, None) if error (True, pos, u_alts) if succeed u_alts is a sorted list, items are str (utf-8) """ Err = (False, None, None) # use set instead of list to prevent duplicates u_alts = set() while pos < len(block.data): if pos + 1 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + 1 > len(block.data)" ) return Err Len = block.data[pos] pos += 1 if pos + Len > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading alt: pos + Len > len(block.data)" ) return Err b_alt = block.data[pos:pos + Len] u_alt = self.processAlternativeKey(b_alt, b_word) # Like entry key, alt is not processed as html by babylon, # so do we. u_alts.add(u_alt) pos += Len if u_word in u_alts: u_alts.remove(u_word) return True, pos, list(sorted(u_alts)) def readEntry_Type11(self, block): """return (succeed, u_word, u_alts, u_defi)""" Err = (False, None, None, None) pos = 0 # reading headword if pos + 5 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading word size: pos + 5 > len(block.data)" ) return Err wordLen = uintFromBytes(block.data[pos:pos + 5]) pos += 5 if pos + wordLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading word: pos + wordLen > len(block.data)" ) return Err b_word = block.data[pos:pos + wordLen] u_word = self.processKey(b_word) pos += wordLen self.wordLenMax = max(self.wordLenMax, len(u_word)) # reading alts and defi if pos + 4 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading defi size: pos + 4 > len(block.data)" ) return Err altsCount = uintFromBytes(block.data[pos:pos + 4]) pos += 4 # reading alts # use set instead of list to prevent duplicates u_alts = set() for altIndex in range(altsCount): if pos + 4 > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + 4 > len(block.data)" ) return Err altLen = uintFromBytes(block.data[pos:pos + 4]) pos += 4 if altLen == 0: if pos + altLen != len(block.data): # no evidence log.warning( f"reading block offset={block.offset:#02x}" f", reading alt size: pos + altLen != len(block.data)" ) break if pos + altLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading alt: pos + altLen > len(block.data)" ) return Err b_alt = block.data[pos:pos + altLen] u_alt = self.processAlternativeKey(b_alt, b_word) # Like entry key, alt is not processed as html by babylon, # so do we. u_alts.add(u_alt) pos += altLen if u_word in u_alts: u_alts.remove(u_word) u_alts = list(sorted(u_alts)) # reading defi defiLen = uintFromBytes(block.data[pos:pos + 4]) pos += 4 if pos + defiLen > len(block.data): log.error( f"reading block offset={block.offset:#02x}" f", block.type={block.type}" f", reading defi: pos + defiLen > len(block.data)" ) return Err b_defi = block.data[pos:pos + defiLen] u_defi = self.processDefi(b_defi, b_word) self.defiMaxBytes = max(self.defiMaxBytes, len(b_defi)) pos += defiLen return True, u_word, u_alts, u_defi def charReferencesStat(self, b_text, encoding): pass def decodeCharsetTags(self, b_text, defaultEncoding): """ b_text is a bytes Decode html text taking into account charset tags and default encoding Return value: (u_text, defaultEncodingOnly) u_text is str defaultEncodingOnly parameter is false if the text contains parts encoded with non-default encoding (babylon character references '00E6;' do not count). """ b_parts = re_charset_decode.split(b_text) u_text = "" encodings = [] # stack of encodings defaultEncodingOnly = True for i, b_part in enumerate(b_parts): if i % 3 == 0: # text block encoding = encodings[-1] if encodings else defaultEncoding b_text2 = b_part if encoding == "babylon-reference": b_refs = b_text2.split(b";") for i_ref, b_ref in enumerate(b_refs): if not b_ref: if i_ref != len(b_refs) - 1: log.debug( f"decoding charset tags, b_text={b_text!r}" f"\nblank character" f" reference ({b_text2!r})\n" ) continue if not re_b_reference.match(b_ref): log.debug( f"decoding charset tags, b_text={b_text!r}" f"\ninvalid character" f" reference ({b_text2!r})\n" ) continue u_text += chr(int(b_ref, 16)) else: self.charReferencesStat(b_text2, encoding) if encoding == "cp1252": b_text2 = replaceAsciiCharRefs(b_text2, encoding) if self._strict_string_convertion: try: u_text2 = b_text2.decode(encoding) except UnicodeError: log.debug( f"decoding charset tags, b_text={b_text!r}" f"\nfragment: {b_text2!r}" f"\nconversion error:\n" + excMessage() ) u_text2 = text2.decode(encoding, "replace") else: u_text2 = b_text2.decode(encoding, "replace") u_text += u_text2 if encoding != defaultEncoding: defaultEncodingOnly = False elif i % 3 == 1: # or if b_part.startswith(b" if encodings: encodings.pop() else: log.debug( f"decoding charset tags, b_text={b_text!r}" f"\nunbalanced tag\n" ) else: # b_type = b_parts[i + 1].lower() # b_type is a bytes instance, with length 1 if b_type == b"t": encodings.append("babylon-reference") elif b_type == b"u": encodings.append("utf-8") elif b_type == b"k": encodings.append(self.sourceEncoding) elif b_type == b"e": encodings.append(self.sourceEncoding) elif b_type == b"g": # gbk or gb18030 encoding # (not enough data to make distinction) encodings.append("gbk") else: log.debug( f"decoding charset tags, text = {b_text!r}" f"\nunknown charset code = {ord(b_type):#02x}\n" ) # add any encoding to prevent # "unbalanced tag" error encodings.append(defaultEncoding) else: # c attribute of charset tag if the previous tag was charset pass if encodings: log.debug( f"decoding charset tags, text={b_text}" f"\nunclosed tag\n" ) return u_text, defaultEncodingOnly def processKey(self, b_word): """ b_word is a bytes instance returns u_word_main, as str instance (utf-8 encoding) """ b_word_main, strip_count = stripDollarIndexes(b_word) if strip_count > 1: log.debug( f"processKey({b_word}):\n" f"number of dollar indexes = {strip_count}", ) # convert to unicode if self._strict_string_convertion: try: u_word_main = b_word_main.decode(self.sourceEncoding) except UnicodeError: log.debug( f"processKey({b_word}):\nconversion error:\n" + excMessage() ) u_word_main = b_word_main.decode( self.sourceEncoding, "ignore", ) else: u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") if self._process_html_in_key: # u_word_main_orig = u_word_main u_word_main = stripHtmlTags(u_word_main) u_word_main = replaceHtmlEntriesInKeys(u_word_main) # if(re.match(".*[&<>].*", u_word_main_orig)): # log.debug("original text: " + u_word_main_orig + "\n" \ # + "new text: " + u_word_main + "\n") u_word_main = removeControlChars(u_word_main) u_word_main = removeNewlines(u_word_main) u_word_main = u_word_main.lstrip() if self._key_rstrip_chars: u_word_main = u_word_main.rstrip(self._key_rstrip_chars) return u_word_main def processAlternativeKey(self, b_word, b_key): """ b_word is a bytes instance returns u_word_main, as str instance (utf-8 encoding) """ b_word_main, strip_count = stripDollarIndexes(b_word) # convert to unicode if self._strict_string_convertion: try: u_word_main = b_word_main.decode(self.sourceEncoding) except UnicodeError: log.debug( f"processAlternativeKey({b_word})\nkey = {b_key}" f":\nconversion error:\n" + excMessage() ) u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") else: u_word_main = b_word_main.decode(self.sourceEncoding, "ignore") # strip "/" before words u_word_main = self.stripSlashAltKeyPattern.sub( r"\1\2", u_word_main, ) if self._process_html_in_key: # u_word_main_orig = u_word_main u_word_main = stripHtmlTags(u_word_main) u_word_main = replaceHtmlEntriesInKeys(u_word_main) # if(re.match(".*[&<>].*", u_word_main_orig)): # log.debug("original text: " + u_word_main_orig + "\n" \ # + "new text: " + u_word_main + "\n") u_word_main = removeControlChars(u_word_main) u_word_main = removeNewlines(u_word_main) u_word_main = u_word_main.lstrip() u_word_main = u_word_main.rstrip(self._key_rstrip_chars) return u_word_main def processDefi(self, b_defi, b_key): """ b_defi: bytes b_key: bytes return: u_defi_format """ fields = DefinitionFields() self.collectDefiFields(b_defi, b_key, fields) fields.u_defi, fields.singleEncoding = self.decodeCharsetTags( fields.b_defi, self.targetEncoding, ) if fields.singleEncoding: fields.encoding = self.targetEncoding fields.u_defi = fixImgLinks(fields.u_defi) fields.u_defi = replaceHtmlEntries(fields.u_defi) fields.u_defi = removeControlChars(fields.u_defi) fields.u_defi = normalizeNewlines(fields.u_defi) fields.u_defi = fields.u_defi.strip() if fields.b_title: fields.u_title, singleEncoding = self.decodeCharsetTags( fields.b_title, self.sourceEncoding, ) fields.u_title = replaceHtmlEntries(fields.u_title) fields.u_title = removeControlChars(fields.u_title) if fields.b_title_trans: # sourceEncoding or targetEncoding ? fields.u_title_trans, singleEncoding = self.decodeCharsetTags( fields.b_title_trans, self.sourceEncoding, ) fields.u_title_trans = replaceHtmlEntries(fields.u_title_trans) fields.u_title_trans = removeControlChars(fields.u_title_trans) if fields.b_transcription_50: if fields.code_transcription_50 == 0x10: # contains values like this (char codes): # 00 18 00 19 00 1A 00 1B 00 1C 00 1D 00 1E 00 40 00 07 # this is not utf-16 # what is this? pass elif fields.code_transcription_50 == 0x1b: fields.u_transcription_50, singleEncoding = \ self.decodeCharsetTags( fields.b_transcription_50, self.sourceEncoding, ) fields.u_transcription_50 = \ replaceHtmlEntries(fields.u_transcription_50) fields.u_transcription_50 = \ removeControlChars(fields.u_transcription_50) elif fields.code_transcription_50 == 0x18: # incomplete text like: # t c=T>02D0;g0259;- # This defi normally contains fields.b_transcription_60 # in this case. pass else: log.debug( f"processDefi({b_defi})\nb_key = {b_key}" f":\ndefi field 50" f", unknown code: {fields.code_transcription_50:#02x}" ) if fields.b_transcription_60: if fields.code_transcription_60 == 0x1b: fields.u_transcription_60, singleEncoding = \ self.decodeCharsetTags( fields.b_transcription_60, self.sourceEncoding, ) fields.u_transcription_60 = \ replaceHtmlEntries(fields.u_transcription_60) fields.u_transcription_60 = \ removeControlChars(fields.u_transcription_60) else: log.debug( f"processDefi({b_defi})\nb_key = {b_key}" f":\ndefi field 60" f", unknown code: {fields.code_transcription_60:#02x}" ) if fields.b_field_1a: fields.u_field_1a, singleEncoding = self.decodeCharsetTags( fields.b_field_1a, self.sourceEncoding, ) log.info(f"------- u_field_1a = {fields.u_field_1a}") self.processDefiStat(fields, b_defi, b_key) u_defi_format = "" if fields.partOfSpeech or fields.u_title: if fields.partOfSpeech: pos = xml_escape(fields.partOfSpeech) posColor = self._part_of_speech_color u_defi_format += f'{pos}' if fields.u_title: if u_defi_format: u_defi_format += " " u_defi_format += fields.u_title u_defi_format += "
    \n" if fields.u_title_trans: u_defi_format += fields.u_title_trans + "
    \n" if fields.u_transcription_50: u_defi_format += f"[{fields.u_transcription_50}]
    \n" if fields.u_transcription_60: u_defi_format += f"[{fields.u_transcription_60}]
    \n" if fields.u_defi: u_defi_format += fields.u_defi return u_defi_format def processDefiStat(self, fields, b_defi, b_key): pass def findDefiFieldsStart(self, b_defi): """ b_defi is a bytes instance Finds the beginning of the definition trailing fields. Return value is the index of the first chars of the field set, or -1 if the field set is not found. Normally "\x14" should signal the beginning of the definition fields, but some articles may contain this characters inside, so we get false match. As a workaround we may check the following chars. If "\x14" is followed by space, we assume this is part of the article and continue search. Unfortunately this does no help in many cases... """ if self._no_control_sequence_in_defi: return -1 index = -1 while True: index = b_defi.find( 0x14, index + 1, # starting from next character -1, # not the last character ) if index == -1: break if b_defi[index + 1] != 0x20: # b" "[0] == 0x20 break return index def collectDefiFields(self, b_defi, b_key, fields): """ entry definition structure:
    ['\x14'[{field_code}{field_data}]*] {field_code} is one character {field_data} has arbitrary length """ # d0 is index of the '\x14 char in b_defi # d0 may be the last char of the string d0 = self.findDefiFieldsStart(b_defi) if d0 == -1: fields.b_defi = b_defi return fields.b_defi = b_defi[:d0] i = d0 + 1 while i < len(b_defi): if self.metadata2: self.metadata2.defiTrailingFields[b_defi[i]] += 1 if b_defi[i] == 0x02: # part of speech # "\x02" if fields.partOfSpeech: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nduplicate part of speech item", ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nb_defi ends after \\x02" ) return posCode = b_defi[i + 1] try: fields.partOfSpeech = partOfSpeechByCode[posCode] except KeyError: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nunknown part of speech code = {posCode:#02x}" ) return i += 2 elif b_defi[i] == 0x06: # \x06 if fields.b_field_06: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nduplicate type 6" ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nb_defi ends after \\x06" ) return fields.b_field_06 = b_defi[i + 1] i += 2 elif b_defi[i] == 0x07: # \x07 # Found in 4 Hebrew dictionaries. I do not understand. if i + 3 > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x07" ) return fields.b_field_07 = b_defi[i + 1:i + 3] i += 3 elif b_defi[i] == 0x13: # "\x13" # known values: # 03 06 0D C7 # 04 00 00 00 44 # ... # 04 00 00 00 5F if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x13" ) return Len = b_defi[i + 1] i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\nblank data after \\x13" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" + f"b_key = {b_key!r}:\ntoo few data after \\x13" ) return fields.b_field_13 = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x18: # \x18 if fields.b_title: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"b_key = {b_key!r}:\nduplicate entry title item" ) if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\nb_defi ends after \\x18" ) return i += 1 Len = b_defi[i] i += 1 if Len == 0: # log.debug( # f"collecting definition fields, b_defi = {b_defi!r}\n" # f"b_key = {b_key!r}:\nblank entry title" # ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}\n" f"b_key = {b_key!r}:\ntitle is too long" ) return fields.b_title = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x1a: # "\x1a" # found only in Hebrew dictionaries, I do not understand. if i + 1 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key}:\ntoo few data after \\x1a" ) return Len = b_defi[i + 1] i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x1a" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x1a" ) return fields.b_field_1a = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x28: # "\x28" # title with transcription? if i + 2 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x28" ) return i += 1 Len = uintFromBytes(b_defi[i:i + 2]) i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x28" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x28" ) return fields.b_title_trans = b_defi[i:i + Len] i += Len elif 0x40 <= b_defi[i] <= 0x4f: # [\x41-\x4f] # often contains digits as text: # 56 # ælps - key Alps # 48@i # has no apparent influence on the article code = b_defi[i] Len = b_defi[i] - 0x3f if i + 2 + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x40+" ) return i += 2 b_text = b_defi[i:i + Len] i += Len log.debug( f"unknown definition field {code:#02x}, b_text={b_text!r}" ) elif b_defi[i] == 0x50: # \x50 if i + 2 >= len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x50" ) return fields.code_transcription_50 = b_defi[i + 1] Len = b_defi[i + 2] i += 3 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x50" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x50" ) return fields.b_transcription_50 = b_defi[i:i + Len] i += Len elif b_defi[i] == 0x60: # "\x60" if i + 4 > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\ntoo few data after \\x60" ) return fields.code_transcription_60 = b_defi[i + 1] i += 2 Len = uintFromBytes(b_defi[i:i + 2]) i += 2 if Len == 0: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}:\nblank data after \\x60" ) continue if i + Len > len(b_defi): log.debug( f"collecting definition fields, b_defi = {b_defi!r}" + f"\nb_key = {b_key!r}:\ntoo few data after \\x60" ) return fields.b_transcription_60 = b_defi[i:i + Len] i += Len else: log.debug( f"collecting definition fields, b_defi = {b_defi!r}" f"\nb_key = {b_key!r}" f":\nunknown control char. Char code = {b_defi[i]:#02x}" ) return pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_reader_debug.py000066400000000000000000000335121417733132500257230ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2021 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from .bgl_reader import BglReader from pyglossary.text_utils import toStr, isASCII class MetaData(object): def __init__(self): self.blocks = [] self.numEntries = None self.numBlocks = None self.numFiles = None self.gzipStartOffset = None self.gzipEndOffset = None self.fileSize = None self.bglHeader = None # data before gzip header class MetaDataBlock(object): def __init__(self, data, _type): self.data = data self.type = _type class MetaDataRange(object): def __init__(self, _type, count): self.type = _type self.count = count class MetaData2(object): """ Second pass metadata. We need to scan all definitions in order to collect these statistical data. """ def __init__(self): # defiTrailingFields[i] - number of fields with code i found self.defiTrailingFields = [0] * 256 self.isDefiASCII = True # isDefiASCII = true if all definitions contain only ASCII chars """ We apply a number of tests to each definition, excluding those with overwritten encoding (they start with ). defiProcessedCount - total number of definitions processed defiUtf8Count - number of definitions in utf8 encoding defiAsciiCount - number of definitions containing only ASCII chars """ self.defiProcessedCount = 0 self.defiUtf8Count = 0 self.defiAsciiCount = 0 self.charRefs = dict() # encoding -> [ 0 ] * 257 class GzipWithCheck(object): """ gzip.GzipFile with check. It checks that unpacked data match what was packed. """ def __init__(self, fileobj, unpackedPath, reader, closeFileobj=False): """ constructor fileobj - gzip file - archive unpackedPath - path of a file containing original data, for testing. reader - reference to BglReader class instance, used for logging. """ self.file = BGLGzipFile( fileobj=fileobj, closeFileobj=closeFileobj, ) self.unpackedFile = open(unpackedPath, "rb") self.reader = reader def __del__(self): self.close() def close(self): if self.file: self.file.close() self.file = None if self.unpackedFile: self.unpackedFile.close() self.unpackedFile = None def read(self, size=-1): buf1 = self.file.read(size) buf2 = self.unpackedFile.read(size) if buf1 != buf2: self.reader.msgLogFileWrite( f"GzipWithCheck.read: !=: size = {buf1}, ({buf2}) ({size})", ) # else: # self.reader.msgLogFileWrite( # f"GzipWithCheck.read: ==: size = {buf1}, ({buf2}) ({size})", # ) return buf1 def seek(self, offset, whence=os.SEEK_SET): self.file.seek(offset, whence) self.unpackedFile.seek(offset, whence) # self.reader.msgLogFileWrite( # f"GzipWithCheck.seek: offset = {offset}, whence = {whence}", # ) def tell(self): pos1 = self.file.tell() pos2 = self.unpackedFile.tell() if pos1 != pos2: self.reader.msgLogFileWrite( f"GzipWithCheck.tell: !=: {pos1} {pos2}", ) # else: # self.reader.msgLogFileWrite( # f"GzipWithCheck.tell: ==: {pos1} {pos2}", # ) return pos1 def flush(self): if os.sep == "\\": pass # a bug in Windows # after file.flush, file.read returns garbage else: self.file.flush() self.unpackedFile.flush() class DebugBglReader(BglReader): _collect_metadata2 = False _search_char_samples = False _write_gz = False _raw_dump_path = None _unpacked_gzip_path = None _char_samples_path = None _msg_log_path = None def open( self, filename, ): if not BglReader.open(self, filename): return self.metadata2 = MetaData2() if self._collect_metadata2 else None if self._search_char_samples: self.targetCharsArray = ([False] * 256) else: self.targetCharsArray = None if self._raw_dump_path: self.rawDumpFile = open(self._raw_dump_path, "w") if self._char_samples_path: self.samplesDumpFile = open(self._char_samples_path, "w") if self._msg_log_path: self.msgLogFile = open(self._msg_log_path, "w") self.charRefStatPattern = re.compile(b"(&#\\w+;)", re.I) def openGzip(self): with open(self._filename, "rb") as bglFile: if not bglFile: log.error(f"file pointer empty: {bglFile}") return False buf = bglFile.read(6) if len(buf) < 6 or not buf[:4] in ( b"\x12\x34\x00\x01", b"\x12\x34\x00\x02", ): log.error(f"invalid header: {buf[:6]!r}") return False self.gzipOffset = gzipOffset = uintFromBytes(buf[4:6]) log.debug(f"Position of gz header: {gzipOffset}") if gzipOffset < 6: log.error(f"invalid gzip header position: {gzipOffset}") return False if self._write_gz: self.dataFile = self._filename + "-data.gz" try: f2 = open(self.dataFile, "wb") except IOError: log.exception("error while opening gzip data file") self.dataFile = join( tmpDir, os.path.split(self.m_filename)[-1] + "-data.gz" ) f2 = open(self.dataFile, "wb") bglFile.seek(i) f2.write(bglFile.read()) f2.close() self.file = gzip.open(self.dataFile, "rb") else: f2 = FileOffS(self._filename, gzipOffset) if self._unpacked_gzip_path: self.file = GzipWithCheck( f2, self._unpacked_gzip_path, self, closeFileobj=True, ) else: self.file = BGLGzipFile( fileobj=f2, closeFileobj=True, ) def close(self): BglReader.close(self) if self.rawDumpFile: self.rawDumpFile.close() self.rawDumpFile = None if self.msgLogFile: self.msgLogFile.close() self.msgLogFile = None if self.samplesDumpFile: self.samplesDumpFile.close() self.samplesDumpFile = None def __del__(self): BglReader.__del__(self) def readEntryWord(self, block, pos): succeed, pos, u_word, b_word = \ BglReader.readEntryWord(self, block, pos) if not succeed: return self.rawDumpFileWriteText(f"\n\nblock type = {block.type}\nkey = ") self.rawDumpFileWriteData(b_word) def readEntryDefi(self, block, pos, b_key): succeed, pos, u_defi, b_defi = \ BglReader.readEntryDefi(self, block, pos, b_key) if not succeed: return self.rawDumpFileWriteText("\ndefi = ") self.rawDumpFileWriteData(b_defi) """ def readEntryAlts(self, block, pos, b_key, key): succeed, pos, alts, b_alts = \ BglReader.readEntryAlts(self, block, pos, b_key, key) if not succeed: return for b_alt in b_alts: self.rawDumpFileWriteText("\nalt = ") self.rawDumpFileWriteData(b_alt) """ def charReferencesStat(self, b_text, encoding): """ b_text is bytes instance """ # “ # ċ if not self.metadata2: return if encoding not in self.metadata2.charRefs: self.metadata2.charRefs[encoding] = [0] * 257 charRefs = self.metadata2.charRefs[encoding] for index, b_part in enumerate(self.charRefStatPattern.split(b_text)): if index % 2 != 1: continue try: if b_part[:3].lower() == "&#x": code = int(b_part[3:-1], 16) else: code = int(b_part[2:-1]) except (ValueError, OverflowError): continue if code <= 0: continue code = min(code, 256) charRefs[code] += 1 def processDefiStat(self, fields, b_defi, b_key): if fields.singleEncoding: self.findAndPrintCharSamples( fields.b_defi, f"defi, key = {b_key}", fields.encoding, ) if self.metadata2: self.metadata2.defiProcessedCount += 1 if isASCII(toStr(fields.b_defi)): self.metadata2.defiAsciiCount += 1 try: fields.b_defi.decode("utf-8") except UnicodeError: pass else: self.metadata2.defiUtf8Count += 1 if self.metadata2 and self.metadata2.isDefiASCII: if not isASCII(fields.u_defi): self.metadata2.isDefiASCII = False # write text to dump file as is def rawDumpFileWriteText(self, text): # FIXME text = toStr(text) if self.rawDumpFile: self.rawDumpFile.write(text) # write data to dump file unambiguously representing control chars # escape "\" with "\\" # print control chars as "\xhh" def rawDumpFileWriteData(self, text): text = toStr(text) # the next function escapes too many chars, for example, it escapes äöü # self.rawDumpFile.write(text.encode("unicode_escape")) if self.rawDumpFile: self.rawDumpFile.write(text) def msgLogFileWrite(self, text): text = toStr(text) if self.msgLogFile: offset = self.msgLogFile.tell() # print offset in the log file to facilitate navigating this # log in hex editor # intended usage: # the log file is opened in a text editor and hex editor # use text editor to read error messages, use hex editor to # inspect char codes offsets allows to quickly jump to the right # place of the file hex editor self.msgLogFile.write(f"\noffset = {offset:#02x}\n") self.msgLogFile.write(text + "\n") else: log.debug(text) def samplesDumpFileWrite(self, text): text = toStr(text) if self.samplesDumpFile: offset = self.samplesDumpFile.tell() self.samplesDumpFile.write(f"\noffset = {offset:#02x}\n") self.samplesDumpFile.write(text + "\n") else: log.debug(text) def dumpBlocks(self, dumpPath): import pickle self.file.seek(0) metaData = MetaData() metaData.numFiles = 0 metaData.gzipStartOffset = self.gzipOffset self.numEntries = 0 self.numBlocks = 0 range_type = None range_count = 0 block = Block() while not self.isEndOfDictData(): log.debug( f"readBlock: offset {self.file.tell():#02x}, " f"unpacked offset {self.file.unpackedFile.tell():#02x}" ) if not self.readBlock(block): break self.numBlocks += 1 if block.type in (1, 7, 10, 11, 13): self.numEntries += 1 elif block.type == 2: # Embedded File (mostly Image or HTML) metaData.numFiles += 1 if block.type in (1, 2, 7, 10, 11, 13): if range_type == block.type: range_count += 1 else: if range_count > 0: mblock = MetaDataRange(range_type, range_count) metaData.blocks.append(mblock) range_count = 0 range_type = block.type range_count = 1 else: if range_count > 0: mblock = MetaDataRange(range_type, range_count) metaData.blocks.append(mblock) range_count = 0 mblock = MetaDataBlock(block.data, block.type) metaData.blocks.append(mblock) if range_count > 0: mblock = MetaDataRange(range_type, range_count) metaData.blocks.append(mblock) range_count = 0 metaData.numEntries = self.numEntries metaData.numBlocks = self.numBlocks metaData.gzipEndOffset = self.file_bgl.tell() metaData.fileSize = os.path.getsize(self._filename) with open(self._filename, "rb") as f: metaData.bglHeader = f.read(self.gzipOffset) with open(dumpPath, "wb") as f: pickle.dump(metaData, f) self.file.seek(0) def dumpMetadata2(self, dumpPath): import pickle if not self.metadata2: return with open(dumpPath, "wb") as f: pickle.dump(self.metadata2, f) def processDefiStat(self, fields, defi, b_key): BglReader.processDefiStat(self, fields, defi, b_key) if fields.b_title: self.rawDumpFileWriteText("\ndefi title: ") self.rawDumpFileWriteData(fields.b_title) if fields.b_title_trans: self.rawDumpFileWriteText("\ndefi title trans: ") self.rawDumpFileWriteData(fields.b_title_trans) if fields.b_transcription_50: self.rawDumpFileWriteText( f"\ndefi transcription_50 ({fields.code_transcription_50:#x}): ", ) self.rawDumpFileWriteData(fields.b_transcription_50) if fields.b_transcription_60: self.rawDumpFileWriteText( f"\ndefi transcription_60 ({fields.code_transcription_60:#x}): ", ) self.rawDumpFileWriteData(fields.b_transcription_60) if fields.b_field_1a: self.rawDumpFileWriteText("\ndefi field_1a: ") self.rawDumpFileWriteData(fields.b_field_1a) if fields.b_field_13: self.rawDumpFileWriteText( f"\ndefi field_13 bytes: {fields.b_field_13!r}", ) if fields.b_field_07: self.rawDumpFileWriteText("\ndefi field_07: ") self.rawDumpFileWriteData(fields.b_field_07) if fields.b_field_06: self.rawDumpFileWriteText( f"\ndefi field_06: {fields.b_field_06}", ) # search for new chars in data # if new chars are found, mark them with a special sequence in the text # and print result into msg log def findAndPrintCharSamples(self, b_data, hint, encoding): assert isinstance(b_data, bytes) if not self.targetCharsArray: return offsets = self.findCharSamples(b_data) if len(offsets) == 0: return res = "" utf8 = (encoding.lower() == "utf-8") i = 0 for o in offsets: j = o if utf8: while b_data[j] & 0xc0 == 0x80: j -= 1 res += b_data[i:j] res += "!!!--+!!!" i = j res += b_data[j:] offsets_str = " ".join([str(el) for el in offsets]) self.samplesDumpFileWrite( f"charSample({hint})\noffsets = {offsets_str}" f"\nmarked = {res}\norig = {b_data}\n", ) def findCharSamples(self, b_data): """ Find samples of chars in b_data. Search for chars in data that have not been marked so far in the targetCharsArray array, mark new chars. Returns a list of offsets in b_data May return an empty list. """ res = [] if not isinstance(b_data, bytes): log.error("findCharSamples: b_data is not a bytes instance") return res if not self.targetCharsArray: log.error( "findCharSamples: self.targetCharsArray={self.targetCharsArray}" ) return res for i, char in enumerate(b_data): if x < 128: continue if not self.targetCharsArray[x]: self.targetCharsArray[x] = True res.append(i) return res pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/bgl_text.py000066400000000000000000000200071417733132500242720ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2021 Saeed Rasooli (ilius) # Copyright © 2011-2012 kubtek # This file is part of PyGlossary project, http://github.com/ilius/pyglossary # Thanks to Raul Fernandes and Karl Grill for reverse # engineering as part of https://sourceforge.net/projects/ktranslator/ # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import re from pyglossary.plugins.formats_common import log from pyglossary.xml_utils import xml_escape u_pat_html_entry = re.compile("(?:&#x|&#|&)(\\w+);?", re.I) u_pat_html_entry_key = re.compile("(?:&#x|&#|&)(\\w+);", re.I) b_pat_ascii_char_ref = re.compile(b"(&#\\w+;)", re.I) u_pat_newline_escape = re.compile("[\\r\\n\\\\]") u_pat_strip_tags = re.compile("(?:<[/a-zA-Z].*?(?:>|$))+") u_pat_control_chars = re.compile("[\x00-\x08\x0c\x0e-\x1f]") u_pat_newline = re.compile("[\r\n]+") unknownHtmlEntries = set() def replaceHtmlEntryNoEscapeCB(u_match): """ u_match: instance of _sre.SRE_Match Replace character entity with the corresponding character Return the original string if conversion fails. Use this as a replace function of re.sub. """ from pyglossary.html_utils import name2codepoint u_text = u_match.group(0) u_name = u_match.group(1) if log.isDebug(): assert isinstance(u_text, str) and isinstance(u_name, str) u_res = None if u_text[:2] == "&#": # character reference try: if u_text[:3].lower() == "&#x": code = int(u_name, 16) else: code = int(u_name) if code <= 0: raise ValueError() u_res = chr(code) except (ValueError, OverflowError): u_res = chr(0xFFFD) # replacement character elif u_text[0] == "&": """ Babylon dictionaries contain a lot of non-standard entity, references for example, csdot, fllig, nsm, cancer, thlig, tsdot, upslur... This not just a typo. These entries repeat over and over again. Perhaps they had meaning in the source dictionary that was converted to Babylon, but now the meaning is lost. Babylon does render them as is, that is, for example, &csdot; despite other references like & are replaced with corresponding characters. """ # named entity try: u_res = chr(name2codepoint[u_name.lower()]) except KeyError: unknownHtmlEntries.add(u_text) u_res = u_text else: raise ArgumentError() return u_res def replaceHtmlEntryCB(u_match): """ u_match: instance of _sre.SRE_Match Same as replaceHtmlEntryNoEscapeCB, but escapes result string Only <, >, & characters are escaped. """ u_res = replaceHtmlEntryNoEscapeCB(u_match) if u_match.group(0) == u_res: # conversion failed return u_res else: # FIXME: should " and ' be escaped? return xml_escape(u_res, quotation=False) def replaceDingbat(u_match): """ u_match: instance of _sre.SRE_Match replace chars \\u008c-\\u0095 with \\u2776-\\u277f """ ch = u_match.group(0) code = ch + 0x2776 - 0x8c return chr(code) def escapeNewlinesCallback(u_match): """ u_match: instance of _sre.SRE_Match """ ch = u_match.group(0) if ch == "\n": return "\\n" if ch == "\r": return "\\r" if ch == "\\": return "\\\\" return ch def replaceHtmlEntries(u_text): # &ldash; # “ # ċ if log.isDebug(): assert isinstance(u_text, str) return u_pat_html_entry.sub( replaceHtmlEntryCB, u_text, ) def replaceHtmlEntriesInKeys(u_text): # &ldash; # “ # ċ if log.isDebug(): assert isinstance(u_text, str) return u_pat_html_entry_key.sub( replaceHtmlEntryNoEscapeCB, u_text, ) def escapeNewlines(u_text): r""" convert text to c-escaped string: \ -> \\ new line -> \n or \r """ if log.isDebug(): assert isinstance(u_text, str) return u_pat_newline_escape.sub( escapeNewlinesCallback, u_text, ) def stripHtmlTags(u_text): if log.isDebug(): assert isinstance(u_text, str) return u_pat_strip_tags.sub( " ", u_text, ) def removeControlChars(u_text): # \x09 - tab # \x0a - line feed # \x0b - vertical tab # \x0d - carriage return if log.isDebug(): assert isinstance(u_text, str) return u_pat_control_chars.sub( "", u_text, ) def removeNewlines(u_text): if log.isDebug(): assert isinstance(u_text, str) return u_pat_newline.sub( " ", u_text, ) def normalizeNewlines(u_text): """ convert new lines to unix style and remove consecutive new lines """ if log.isDebug(): assert isinstance(u_text, str) return u_pat_newline.sub( "\n", u_text, ) def replaceAsciiCharRefs(b_text, encoding): # “ # ċ if log.isDebug(): assert isinstance(b_text, bytes) b_parts = b_pat_ascii_char_ref.split(b_text) for i_part, b_part in enumerate(b_parts): if i_part % 2 != 1: continue # reference try: if b_part[:3].lower() == "&#x": code = int(b_part[3:-1], 16) else: code = int(b_part[2:-1]) if code <= 0: raise ValueError() except (ValueError, OverflowError): code = -1 if code < 128 or code > 255: continue # no need to escape "<", ">", "&" b_parts[i_part] = bytes([code]) return b"".join(b_parts) def fixImgLinks(u_text): """ Fix img tag links src attribute value of image tag is often enclosed in \x1e - \x1f characters. For example: . Naturally the control characters are not part of the image source name. They may be used to quickly find all names of resources. This function strips all such characters. Control characters \x1e and \x1f are useless in html text, so we may safely remove all of them, irrespective of context. """ if log.isDebug(): assert isinstance(u_text, str) return u_text.replace("\x1e", "").replace("\x1f", "") def stripDollarIndexes(b_word): if log.isDebug(): assert isinstance(b_word, bytes) i = 0 b_word_main = b"" strip_count = 0 # number of sequences found # strip $$ sequences while True: d0 = b_word.find(b"$", i) if d0 == -1: b_word_main += b_word[i:] break d1 = b_word.find(b"$", d0 + 1) if d1 == -1: # log.debug( # f"stripDollarIndexes({b_word}):\npaired $ is not found", # ) b_word_main += b_word[i:] break if d1 == d0 + 1: """ You may find keys (or alternative keys) like these: sur l'arbre$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ obscurantiste$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ They all end on a sequence of b'$', key length including dollars is always 60 chars. You may find keys like these: extremidade-$$$-$$$-linha .FIRM$$$$$$$$$$$$$ etc summary: we must remove any sequence of dollar signs longer than 1 chars """ # log.debug(f"stripDollarIndexes({b_word}):\nfound $$") b_word_main += b_word[i:d0] i = d1 + 1 while i < len(b_word) and b_word[i] == ord(b"$"): i += 1 if i >= len(b_word): break continue if b_word[d0 + 1:d1].strip(b"0123456789"): # if has at least one non-digit char # log.debug(f"stripDollarIndexes({b_word}):\nnon-digit between $$") b_word_main += b_word[i:d1] i = d1 continue if d1 + 1 < len(b_word) and b_word[d1 + 1] != 0x20: """ Examples: make do$4$/make /do potere$1$

    See also notes... volere$1$

    See also notes... Ihre$1$Ihres """ log.debug( f"stripDollarIndexes({b_word}):\n" f"second $ is followed by non-space" ) pass b_word_main += b_word[i:d0] i = d1 + 1 strip_count += 1 return b_word_main, strip_count pyglossary-4.5.0/pyglossary/plugins/babylon_bgl/gzip_no_crc.patch000066400000000000000000000003251417733132500254260ustar00rootroot000000000000007a8,10 > import logging > log = logging.getLogger('root') > 498c501 < raise OSError("CRC check failed %s != %s" % (hex(crc32), --- > log.warning("CRC check failed %s != %s" % (hex(crc32), pyglossary-4.5.0/pyglossary/plugins/cc_cedict/000077500000000000000000000000001417733132500215375ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/cc_cedict/.gitignore000066400000000000000000000000371417733132500235270ustar00rootroot00000000000000.*.swp __pycache__ *.pyc venv pyglossary-4.5.0/pyglossary/plugins/cc_cedict/__init__.py000066400000000000000000000042261417733132500236540ustar00rootroot00000000000000import re from pyglossary.plugins.formats_common import * from . import conv enable = True lname = "cc_cedict" format = "CC-CEDICT" description = "CC-CEDICT" extensions = (".u8",) extensionCreate = "" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/CEDICT" website = ( "https://cc-cedict.org/editor/editor.php", "CC-CEDICT Editor", ) optionsProp = { "encoding": EncodingOption(), "traditional_title": BoolOption( comment="Use traditional Chinese for entry titles/keys", ), } entry_count_reg = re.compile(r"#! entries=(\d+)") class Reader: depends = { "lxml": "lxml", } _encoding: str = "utf-8" _traditional_title: bool = False def __init__(self, glos): self._glos = glos self.file = None self.total_entries = self.entries_left = None def open(self, filename): if self.file is not None: self.file.close() self._glos.sourceLangName = "Chinese" self._glos.targetLangName = "English" self.file = open(filename, "r", encoding=self._encoding) for line in self.file: match = entry_count_reg.match(line) if match is not None: count = match.groups()[0] self.total_entries = self.entries_left = int(count) break else: self.close() raise RuntimeError("CC-CEDICT: could not find entry count") def close(self): if self.file is not None: self.file.close() self.file = None self.total_entries = self.entries_left = None def __len__(self): if self.total_entries is None: raise RuntimeError( "CC-CEDICT: len(reader) called while reader is not open", ) return self.total_entries def __iter__(self): if self.file is None: raise RuntimeError( "CC-CEDICT: tried to iterate over entries " + "while reader is not open" ) for line in self.file: if line.startswith("#"): continue if self.entries_left == 0: log.warning("more entries than the header claimed?!") self.entries_left -= 1 parts = conv.parse_line(line) if parts is None: log.warning("bad line: %s", line) continue names, article = conv.make_entry( *parts, traditional_title=self._traditional_title, ) entry = self._glos.newEntry(names, article, defiFormat="h") yield entry pyglossary-4.5.0/pyglossary/plugins/cc_cedict/conv.py000066400000000000000000000045201417733132500230570ustar00rootroot00000000000000import re import os from .pinyin import convert from .summarize import summarize from pyglossary.plugins.formats_common import pip, log line_reg = re.compile(r"^([^ ]+) ([^ ]+) \[([^\]]+)\] /(.+)/$") script_dir = os.path.dirname(__file__) COLORS = { "": "black", "1": "red", "2": "orange", "3": "green", "4": "blue", "5": "black", } def parse_line(line): line = line.strip() match = line_reg.match(line) if match is None: return None trad, simp, pinyin, eng = match.groups() pinyin = pinyin.replace("u:", "v") eng = eng.split("/") return trad, simp, pinyin, eng def make_entry(trad, simp, pinyin, eng, traditional_title): eng_names = list(map(summarize, eng)) names = [ trad if traditional_title else simp, simp if traditional_title else trad, pinyin ] + eng_names article = render_article(trad, simp, pinyin, eng, traditional_title) return names, article def colorize(hf, syllables, tones): if len(syllables) != len(tones): log.warning(f"unmatched tones: syllables={syllables!r}, tones={tones}") with hf.element("div", style="display: inline-block"): for syllable in syllables: with hf.element("font", color=""): hf.write(syllable) return with hf.element("div", style="display: inline-block"): for syllable, tone in zip(syllables, tones): with hf.element("font", color=COLORS[tone]): hf.write(syllable) def render_article(trad, simp, pinyin, eng, traditional_title): from lxml import etree as ET from io import BytesIO # pinyin_tones = [convert(syl) for syl in pinyin.split()] pinyin_list = [] tones = [] for syllable in pinyin.split(): nice_syllable, tone = convert(syllable) pinyin_list.append(nice_syllable) tones.append(tone) f = BytesIO() with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div", style="border: 1px solid; padding: 5px"): with hf.element("div"): with hf.element("big"): colorize(hf, trad if traditional_title else simp, tones) if trad != simp: hf.write("\xa0/\xa0") # "\xa0" --> " " == " " colorize(hf, simp if traditional_title else trad, tones) hf.write(ET.Element("br")) with hf.element("big"): colorize(hf, pinyin_list, tones) with hf.element("div"): with hf.element("ul"): for defn in eng: with hf.element("li"): hf.write(defn) article = f.getvalue().decode("utf-8") return article pyglossary-4.5.0/pyglossary/plugins/cc_cedict/pinyin.py000066400000000000000000000014271417733132500234230ustar00rootroot00000000000000# coding=utf-8 # based on https://github.com/zkoch/CEDICT_Parser TONES = { "a1": "ā", "a2": "á", "a3": "ǎ", "a4": "à", "e1": "ē", "e2": "é", "e3": "ě", "e4": "è", "i1": "ī", "i2": "í", "i3": "ǐ", "i4": "ì", "o1": "ō", "o2": "ó", "o3": "ǒ", "o4": "ò", "u1": "ū", "u2": "ú", "u3": "ǔ", "u4": "ù", "v1": "ǖ", "v2": "ǘ", "v3": "ǚ", "v4": "ǜ", } # using v for the umlauted u VOWELS = ("a", "e", "o", "iu", "ui", "i", "u", "v") def convert(word): tone = word[-1] pinyin = word[0:-1].lower() result = pinyin if tone == "5": return pinyin, tone elif tone not in ("1", "2", "3", "4"): return word, "" for vowel in VOWELS: if vowel in pinyin: vowel1 = vowel[-1] result = pinyin.replace(vowel1, TONES[vowel1 + tone]) break return result, tone pyglossary-4.5.0/pyglossary/plugins/cc_cedict/summarize.py000066400000000000000000000032141417733132500241250ustar00rootroot00000000000000import re import string parenthetical = re.compile(r"\([^)]+?\)") punct_table = {ord(p): " " for p in string.punctuation if p not in "-'"} stops = { "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn" } def summarize(phrase): phrase = parenthetical.sub("", phrase) phrase = phrase.translate(punct_table) words = phrase.split() relevant_words = [word for word in words if word not in stops] if not relevant_words: relevant_words = words summary = " ".join(relevant_words[:10]) return summary pyglossary-4.5.0/pyglossary/plugins/cc_kedict.py000066400000000000000000000142621417733132500221260ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from pyglossary.text_reader import TextGlossaryReader from io import BytesIO from os.path import dirname enable = True lname = "cc_kedict" format = "cc-kedict" description = "cc-kedict" extensions = () extensionCreate = "" singleFile = True kind = "text" wiki = "" website = ( "https://github.com/mhagiwara/cc-kedict", "@mhagiwara/cc-kedict", ) optionsProp = { } class YamlReader(TextGlossaryReader): tagStyle = ( "color:white;" "background:green;" "padding-left:3px;" "padding-right:3px;" "border-radius:0.5ex;" # 0.5ex ~= 0.3em, but "ex" is recommended ) def __init__( self, glos: GlossaryType, spellKey: str = "", posKey: str = "", synsKey: str = "", tagsKey: str = "", ): TextGlossaryReader.__init__(self, glos) self._spellKey = spellKey self._posKey = posKey self._synsKey = synsKey self._tagsKey = tagsKey self._posMapping = { "n": "noun", "v": "verb", "a": "adjective", "pron": "pronoun", "propn": "proper noun", "intj": "interjection", "det": "determiner", "part": "particle", "adv": "adverb", "num": "number", "abbrev": "abbreviation", "suf": "suffix", "pref": "prefix", } def isInfoWord(self, word): return False def fixInfoWord(self, word): return "" def _makeList( self, hf: "lxml.etree.htmlfile", input_objects: "List[Any]", processor: "Callable", single_prefix=None, skip_single=True ): """ Wrap elements into
      if more than one element """ from lxml import etree as ET if not input_objects: return if skip_single and len(input_objects) == 1: # if single_prefix is None: # single_prefix = ET.Element("br") hf.write(single_prefix) processor(hf, input_objects[0], 1) return with hf.element("ol"): for el in input_objects: with hf.element("li"): processor(hf, el, len(input_objects)) def _processExample( self, hf: "lxml.etree.htmlfile", exampleDict: "Dict", count: int, ): from lxml import etree as ET if not exampleDict.get("example"): log.error(f"invalid example: {exampleDict}") return hf.write(exampleDict["example"]) transliteration = exampleDict.get("transliteration") if transliteration: hf.write(ET.Element("br")) with hf.element("font", color="green"): hf.write(f"{transliteration}") translation = exampleDict.get("translation") if translation: hf.write(ET.Element("br")) with hf.element("i"): hf.write(f"{translation}") def _processDef( self, hf: "lxml.etree.htmlfile", defDict: "Dict", count: int, ): from lxml import etree as ET text = defDict.get("def", "") if text: hf.write(text) examples = defDict.get("examples") if examples: if text: if count == 1: hf.write(ET.Element("br")) hf.write(ET.Element("br")) with hf.element("i"): hf.write("Examples:") self._makeList( hf, examples, self._processExample, skip_single=False, ) def _processNote( self, hf: "lxml.etree.htmlfile", note: str, count: int, ): hf.write(note) def _processEntry( self, hf: "lxml.etree.htmlfile", edict: "Dict", ): from lxml import etree as ET if self._spellKey and self._spellKey in edict: spelling = edict[self._spellKey] if not isinstance(spelling, str): log.error(f"spelling = {spelling} type {type(spelling)}, edict={edict}") if spelling is True: # https://github.com/mhagiwara/cc-kedict/pull/1 spelling = "on" else: spelling = "" if spelling: with hf.element("font", color="green"): hf.write(spelling) hf.write(ET.Element("br")) if self._posKey and self._posKey in edict: pos = edict[self._posKey] pos = self._posMapping.get(pos, pos) with hf.element("i"): hf.write(pos.capitalize()) hf.write(ET.Element("br")) if self._tagsKey and self._tagsKey in edict: tags = edict[self._tagsKey] for i, tag in enumerate(tags): if i > 0: hf.write(" ") with hf.element("span", style=self.tagStyle): hf.write(tag) hf.write(ET.Element("br")) defs = edict.get("defs") if defs: self._makeList( hf, defs, self._processDef, ) if self._synsKey and self._synsKey in edict: hf.write("Synonyms: ") for i, word in enumerate(edict[self._synsKey]): if i > 0: with hf.element("big"): hf.write(" | ") with hf.element("a", href=f"bword://{word}"): hf.write(word) hf.write(ET.Element("br")) notes = edict.get("notes") if notes: hf.write(ET.Element("br")) hf.write("Notes:") self._makeList( hf, notes, self._processNote, skip_single=False, ) def _createEntry(self, yamlBlock: str): from lxml import etree as ET from yaml import load try: from yaml import CLoader as Loader except ImportError: from yaml import Loader edict = load(yamlBlock, Loader=Loader) word = edict.get("word") if not word: log.error(f"no word in {edict}") return f = BytesIO() with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div"): self._processEntry(hf, edict) defi = f.getvalue().decode("utf-8") return word, defi def nextPair(self): if not self._file: raise StopIteration lines = [] while True: line = self.readline() if not line: break line = line.rstrip("\n\r") if not line: continue if line.startswith("- "): line = " " + line[1:] if lines: self._bufferLine = line return self._createEntry("\n".join(lines)) lines.append(line) if lines: return self._createEntry("\n".join(lines)) raise StopIteration class Reader(object): depends = { "yaml": "PyYAML", "lxml": "lxml", } def __init__(self, glos: GlossaryType): self._glos = glos self._yaml = YamlReader( glos, spellKey="romaja", posKey="pos", synsKey="syns", tagsKey="tags", ) def __len__(self): return 0 def open(self, filename: str) -> None: if isdir(filename): filename = join(filename, "kedict.yml") self._filename = filename self._glos.sourceLangName = "Korean" self._glos.targetLangName = "English" self._glos.setDefaultDefiFormat("h") self._yaml.open(filename) def close(self): self._yaml.close() def __iter__(self): for entry in self._yaml: yield entry pyglossary-4.5.0/pyglossary/plugins/check-style000077700000000000000000000000001417733132500245542../../check-styleustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/crawler_dir.py000066400000000000000000000077121417733132500225150ustar00rootroot00000000000000from pyglossary.plugins.formats_common import * from hashlib import sha1 from os.path import dirname from os import makedirs, listdir from pyglossary.text_utils import ( escapeNTB, splitByBarUnescapeNTB, ) from pyglossary.compression import ( compressionOpenFunc, ) enable = True lname = "crawler_dir" format = "CrawlerDir" description = "Crawler Directory" extensions = (".crawler",) extensionCreate = ".crawler/" singleFile = True kind = "directory" wiki = "" website = None optionsProp = { "compression": StrOption( values=["", "gz", "bz2", "lzma"], comment="Compression Algorithm", ), } class Writer(object): _compression: str = "" def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None def finish(self): pass def open(self, filename: str): self._filename = filename if not isdir(filename): makedirs(filename) def filePathFromWord(self, b_word: bytes) -> str: bw = b_word.lower() if len(bw) <= 2: return bw.hex() if len(bw) <= 4: return join( bw[:2].hex() + ".d", bw[2:].hex(), ) return join( bw[:2].hex() + ".d", bw[2:4].hex() + ".d", bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], ) def write(self, ): from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson filename = self._filename wordCount = 0 compression = self._compression c_open = compressionOpenFunc(compression) if not c_open: raise ValueError(f"invalid compression {c!r}") while True: entry = yield if entry is None: break if entry.isData(): continue fpath = join(filename, self.filePathFromWord(entry.b_word)) if compression: fpath = f"{fpath}.{compression}" parentDir = dirname(fpath) if not isdir(parentDir): makedirs(parentDir) if isfile(fpath): log.warning(f"file exists: {fpath}") fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" with c_open(fpath, "wt", encoding="utf-8") as _file: _file.write( f"{escapeNTB(entry.s_word)}\n{entry.defi}" ) wordCount += 1 with open( join(filename, "info.json"), mode="w", encoding="utf-8", ) as infoFile: info = odict() info["name"] = self._glos.getInfo("name") info["wordCount"] = wordCount for key, value in self._glos.getExtraInfos(( "name", "wordCount", )).items(): info[key] = value infoFile.write(dataToPrettyJson(info)) class Reader(object): def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None self._wordCount = 0 def open(self, filename: str): from pyglossary.json_utils import jsonToOrderedData self._filename = filename with open(join(filename, "info.json"), "r", encoding="utf-8") as infoFp: info = jsonToOrderedData(infoFp.read()) self._wordCount = info.pop("wordCount") for key, value in info.items(): self._glos.setInfo(key, value) def close(self): pass def __len__(self): return self._wordCount def _fromFile(self, fpath): _, ext = splitext(fpath) c_open = compressionOpenFunc(ext.lstrip(".")) if not c_open: log.error(f"invalid extention {ext}") c_open = open with c_open(fpath, "rt", encoding="utf-8") as _file: words = splitByBarUnescapeNTB(_file.readline().rstrip("\n")) defi = _file.read() return self._glos.newEntry(words, defi) def _listdirSortKey(self, name): name_nox, ext = splitext(name) if ext == ".d": return name return name_nox def _readDir( self, dpath: str, exclude: "Optional[Set[str]]", ): children = listdir(dpath) if exclude: children = [ name for name in children if name not in exclude ] children.sort(key=self._listdirSortKey) for name in children: cpath = join(dpath, name) if isfile(cpath): yield self._fromFile(cpath) continue if isdir(cpath): yield from self._readDir(cpath, None) continue log.error(f"Not a file nor a directory: {cpath}") def __iter__(self): yield from self._readDir( self._filename, { "info.json", }, ) pyglossary-4.5.0/pyglossary/plugins/csv_plugin.py000066400000000000000000000136541417733132500223730ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2013-2019 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from pyglossary.plugins.formats_common import * import csv enable = True lname = "csv" format = "Csv" description = "CSV (.csv)" extensions = (".csv",) extensionCreate = ".csv" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/Comma-separated_values" website = None optionsProp = { "encoding": EncodingOption(), "resources": BoolOption( comment="Enable resources / data files", ), "delimiter": Option( typ="str", customValue=True, values=[",", ";", "@"], comment="Column delimiter", ), "add_defi_format": BoolOption( comment="enable adding defiFormat (m/h/x)", ), "enable_info": BoolOption( comment="Enable glossary info / metedata", ), "word_title": BoolOption( comment="add headwords title to begining of definition", ), } class Reader(object): compressions = stdCompressions _encoding: str = "utf-8" _delimiter: str = "," def __init__(self, glos: GlossaryType): self._glos = glos self.clear() def clear(self) -> None: self._filename = "" self._file = None self._leadingLinesCount = 0 self._wordCount = None self._pos = -1 self._csvReader = None self._resDir = "" self._resFileNames = [] self._bufferRow = None def open( self, filename: str, ) -> None: from pyglossary.text_reader import TextFilePosWrapper self._filename = filename cfile = compressionOpen(filename, mode="rt", encoding=self._encoding) cfile.seek(0, 2) self._fileSize = cfile.tell() cfile.seek(0) self._file = TextFilePosWrapper(cfile, self._encoding) self._csvReader = csv.reader( self._file, dialect="excel", delimiter=self._delimiter, ) self._resDir = filename + "_res" if isdir(self._resDir): self._resFileNames = os.listdir(self._resDir) else: self._resDir = "" self._resFileNames = [] for row in self._csvReader: if not row: continue if not row[0].startswith("#"): self._bufferRow = row break if len(row) < 2: log.error(f"invalid row: {row}") continue self._glos.setInfo(row[0].lstrip("#"), row[1]) def close(self) -> None: if self._file: try: self._file.close() except Exception: log.exception("error while closing csv file") self.clear() def __len__(self) -> int: from pyglossary.file_utils import fileCountLines if self._wordCount is None: if hasattr(self._file, "compression"): return 0 log.debug("Try not to use len(reader) as it takes extra time") self._wordCount = fileCountLines(self._filename) - \ self._leadingLinesCount return self._wordCount + len(self._resFileNames) def _iterRows(self): if self._bufferRow: yield self._bufferRow for row in self._csvReader: yield row def _processRow(self, row): if not row: return None try: word = row[0] defi = row[1] except IndexError: log.error(f"invalid row: {row!r}") return None try: alts = row[2].split(",") except IndexError: pass else: word = [word] + alts return self._glos.newEntry( word, defi, byteProgress=(self._file.tell(), self._fileSize), ) def __iter__(self) -> "Iterator[BaseEntry]": if not self._csvReader: raise RuntimeError("iterating over a reader while it's not open") wordCount = 0 for row in self._iterRows(): wordCount += 1 yield self._processRow(row) self._wordCount = wordCount resDir = self._resDir for fname in self._resFileNames: with open(join(resDir, fname), "rb") as _file: yield self._glos.newDataEntry( fname, _file.read(), ) class Writer(object): compressions = stdCompressions _encoding: str = "utf-8" _resources: bool = True _delimiter: str = "," _add_defi_format: bool = False _enable_info: bool = True _word_title: bool = False def __init__(self, glos: GlossaryType): self._glos = glos def open(self, filename: str): self._filename = filename self._file = compressionOpen(filename, mode="wt", encoding=self._encoding) self._resDir = resDir = filename + "_res" self._csvWriter = csv.writer( self._file, dialect="excel", quoting=csv.QUOTE_ALL, # FIXME delimiter=self._delimiter, ) if not isdir(resDir): os.mkdir(resDir) if self._enable_info: for key, value in self._glos.iterInfo(): self._csvWriter.writerow([f"#{key}", value]) def finish(self): self._filename = None if self._file: self._file.close() self._file = None if not os.listdir(self._resDir): os.rmdir(self._resDir) def write(self) -> "Generator[None, BaseEntry, None]": encoding = self._encoding resources = self._resources add_defi_format = self._add_defi_format glos = self._glos resDir = self._resDir writer = self._csvWriter word_title = self._word_title while True: entry = yield if entry is None: break if entry.isData(): if resources: entry.save(resDir) continue words = entry.l_word if not words: continue word, alts = words[0], words[1:] defi = entry.defi if word_title: defi = glos.wordTitleStr(words[0]) + defi row = [ word, defi, ] if add_defi_format: entry.detectDefiFormat() row.append(entry.defiFormat) if alts: row.append(",".join(alts)) writer.writerow(row) pyglossary-4.5.0/pyglossary/plugins/dicformids.py000066400000000000000000000146671417733132500223520ustar00rootroot00000000000000# -*- coding: utf-8 -*- import re from pyglossary.plugins.tabfile import Reader as TabfileReader from pyglossary.plugins.formats_common import * lname = "dicformids" enable = True format = "Dicformids" description = "DictionaryForMIDs" extensions = (".mids",) extensionCreate = ".mids/" sortOnWrite = ALWAYS sortKeyName = "dicformids" sortEncoding = "utf-8" kind = "directory" wiki = "" website = ( "http://dictionarymid.sourceforge.net/", "DictionaryForMIDs - SourceForge", ) optionsProp = {} PROP_TEMPLATE = """#DictionaryForMIDs property file infoText={name}, author: {author} indexFileMaxSize={indexFileMaxSize}\n language1IndexNumberOfSourceEntries={wordCount} language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate indexCharEncoding=ISO-8859-1 dictionaryFileSeparationCharacter='\\t' language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate logLevel=0 language1FilePostfix={directoryPostfix} dictionaryCharEncoding=UTF-8 numberOfAvailableLanguages=2 language1IsSearchable=true language2GenerateIndex=false dictionaryFileMaxSize={dicMaxSize} language2FilePostfix={language2FilePostfix} searchListFileMaxSize=20000 language2IsSearchable=false fileEncodingFormat=plain_format1 language1HasSeparateDictionaryFile=true searchListCharEncoding=ISO-8859-1 searchListFileSeparationCharacter='\t' indexFileSeparationCharacter='\t' language1DisplayText={sourceLang} language2HasSeparateDictionaryFile=false dictionaryGenerationInputCharEncoding=UTF-8 language1GenerateIndex=true language2DisplayText={targetLang} language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng """ class Reader(object): re_number = re.compile(r"\d+") def __init__(self, glos): self._glos = glos self._tabFileNames = [] self._tabFileReader = None def open(self, dirname): self._dirname = dirname orderFileNames = [] for fname in os.listdir(dirname): if not fname.startswith("directory"): continue try: num = self.re_number.findall(fname)[-1] except IndexError: pass else: orderFileNames.append((num, fname)) orderFileNames.sort( key=lambda x: x[0], reverse=True, ) self._tabFileNames = [x[1] for x in orderFileNames] self.nextTabFile() def __len__(self): # FIXME raise NotImplementedError def __iter__(self): return self def __next__(self): for _ in range(10): try: return next(self._tabFileReader) except StopIteration: self._tabFileReader.close() self.nextTabFile() def nextTabFile(self): try: tabFileName = self._tabFileNames.pop() except IndexError: raise StopIteration self._tabFileReader = TabfileReader(self._glos, hasInfo=False) self._tabFileReader.open(join(self._dirname, tabFileName)) def close(self): if self._tabFileReader: try: self._tabFileReader.close() except Exception: pass self._tabFileReader = None self._tabFileNames = [] class Writer(object): def __init__(self, glos): self._glos = glos self.linesPerDirectoryFile = 500 # 200 self.indexFileMaxSize = 32722 # 30000 self.directoryPostfix = "" self.indexPostfix = "" self._dirname = "" self.re_punc = re.compile( r"[!\"$§$%&/()=?´`\\{}\[\]^°+*~#'-_.:,;<>@]*", # FIXME: | ) self.re_spaces = re.compile(" +") self.re_tabs = re.compile("\t+") def normateWord(self, word: str) -> str: word = word.strip() # looks like we need to remove tabs, because app gives error # but based on the java code, all punctuations should be removed # as well, including '|' which is used to separate alternate words # FIXME # word = word.replace("|", " ") word = self.re_punc.sub("", word) word = self.re_spaces.sub(" ", word) word = self.re_tabs.sub(" ", word) word = word.lower() return word def writeProbs(self): glos = self._glos with open(join( self._dirname, "DictionaryForMIDs.properties", ), "w") as fileObj: fileObj.write(PROP_TEMPLATE.format( name=glos.getInfo("name"), author=glos.author, indexFileMaxSize=self.indexFileMaxSize, wordCount=self.wordCount, directoryPostfix=self.directoryPostfix, dicMaxSize=self.dicMaxSize + 1, language2FilePostfix="fa", # FIXME sourceLang=glos.sourceLangName, targetLang=glos.targetLangName, )) def nextIndex(self): try: self.indexFp.close() except AttributeError: self.indexIndex = 0 self.indexIndex += 1 fname = f"index{self.indexPostfix}{self.indexIndex}.csv" fpath = join(self._dirname, fname) self.indexFp = open(fpath, mode="w", encoding="utf-8") def finish(self): pass def open(self, dirname: str): self._dirname = dirname if not os.path.isdir(dirname): os.mkdir(dirname) def write(self): self.nextIndex() dicMaxSize = 0 indexData = [] def writeBucket(dicIndex: int, entryList: "List[BaseEntry]"): nonlocal dicMaxSize log.debug( f"dicIndex={dicIndex}, len(entryList)={len(entryList)}" f", dicMaxSize={dicMaxSize}" ) dicFp = open(join( self._dirname, f"directory{self.directoryPostfix}{dicIndex+1}.csv", ), mode="w", encoding="utf-8") for entry in entryList: word = entry.s_word n_word = self.normateWord(word) defi = entry.defi dicLine = word + "\t" + defi + "\n" dicPos = dicFp.tell() dicFp.write(dicLine) indexData.append((n_word, dicIndex + 1, dicPos)) dicMaxSize = max(dicMaxSize, dicFp.tell()) dicFp.close() bucketSize = self.linesPerDirectoryFile wordCount = 0 dicIndex = 0 entryList = [] # aka bucket while True: entry = yield if entry is None: break if entry.isData(): # FIXME continue wordCount += 1 entryList.append(entry) if len(entryList) >= bucketSize: writeBucket(dicIndex, entryList) dicIndex += 1 entryList = [] if entryList: writeBucket(dicIndex, entryList) entryList = None self.dicMaxSize = dicMaxSize self.wordCount = wordCount langSearchListFp = open(join( self._dirname, f"searchlist{self.directoryPostfix}.csv" ), mode="w", encoding="utf-8") langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n") for word, dicIndex, dicPos in indexData: indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n" if ( self.indexFp.tell() + len(indexLine) ) > self.indexFileMaxSize - 10: self.nextIndex() langSearchListFp.write(f"{word}\t{self.indexIndex}\n") self.indexFp.write(indexLine) self.indexFp.close() langSearchListFp.close() self.writeProbs() pyglossary-4.5.0/pyglossary/plugins/dict_cc.py000066400000000000000000000077331417733132500216130ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * import html from operator import itemgetter enable = True lname = "dict_cc" format = 'Dictcc' description = 'Dict.cc (SQLite3)' extensions = () extensionCreate = ".db" kind = "binary" wiki = "https://en.wikipedia.org/wiki/Dict.cc" website = ( "https://play.google.com/store/apps/details?id=cc.dict.dictcc", "dict.cc dictionary - Google Play", ) class Reader(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() self._glos.setDefaultDefiFormat("h") def __len__(self): self._cur.execute( "select count(distinct term1)+count(distinct term2) from main_ft" ) return self._cur.fetchone()[0] def makeList( self, hf: "lxml.etree.htmlfile", input_elements: "List[lxml.etree.Element]", processor: "Callable", single_prefix=None, skip_single=True ): """ Wrap elements into
        if more than one element """ if len(input_elements) == 0: return if len(input_elements) == 1: hf.write(single_prefix) processor(hf, input_elements[0]) return with hf.element("ol"): for el in input_elements: with hf.element("li"): processor(hf, el) def writeSense( self, hf: "lxml.etree.htmlfile", row: "Tuple[str, str, str]", ): from lxml import etree as ET trans, entry_type = row if entry_type: with hf.element("i"): hf.write(f"{entry_type}") hf.write(ET.Element("br")) try: hf.write(trans + " ") except Exception as e: log.error(f"error in writing {trans!r}, {e}") hf.write(repr(trans) + " ") else: with hf.element("big"): with hf.element("a", href=f'bword://{trans}'): hf.write(f"⏎") def iterRows(self, column1, column2): self._cur.execute( f"select {column1}, {column2}, entry_type from main_ft" f" order by {column1}" ) for row in self._cur.fetchall(): term1 = row[0] term2 = row[1] try: term1 = html.unescape(term1) except Exception as e: log.error(f"html.unescape({term1!r}) -> {e}") try: term2 = html.unescape(term2) except Exception as e: log.error(f"html.unescape({term2!r}) -> {e}") yield term1, term2, row[2] def parseGender(self, headword): # {m} masc masculine German: maskulin # {f} fem feminine German: feminin # {n} neut neutral German: neutral # { } ???? i = headword.find(" {") if i <= 0: return None, headword if len(headword) < i + 4: return None, headword if headword[i + 3] != "}": return None, headword g = headword[i + 2] gender = None if g == "m": gender = "masculine" elif g == "f": gender = "feminine" elif g == "n": gender = "neutral" else: log.warning(f"invalid gender {g!r}") return None, headword headword = headword[:i] + headword[i + 4:] return gender, headword def _iterOneDirection(self, column1, column2): from itertools import groupby from lxml import etree as ET from io import BytesIO glos = self._glos for headword, groupsOrig in groupby( self.iterRows(column1, column2), key=itemgetter(0), ): headword = html.unescape(headword) groups = [ (term2, entry_type) for _, term2, entry_type in groupsOrig ] f = BytesIO() gender, headword = self.parseGender(headword) with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div"): if gender: with hf.element("i"): hf.write(gender) hf.write(ET.Element("br")) self.makeList( hf, groups, self.writeSense, ) defi = f.getvalue().decode("utf-8") yield self._glos.newEntry(headword, defi, defiFormat="h") def __iter__(self): yield from self._iterOneDirection("term1", "term2") yield from self._iterOneDirection("term2", "term1") def close(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() pyglossary-4.5.0/pyglossary/plugins/dict_cc_split.py000066400000000000000000000034431417733132500230200ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * import html enable = True lname = "dict_cc_split" format = 'Dictcc_split' description = 'Dict.cc (SQLite3) - Split' extensions = () extensionCreate = ".db" kind = "binary" wiki = "https://en.wikipedia.org/wiki/Dict.cc" website = ( "https://play.google.com/store/apps/details?id=cc.dict.dictcc", "dict.cc dictionary - Google Play", ) class Reader(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() self._glos.setDefaultDefiFormat("m") def __len__(self): self._cur.execute("select count(*) * 2 from main_ft") return self._cur.fetchone()[0] def iterRows(self, column1, column2): self._cur.execute( f"select {column1}, {column2}, entry_type from main_ft" f" order by {column1}" ) for row in self._cur.fetchall(): term1 = row[0] term2 = row[1] try: term1 = html.unescape(term1) except Exception as e: log.error(f"html.unescape({term1!r}) -> {e}") try: term2 = html.unescape(term2) except Exception as e: log.error(f"html.unescape({term2!r}) -> {e}") yield term1, term2, row[2] def _iterOneDirection(self, column1, column2): for word, defi, entry_type in self.iterRows(column1, column2): if entry_type: word = f"{word} {{{entry_type}}}" yield self._glos.newEntry(word, defi, defiFormat="m") def __iter__(self): yield from self._iterOneDirection("term1", "term2") yield from self._iterOneDirection("term2", "term1") def close(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() pyglossary-4.5.0/pyglossary/plugins/dict_org.py000066400000000000000000000107031417733132500220040ustar00rootroot00000000000000# -*- coding: utf-8 -*- import re from pyglossary.plugins.formats_common import * from pyglossary.file_utils import fileCountLines from pyglossary.plugin_lib.dictdlib import DictDB enable = True lname = "dict_org" format = "DictOrg" description = "DICT.org file format (.index)" extensions = (".index",) optionsProp = { "dictzip": BoolOption(comment="Compress .dict file to .dict.dz"), "install": BoolOption(comment="Install dictionary to /usr/share/dictd/"), } sortOnWrite = DEFAULT_NO kind = "directory" wiki = "https://en.wikipedia.org/wiki/DICT#DICT_file_format" website = ( "http://dict.org/bin/Dict", "The DICT Development Group", ) def installToDictd(filename: str, dictzip: bool, title: str = "") -> None: """ filename is without extension (neither .index or .dict or .dict.dz) """ import shutil import subprocess targetDir = "/usr/share/dictd/" if filename.startswith(targetDir): return if not isdir(targetDir): log.warning(f"Directory {targetDir!r} does not exist, skipping install") return log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}") if dictzip and os.path.isfile(filename + ".dict.dz"): dictExt = ".dict.dz" elif os.path.isfile(filename + ".dict"): dictExt = ".dict" else: log.error(f"No .dict file, could not install dictd file {filename!r}") return if not filename.startswith(targetDir): shutil.copy(filename + ".index", targetDir) shutil.copy(filename + dictExt, targetDir) # update /var/lib/dictd/db.list if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0: log.error( "failed to update /var/lib/dictd/db.list file" ", try manually runing: sudo /usr/sbin/dictdconfig -w" ) log.info("don't forget to restart dictd server") class Reader(object): def __init__(self, glos: GlossaryType): self._glos = glos self._filename = "" self._dictdb = None # type: Optional[DictDB] # regular expression patterns used to prettify definition text self._re_newline_in_braces = re.compile( r'\{(?P.*?)\n(?P.*?)?\}', ) self._re_words_in_braces = re.compile( r'\{(?P.+?)\}', ) def open(self, filename: str) -> None: import gzip if filename.endswith(".index"): filename = filename[:-6] self._filename = filename self._dictdb = DictDB(filename, "read", 1) def close(self) -> None: if self._dictdb is not None: self._dictdb.indexfile.close() self._dictdb.dictfile.close() # self._dictdb.finish() self._dictdb = None def prettifyDefinitionText(self, defi: str) -> str: # Handle words in {} # First, we remove any \n in {} pairs defi = self._re_newline_in_braces.sub(r'{\g\g}', defi) # Then, replace any {words} into words, # so it can be rendered as link correctly defi = self._re_words_in_braces.sub( r'\g', defi, ) # Use
        so it can be rendered as newline correctly defi = defi.replace("\n", "
        ") return defi def __len__(self) -> int: if self._dictdb is None: return 0 return len(self._dictdb.indexentries) def __iter__(self) -> "Iterator[BaseEntry]": if self._dictdb is None: raise RuntimeError("iterating over a reader while it's not open") dictdb = self._dictdb for word in dictdb.getdeflist(): b_defi = b"\n\n
        \n\n".join(dictdb.getdef(word)) try: defi = b_defi.decode("utf_8", 'ignore') defi = self.prettifyDefinitionText(defi) except Exception as e: log.error(f"b_defi = {b_defi}") raise e yield self._glos.newEntry(word, defi) class Writer(object): _dictzip: bool = False _install: bool = True def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None self._dictdb = None def finish(self): from pyglossary.os_utils import runDictzip self._dictdb.finish(dosort=1) if self._dictzip: runDictzip(f"{self._filename}.dict") if self._install: installToDictd( self._filename, self._dictzip, self._glos.getInfo("name").replace(" ", "_"), ) self._filename = None def open(self, filename: str): filename_nox, ext = splitext(filename) if ext.lower() == ".index": filename = filename_nox self._dictdb = DictDB(filename, "write", 1) self._filename = filename def write(self) -> "Generator[None, BaseEntry, None]": dictdb = self._dictdb while True: entry = yield if entry is None: break if entry.isData(): # does dictd support resources? and how? FIXME continue dictdb.addentry(entry.b_defi, entry.l_word) pyglossary-4.5.0/pyglossary/plugins/dict_org_source.py000066400000000000000000000022101417733132500233560ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = True lname = "dict_org_source" format = "DictOrgSource" description = "DICT.org dictfmt source file" extensions = (".dtxt",) extensionCreate = ".dtxt" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/DICT" website = ( "https://github.com/cheusov/dictd", "@cheusov/dictd", ) optionsProp = { "remove_html_all": BoolOption(comment="Remove all HTML tags"), } class Writer(object): _remove_html_all: bool = True def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None def finish(self): self._filename = None def open(self, filename: str): self._filename = filename if self._remove_html_all: self._glos.removeHtmlTagsAll() # TODO: add another bool flag to only remove html tags that are not # supported by GtkTextView def write(self) -> "Generator[None, BaseEntry, None]": from pyglossary.text_writer import writeTxt yield from writeTxt( self._glos, entryFmt=":{word}:{defi}\n", filename=self._filename, defiEscapeFunc=replaceStringTable([ ("\r", ""), ]), ext=".dtxt", ) pyglossary-4.5.0/pyglossary/plugins/dictunformat.py000066400000000000000000000040371417733132500227140ustar00rootroot00000000000000from pyglossary.plugins.formats_common import * from pyglossary.text_reader import TextGlossaryReader enable = True lname = "dictunformat" format = "Dictunformat" description = "dictunformat output file" extensions = (".dictunformat",) extensionCreate = ".dictunformat" singleFile = True kind = "text" wiki = "https://directory.fsf.org/wiki/Dictd" website = ( "https://github.com/cheusov/dictd/blob/master/dictunformat.1.in", "dictd/dictunformat.1.in - @cheusov/dictd", ) optionsProp = { "encoding": EncodingOption(), } def unescapeDefi(defi: str) -> str: return defi class Reader(TextGlossaryReader): def isInfoWord(self, word): return word.startswith("00-database-") def fixInfoWord(self, word): return word def setInfo(self, word: str, defi: str) -> None: if word == "00-database-short": self._glos.setInfo("name", defi) return if word != "00-database-info": return glos = self._glos lastKey = "" for line in defi.split("\n"): if not line.startswith("##:"): if lastKey: glos.setInfo(key, f"{glos.getInfo(lastKey)}\n{line}") continue parts = line[3:].split(":") if len(parts) < 2: log.error(f"unexpected line: {line}") key = lastKey = parts[0] value = ":".join(parts[1:]) glos.setInfo(key, value) def nextPair(self): if not self._file: raise StopIteration word = "" defiLines = [] while True: line = self.readline() if not line: break line = line.rstrip("\n\r") if not line: continue if not line.strip("_"): if not word: continue if not defiLines: log.warning(f"no definition/value for {word!r}") defi = unescapeDefi("\n".join(defiLines)) return word, defi if not word: word = line continue if line == word: continue if line.lower() == word: word = line continue defiLines.append(line) if word: defi = unescapeDefi("\n".join(defiLines)) if word.startswith("00-database-") and defi == "unknown": log.info(f"ignoring {word} -> {defi}") return return word, defi raise StopIteration pyglossary-4.5.0/pyglossary/plugins/digitalnk.py000066400000000000000000000025131417733132500221600ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * import html enable = True lname = "digitalnk" format = 'DigitalNK' description = 'DigitalNK (SQLite3, N-Korean)' extensions = () extensionCreate = ".db" kind = "binary" wiki = "" website = ( "https://github.com/digitalprk/dicrs", "@digitalprk/dicrs", ) class Reader(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() self._glos.setDefaultDefiFormat("m") def __len__(self): self._cur.execute("select count(*) from dictionary") return self._cur.fetchone()[0] def __iter__(self): self._cur.execute( "select word, definition from dictionary" " order by word" ) # iteration over self._cur stops after one entry # and self._cur.fetchone() returns None # no idea why! # https://github.com/ilius/pyglossary/issues/282 # for row in self._cur: for row in self._cur.fetchall(): word = html.unescape(row[0]) definition = row[1] yield self._glos.newEntry(word, definition, defiFormat="m") def close(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() pyglossary-4.5.0/pyglossary/plugins/dsl/000077500000000000000000000000001417733132500204215ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/dsl/__init__.py000066400000000000000000000271011417733132500225330ustar00rootroot00000000000000# -*- coding: utf-8 -*- # dsl/__init__.py # Read ABBYY Lingvo DSL dictionary format # # Copyright © 2013-2020 Saeed Rasooli # Copyright © 2016 Ratijas # Copyright © 2013 Xiaoqiang Wang # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import re import html import html.entities from xml.sax.saxutils import escape, quoteattr from pyglossary.plugins.formats_common import * from pyglossary.text_reader import TextFilePosWrapper from . import layer from . import tag from .main import ( DSLParser, ) enable = True lname = "dsl" format = "ABBYYLingvoDSL" description = "ABBYY Lingvo DSL (.dsl)" extensions = (".dsl",) extensionCreate = ".dsl" singleFile = True kind = "text" wiki = "https://ru.wikipedia.org/wiki/ABBYY_Lingvo" website = ( "https://www.lingvo.ru/", "www.lingvo.ru", ) optionsProp = { "encoding": EncodingOption(), "audio": BoolOption(comment="Enable audio objects"), "only_fix_markup": BoolOption(comment="Only fix markup, without tag conversion"), } # ABBYY is a Russian company # https://ru.wikipedia.org/wiki/ABBYY_Lingvo # http://lingvo.helpmax.net/en/troubleshooting/dsl-compiler/compiling-a-dictionary/ # https://www.abbyy.com/news/abbyy-lingvo-80-dictionaries-to-suit-every-taste/ __all__ = ["read"] # {{{ # modified to work around codepoints that are not supported by `unichr`. # http://effbot.org/zone/re-sub.htm#unescape-html # January 15, 2003 | Fredrik Lundh # Removes HTML or XML character references and entities from a text string. # # @param text The HTML (or XML) source text. # @return The plain text, as a Unicode string, if necessary. htmlEntityPattern = re.compile(r"&#?\w+;") def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": i = int(text[3:-1], 16) else: i = int(text[2:-1]) except ValueError: pass else: try: return chr(i) except ValueError: # f"\\U{i:08x}", but no fb"..." return (b"\\U%08x" % i).decode("unicode-escape") else: # named entity try: text = chr(html.entities.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return htmlEntityPattern.sub(fixup, text) # }}} def make_a_href(s): return f"{escape(s)}" def ref_sub(x): return make_a_href(unescape(x.groups()[0])) # order matters, a lot. shortcuts = [ # canonical: m > * > ex > i > c ( "[m1](?:-{2,})[/m]", "
        " ), ( "[m(\\d)](?:-{2,})[/m]", "
        em\"/>" ), ] shortcuts = [ ( re.compile(repl.replace("[", "\\[").replace("*]", "\\*]")), sub ) for (repl, sub) in shortcuts ] # precompiled regexs re_brackets_blocks = re.compile(r"\{\{[^}]*\}\}") re_lang_open = re.compile(r"(?>") # single instance of parser # it is safe as long as this script is not going multithread. _parse = DSLParser().parse def apply_shortcuts(line): for pattern, sub in shortcuts: line = pattern.sub(sub, line) return line def _clean_tags(line, audio): r""" [m{}] =>
        [*] => [ex] => [c] => [p] => ['] => [b] => [i] => [u] => [sup] => [sub] => [ref] \ [url] } => {} <<...>> / [s] => [s] => {} [t] => {{...}} \ [trn] | [!trn] | [trs] } => remove [!trs] | [lang ...] | [com] / """ # remove {{...}} blocks line = re_brackets_blocks.sub("", line) # remove trn tags # re_trn = re.compile("\[\/?!?tr[ns]\]") line = line \ .replace("[trn]", "") \ .replace("[/trn]", "") \ .replace("[trs]", "") \ .replace("[/trs]", "") \ .replace("[!trn]", "") \ .replace("[/!trn]", "") \ .replace("[!trs]", "") \ .replace("[/!trs]", "") # remove lang tags line = re_lang_open.sub("", line).replace("[/lang]", "") # remove com tags line = line.replace("[com]", "").replace("[/com]", "") # escape html special characters like '<' and '>' line = html.escape(html.unescape(line)) # remove t tags line = line.replace( "[t]", "" ) line = line.replace("[/t]", "") line = _parse(line) line = re_end.sub("
        ", line) # paragraph, part one: before shortcuts. line = line.replace("[m]", "[m1]") # if line somewhere contains "[m_]" tag like # "[b]I[/b][m1] [c][i]conj.[/i][/c][/m][m1]1) ...[/m]" # then leave it alone. only wrap in "[m1]" when no "m" tag found at all. if not re_m_open.search(line): line = f"[m1]{line}[/m]" line = apply_shortcuts(line) # paragraph, part two: if any not shourcuted [m] left? line = re_m.sub(r'
        \g<2>
        ', line) # text formats line = line.replace("[']", "").replace("[/']", "") line = line.replace("[b]", "").replace("[/b]", "") line = line.replace("[i]", "").replace("[/i]", "") line = line.replace("[u]", "").replace("[/u]", "") line = line.replace("[sup]", "").replace("[/sup]", "") line = line.replace("[sub]", "").replace("[/sub]", "") # color line = line.replace("[c]", "") line = re_c_open_color.sub("\">", line) line = line.replace("[/c]", "") # example zone line = line.replace("[ex]", "") line = line.replace("[/ex]", "") # secondary zone line = line.replace("[*]", "")\ .replace("[/*]", "") # abbrev. label line = line.replace("[p]", "") line = line.replace("[/p]", "") # cross reference line = line.replace("[ref]", "<<").replace("[/ref]", ">>") line = line.replace("[url]", "<<").replace("[/url]", ">>") line = re_ref.sub(ref_sub, line) # sound file if audio: sound_tag = r'" \ "" \ "" else: sound_tag = "" line = re_sound.sub(sound_tag, line) # image file line = re_img.sub( r'\g<1>\g<2>', line, ) # \[...\] line = line.replace("\\[", "[").replace("\\]", "]") return line def unwrap_quotes(s): return re_wrapped_in_quotes.sub("\\2", s) class Reader(object): compressions = stdCompressions + ("dz",) _encoding: str = "" _audio: bool = False _only_fix_markup: bool = False re_tags_open = re.compile(r"(? int: # FIXME return 0 def _clean_tags_only_markup(self, line, audio): return _parse(line) def open( self, filename: str, ) -> None: self._filename = filename if self._only_fix_markup: self.clean_tags = self._clean_tags_only_markup else: self.clean_tags = _clean_tags encoding = self._encoding if not encoding: encoding = self.detectEncoding() cfile = compressionOpen( filename, dz=True, mode="rt", encoding=encoding, ) cfile.seek(0, 2) self._fileSize = cfile.tell() cfile.seek(0) self._file = TextFilePosWrapper(cfile, encoding) # read header for line in self._file: line = line.rstrip().lstrip('\ufeff') # \ufeff -> https://github.com/ilius/pyglossary/issues/306 if not line: continue if not line.startswith("#"): self._bufferLine = line break self.processHeaderLine(line) def detectEncoding(self): for testEncoding in ("utf-8", "utf-16"): with compressionOpen( self._filename, dz=True, mode="rt", encoding=testEncoding, ) as fileObj: try: for i in range(10): fileObj.readline() except UnicodeDecodeError: log.info(f"Encoding of DSL file is not {testEncoding}") continue else: log.info(f"Encoding of DSL file detected: {testEncoding}") return testEncoding raise ValueError( "Could not detect encoding of DSL file" ", specify it by: --read-options encoding=ENCODING" ) def setInfo(self, key, value): self._glos.setInfo(key, unwrap_quotes(value)) def processHeaderLine(self, line): if line.startswith("#NAME"): self.setInfo("name", unwrap_quotes(line[6:].strip())) elif line.startswith("#INDEX_LANGUAGE"): self._glos.sourceLangName = unwrap_quotes(line[16:].strip()) elif line.startswith("#CONTENTS_LANGUAGE"): self._glos.targetLangName = unwrap_quotes(line[19:].strip()) def _iterLines(self) -> "Iterator[str]": if self._bufferLine: line = self._bufferLine self._bufferLine = "" yield line for line in self._file: yield line def __iter__(self) -> "Iterator[BaseEntry]": current_key = "" current_key_alters = [] current_text = [] line_type = "header" unfinished_line = "" re_tags_open = self.re_tags_open re_tags_close = self.re_tags_close for line in self._iterLines(): line = line.rstrip() if not line: continue # texts if line.startswith(" ") or line.startswith("\t"): line_type = "text" line = unfinished_line + line.lstrip() # some ill formatted source may have tags spanned into # multiple lines # try to match opening and closing tags tags_open = re_tags_open.findall(line) tags_close = re_tags_close.findall(line) if len(tags_open) != len(tags_close): unfinished_line = line continue unfinished_line = "" # convert DSL tags to HTML tags line = self.clean_tags(line, self._audio) current_text.append(line) continue # title word(s) # alternative titles if line_type == "title": current_key_alters.append(line) continue # previous line type is text -> start new title # append previous entry if line_type == "text": if unfinished_line: # line may be skipped if ill formatted current_text.append(self.clean_tags(unfinished_line, self._audio)) yield self._glos.newEntry( [current_key] + current_key_alters, "\n".join(current_text), byteProgress=(self._file.tell(), self._fileSize), ) # start new entry current_key = line current_key_alters = [] current_text = [] unfinished_line = "" line_type = "title" # last entry if line_type == "text": yield self._glos.newEntry( [current_key] + current_key_alters, "\n".join(current_text), ) pyglossary-4.5.0/pyglossary/plugins/dsl/layer.py000066400000000000000000000045371417733132500221200ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016 Ratijas # Copyright © 2016-2017 Saeed Rasooli # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ internal stuff. Layer class """ from . import tag class Layer(object): __slots__ = ["tags", "text"] def __init__(self, stack): stack.append(self) self.tags = set() self.text = "" def __contains__(self, tag): """ :param tag: tag.Tag :return: bool """ return tag in self.tags def __repr__(self): tags = "{" + ", ".join(map(str, self.tags)) + "}" return f"Layer({tags}, {self.text!r})" def __eq__(self, other): """ mostly for unittest. """ return self.text == other.text and self.tags == other.tags i_and_c = {tag.Tag("i", "i"), tag.Tag("c", "c")} p_tag = tag.Tag("p", "p") def close_tags(stack, tags, layer_index=-1): """ close given tags on layer with index `layer_index`. :param stack: Iterable[Layer] :param layer_index: int :param tags: Iterable[tag.Tag] :return: None """ if layer_index == -1: layer_index = len(stack) - 1 layer = stack[layer_index] if layer.text: tags = set.intersection(layer.tags, tags) if not tags: return # shortcut: [i][c] equivalent to [p] if tags.issuperset(i_and_c): tags -= i_and_c tags.add(p_tag) layer.tags -= i_and_c # no need to layer.tags.add() ordered_tags = tag.canonical_order(tags) layer.text = "".join( [f"[{x.opening}]" for x in ordered_tags] + [layer.text] + [f"[/{x.closing}]" for x in reversed(ordered_tags)] ) # remove tags from layer layer.tags -= tags if layer.tags or layer_index == 0: return superlayer = stack[layer_index - 1] superlayer.text += layer.text del stack[layer_index] def close_layer(stack): """ close top layer on stack. """ if not stack: return tags = stack[-1].tags close_tags(stack, tags) pyglossary-4.5.0/pyglossary/plugins/dsl/main.py000066400000000000000000000153251417733132500217250ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016 Ratijas # Copyright © 2016-2018 Saeed Rasooli # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ exposed API lives here. """ import copy import re from . import tag as _tag from . import layer as _layer def process_closing_tags(stack, tags): """ close `tags`, closing some inner layers if necessary. :param stack: Iterable[layer.Layer] :param tags: Iterable[str] """ index = len(stack) - 1 for tag in copy.copy(tags): index_for_tag = _tag.index_of_layer_containing_tag(stack, tag) if index_for_tag is not None: index = min(index, index_for_tag) else: tags.remove(tag) if not tags: return to_open = set() for layer in stack[:index:-1]: for lt in layer.tags: if lt.closing not in tags: to_open.add(lt) _layer.close_layer(stack) to_close = set() layer = stack[index] for lt in layer.tags: if lt.closing in tags: to_close.add(lt) _layer.close_tags(stack, to_close, index) if to_open: _layer.Layer(stack) stack[-1].tags = to_open OPEN = 1 CLOSE = 2 TEXT = 3 BRACKET_L = "\0\1" BRACKET_R = "\0\2" # precompiled regexs # re_m_tag_with_content = re.compile(r"(\[m\d\])(.*?)(\[/m\])") re_non_escaped_bracket = re.compile(r"(?= 1: # close all layers. [m*] tags can only appear # at top layer. # note: do not reopen tags that were marked as # closed already. to_open = set.union(*( {t for t in layer.tags if t.closing not in closings} for layer in stack )) for i in range(len(stack)): _layer.close_layer(stack) # assert len(stack) == 1 # assert not stack[0].tags _layer.Layer(stack) stack[-1].tags = to_open elif state is CLOSE: process_closing_tags(stack, closings) if not stack or stack[-1].text: _layer.Layer(stack) stack[-1].tags.add(item) state = OPEN continue elif item_t is CLOSE: if state in (OPEN, TEXT): closings.clear() closings.add(item) state = CLOSE continue elif item_t is TEXT: if state is CLOSE: process_closing_tags(stack, closings) if not stack: _layer.Layer(stack) stack[-1].text += item state = TEXT continue if state is CLOSE and closings: process_closing_tags(stack, closings) # shutdown unclosed tags return "".join([layer.text for layer in stack]) def put_brackets_away(self, line): r"""put away \[, \] and brackets that does not belong to any of given tags. :rtype: str """ clean_line = "" startswith_tag = _startswith_tag_cache.get(self.tags, None) if startswith_tag is None: openings = "|".join(f"{_[1]}{_[2]}" for _ in self.tags) closings = "|".join(_[1] for _ in self.tags) startswith_tag = re.compile( fr"(?:(?:{openings})|/(?:{closings}))\]" ) _startswith_tag_cache[self.tags] = startswith_tag for i, chunk in enumerate(re_non_escaped_bracket.split(line)): if i != 0: m = startswith_tag.match(chunk) if m: clean_line += "[" + \ m.group() + \ chunk[m.end():].replace("[", BRACKET_L)\ .replace("]", BRACKET_R) else: clean_line += BRACKET_L + chunk.replace("[", BRACKET_L)\ .replace("]", BRACKET_R) else: # first chunk clean_line += chunk.replace("[", BRACKET_L)\ .replace("]", BRACKET_R) return clean_line @staticmethod def bring_brackets_back(line): return line.replace(BRACKET_L, "[").replace(BRACKET_R, "]") pyglossary-4.5.0/pyglossary/plugins/dsl/tag.py000066400000000000000000000040601417733132500215460ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016 Ratijas # Copyright © 2016-2017 Saeed Rasooli # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ internal stuff. Tag class """ from collections import namedtuple Tag = namedtuple("Tag", ["opening", "closing"]) Tag.__repr__ = lambda tag: \ f"Tag({tag.opening!r})" if tag.opening == tag.closing \ else f"Tag({tag.opening!r}, {tag.closing!r})" predefined = [ "m", "*", "ex", "i", "c", ] def was_opened(stack, tag): """ check if tag was opened at some layer before. :param stack: Iterable[layer.Layer] :param tag: tag.Tag :return: bool """ if not len(stack): return False layer = stack[-1] if tag in layer: return True return was_opened(stack[:-1], tag) def canonical_order(tags): """ arrange tags in canonical way, where (outermost to innermost): m > * > ex > i > c with all other tags follow them in alphabetical order. :param tags: Iterable[Tag] :return: List """ result = [] tags = list(tags) for predef in predefined: t = next((t for t in tags if t.closing == predef), None) if t: result.append(t) tags.remove(t) result.extend(sorted(tags, key=lambda x: x.opening)) return result def index_of_layer_containing_tag(stack, tag): """ return zero based index of layer with `tag` or None :param stack: Iterable[layer.Layer] :param tag: str :return: int | None """ for i, layer in enumerate(reversed(stack)): for t in layer.tags: if t.closing == tag: return len(stack) - i - 1 return None pyglossary-4.5.0/pyglossary/plugins/ebook_epub2.py000066400000000000000000000154731417733132500224170ustar00rootroot00000000000000# -*- coding: utf-8 -*- # The MIT License (MIT) # Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) # Copyright © 2016-2019 Saeed Rasooli # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from pyglossary.plugins.formats_common import * from pyglossary.ebook_base import * enable = True lname = "epub2" format = "Epub2" description = "EPUB-2 E-Book" extensions = (".epub",) extensionCreate = ".epub" sortOnWrite = ALWAYS sortKeyName = "ebook" kind = "package" wiki = "https://en.wikipedia.org/wiki/EPUB" website = None # EPUB-3: https://www.w3.org/community/epub3/ optionsProp = { "group_by_prefix_length": IntOption( comment="Prefix length for grouping", ), # "group_by_prefix_merge_min_size": IntOption(), # "group_by_prefix_merge_across_first": BoolOption(), "compress": BoolOption( comment="Enable compression", ), "keep": BoolOption( comment="Keep temp files", ), "include_index_page": BoolOption( comment="Include index page", ), "css": StrOption( comment="Path to css file", ), "cover_path": StrOption( comment="Path to cover file", ), } class Writer(EbookWriter): # these class attrs are only in Epub # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE MIMETYPE_CONTENTS = "application/epub+zip" CONTAINER_XML_CONTENTS = """ """ NCX_TEMPLATE = """ {title} {ncx_items} """ NCX_NAVPOINT_TEMPLATE = \ """\t {text} """ CSS_CONTENTS = """@charset "UTF-8"; body { margin: 10px 25px 10px 25px; } h1 { font-size: 200%; } h2 { font-size: 150%; } p { margin-left: 0em; margin-right: 0em; margin-top: 0em; margin-bottom: 0em; line-height: 2em; text-align: justify; } a, a:focus, a:active, a:visited { color: black; text-decoration: none; } body.indexPage {} h1.indexTitle {} p.indexGroups { font-size: 150%; } span.indexGroup {} body.groupPage {} h1.groupTitle {} div.groupNavigation {} span.groupHeadword {} div.groupEntry { margin-top: 0; margin-bottom: 1em; } h2.groupHeadword { margin-left: 5%; } p.groupDefinition { margin-left: 10%; margin-right: 10%; } """ GROUP_XHTML_TEMPLATE = \ """ {title}

        {group_title}

        [ Previous ] {index_link} [ Next ]
        {group_contents} """ GROUP_XHTML_INDEX_LINK = "\t\t[ Index ]" GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t

        {headword}

        {definition}

        """ OPF_TEMPLATE = """ {identifier} {sourceLang} {title} {creator} {copyright} {creationDate} {cover} {manifest} {spine} """ COVER_TEMPLATE = "" def __init__(self, glos): import uuid EbookWriter.__init__( self, glos, ) glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) @classmethod def cls_get_prefix(cls, options: "Dict[str, Any]", word: str) -> str: if not word: return None length = options.get("group_by_prefix_length", cls._group_by_prefix_length) prefix = word[:length].lower() if prefix[0] < "a": return "SPECIAL" return prefix def get_prefix(self, word: str) -> str: if not word: return None length = self._group_by_prefix_length prefix = word[:length].lower() if prefix[0] < "a": return "SPECIAL" return prefix def write_ncx(self, group_labels): """ write_ncx only for epub """ ncx_items = [] index = 1 if self._include_index_page: ncx_items.append(self.NCX_NAVPOINT_TEMPLATE.format( index=index, text="Index", src="index.xhtml", )) index += 1 for group_label in group_labels: ncx_items.append(self.NCX_NAVPOINT_TEMPLATE.format( index=index, text=group_label, src=self.get_group_xhtml_file_name_from_index(index), )) index += 1 ncx_items_unicode = "\n".join(ncx_items) ncx_contents = self.NCX_TEMPLATE.format( identifier=self._glos.getInfo("uuid"), title=self._glos.getInfo("name"), ncx_items=ncx_items_unicode, ) self.add_file_manifest( "OEBPS/toc.ncx", "toc.ncx", ncx_contents, "application/x-dtbncx+xml", ) # inherts write from EbookWriter pyglossary-4.5.0/pyglossary/plugins/ebook_kobo.py000066400000000000000000000150101417733132500223170ustar00rootroot00000000000000# -*- coding: utf-8 -*- # The MIT License (MIT) # Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) # Copyright © 2022 Saeed Rasooli # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from pyglossary.plugins.formats_common import * from itertools import groupby from pathlib import Path import unicodedata import re from pickle import dumps, loads from gzip import compress, decompress from operator import itemgetter enable = True lname = "kobo" format = "Kobo" description = "Kobo E-Reader Dictionary" extensions = (".kobo",) extensionCreate = ".kobo.zip" kind = "package" sortOnWrite = NEVER wiki = "https://en.wikipedia.org/wiki/Kobo_eReader" website = ( "https://www.kobo.com", "www.kobo.com", ) # https://help.kobo.com/hc/en-us/articles/360017640093-Add-new-dictionaries-to-your-Kobo-eReader optionsProp = { } # Penelope option: marisa_index_size=1000000 def is_cyrillic_char(c: str) -> bool: # U+0400 – U+04FF: Cyrillic # U+0500 – U+052F: Cyrillic Supplement if "\u0400" <= c <= "\u052F": return True # U+2DE0 – U+2DFF: Cyrillic Extended-A if "\u2DE0" <= c <= "\u2DFF": return True # U+A640 – U+A69F: Cyrillic Extended-B if "\uA640" <= c <= "\uA69F": return True # U+1C80 – U+1C8F: Cyrillic Extended-C if "\u1C80" <= c <= "\u1C8F": return True # U+FE2E, U+FE2F: Combining Half Marks # U+1D2B, U+1D78: Phonetic Extensions return c in ("\uFE2E", "\uFE2F", "\u1D2B", "\u1D78") def fixFilename(fname: str) -> str: return Path(fname.replace("/", "2F").replace("\\", "5C")).name class Writer: WORDS_FILE_NAME = "words" depends = { "marisa_trie": "marisa-trie", } def __init__(self, glos, **kwargs): self._glos = glos self._filename = None self._words = [] self._img_pattern = re.compile( ']*?)?>', re.DOTALL, ) # img tag has no closing try: import marisa_trie except ModuleNotFoundError as e: e.msg += f", run `{pip} install marisa-trie` to install" raise e def get_prefix(self, word: str) -> str: if not word: return "11" wo = word[:2].strip().lower() if not wo: return "11" if wo[0] == "\x00": return "11" if len(wo) > 1 and wo[1] == "\x00": wo = wo[:1] if is_cyrillic_char(wo[0]): return wo # if either of the first 2 chars are not unicode letters, return "11" for c in wo: if not unicodedata.category(c).startswith("L"): return "11" wo = wo.ljust(2, "a") return wo def fix_defi(self, defi: str) -> str: # @pgaskin on #219: Kobo supports images in dictionaries, # but these have a lot of gotchas # (see https://pgaskin.net/dictutil/dicthtml/format.html). # Basically, The best way to do it is to encode the images as a # base64 data URL after shrinking it and making it grayscale # (if it's JPG, this is as simple as only keeping the Y channel) # for now we just skip data entries and remove '\n" groupCounter = 0 htmlContents = htmlHeader def writeGroup(lastPrefix): nonlocal htmlContents group_fname = fixFilename(lastPrefix) htmlContents += "" log.trace( f"writeGroup: {lastPrefix!r}, " f"{group_fname!r}, count={groupCounter}" ) with gzip.open(group_fname + ".html", mode="wb") as gzipFile: gzipFile.write(htmlContents.encode("utf-8")) htmlContents = htmlHeader allWords = [] data = [] while True: entry = yield if entry is None: break if entry.isData(): dataEntryCount += 1 continue l_word = entry.l_word allWords += l_word wordsByPrefix = OrderedDict() for word in l_word: prefix = self.get_prefix(word) if prefix in wordsByPrefix: wordsByPrefix[prefix].append(word) else: wordsByPrefix[prefix] = [word] entry.stripFullHtml() defi = self.fix_defi(entry.defi) mainHeadword = l_word[0] for prefix, p_words in wordsByPrefix.items(): headword, *variants = p_words if headword != mainHeadword: headword = f"{mainHeadword}, {headword}" data.append(( prefix, compress(dumps(( headword, variants, defi, ))) )) del entry log.info(f"Kobo: sorting entries...") data.sort(key=itemgetter(0)) log.info(f"Kobo: writing entries...") lastPrefix = "" for prefix, row in data: headword, variants, defi = loads(decompress(row)) if lastPrefix and prefix != lastPrefix: writeGroup(lastPrefix) groupCounter = 0 lastPrefix = prefix htmlVariants = "".join( f'' for v in variants ) body = f"
        {headword}{htmlVariants}
        {defi}
        " htmlContents += f"{body}\n" groupCounter += 1 del data if groupCounter > 0: writeGroup(lastPrefix) if dataEntryCount > 0: log.warning( f"ignored {dataEntryCount} files (data entries)" " and replaced ' None: self._filename = filename def write(self) -> "Generator[None, BaseEntry, None]": with indir(self._filename, create=True): yield from self.write_groups() def finish(self) -> None: import marisa_trie with indir(self._filename, create=False): trie = marisa_trie.Trie(self._words) trie.save(self.WORDS_FILE_NAME) self._filename = None pyglossary-4.5.0/pyglossary/plugins/ebook_kobo_dictfile.py000066400000000000000000000115071417733132500241710ustar00rootroot00000000000000# -*- coding: utf-8 -*- # The MIT License (MIT) # Copyright © 2020-2021 Saeed Rasooli # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from pyglossary.plugins.formats_common import * from pyglossary.text_reader import TextGlossaryReader from pyglossary.image_utils import extractInlineHtmlImages enable = True lname = "kobo_dictfile" format = "Dictfile" description = "Kobo E-Reader Dictfile (.df)" extensions = (".df",) extensionCreate = ".df" kind = "text" wiki = "" website = ( "https://pgaskin.net/dictutil/dictgen/#dictfile-format", "dictgen - dictutil", ) # https://github.com/pgaskin/dictutil optionsProp = { "encoding": EncodingOption(), "extract_inline_images": BoolOption(comment="Extract inline images"), } def fixWord(word: str) -> str: return word.replace("\n", " ") def escapeDefi(defi: str) -> str: return defi.replace("\n@", "\n @")\ .replace("\n:", "\n :")\ .replace("\n&", "\n &") class Reader(TextGlossaryReader): depends = { "mistune": "mistune==2.0.0a5", } _extract_inline_images = True def __init__(self, glos: "GlossaryType"): TextGlossaryReader.__init__(self, glos, hasInfo=False) def open(self, filename: str) -> None: try: import mistune except ModuleNotFoundError as e: e.msg += f", run `{pip} install mistune` to install" raise e TextGlossaryReader.open(self, filename) self._glos.setDefaultDefiFormat("h") def isInfoWord(self, word): return False def fixInfoWord(self, word): raise NotImplementedError def fixDefi(self, defi: str, html: bool) -> str: import mistune defi = defi.replace("\n @", "\n@")\ .replace("\n :", "\n:")\ .replace("\n &", "\n&")\ .replace("


        ", "

        ")\ .replace("


        ", "

        ")\ .replace("


        ", "

        ") defi = defi.lstrip() if html: pass else: defi = mistune.html(defi) if self._extract_inline_images: defi, images = extractInlineHtmlImages( defi, self._glos.tmpDataDir, fnamePrefix="", # maybe f"{self._pos:06d}-" ) if images: defi = (defi, images) return defi def nextPair(self): if not self._file: raise StopIteration words = [] defiLines = [] html = False while True: line = self.readline() if not line: break line = line.rstrip("\n\r") if line.startswith("@"): if words: self._bufferLine = line return words, self.fixDefi("\n".join(defiLines), html=html) words = [line[1:]] continue if line.startswith(": "): defiLines.append(line[2:]) continue if line.startswith("::"): continue if line.startswith("&"): words.append(line[1:]) continue if line.startswith(""): line = line[6:] html = True defiLines.append(line) if words: return words, self.fixDefi("\n".join(defiLines), html=html) raise StopIteration class Writer(object): _encoding: str = "utf-8" def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._file = None def finish(self): if self._file is None: return self._file.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) def open(self, filename: str) -> None: self._file = open(filename, "w", encoding=self._encoding) # dictgen's ParseDictFile does not seem to support glossary info / metedata self._resDir = filename + "_res" if not isdir(self._resDir): os.mkdir(self._resDir) def write( self, ) -> "Generator[None, BaseEntry, None]": fileObj = self._file resDir = self._resDir while True: entry = yield if entry is None: break if entry.isData(): entry.save(resDir) continue words = entry.l_word defi = entry.defi entry.detectDefiFormat() if entry.defiFormat == "h": entry.stripFullHtml() defi = f"{entry.defi}" fileObj.write(f"@ {fixWord(words[0])}\n") for alt in words[1:]: fileObj.write(f"& {fixWord(alt)}\n") fileObj.write(f"{escapeDefi(defi)}\n\n") pyglossary-4.5.0/pyglossary/plugins/ebook_mobi.py000066400000000000000000000233551417733132500223260ustar00rootroot00000000000000# -*- coding: utf-8 -*- # The MIT License (MIT) # Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) # Copyright © 2016-2022 Saeed Rasooli # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. from pyglossary.plugins.formats_common import * from pyglossary.ebook_base import * from pyglossary.langs import Lang enable = True lname = "mobi" format = "Mobi" description = "Mobipocket (.mobi) E-Book" extensions = (".mobi",) extensionCreate = ".mobi" sortOnWrite = DEFAULT_YES sortKeyName = "ebook" kind = "package" wiki = "https://en.wikipedia.org/wiki/Mobipocket" website = None optionsProp = { "group_by_prefix_length": IntOption( comment="Prefix length for grouping", ), # "group_by_prefix_merge_min_size": IntOption(), # "group_by_prefix_merge_across_first": BoolOption(), # specific to mobi "kindlegen_path": StrOption( comment="Path to kindlegen executable", ), "compress": BoolOption( disabled=True, comment="Enable compression", ), "keep": BoolOption( comment="Keep temp files", ), "include_index_page": BoolOption( disabled=True, comment="Include index page", ), "css": StrOption( # disabled=True, comment="Path to css file", ), "cover_path": StrOption( # disabled=True, comment="Path to cover file", ), "file_size_approx": FileSizeOption( comment="Approximate size of each xhtml file (example: 200kb)", ), "hide_word_index": BoolOption( comment="Hide headword in tap-to-check interface", ), "spellcheck": BoolOption( comment="Enable wildcard search and spell correction during word lookup", # "Maybe it just enables the kindlegen's spellcheck." ), "exact": BoolOption( comment="Exact-match Parameter", # "I guess it only works for inflections" ), } extraDocs = [ ( "Other Requirements", "Install [KindleGen](https://wiki.mobileread.com/wiki/KindleGen)" " for creating Mobipocket e-books." ), ] class GroupStateBySize(object): def __init__(self, writer) -> None: self.writer = writer self.group_index = -1 self.reset() def reset(self) -> None: self.group_contents = [] self.group_size = 0 def add(self, entry: "BaseEntry") -> None: word = entry.l_word defi = entry.defi content = self.writer.format_group_content(word, defi) self.group_contents.append(content) self.group_size += len(content.encode("utf-8")) class Writer(EbookWriter): _compress: bool = False _keep: bool = False _kindlegen_path: str = "" _file_size_approx: int = 271360 _hide_word_index: bool = False _spellcheck: bool = True _exact: bool = False CSS_CONTENTS = """"@charset "UTF-8";""" GROUP_XHTML_TEMPLATE = """ {group_contents} """ GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """ {headword_visible}{infl}
        {definition}

        """ GROUP_XHTML_WORD_INFL_TEMPLATE = """ {iforms_str} """ GROUP_XHTML_WORD_IFORM_TEMPLATE = """""" OPF_TEMPLATE = """ {title} {sourceLang} {identifier} {creator} {copyright} {description} Dictionaries {sourceLang} {targetLang} {cover} {manifest} {spine} """ def __init__(self, glos, **kwargs): import uuid EbookWriter.__init__( self, glos, ) glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) def get_prefix(self, word: str) -> str: if not word: return None length = self._group_by_prefix_length prefix = word[:length].lower() if prefix[0] < "a": return "SPECIAL" return prefix def format_group_content(self, word: "List[str]", defi: str) -> str: hide_word_index = self._hide_word_index if len(word) == 1: infl = '' mainword = word[0] else: mainword, *variants = word iforms_list = [] for variant in variants: iforms_list.append(self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format( inflword=variant, exact_str=' exact="yes"' if self._exact else '', )) infl = '\n' + \ self.GROUP_XHTML_WORD_INFL_TEMPLATE.format( iforms_str="\n".join(iforms_list)) headword = self.escape_if_needed(mainword) defi = self.escape_if_needed(defi) if hide_word_index: headword_visible = "" value_headword = f' value="{headword}"' else: headword_visible = "\n" + self._glos.wordTitleStr(headword) value_headword = "" group_content = self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format( spellcheck_str=' spell="yes"' if self._spellcheck else "", headword_visible=headword_visible, value_headword=value_headword, definition=defi, infl=infl, ) return group_content def getLangCode(self, lang) -> str: return lang.code if isinstance(lang, Lang) else "" def get_opf_contents(self, manifest_contents, spine_contents): cover = "" if self.cover: cover = self.COVER_TEMPLATE.format(cover=self.cover) creationDate = datetime.now().strftime("%Y-%m-%d") return self.OPF_TEMPLATE.format( identifier=self._glos.getInfo("uuid"), # use Language code instead name for kindlegen sourceLang=self.getLangCode(self._glos.sourceLang), targetLang=self.getLangCode(self._glos.targetLang), title=self._glos.getInfo("name"), creator=self._glos.author, copyright=self._glos.getInfo("copyright"), description=self._glos.getInfo("description"), creationDate=creationDate, cover=cover, manifest=manifest_contents, spine=spine_contents, ) def write_groups(self): def add_group(state): if state.group_size <= 0: return state.group_index += 1 index = state.group_index + self.GROUP_START_INDEX group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) self.add_file_manifest( "OEBPS/" + group_xhtml_path, group_xhtml_path, self.GROUP_XHTML_TEMPLATE.format( group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join( state.group_contents, ), ), "application/xhtml+xml", ) state = GroupStateBySize(self) while True: entry = yield if entry is None: break if entry.isData(): continue if state.group_size >= self._file_size_approx: add_group(state) state.reset() state.add(entry) add_group(state) def write(self): import subprocess filename = self._filename kindlegen_path = self._kindlegen_path yield from EbookWriter.write(self) # download kindlegen from this page: # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211 # run kindlegen if not kindlegen_path: log.warning(f"Not running kindlegen, the raw files are located in {filename}") log.warning( "Provide KindleGen path with: " "--write-options 'kindlegen_path=...'" ) return name = self._glos.getInfo("name") log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}") opf_path_abs = join(filename, "OEBPS", "content.opf") proc = subprocess.Popen( [kindlegen_path, opf_path_abs, "-o", "content.mobi"], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) output = proc.communicate() log.info(output[0].decode("utf-8")) mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi") log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}") pyglossary-4.5.0/pyglossary/plugins/edlin.py000066400000000000000000000157601417733132500213150ustar00rootroot00000000000000# -*- coding: utf-8 -*- # edlin.py # # Copyright © 2016-2019 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from pyglossary.plugins.formats_common import * from pyglossary.text_utils import ( escapeNTB, unescapeNTB, splitByBarUnescapeNTB, ) enable = True lname = "edlin" format = "Edlin" # Editable Linked List of Entries description = "EDLIN" extensions = (".edlin",) extensionCreate = ".edlin/" kind = "directory" wiki = "" website = None optionsProp = { "encoding": EncodingOption(), "prev_link": BoolOption(comment="Enable link to previous entry"), } def makeDir(direc: str) -> None: if not isdir(direc): os.makedirs(direc) class Reader(object): _encoding: str = "utf-8" def __init__(self, glos: GlossaryType): self._glos = glos self._clear() def close(self) -> None: self._clear() def _clear(self) -> None: self._filename = "" self._prev_link = True self._wordCount = None self._rootPath = None self._resDir = "" self._resFileNames = [] def open(self, filename: str) -> None: from pyglossary.json_utils import jsonToOrderedData if isdir(filename): infoFname = join(filename, "info.json") elif isfile(filename): infoFname = filename filename = dirname(filename) else: raise ValueError( f"error while opening {filename!r}: no such file or directory" ) self._filename = filename with open(infoFname, "r", encoding=self._encoding) as infoFp: info = jsonToOrderedData(infoFp.read()) self._wordCount = info.pop("wordCount") self._prev_link = info.pop("prev_link") self._rootPath = info.pop("root") for key, value in info.items(): self._glos.setInfo(key, value) self._resDir = join(filename, "res") if isdir(self._resDir): self._resFileNames = os.listdir(self._resDir) else: self._resDir = "" self._resFileNames = [] def __len__(self) -> int: if self._wordCount is None: log.error("called len() on a reader which is not open") return 0 return self._wordCount + len(self._resFileNames) def __iter__(self) -> "Iterator[BaseEntry]": if not self._rootPath: raise RuntimeError("iterating over a reader while it's not open") wordCount = 0 nextPath = self._rootPath while nextPath != "END": wordCount += 1 # before or after reading word and defi # (and skipping empty entry)? FIXME with open( join(self._filename, nextPath), "r", encoding=self._encoding, ) as _file: header = _file.readline().rstrip() if self._prev_link: _prevPath, nextPath = header.split(" ") else: nextPath = header word = _file.readline() if not word: yield None # update progressbar continue defi = _file.read() if not defi: log.warning( f"Edlin Reader: no definition for word {word!r}" f", skipping" ) yield None # update progressbar continue word = word.rstrip() defi = defi.rstrip() if self._glos.alts: word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=False) # defi = unescapeNTB(defi) yield self._glos.newEntry(word, defi) if wordCount != self._wordCount: log.warning( f"{wordCount} words found, " f"wordCount in info.json was {self._wordCount}" ) self._wordCount = wordCount resDir = self._resDir for fname in self._resFileNames: with open(join(resDir, fname), "rb") as _file: yield self._glos.newDataEntry( fname, _file.read(), ) class Writer(object): _encoding: str = "utf-8" _prev_link: bool = True def __init__(self, glos: GlossaryType): self._glos = glos self._clear() def finish(self) -> None: self._clear() def open(self, filename: str): self._filename = filename self._resDir = join(filename, "res") os.makedirs(filename) os.mkdir(self._resDir) def _clear(self) -> None: self._filename = None self._resDir = None self._encoding = "utf-8" self._hashSet = set() # self._wordCount = None def hashToPath(self, h: str) -> str: return h[:2] + "/" + h[2:] def getEntryHash(self, entry: BaseEntry) -> str: """ return hash string for given entry don't call it twice for one entry, if you do you will get a different hash string """ from hashlib import sha1 _hash = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] if _hash not in self._hashSet: self._hashSet.add(_hash) return _hash index = 0 while True: tmp_hash = _hash + hex(index)[2:] if tmp_hash not in self._hashSet: self._hashSet.add(tmp_hash) return tmp_hash index += 1 def saveEntry( self, thisEntry: BaseEntry, thisHash: str, prevHash: str, nextHash: str, ) -> None: dpath = join(self._filename, thisHash[:2]) makeDir(dpath) with open( join(dpath, thisHash[2:]), "w", encoding=self._encoding, ) as toFile: nextPath = self.hashToPath(nextHash) if nextHash else "END" if self._prev_link: prevPath = self.hashToPath(prevHash) if prevHash else "START" header = prevPath + " " + nextPath else: header = nextPath toFile.write("\n".join([ header, escapeNTB(thisEntry.s_word, bar=False), thisEntry.defi, ])) def write(self) -> "Generator[None, BaseEntry, None]": from collections import OrderedDict as odict from pyglossary.json_utils import dataToPrettyJson filename = self._filename thisEntry = yield if thisEntry is None: raise ValueError("glossary is empty") count = 1 rootHash = thisHash = self.getEntryHash(thisEntry) prevHash = None while True: nextEntry = yield if nextEntry is None: break if nextEntry.isData(): nextEntry.save(self._resDir) continue nextHash = self.getEntryHash(nextEntry) self.saveEntry(thisEntry, thisHash, prevHash, nextHash) thisEntry = nextEntry prevHash, thisHash = thisHash, nextHash count += 1 self.saveEntry(thisEntry, thisHash, prevHash, None) with open( join(self._filename, "info.json"), "w", encoding=self._encoding, ) as toFile: info = odict() info["name"] = self._glos.getInfo("name") info["root"] = self.hashToPath(rootHash) info["prev_link"] = self._prev_link info["wordCount"] = count # info["modified"] = for key, value in self._glos.getExtraInfos(( "name", "root", "prev_link", "wordCount", )).items(): info[key] = value toFile.write(dataToPrettyJson(info)) pyglossary-4.5.0/pyglossary/plugins/formats_common.py000066400000000000000000000020031417733132500232270ustar00rootroot00000000000000import sys import os from os.path import ( join, split, splitext, isfile, isdir, exists, ) import logging log = logging.getLogger("pyglossary") from pprint import pformat from pyglossary.core import rootDir sys.path.insert(0, rootDir) from pyglossary.flags import * from pyglossary import core from pyglossary.core import ( pip, cacheDir, ) from pyglossary.option import * from pyglossary.text_utils import ( toStr, toBytes, replaceStringTable, ) from pyglossary.compression import ( compressionOpen, stdCompressions, ) from pyglossary.os_utils import indir from pyglossary.entry_base import BaseEntry from pyglossary.glossary_type import GlossaryType enable = False lname = "" format = "Unknown" description = "Unknown" extensions = () # type: Tuple[str, ...] extensionCreate = "" # type: str singleFile = False kind = "" wiki = "" website = None # key is option/argument name, value is instance of Option optionsProp = {} # type: Dict[str, Option] sortOnWrite = DEFAULT_NO # type: YesNoAlwaysNever pyglossary-4.5.0/pyglossary/plugins/freedict.py000066400000000000000000000473671417733132500220170ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from pyglossary.html_utils import unescape_unicode from pyglossary.langs import langDict from pyglossary.langs.writing_system import getWritingSystemFromText from io import BytesIO import re import html enable = True lname = "freedict" format = "FreeDict" description = "FreeDict (.tei)" extensions = (".tei",) extensionCreate = ".tei" singleFile = True kind = "text" wiki = "https://github.com/freedict/fd-dictionaries/wiki" website = ( "https://freedict.org/", "FreeDict.org", ) optionsProp = { "resources": BoolOption( comment="Enable resources / data files", ), "discover": BoolOption( comment="Find and show unsupported tags", ), "auto_rtl": BoolOption( allowNone=True, comment="Auto-detect and mark Right-to-Left text", ), "word_title": BoolOption( comment="Add headwords title to begining of definition", ), "pron_color": StrOption( comment="Pronunciation color", ), "gram_color": StrOption( comment="Grammar color", ), "example_padding": IntOption( comment="Padding for examples (in px)", ), } tei = "{http://www.tei-c.org/ns/1.0}" class Reader(object): compressions = stdCompressions depends = { "lxml": "lxml", } _discover: bool = False _auto_rtl: "Optional[bool]" = None _word_title: bool = False _pron_color: str = "gray" _gram_color: str = "green" _example_padding: int = 10 ns = { None: "http://www.tei-c.org/ns/1.0", } xmlLang = "{http://www.w3.org/XML/1998/namespace}lang" supportedTags = { f"{tei}{tag}" for tag in ( "entry", "form", # entry.form "orth", # entry.form.orth "pron", # entry.form.pron "sense", # entry.sense "cit", # entry.sense.cit "quote", # entry.sense.cit.quote "gramGrp", # entry.sense.cit.gramGrp "pos", # entry.sense.cit.gramGrp.pos "gen", # entry.sense.cit.gramGrp.gen "number", # entry.sense.cit.gramGrp.number "num", # entry.sense.cit.gramGrp.num ) } posMapping = { "n": "noun", "v": "verb", "pn": "pronoun", "pron": "pronoun", "prep": "preposition", "conj": "conjuction", "adj": "adjective", "adv": "adverb", # "numeral", "interjection", "suffix", "particle" # "indefinitePronoun" } genderMapping = { "m": "male", "masc": "male", "f": "female", "fem": "female", "n": "neutral", "neut": "neutral", # "m;f" "adj": "adjective", } numberMapping = { "pl": "plural", "sing": "singular", } subcMapping = { "t": "transitive", "i": "intransitive", } def makeList( self, hf: "lxml.etree.htmlfile", input_objects: "List[Any]", processor: "Callable", single_prefix="", skip_single=True, ordered=True, list_type="", ): """ Wrap elements into
          if more than one element """ if not input_objects: return if skip_single and len(input_objects) == 1: if single_prefix: hf.write(single_prefix) processor(hf, input_objects[0]) return kw = {} if list_type: kw["type"] = list_type with hf.element("ol" if ordered else "ul", **kw): for el in input_objects: with hf.element("li"): processor(hf, el) def getTitleTag(self, sample: str) -> str: ws = getWritingSystemFromText(sample) if ws: return ws.titleTag return "b" def writeRef( self, hf: "lxml.etree.htmlfile", ref: "lxml.etree.Element", ): target = ref.get("target") attrib = {} if target: if "://" in target: attrib["class"] = "external" else: target = f"bword://{ref.text}" with hf.element("a", href=target, **attrib): hf.write(ref.text) def writeQuote( self, hf: "lxml.etree.htmlfile", elem: "lxml.etree.Element", ): self.writeWithDirection(hf, elem, "div") def writeTransCit( self, hf: "lxml.etree.htmlfile", elem: "lxml.etree.Element", ): from lxml import etree as ET quotes = [] for child in elem.xpath("child::node()"): if isinstance(child, str): child = child.strip() if child: hf.write(child) log.warning(f"text directly inside ") continue if child.__class__.__name__ == "_Comment": continue if child.tag != f"{tei}quote": log.warning( f"unknown tag {child.tag!r} inside translation " f": {self.tostring(child)}" ) continue quotes.append(child) self.makeList( hf, quotes, self.writeQuote, single_prefix="", ) def writeDef( self, hf: "lxml.etree.htmlfile", elem: "lxml.etree.Element", ): from lxml import etree as ET sep = ", " # TODO: self.getCommaSep(sample) # if self._cif_newline: # sep = ET.Element("br") count = 0 def writeChild(item, depth): nonlocal count if isinstance(item, str): item = item.strip() if not item: return if count > 0: hf.write(sep) # with hf.element(self.getTitleTag(item)): hf.write(item) return if item.tag == f"{tei}ref": if count > 0: hf.write(sep) self.writeRef(hf, item) return for child in item.xpath("child::node()"): writeChild(child, depth + 1) if depth < 1: count += 1 for child in elem.xpath("child::node()"): writeChild(child, 0) def writeWithDirection(self, hf, child, tag): attrib = child.attrib try: lang = attrib.pop(self.xmlLang) except KeyError: pass else: attrib["lang"] = lang if self._auto_rtl: langObj = langDict[lang] if langObj: if langObj.rtl: attrib["dir"] = "rtl" else: attrib["dir"] = "ltr" try: _type = attrib.pop("type") except KeyError: pass else: if _type not in ("trans",): attrib["class"] = _type with hf.element(tag, **attrib): self.writeRichText(hf, child) def writeRichText( self, hf: "lxml.etree.htmlfile", el: "lxml.etree.Element", ): from lxml import etree as ET for child in el.xpath("child::node()"): if isinstance(child, str): hf.write(child) continue if child.tag == f"{tei}ref": self.writeRef(hf, child) continue if child.tag == f"{tei}br": hf.write(ET.Element("br")) continue if child.tag == f"{tei}p": with hf.element("p", **child.attrib): self.writeRichText(hf, child) continue if child.tag == f"{tei}div": self.writeWithDirection(hf, child, "div") continue if child.tag == f"{tei}span": self.writeWithDirection(hf, child, "span") continue self.writeRichText(hf, child) def getLangDesc(self, elem): lang = elem.attrib.get(self.xmlLang) if lang: langObj = langDict[lang] if not langObj: log.warning(f"unknown lang {lang!r} in {self.tostring(elem)}") return return langObj.name orig = elem.attrib.get("orig") if orig: return orig log.warning(f"unknown lang name in {self.tostring(elem)}") def writeLangTag(self, hf, elem): langDesc = self.getLangDesc(elem) if not langDesc: return # TODO: make it Italic or change font color? if elem.text: hf.write(f"{langDesc}: {elem.text}") else: hf.write(f"{langDesc}") def writeNote(self, hf, note): self.writeRichText(hf, note) def writeSenseSense( self, hf: "lxml.etree.htmlfile", sense: "lxml.etree.Element", ): from lxml import etree as ET # this element can be 1st-level (directly under ) # or 2nd-level transCits = [] defList = [] gramList = [] noteList = [] refList = [] usgList = [] xrList = [] exampleCits = [] for child in sense.iterchildren(): if child.tag == f"{tei}cit": if child.attrib.get("type", "trans") == "trans": transCits.append(child) elif child.attrib.get("type") == "example": exampleCits.append(child) else: log.warning(f"unknown cit type: {self.tostring(child)}") continue if child.tag == f"{tei}def": defList.append(child) continue if child.tag == f"{tei}note": _type = child.attrib.get("type") if not _type: noteList.append(child) elif _type in ("pos", "gram"): gramList.append(child) elif _type in ( "sense", "stagr", "stagk", "def", "usage", "hint", "status", "editor", "dom", "infl", "obj", "lbl", ): noteList.append(child) else: log.warning(f"unknown note type {_type}") noteList.append(child) continue if child.tag == f"{tei}ref": refList.append(child) continue if child.tag == f"{tei}usg": if not child.text: log.warning(f"empty usg: {self.tostring(child)}") continue usgList.append(child) continue if child.tag == f"{tei}lang": self.writeLangTag(hf, child) continue if child.tag in (f"{tei}sense", f"{tei}gramGrp"): continue if child.tag == f"{tei}xr": xrList.append(child) continue log.warning(f"unknown tag {child.tag} in ") self.makeList( hf, defList, self.writeDef, single_prefix="", ) if gramList: with hf.element("div"): for i, gram in enumerate(gramList): if i > 0: hf.write(self.getCommaSep(gram.text)) with hf.element("font", color=self._gram_color): hf.write(gram.text) self.makeList( hf, noteList, self.writeNote, single_prefix="", ) self.makeList( hf, transCits, self.writeTransCit, single_prefix="", ) if refList: with hf.element("div"): hf.write("Related: ") for i, ref in enumerate(refList): if i > 0: hf.write(" | ") self.writeRef(hf, ref) if xrList: for xr in xrList: with hf.element("div"): self.writeRichText(hf, xr) if usgList: with hf.element("div"): hf.write("Usage: ") for i, usg in enumerate(usgList): if i > 0: hf.write(self.getCommaSep(usg.text)) hf.write(usg.text) if exampleCits: for cit in exampleCits: with hf.element("div", **{ "class": "example", "style": f"padding: {self._example_padding}px 0px;", }): for quote in cit.findall("quote", self.ns): self.writeWithDirection(hf, quote, "div") for cit2 in cit.findall("cit", self.ns): for quote in cit2.findall("quote", self.ns): quote.attrib.update(cit2.attrib) self.writeWithDirection(hf, quote, "div") return len(transCits) + len(exampleCits) def getCommaSep(self, sample: str): if self._auto_rtl: ws = getWritingSystemFromText(sample) if ws: return ws.comma + " " return ", " def writeGramGroups( self, hf: "lxml.etree.htmlfile", gramGrpList: "List[lxml.etree.htmlfile]", ): from lxml import etree as ET color = self._gram_color for gramGrp in gramGrpList: parts = [] for child in gramGrp.iterchildren(): part = self.normalizeGramGrpChild(child) if part: parts.append(part) if not parts: continue sep = self.getCommaSep(parts[0]) text = sep.join(parts) with hf.element("font", color=color): hf.write(text) hf.write(ET.Element("br")) def writeSenseGrams( self, hf: "lxml.etree.htmlfile", sense: "lxml.etree.Element", ): self.writeGramGroups(hf, sense.findall("gramGrp", self.ns)) def writeSense( self, hf: "lxml.etree.htmlfile", sense: "lxml.etree.Element", ): # this element is 1st-level (directly under ) self.writeSenseGrams(hf, sense) self.makeList( hf, sense.findall("sense", self.ns), self.writeSenseSense, single_prefix="", ) self.writeSenseSense(hf, sense) def getDirection(self, elem: "lxml.etree.Element"): lang = elem.get(self.xmlLang) if lang is None: return "" langObj = langDict[lang] if langObj is None: log.warning(f"unknown language {lang}") return "" if langObj.rtl: return "rtl" return "" def writeSenseList( self, hf: "lxml.etree.htmlfile", senseList: "List[lxml.etree.Element]", ): # these elements are 1st-level (directly under ) if not senseList: return if self._auto_rtl and self.getDirection(senseList[0]) == "rtl": with hf.element("div", dir="rtl"): self.makeList( hf, senseList, self.writeSense, ordered=(len(senseList) > 3), ) return self.makeList( hf, senseList, self.writeSense, # list_type="A", ) def normalizeGramGrpChild(self, elem) -> str: # child can be "pos" or "gen" tag = elem.tag text = elem.text.strip() if tag == f"{tei}pos": return self.posMapping.get(text.lower(), text) if tag == f"{tei}gen": return self.genderMapping.get(text.lower(), text) if tag in (f"{tei}num", f"{tei}number"): return self.numberMapping.get(text.lower(), text) if tag == f"{tei}subc": return self.subcMapping.get(text.lower(), text) if tag == f"{tei}gram": _type = elem.get("type") if _type: if _type == "pos": return self.posMapping.get(text.lower(), text) if _type == "gen": return self.genderMapping.get(text.lower(), text) if _type in ("num", "number"): return self.numberMapping.get(text.lower(), text) if _type == "subc": return self.subcMapping.get(text.lower(), text) log.warning(f"unrecognize type={_type!r}: {self.tostring(elem)}") return text else: log.warning(f" with no type: {self.tostring(elem)}") return text if tag == f"{tei}note": return text log.warning(f"unrecognize GramGrp child tag: {self.tostring(elem)}") return "" def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() pron_color = self._pron_color if self._discover: for elem in entry.iter(): if elem.tag not in self.supportedTags: self._discoveredTags[elem.tag] = elem def br(): return ET.Element("br") inflectedKeywords = [] for form in entry.findall("form", self.ns): inflected = form.get("type") == "infl" for orth in form.findall("orth", self.ns): if not orth.text: continue if inflected: inflectedKeywords.append(orth.text) else: keywords.append(orth.text) keywords += inflectedKeywords pronList = [ pron.text.strip('/') for pron in entry.findall("form/pron", self.ns) if pron.text ] senseList = entry.findall("sense", self.ns) with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div"): if self._word_title: for keyword in keywords: with glos.titleElement(hf, keyword): hf.write(keyword) hf.write(br()) # TODO: "form/usg" # Brit # US # ... if pronList: for i, pron in enumerate(pronList): if i > 0: hf.write(self.getCommaSep(pron)) hf.write("/") with hf.element("font", color=pron_color): hf.write(f"{pron}") hf.write("/") hf.write(br()) hf.write("\n") self.writeGramGroups(hf, entry.findall("gramGrp", self.ns)) self.writeSenseList(hf, senseList) defi = f.getvalue().decode("utf-8") # defi = defi.replace("\xa0", " ") # do we need to do this? return self._glos.newEntry( keywords, defi, defiFormat="h", byteProgress=(self._file.tell(), self._fileSize), ) def setWordCount(self, header): extent_elem = header.find(".//extent", self.ns) if extent_elem is None: log.warning( "did not find 'extent' tag in metedata" ", progress bar will not word" ) return extent = extent_elem.text if not extent.endswith(" headwords"): log.warning(f"unexpected extent={extent}") return try: self._wordCount = int(extent.split(" ")[0].replace(",", "")) except Exception: log.exception(f"unexpected extent={extent}") def tostring(self, elem: "lxml.etree.Element") -> str: from lxml import etree as ET return ET.tostring( elem, method="html", pretty_print=True, ).decode("utf-8").strip() def stripParag(self, elem: "lxml.etree.Element") -> str: text = self.tostring(elem) text = self._p_pattern.sub("\\2", text) return text def stripParagList(self, elems: "List[lxml.etree.Element]") -> str: lines = [] for elem in elems: for line in self.stripParag(elem).split("\n"): line = line.strip() if not line: continue lines.append(line) return "\n".join(lines) def setGlosInfo(self, key: str, value: str) -> None: self._glos.setInfo(key, unescape_unicode(value)) def setCopyright(self, header): elems = header.findall(".//availability//p", self.ns) if not elems: log.warning("did not find copyright") return copyright = self.stripParagList(elems) copyright = self.replaceRefLink(copyright) self.setGlosInfo("copyright", copyright) log.debug(f"Copyright: {copyright!r}") def setPublisher(self, header): elem = header.find(".//publisher", self.ns) if elem is None or not elem.text: log.warning("did not find publisher") return self.setGlosInfo("publisher", elem.text) def setCreationTime(self, header): elem = header.find(".//publicationStmt/date", self.ns) if elem is None or not elem.text: return self.setGlosInfo("creationTime", elem.text) def replaceRefLink(self, text: str) -> str: text = self._ref_pattern.sub('\\2', text) return text def setDescription(self, header): elems = [] for tag in ("sourceDesc", "projectDesc"): elems += header.findall(f".//{tag}//p", self.ns) desc = self.stripParagList(elems) if not desc: return website_list = [] for match in self._website_pattern.findall(desc): if not match[1]: continue website_list.append(match[1]) if website_list: website = " | ".join(website_list) self.setGlosInfo("website", website) desc = self._website_pattern.sub("", desc).strip() log.debug(f"Website: {website}") desc = self.replaceRefLink(desc) self.setGlosInfo("description", desc) log.debug( "------------ Description: ------------\n" f"{desc}\n" "--------------------------------------" ) def setMetadata(self, header): self.setWordCount(header) self.setGlosInfo("name", header.find(".//title", self.ns).text) edition = header.find(".//edition", self.ns) if edition is not None and edition.text: self.setGlosInfo("edition", edition.text) self.setCopyright(header) self.setPublisher(header) self.setCreationTime(header) self.setDescription(header) def __init__(self, glos: GlossaryType): self._glos = glos self._filename = "" self._file = None self._fileSize = 0 self._wordCount = 0 self._discoveredTags = dict() self._p_pattern = re.compile( ']*?)?>(.*?)

          ', re.DOTALL, ) self._ref_pattern = re.compile( '(.*?)', ) self._website_pattern = re.compile( 'Home: <(ref|ptr) target="(.*)">(.*)', ) def __len__(self) -> int: return self._wordCount def close(self) -> None: if self._file: self._file.close() self._file = None self._filename = "" self._fileSize = 0 def open( self, filename: str, ): try: from lxml import etree as ET except ModuleNotFoundError as e: e.msg += f", run `{pip} install lxml` to install" raise e self._filename = filename _file = compressionOpen(filename, mode="rb") _file.seek(0, 2) self._fileSize = _file.tell() _file.seek(0) self._glos.setDefaultDefiFormat("h") if self._word_title: self._glos.setInfo("definition_has_headwords", "True") self._glos.setInfo("input_file_size", f"{self._fileSize}") context = ET.iterparse( _file, events=("end",), tag=f"{tei}teiHeader", ) for action, elem in context: self.setMetadata(elem) break _file.close() def __iter__(self) -> "Iterator[BaseEntry]": from lxml import etree as ET if self._auto_rtl is None: glos = self._glos if ( glos.sourceLang and glos.sourceLang.rtl or glos.targetLang and glos.targetLang.rtl ): log.info("setting auto_rtl=True") self._auto_rtl = True self._file = compressionOpen(self._filename, mode="rb") context = ET.iterparse( self._file, events=("end",), tag=f"{tei}entry", ) for action, elem in context: yield self.getEntryByElem(elem) # clean up preceding siblings to save memory # this reduces memory usage from ~64 MB to ~30 MB while elem.getprevious() is not None: del elem.getparent()[0] if self._discoveredTags: log.info("Found unsupported tags") for tag, elem in self._discoveredTags.items(): log.info(f"{self.tostring(elem)}\n") pyglossary-4.5.0/pyglossary/plugins/gettext_mo.py000066400000000000000000000006101417733132500223650ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = False lname = "gettext_mo" format = "GettextMo" description = "Gettext Binary (mo)" extensions = (".mo",) extensionCreate = ".mo" singleFile = True kind = "binary" wiki = "https://en.wikipedia.org/wiki/Gettext" website = ( "https://www.gnu.org/software/gettext", "gettext - GNU Project", ) optionsProp = {} pyglossary-4.5.0/pyglossary/plugins/gettext_po.py000066400000000000000000000064511417733132500224010ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = True lname = "gettext_po" format = "GettextPo" description = "Gettext Source (.po)" extensions = (".po",) extensionCreate = ".po" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/Gettext" website = ( "https://www.gnu.org/software/gettext", "gettext - GNU Project", ) optionsProp = { "resources": BoolOption(comment="Enable resources / data files"), } class Reader(object): depends = { "polib": "polib", } def __init__(self, glos: GlossaryType): self._glos = glos self.clear() def clear(self): self._filename = "" self._file = None self._wordCount = None self._resDir = "" self._resFileNames = [] def open(self, filename): self._filename = filename self._file = open(filename) self._resDir = filename + "_res" if isdir(self._resDir): self._resFileNames = os.listdir(self._resDir) else: self._resDir = "" self._resFileNames = [] def close(self): if self._file: self._file.close() self.clear() def __len__(self): from pyglossary.file_utils import fileCountLines if self._wordCount is None: log.debug("Try not to use len(reader) as it takes extra time") self._wordCount = fileCountLines( self._filename, newline="\nmsgid", ) return self._wordCount def __iter__(self): try: from polib import unescape as po_unescape except ModuleNotFoundError as e: e.msg += f", run `{pip} install polib` to install" raise e word = "" defi = "" msgstr = False wordCount = 0 for line in self._file: line = line.strip() if not line: continue if line.startswith("#"): continue if line.startswith("msgid "): if word: yield self._glos.newEntry(word, defi) wordCount += 1 word = "" defi = "" else: pass # TODO: parse defi and set glos info? # but this should be done in self.open word = po_unescape(line[6:]) msgstr = False elif line.startswith("msgstr "): if msgstr: log.error("msgid omitted!") defi = po_unescape(line[7:]) msgstr = True else: if msgstr: defi += po_unescape(line) else: word += po_unescape(line) if word: yield self._glos.newEntry(word, defi) wordCount += 1 self._wordCount = wordCount class Writer(object): depends = { "polib": "polib", } _resources: bool = True def __init__(self, glos: GlossaryType): self._glos = glos self._filename = None self._file = None def open(self, filename: str): self._filename = filename self._file = _file = open(filename, mode="wt", encoding="utf-8") _file.write('#\nmsgid ""\nmsgstr ""\n') for key, value in self._glos.iterInfo(): _file.write(f'"{key}: {value}\\n"\n') def finish(self): self._filename = None if self._file: self._file.close() self._file = None def write(self) -> "Generator[None, BaseEntry, None]": try: from polib import escape as po_escape except ModuleNotFoundError as e: e.msg += f", run `{pip} install polib` to install" raise e resources = self._resources _file = self._file while True: entry = yield if entry is None: break if entry.isData(): if resources: entry.save(filename + "_res") continue _file.write( f"msgid {po_escape(entry.s_word)}\n" f"msgstr {po_escape(entry.defi)}\n\n" ) pyglossary-4.5.0/pyglossary/plugins/html_dir.py000066400000000000000000000273301417733132500220200ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from pyglossary.text_utils import ( escapeNTB, unescapeNTB, ) import html import os import json enable = True lname = "html_dir" format = "HtmlDir" description = "HTML Directory" extensions = (".hdir",) extensionCreate = ".hdir/" singleFile = False kind = "directory" wiki = "" website = None optionsProp = { "encoding": EncodingOption(), "resources": BoolOption( comment="Enable resources / data files", ), "max_file_size": IntOption( comment="Maximum file size in bytes", ), "filename_format": StrOption( comment="Filename format, default: {n:05d}.html", ), "escape_defi": BoolOption( comment="Escape definitions", ), "dark": BoolOption( comment="Use dark style", ), "css": StrOption( comment="Path to css file", ), "word_title": BoolOption( comment="Add headwords title to begining of definition", ), } nbsp = "\xa0" # nbsp = " " darkStyle = """ body {{ background-color: #373737; color: #eee; }} a {{ color: #aaaaff; }} a.broken {{ color: #e0c0c0; }} a.no_ul {{ text-decoration: none; }} b.headword {{ font-size: 1.5em; color: #c7ffb9; }} h1 {{ font-size: 1.5em; color: #c7ffb9;}} h2 {{ font-size: 1.3em;}} h3 {{ font-size: 1.0em;}} h4 {{ font-size: 1.0em;}} h5 {{ font-size: 1.0em;}} h6 {{ font-size: 1.0em;}} """ class Writer(object): depends = { "cachetools": "cachetools", } _encoding: str = "utf-8" _resources: bool = True _max_file_size: int = 102400 _filename_format: str = "{n:05d}.html" _escape_defi: bool = False _dark: bool = True _css: str = "" _word_title: bool = True def __init__(self, glos: GlossaryType): self._glos = glos self._filename = None self._fileObj = None self._encoding = "utf-8" self._filename_format = "{n:05d}.html" self._tail = "" self._filenameList = [] def open(self, filename: str): from cachetools import LRUCache self._filename = filename self._resDir = resDir = join(filename, "res") if not isdir(filename): os.mkdir(filename) if not isdir(resDir): os.mkdir(resDir) if self._css: self.copyCSS(self._css) def copyCSS(self, cssPath): import shutil shutil.copy(self._css, join(self._filename, "style.css")) def finish(self): pass def getNextFilename(self): return self._filename_format.format( n=len(self._filenameList) ) def nextFile(self): if self._fileObj: self._fileObj.write(self._tail) self._fileObj.close() filename = self.getNextFilename() self._filenameList.append(filename) self._fileObj = open( join( self._filename, filename, ), mode="w", encoding=self._encoding, ) return self._fileObj def fixLinks(self, linkTargetSet): import gc from cachetools import LRUCache gc.collect() dirn = self._filename filenameList = self._filenameList fileByWord = {} for line in open(join(dirn, "index.txt"), encoding="utf-8"): line = line.rstrip("\n") if not line: continue entryIndex, wordEsc, filename, _ = line.split("\t") entryIndex = int(entryIndex) # entryId = f"entry{entryIndex}" word = unescapeNTB(wordEsc) if word not in linkTargetSet: continue if word in fileByWord: # log.info(f'fileByWord[{word}]={fileByWord[word]}, filename={filename}') fileByWord[word].append((filename, entryIndex)) else: fileByWord[word] = [(filename, entryIndex)] linksByFile = LRUCache(maxsize=100) # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile: # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t") def getLinksByFile(fileIndex): _file = linksByFile.get(fileIndex) if _file is not None: return _file _file = open( join(dirn, f"links{fileIndex}"), mode="a", encoding="utf-8", ) linksByFile[fileIndex] = _file return _file log.info("") for line in open(join(dirn, "links.txt"), encoding="utf-8"): line = line.rstrip("\n") if not line: continue target, fileIndex, x_start, x_size = line.split("\t") target = unescapeNTB(target) if target not in fileByWord: targetNew = "" else: targetFilename, targetEntryIndex = fileByWord[target][0] if targetFilename == filename: continue targetNew = f"{targetFilename}#entry{targetEntryIndex}" _file = getLinksByFile(int(fileIndex)) _file.write( f"{x_start}\t{x_size}\t{targetNew}\n" ) _file.flush() for _, _file in linksByFile.items(): _file.close() del linksByFile linkTargetSet.clear() del fileByWord, linkTargetSet gc.collect() entry_url_fmt = self._glos.getInfo("entry_url") re_href = re.compile( b' href="[^<>"]*?"', re.I, ) for fileIndex, filename in enumerate(filenameList): if not isfile(join(dirn, f"links{fileIndex}")): continue with open(join(dirn, filename), mode="rb") as inFile: with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): outFile.flush() linkLine = linkLine.rstrip(b"\n") x_start, x_size, target = linkLine.split(b"\t") outFile.write(inFile.read( int(x_start, 16) - inFile.tell() )) curLink = inFile.read(int(x_size, 16)) if target: outFile.write(re_href.sub( b' href="./' + target + b'"', curLink, )) continue if not entry_url_fmt: outFile.write(curLink.replace( b' href="#', b' class="broken" href="#', )) continue _st = curLink.decode("utf-8") i = _st.find('href="#') j = _st.find('"', i + 7) word = _st[i + 7:j] url = entry_url_fmt.format(word=word) outFile.write(( _st[:i] + f'class="broken" href="{url}"' + _st[j + 1:] ).encode("utf-8")) outFile.write(inFile.read()) os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) os.remove(join(dirn, f"links{fileIndex}")) def writeInfo(self, filename, header): glos = self._glos title = glos.getInfo("name") encoding = self._encoding customStyle = ( 'table, th, td {border: 1px solid black; ' 'border-collapse: collapse; padding: 5px;}' ) infoHeader = header.format( pageTitle=f"Info: {title}", customStyle=customStyle, ) with open( join(filename, "info.html"), mode="w", encoding="utf-8", ) as _file: _file.write( infoHeader + '' '' '' '' '\n' ) for key, value in glos.iterInfo(): _file.write( f'\n' ) _file.write("
          KeyValue
          {key}{value}
          ") def write(self) -> "Generator[None, BaseEntry, None]": encoding = self._encoding resources = self._resources max_file_size = self._max_file_size filename_format = self._filename_format escape_defi = self._escape_defi wordSep = ' | ' initFileSizeMax = 100 glos = self._glos filename = self._filename self._encoding = encoding self._filename_format = filename_format entry_url_fmt = glos.getInfo("entry_url") def getEntryWebLink(entry) -> str: if not entry_url_fmt: return "" url = entry_url_fmt.format(word=html.escape(entry.l_word[0])) return f'{nbsp}🌏' # from math import log2, ceil # maxPosHexLen = int(ceil(log2(max_file_size) / 4)) indexTxtFileObj = open( join(filename, "index.txt"), mode="w", encoding="utf-8", ) linksTxtFileObj = open( join(filename, "links.txt"), mode="w", encoding="utf-8", ) title = glos.getInfo("name") style = "" if self._dark: style = darkStyle if self._css: cssLink = '' else: cssLink = "" header = ( '\n' '' f'{{pageTitle}}' f'' f'{cssLink}' '\n' ) def pageHeader(n: int): return header.format( pageTitle=f"Page {n} of {title}", customStyle="", ) def navBar() -> str: links = [] if len(self._filenameList) > 1: links.append(f'') links.append(f'') links.append(f'ℹ️
        ') return ( '' ) tailSize = len(self._tail.encode(encoding)) if max_file_size < len(header) + tailSize: raise ValueError(f"max_file_size={max_file_size} is too small") max_file_size -= tailSize if not isdir(self._filename): os.mkdir(self._filename) fileObj = self.nextFile() fileObj.write(pageHeader(0)) fileObj.write(navBar()) re_fixed_link = re.compile( r']*? )?href="#([^<>"]+?)">[^<>]+?', re.I, ) linkTargetSet = set() def replaceBword(text) -> str: return text.replace( ' href="bword://', ' href="#', ) def addLinks(text: str, pos: int) -> str: for m in re_fixed_link.finditer(text): if ' class="entry_link"' in m.group(0): continue if m.group(0).count("href=") != 1: log.error(f"unexpected match: {m.group(0)}") target = html.unescape(m.group(1)) linkTargetSet.add(target) start = m.start() b_start = len(text[:start].encode(encoding)) b_size = len(text[start:m.end()].encode(encoding)) linksTxtFileObj.write( f"{escapeNTB(target)}\t" f"{len(self._filenameList)-1}\t" f"{hex(pos+b_start)[2:]}\t" f"{hex(b_size)[2:]}\n" ) linksTxtFileObj.flush() self.writeInfo(filename, header) _word_title = self._word_title resDir = self._resDir entryIndex = -1 while True: entryIndex += 1 entry = yield if entry is None: break if entry.isData(): if resources: entry.save(resDir) continue if entry.defi.startswith('') and defiFormat != "h": log.error(f"bad defiFormat={defiFormat}") defiFormat = "h" entry.detectDefiFormat() entry.stripFullHtml() defi = entry.defi defiFormat = entry.defiFormat if defiFormat == "m": defi = html.escape(defi) if "\n" in defi: # could be markdown or unformatted plaintext # FIXME: this changes the font to a monospace defi = f'
        {defi}
        ' elif defiFormat == "h": if escape_defi: defi = html.escape(defi) defi = defi.replace(' src="./', ' src="./res/') entryId = f"entry{entryIndex}" if _word_title: words = [ html.escape(word) for word in entry.l_word ] title = glos.wordTitleStr( wordSep.join(words), sample=entry.l_word[0], _class="headword", ) if not title: title = f'Entry {entryIndex}' # entry_link_sym = "¶" entry_link_sym = "🔗" text = ( f'
        {title}{nbsp}{nbsp}' f'' f'{entry_link_sym}' f'{getEntryWebLink(entry)}' f"
        \n{defi}" '
        \n' '
        \n' ) pos = fileObj.tell() if pos > initFileSizeMax: if pos > max_file_size - len(text.encode(encoding)): fileObj = self.nextFile() fileObj.write(pageHeader( len(self._filenameList) - 1 )) fileObj.write(navBar()) pos = fileObj.tell() tmpFilename = escapeNTB(self._filenameList[-1]) for word in entry.l_word: indexTxtFileObj.write( f"{entryIndex}\t" f"{escapeNTB(word)}\t" f"{tmpFilename}\t" f"{pos}\n" ) del tmpFilename text = replaceBword(text) addLinks(text, pos) fileObj.write(text) fileObj.close() self._fileObj = None indexTxtFileObj.close() if linkTargetSet: log.info(f"{len(linkTargetSet)} link targets found") log.info("Fixing links, please wait...") self.fixLinks(linkTargetSet) os.remove(join(filename, "links.txt")) pyglossary-4.5.0/pyglossary/plugins/info_plugin.py000066400000000000000000000076201417733132500225270ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = True lname = "info" format = "Info" description = "Glossary Info (.info)" extensions = (".info",) extensionCreate = ".info" singleFile = True kind = "text" wiki = "" website = None # key is option/argument name, value is instance of Option optionsProp = {} class Writer(object): def __init__(self, glos: GlossaryType): self._glos = glos self._filename = None self._file = None def open(self, filename: str): self._filename = filename self._file = open(filename, mode="wt", encoding="utf-8") def finish(self): self._filename = None if self._file: self._file.close() self._file = None def write(self) -> "Generator[None, BaseEntry, None]": import re from collections import Counter, OrderedDict from pyglossary.json_utils import dataToPrettyJson from pyglossary.langs.writing_system import getWritingSystemFromText glos = self._glos re_possible_html = re.compile( r"<[a-z1-6]+[ />]", re.I, ) re_style = re.compile( r"<([a-z1-6]+)[^<>]* style=", re.I | re.DOTALL, ) wordCount = 0 bwordCount = 0 styleByTagCounter = Counter() defiFormatCounter = Counter() firstTagCounter = Counter() allTagsCounter = Counter() sourceScriptCounter = Counter() dataEntryExtCounter = Counter() while True: entry = yield if entry is None: break defi = entry.defi wordCount += 1 bwordCount += defi.count("bword://") for m in re_style.finditer(defi): tag = m.group(1) styleByTagCounter[tag] += 1 entry.detectDefiFormat() defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat == "m": if re_possible_html.match(defi): log.warning(f"undetected html defi: {defi}") elif defiFormat == "h": match = re_possible_html.search(defi) if match is not None: tag = match.group().strip("< />").lower() firstTagCounter[tag] += 1 for tag in re_possible_html.findall(defi): tag = tag.strip("< />").lower() allTagsCounter[tag] += 1 elif defiFormat == "b": filenameNoExt, ext = splitext(entry.s_word) ext = ext.lstrip(".") dataEntryExtCounter[ext] += 1 ws = getWritingSystemFromText(entry.s_word) if ws: wsName = ws.name else: log.debug(f"No script detected for word: {entry.s_word}") wsName = "None" sourceScriptCounter[wsName] += 1 data_entry_count = defiFormatCounter["b"] del defiFormatCounter["b"] info = OrderedDict() for key, value in glos.iterInfo(): info[key] = value info["word_count"] = wordCount info["bword_count"] = bwordCount info["data_entry_count"] = data_entry_count info["data_entry_extension_count"] = ", ".join( f"{ext}={count}" for ext, count in dataEntryExtCounter.most_common() ) info["defi_format"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sorted(defiFormatCounter.items()) ) info["defi_tag"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in allTagsCounter.most_common() ) info["defi_first_tag"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in firstTagCounter.most_common() ) info["style"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in styleByTagCounter.most_common() ) info["source_script"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sourceScriptCounter.most_common() ) self._file.write(dataToPrettyJson(info) + "\n") class Reader(object): def __init__(self, glos: GlossaryType): self._glos = glos def close(self) -> None: pass def open(self, filename: str) -> None: from pyglossary.json_utils import jsonToOrderedData with open(filename, "r", encoding="utf-8") as infoFp: info = jsonToOrderedData(infoFp.read()) for key, value in info.items(): self._glos.setInfo(key, value) def __len__(self) -> int: return 0 def __iter__(self) -> "Iterator[BaseEntry]": yield None pyglossary-4.5.0/pyglossary/plugins/jmdict.py000066400000000000000000000167051417733132500214740ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from io import BytesIO import re import html enable = True lname = "jmdict" format = "JMDict" description = "JMDict" extensions = () extensionCreate = "" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/JMdict" website = ( "https://www.edrdg.org/jmdict/j_jmdict.html", "The JMDict Project", ) optionsProp = { } class Reader(object): compressions = stdCompressions depends = { "lxml": "lxml", } tagStyle = ( "color:white;" "background:green;" "padding-left:3px;" "padding-right:3px;" "border-radius:0.5ex;" # 0.5ex ~= 0.3em, but "ex" is recommended ) re_inf_mapping = { "gikun (meaning as reading) or jukujikun (special kanji reading)": "gikun/jukujikun", "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete "word containing irregular kana usage": "irregular", } def makeList( self, hf: "lxml.etree.htmlfile", input_objects: "List[lxml.etree.Element]", processor: "Callable", single_prefix=None, skip_single=True ): """ Wrap elements into
          if more than one element """ if len(input_objects) == 0: return if len(input_objects) == 1: hf.write(single_prefix) processor(hf, input_objects[0]) return with hf.element("ol"): for el in input_objects: with hf.element("li"): processor(hf, el) def writeSense( self, hf: "lxml.etree.htmlfile", sense: "lxml.etree.Element", ): from lxml import etree as ET def br(): return ET.Element("br") for elem in sense.findall("pos"): if not elem.text: continue desc = elem.text if desc == "unclassified": continue with hf.element("i"): hf.write(f"{desc.capitalize()}") hf.write(br()) glossList = [ elem.text.strip() for elem in sense.findall("gloss") if elem.text ] if glossList: for i, gloss in enumerate(glossList): if i > 0: hf.write(", ") hf.write(gloss) hf.write(br()) relatedWords = [] for elem in sense.findall("xref"): if not elem.text: continue word = elem.text.strip() word = self._link_number_postfix.sub("", word) relatedWords.append(word) if relatedWords: hf.write("Related: ") for i, word in enumerate(relatedWords): if i > 0: with hf.element("big"): hf.write(" | ") with hf.element("a", href=f"bword://{word}"): hf.write(word) hf.write(br()) antonymWords = [] for elem in sense.findall("ant"): if not elem.text: continue word = elem.text.strip() word = self._link_number_postfix.sub("", word) antonymWords.append(word) if antonymWords: hf.write("Antonym: ") for i, word in enumerate(antonymWords): if i > 0: with hf.element("big"): hf.write(" | ") with hf.element("a", href=f"bword://{word}"): hf.write(word) hf.write(br()) for i, elem in enumerate(sense.findall("field")): if not elem.text: continue if i > 0: hf.write(" ") desc = elem.text with hf.element("span", style=self.tagStyle): hf.write(desc) hf.write(br()) for i, elem in enumerate(sense.findall("misc")): if not elem.text: continue if i > 0: hf.write(" ") desc = elem.text with hf.element("small"): with hf.element("span", style=self.tagStyle): hf.write(desc) hf.write(br()) def getEntryByElem(self, entry: "lxml.etree.Element") -> "BaseEntry": from lxml import etree as ET glos = self._glos keywords = [] f = BytesIO() def br(): return ET.Element("br") with ET.htmlfile(f, encoding="utf-8") as hf: kebList = [] # type: List[str] rebList = [] # type: List[str] with hf.element("div"): for k_ele in entry.findall("k_ele"): keb = k_ele.find("keb") if keb is None: continue kebList.append(keb.text) keywords.append(keb.text) # for elem in k_ele.findall("ke_pri"): # log.info(elem.text) for r_ele in entry.findall("r_ele"): reb = r_ele.find("reb") if reb is None: continue props = [] if r_ele.find("re_nokanji") is not None: props.append("no kanji") inf = r_ele.find("re_inf") if inf is not None: props.append( self.re_inf_mapping.get(inf.text, inf.text) ) rebList.append((reb.text, props)) keywords.append(reb.text) # for elem in r_ele.findall("re_pri"): # log.info(elem.text) # this is for making internal links valid # this makes too many alternates! # but we don't seem to have a choice # except for scanning and indexing all words once # and then starting over and fixing/optimizing links for keb in kebList: for reb, _ in rebList: keywords.append(f"{keb}・{reb}") if kebList: with glos.titleElement(hf, kebList[0]): for i, keb in enumerate(kebList): if i > 0: with hf.element("font", color="red"): hf.write(" | ") hf.write(keb) hf.write(br()) if rebList: for i, (reb, props) in enumerate(rebList): if i > 0: with hf.element("font", color="red"): hf.write(" | ") with hf.element("font", color="green"): hf.write(reb) for prop in props: hf.write(" ") with hf.element("small"): with hf.element("span", style=self.tagStyle): hf.write(prop) hf.write(br()) self.makeList( hf, entry.findall("sense"), self.writeSense, ) defi = f.getvalue().decode("utf-8") byteProgress = (self._file.tell(), self._fileSize) return self._glos.newEntry( keywords, defi, defiFormat="h", byteProgress=byteProgress, ) def tostring(self, elem: "lxml.etree.Element") -> str: from lxml import etree as ET return ET.tostring( elem, method="html", pretty_print=True, ).decode("utf-8").strip() def setCreationTime(self, header): m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) if m is None: return self._glos.setInfo("creationTime", m.group(1)) def setMetadata(self, header: str): # TODO: self.set_info("edition", ...) self.setCreationTime(header) def __init__(self, glos: GlossaryType): self._glos = glos self._wordCount = 0 self._filename = "" self._file = None self._fileSize = 0 self._link_number_postfix = re.compile("・[0-9]+$") def __len__(self) -> int: return self._wordCount def close(self) -> None: if self._file: self._file.close() self._file = None def open( self, filename: str, ): try: from lxml import etree as ET except ModuleNotFoundError as e: e.msg += f", run `{pip} install lxml` to install" raise e self._filename = filename self._fileSize = os.path.getsize(filename) self._glos.sourceLangName = "Japanese" self._glos.setDefaultDefiFormat("h") self._glos.setInfo("definition_has_headwords", "True") self._glos.setInfo("entry_url", f"https://jisho.org/search/{{word}}") # also good: f"https://sakuradict.com/search?q={{word}}" header = "" with compressionOpen(filename, mode="rt", encoding="utf-8") as _file: for line in _file: if "" in line: break header += line self.setMetadata(header) self._file = compressionOpen(filename, mode="rb") def __iter__(self) -> "Iterator[BaseEntry]": from lxml import etree as ET context = ET.iterparse( self._file, events=("end",), tag=f"entry", ) for action, elem in context: yield self.getEntryByElem(elem) # clean up preceding siblings to save memory # this reduces memory usage from ~64 MB to ~30 MB while elem.getprevious() is not None: del elem.getparent()[0] pyglossary-4.5.0/pyglossary/plugins/json_plugin.py000066400000000000000000000031651417733132500225450ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = True lname = "json" format = "Json" description = "JSON (.json)" extensions = (".json",) extensionCreate = ".json" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/JSON" website = ( "https://www.json.org/json-en.html", "www.json.org", ) optionsProp = { "encoding": EncodingOption(), "enable_info": BoolOption(comment="Enable glossary info / metedata"), "resources": BoolOption(comment="Enable resources / data files"), "word_title": BoolOption( comment="add headwords title to begining of definition", ), } class Writer(object): _encoding: str = "utf-8" _enable_info: bool = True _resources: bool = True _word_title: bool = False compressions = stdCompressions def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None glos.preventDuplicateWords() def open(self, filename: str): self._filename = filename def finish(self): self._filename = None def write(self) -> "Generator[None, BaseEntry, None]": from json import dumps from pyglossary.text_writer import writeTxt glos = self._glos encoding = self._encoding enable_info = self._enable_info resources = self._resources ascii = encoding == "ascii" def escape(st): return dumps(st, ensure_ascii=ascii) yield from writeTxt( glos, entryFmt="\t{word}: {defi},\n", filename=self._filename, encoding=encoding, writeInfo=enable_info, wordEscapeFunc=escape, defiEscapeFunc=escape, ext=".json", head="{\n", tail='\t"": ""\n}', resources=resources, word_title=self._word_title, ) pyglossary-4.5.0/pyglossary/plugins/lingoes_ldf.py000066400000000000000000000061351417733132500225030ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from pyglossary.text_reader import TextGlossaryReader from pyglossary.text_utils import splitByBar from pyglossary.file_utils import fileCountLines enable = True lname = "lingoes_ldf" format = "LingoesLDF" description = "Lingoes Source (.ldf)" extensions = (".ldf",) extensionCreate = ".ldf" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/Lingoes" website = ( "http://www.lingoes.net/en/dictionary/dict_format.php", "Lingoes.net", ) optionsProp = { "newline": NewlineOption(), "resources": BoolOption(comment="Enable resources / data files"), "encoding": EncodingOption(), } class Reader(TextGlossaryReader): compressions = stdCompressions def __len__(self): if self._wordCount is None: log.debug("Try not to use len(reader) as it takes extra time") self._wordCount = fileCountLines( self._filename, newline="\n\n", ) - self._leadingLinesCount return self._wordCount def isInfoWord(self, word): if isinstance(word, str): return word.startswith("#") else: return False def fixInfoWord(self, word): if isinstance(word, str): return word.lstrip("#").lower() else: return word def nextPair(self): if not self._file: raise StopIteration entryLines = [] while True: line = self.readline() if not line: raise StopIteration line = line.rstrip("\n\r") # FIXME if line.startswith("###"): parts = line.split(":") key = parts[0].strip() value = ":".join(parts[1:]).strip() return key, value if line: entryLines.append(line) continue # now `line` is empty, process `entryLines` if not entryLines: return if len(entryLines) < 2: log.error( f"invalid block near line {fileObj.line}" f" in file {filename}" ) return word = entryLines[0] defi = "\n".join(entryLines[1:]) defi = defi.replace("
          ", "\n") # FIXME word = splitByBar(word) return word, defi class Writer(object): compressions = stdCompressions _newline: str = "\n" _resources: str = True def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None def getInfo(self, key): return self._glos.getInfo(key).replace("\n", "
          ") def getAuthor(self): return self._glos.author.replace("\n", "
          ") def finish(self): self._filename = None def open(self, filename: str): self._filename = filename def write(self) -> "Generator[None, BaseEntry, None]": from pyglossary.text_writer import writeTxt newline = self._newline resources = self._resources head = ( f"###Title: {self.getInfo('title')}\n" f"###Description: {self.getInfo('description')}\n" f"###Author: {self.getAuthor()}\n" f"###Email: {self.getInfo('email')}\n" f"###Website: {self.getInfo('website')}\n" f"###Copyright: {self.getInfo('copyright')}\n" ) yield from writeTxt( self._glos, entryFmt="{word}\n{defi}\n\n", filename=self._filename, writeInfo=False, defiEscapeFunc=replaceStringTable([ ("\n", "
          "), ]), ext=".ldf", head=head, newline=newline, resources=resources, ) pyglossary-4.5.0/pyglossary/plugins/octopus_mdict_new/000077500000000000000000000000001417733132500233645ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/octopus_mdict_new/__init__.py000066400000000000000000000143501417733132500255000ustar00rootroot00000000000000# -*- coding: utf-8 -*- # Read Octopus MDict dictionary format, mdx(dictionary)/mdd(data) # # Copyright © 2013 Xiaoqiang Wang # Copyright © 2013-2021 Saeed Rasooli # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from pyglossary.plugins.formats_common import * import os import sys import gc from os.path import splitext, isfile, isdir, extsep, basename, dirname enable = True lname = "octopus_mdict" format = "OctopusMdict" description = "Octopus MDict (.mdx)" extensions = (".mdx",) extensionCreate = "" singleFile = False kind = "binary" wiki = "" website = ( "https://www.mdict.cn/wp/?page_id=5325&lang=en", "Download | MDict.cn", ) optionsProp = { "encoding": EncodingOption(), "substyle": BoolOption( comment="Enable substyle", ), "same_dir_data_files": BoolOption( comment="Read data files from same directory", ), "audio": BoolOption( comment="Enable audio objects", ), } extraDocs = [ ( "`python-lzo` is required for **some** MDX glossaries.", """First try converting your MDX file, if failed (`AssertionError` probably), then try to install [LZO library and Python binding](./doc/lzo.md).""" ), ] class Reader(object): _encoding: str = "" _substyle: bool = True _same_dir_data_files: bool = False _audio: bool = False def __init__(self, glos): self._glos = glos self.clear() self._re_internal_link = re.compile('href=(["\'])(entry://|[dx]:)') self._re_audio_link = re.compile( ']*? )?href="sound://([^<>"]+)"( .*?)?>(.*?)' ) def clear(self): self._filename = "" self._mdx = None self._mdd = [] self._wordCount = 0 self._dataEntryCount = 0 # dict of mainWord -> newline-separated alternatives self._linksDict = {} # type: Dict[str, str] def open(self, filename): from pyglossary.plugin_lib.readmdict import MDX, MDD self._filename = filename self._mdx = MDX(filename, self._encoding, self._substyle) """ multiple MDD files are supported with this naming schema: FILE.mdx FILE.mdd FILE.1.mdd FILE.2.mdd FILE.3.mdd """ filenameNoExt, ext = splitext(self._filename) mddBase = "".join([filenameNoExt, extsep]) for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): if isfile(fname): self._mdd.append(MDD(fname)) mddN = 2 while isfile(f"{mddBase}{mddN}.mdd"): self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) mddN += 1 dataEntryCount = 0 for mdd in self._mdd: dataEntryCount += len(mdd) self._dataEntryCount = dataEntryCount log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries") log.debug("mdx.header = " + pformat(self._mdx.header)) # for key, value in self._mdx.header.items(): # key = key.lower() # self._glos.setInfo(key, value) try: title = self._mdx.header[b"Title"] except KeyError: pass else: title = title.strip() if title: self._glos.setInfo("name", title) desc = self._mdx.header.get(b"Description", "") if desc: self._glos.setInfo("description", desc) self.loadLinks() def loadLinks(self): from pyglossary.plugin_lib.readmdict import MDX log.info("extracting links...") linksDict = {} word = "" wordCount = 0 for b_word, b_defi in self._mdx.items(): word = b_word.decode("utf-8") defi = b_defi.decode("utf-8").strip() if defi.startswith("@@@LINK="): if not word: log.warning(f"unexpected defi: {defi}") continue mainWord = defi[8:] if mainWord in linksDict: linksDict[mainWord] += "\n" + word else: linksDict[mainWord] = word continue wordCount += 1 log.info( "extracting links done, " f"sizeof(linksDict)={sys.getsizeof(linksDict)}" ) log.info(f"wordCount = {wordCount}") self._linksDict = linksDict self._wordCount = wordCount self._mdx = MDX(self._filename, self._encoding, self._substyle) def fixDefi(self, defi: str) -> str: defi = self._re_internal_link.sub(r'href=\1bword://', defi) defi = defi.replace(' src="file://', ' src=".') if self._audio: # \5 is the possible elements between and # but anything between and is completely # ignored by Aaard2 Web and browser # and there is no point adding it after # which makes it shown after audio controls # GoldenDict acts completely different, so must use # audio_goldendict=True option in StarDict writer instead. defi = self._re_audio_link.sub( r'', defi, ) return defi def __iter__(self): if self._mdx is None: log.error("trying to iterate on a closed MDX file") return glos = self._glos linksDict = self._linksDict for b_word, b_defi in self._mdx.items(): word = b_word.decode("utf-8") defi = b_defi.decode("utf-8").strip() if defi.startswith("@@@LINK="): continue defi = self.fixDefi(defi) words = word altsStr = linksDict.get(word, "") if altsStr: words = [word] + altsStr.split("\n") yield glos.newEntry(words, defi) self._mdx = None del linksDict self._linksDict = {} gc.collect() if self._same_dir_data_files: dirPath = dirname(self._filename) for fname in os.listdir(dirPath): ext = splitext(fname)[1].lower() if ext in (".mdx", ".mdd"): continue fpath = join(dirPath, fname) with open(fpath, mode="rb") as _file: b_data = _file.read() yield glos.newDataEntry(fname, b_data) for mdd in self._mdd: try: for b_fname, b_data in mdd.items(): fname = toStr(b_fname) fname = fname.replace("\\", os.sep).lstrip(os.sep) yield glos.newDataEntry(fname, b_data) except Exception as e: log.exception(f"Error reading {mdd.filename}") self._mdd = [] def __len__(self): return self._wordCount + self._dataEntryCount def close(self): self.clear() pyglossary-4.5.0/pyglossary/plugins/paths.py000066400000000000000000000002631417733132500213310ustar00rootroot00000000000000from os.path import realpath, dirname, join, isdir import sys if hasattr(sys, 'frozen'): rootDir = dirname(sys.executable) else: rootDir = dirname(dirname(realpath(__file__))) pyglossary-4.5.0/pyglossary/plugins/sdict.py000066400000000000000000000156611417733132500213300ustar00rootroot00000000000000# -*- coding: utf-8 -*- # sdict.py # Loader engine for AXMASoft's open dictionary format # # Copyright © 2010-2019 Saeed Rasooli (ilius) # Copyright © 2006-2008 Igor Tkach, as part of SDict Viewer: # http://sdictviewer.sf.net # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from struct import unpack from pyglossary.plugins.formats_common import * enable = True lname = "sdict" format = "Sdict" description = "Sdictionary Binary(dct)" extensions = (".dct",) extensionCreate = ".dct" singleFile = True kind = "binary" wiki = "" website = ( "http://swaj.net/sdict/", "Sdictionary Project", ) optionsProp = { # "encoding": EncodingOption(), # TODO: needed? } class GzipCompression(object): def __str__(self): return "gzip" def decompress(self, string): import zlib return zlib.decompress(string) class Bzip2Compression(object): def __str__(self): return "bzip2" def decompress(self, string): import bz2 return bz2.decompress(string) class NoCompression(object): def __str__(self): return "no compression" def decompress(self, string): return string compressions = [ NoCompression(), GzipCompression(), Bzip2Compression(), ] def read_raw(s, fe): return s[fe.offset:fe.offset + fe.length] def read_str(s, fe): return read_raw(s, fe).replace(b"\x00", b"") def read_int(s, fe=None): return unpack("> 4 self.num_of_words = read_int(st, self.f_num_of_words) self.title_offset = read_int(st, self.f_title) self.copyright_offset = read_int(st, self.f_copyright) self.version_offset = read_int(st, self.f_version) self.articles_offset = read_int(st, self.f_articles) self.short_index_offset = read_int(st, self.f_short_index) self.full_index_offset = read_int(st, self.f_full_index) class Reader(object): # _encoding: str = "utf-8" def __init__(self, glos): self._glos = glos self.clear() def clear(self): self._file = None self._filename = "" self._header = Header() def open(self, filename): self._file = open(filename, "rb") h = self._header h.parse(self._file.read(43)) self._compression = compressions[h.compressionType] self.short_index = self.readShortIndex() self._glos.setInfo("name", self.readUnit(h.title_offset)) self._glos.setInfo("version", self.readUnit(h.version_offset)) self._glos.setInfo("copyright", self.readUnit(h.copyright_offset)) log.debug(f"SDict word count: {len(self)}") # correct? FIXME def close(self): if self._file: self._file.close() self.clear() def __len__(self): return self._header.num_of_words def readUnit(self, pos): f = self._file f.seek(pos) record_length = read_int(f.read(4)) return self._compression.decompress(f.read(record_length)) def readShortIndex(self): self._file.seek(self._header.short_index_offset) s_index_depth = self._header.short_index_depth index_entry_len = (s_index_depth + 1) * 4 short_index_str = self._file.read( index_entry_len * self._header.short_index_length ) short_index_str = self._compression.decompress(short_index_str) index_length = self._header.short_index_length short_index = [{} for i in range(s_index_depth + 2)] depth_range = range(s_index_depth) for i in range(index_length): entry_start = start_index = i * index_entry_len short_word = "" try: for j in depth_range: # inlined unpack yields ~20% performance gain # compared to calling read_int() uchar_code = unpack( "", "\n").replace("
          ", "\n") yield self._glos.newEntry(word, defi) def readFullIndexItem(self, pointer): try: f = self._file f.seek(pointer) s = f.read(8) next_word = unpack("= self._header.articles_offset: log.error( "Warning: attempt to read word from " "illegal position in dict file" ) return None log.exception("") def readArticle(self, pointer): return self.readUnit(self._header.articles_offset + pointer) pyglossary-4.5.0/pyglossary/plugins/sdict_source.py000066400000000000000000000032351417733132500227020ustar00rootroot00000000000000# -*- coding: utf-8 -*- # Source Glossary for "Sdictionary" (http://sdict.org) # It has extension ".sdct" from pyglossary.plugins.formats_common import * enable = True lname = "sdict_source" format = "SdictSource" description = "Sdictionary Source (.sdct)" extensions = (".sdct",) extensionCreate = ".sdct" singleFile = True kind = "text" wiki = "" website = ( "http://swaj.net/sdict/", "Sdictionary Project", ) optionsProp = { "enable_info": BoolOption(comment="Enable glossary info / metedata"), "newline": NewlineOption(), "resources": BoolOption(comment="Enable resources / data files"), } class Writer(object): _enable_info: bool = True _newline: bool = "\n" _resources: bool = True def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None def getInfo(self, key): return self._glos.getInfo(key).replace("\n", "
          ") def finish(self): self._filename = None def open(self, filename: str) -> None: self._filename = filename def write(self) -> "Generator[None, BaseEntry, None]": from pyglossary.text_writer import writeTxt glos = self._glos head = "" if self._enable_info: head = ( "
          \n" f"title = {self.getInfo('name')}\n" f"author = {self.getInfo('author')}\n" f"description = {self.getInfo('description')}\n" f"w_lang = {glos.sourceLangName}\n" f"a_lang = {glos.targetLangName}\n" "
          \n#\n#\n#\n" ) yield from writeTxt( glos, entryFmt="{word}___{defi}\n", filename=self._filename, writeInfo=False, defiEscapeFunc=replaceStringTable([ ("\n", "
          "), ]), ext=".sdct", head=head, newline=self._newline, resources=self._resources, ) pyglossary-4.5.0/pyglossary/plugins/sql.py000066400000000000000000000063311417733132500210130ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = True lname = "sql" format = "Sql" description = "SQL (.sql)" extensions = (".sql",) extensionCreate = ".sql" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/SQL" website = None optionsProp = { "encoding": EncodingOption(), "info_keys": ListOption(comment="List of dbinfo table columns"), "add_extra_info": BoolOption(comment="Create dbinfo_extra table"), "newline": NewlineOption(), "transaction": BoolOption(comment="Use TRANSACTION"), } class Writer(object): _encoding: str = "utf-8" _info_keys: "Optional[List]" = None _add_extra_info: bool = True _newline: str = "
          " _transaction: bool = False def __init__(self, glos: "GlossaryType") -> None: self._glos = glos self._filename = None self._file = None def finish(self): self._filename = None if self._file: self._file.close() self._file = None def open(self, filename: str): self._filename = filename self._file = open(filename, "wt", encoding=self._encoding) self._writeInfo() def _writeInfo(self): fileObj = self._file newline = self._newline info_keys = self._getInfoKeys() infoDefLine = "CREATE TABLE dbinfo (" infoValues = [] glos = self._glos for key in info_keys: value = glos.getInfo(key) value = value\ .replace("\'", "\'\'")\ .replace("\x00", "")\ .replace("\r", "")\ .replace("\n", newline) infoValues.append(f"\'{value}\'") infoDefLine += f"{key} char({len(value)}), " infoDefLine = infoDefLine[:-2] + ");" fileObj.write(infoDefLine + "\n") if self._add_extra_info: fileObj.write( "CREATE TABLE dbinfo_extra (" "\'id\' INTEGER PRIMARY KEY NOT NULL, " "\'name\' TEXT UNIQUE, \'value\' TEXT);\n" ) fileObj.write( "CREATE TABLE word (\'id\' INTEGER PRIMARY KEY NOT NULL, " + "\'w\' TEXT, \'m\' TEXT);\n" ) if self._transaction: fileObj.write("BEGIN TRANSACTION;\n") fileObj.write(f"INSERT INTO dbinfo VALUES({','.join(infoValues)});\n") if self._add_extra_info: extraInfo = glos.getExtraInfos(info_keys) for index, (key, value) in enumerate(extraInfo.items()): key = key.replace("\'", "\'\'") value = value.replace("\'", "\'\'") fileObj.write( f"INSERT INTO dbinfo_extra VALUES({index+1}, " f"\'{key}\', \'{value}\');\n" ) def _getInfoKeys(self): info_keys = self._info_keys if info_keys: return info_keys return [ "dbname", "author", "version", "direction", "origLang", "destLang", "license", "category", "description", ] def write(self) -> "Generator[None, BaseEntry, None]": glos = self._glos newline = self._newline fileObj = self._file i = 0 while True: entry = yield if entry is None: break if entry.isData(): # FIXME continue word = entry.s_word defi = entry.defi word = word.replace("\'", "\'\'")\ .replace("\r", "").replace("\n", newline) defi = defi.replace("\'", "\'\'")\ .replace("\r", "").replace("\n", newline) fileObj.write( f"INSERT INTO word VALUES({i+1}, \'{word}\', \'{defi}\');\n" ) i += 1 if self._transaction: fileObj.write("END TRANSACTION;\n") fileObj.write("CREATE INDEX ix_word_w ON word(w COLLATE NOCASE);\n") pyglossary-4.5.0/pyglossary/plugins/stardict.py000066400000000000000000000577671417733132500220540ustar00rootroot00000000000000# -*- coding: utf-8 -*- import sys import os from os.path import ( dirname, getsize, realpath, ) import re import gzip from time import time as now from collections import Counter from operator import itemgetter from pyglossary.text_utils import ( uint32ToBytes, uint32FromBytes, ) from pyglossary.plugins.formats_common import * enable = True lname = "stardict" format = "Stardict" description = "StarDict (.ifo)" extensions = (".ifo",) extensionCreate = "-stardict/" sortOnWrite = ALWAYS sortKeyName = "stardict" sortEncoding = "utf-8" kind = "directory" wiki = "https://en.wikipedia.org/wiki/StarDict" website = ( "http://huzheng.org/stardict/", "huzheng.org/stardict", ) optionsProp = { "stardict_client": BoolOption( comment="Modify html entries for StarDict 3.0", ), "dictzip": BoolOption( comment="Compress .dict file to .dict.dz", ), "sametypesequence": StrOption( values=["", "h", "m", "x", None], comment="Definition format: h=html, m=plaintext, x=xdxf", ), "merge_syns": BoolOption( comment="Write alternates to .idx instead of .syn", ), "xdxf_to_html": BoolOption( comment="Convert XDXF entries to HTML", ), "unicode_errors": StrOption( values=[ "strict", # raise a UnicodeDecodeError exception "ignore", # just leave the character out "replace", # use U+FFFD, REPLACEMENT CHARACTER "backslashreplace", # insert a \xNN escape sequence ], comment="What to do with Unicode decoding errors", ), "audio_goldendict": BoolOption( comment="Convert audio links for GoldenDict (desktop)" ), "audio_icon": BoolOption( comment="Add glossary's audio icon", ), } if os.getenv("PYGLOSSARY_STARDICT_NO_FORCE_SORT") == "1": sortOnWrite = DEFAULT_YES infoKeys = ( "bookname", "author", "email", "website", "description", "date", ) # re_newline = re.compile("[\n\r]+") re_newline = re.compile("\n\r?|\r\n?") def newlinesToSpace(text: str) -> str: return re_newline.sub(" ", text) def newlinesToBr(text: str) -> str: return re_newline.sub("
          ", text) def verifySameTypeSequence(s: str) -> bool: if not s: return True if not s.isalpha(): log.error("Invalid sametypesequence option") return False return True class Reader(object): _xdxf_to_html = True _unicode_errors = "strict" def __init__(self, glos: GlossaryType): self._glos = glos self.clear() self._xdxfTr = None """ indexData format indexData[i] - i-th record in index file, a tuple (previously a list) of length 3 indexData[i][0] - b_word (bytes) indexData[i][1] - definition block offset in dict file (int) indexData[i][2] - definition block size in dict file (int) REMOVED: indexData[i][3] - list of definitions indexData[i][3][j][0] - definition data indexData[i][3][j][1] - definition type - "h", "m" or "x" indexData[i][4] - list of synonyms (strings) synDict: a dict { entryIndex -> altList } """ def xdxf_setup(self): from pyglossary.xdxf_transform import XdxfTransformer self._xdxfTr = XdxfTransformer(encoding="utf-8") def xdxf_transform(self, text: str): if self._xdxfTr is None: self.xdxf_setup() return self._xdxfTr.transformByInnerString(text) def close(self) -> None: if self._dictFile: self._dictFile.close() self.clear() def clear(self) -> None: self._dictFile = None self._filename = "" # base file path, no extension self._indexData = [] self._synDict = {} self._sametypesequence = "" self._resDir = "" self._resFileNames = [] self._wordCount = None def open(self, filename: str) -> None: if splitext(filename)[1].lower() == ".ifo": self._filename = splitext(filename)[0] else: self._filename = filename self._filename = realpath(self._filename) self.readIfoFile() sametypesequence = self._glos.getInfo("sametypesequence") if not verifySameTypeSequence(sametypesequence): return False self._indexData = self.readIdxFile() self._wordCount = len(self._indexData) self._synDict = self.readSynFile() self._sametypesequence = sametypesequence if isfile(self._filename + ".dict.dz"): self._dictFile = gzip.open(self._filename + ".dict.dz", mode="rb") else: self._dictFile = open(self._filename + ".dict", mode="rb") self._resDir = join(dirname(self._filename), "res") if isdir(self._resDir): self._resFileNames = os.listdir(self._resDir) else: self._resDir = "" self._resFileNames = [] # self.readResources() def __len__(self) -> int: if self._wordCount is None: raise RuntimeError( "StarDict: len(reader) called while reader is not open" ) return self._wordCount + len(self._resFileNames) def readIfoFile(self) -> None: """ .ifo file is a text file in utf-8 encoding """ with open(self._filename + ".ifo", "r", encoding="utf-8") as ifoFile: for line in ifoFile: line = line.strip() if not line: continue if line == "StarDict's dict ifo file": continue key, _, value = line.partition("=") if not (key and value): log.warning(f"Invalid ifo file line: {line}") continue self._glos.setInfo(key, value) def readIdxFile(self) -> "List[Tuple[bytes, int, int]]": if isfile(self._filename + ".idx.gz"): with gzip.open(self._filename + ".idx.gz") as idxFile: idxBytes = idxFile.read() else: with open(self._filename + ".idx", "rb") as idxFile: idxBytes = idxFile.read() indexData = [] pos = 0 while pos < len(idxBytes): beg = pos pos = idxBytes.find(b"\x00", beg) if pos < 0: log.error("Index file is corrupted") break b_word = idxBytes[beg:pos] pos += 1 if pos + 8 > len(idxBytes): log.error("Index file is corrupted") break offset = uint32FromBytes(idxBytes[pos:pos + 4]) pos += 4 size = uint32FromBytes(idxBytes[pos:pos + 4]) pos += 4 indexData.append((b_word, offset, size)) return indexData def __iter__(self) -> "Iterator[BaseEntry]": indexData = self._indexData synDict = self._synDict sametypesequence = self._sametypesequence dictFile = self._dictFile unicode_errors = self._unicode_errors if not dictFile: raise RuntimeError("iterating over a reader while it's not open") if not indexData: log.warning("indexData is empty") return for entryIndex, (b_word, defiOffset, defiSize) in enumerate(indexData): if not b_word: continue dictFile.seek(defiOffset) if dictFile.tell() != defiOffset: log.error(f"Unable to read definition for word {b_word}") continue b_defiBlock = dictFile.read(defiSize) if len(b_defiBlock) != defiSize: log.error(f"Unable to read definition for word {b_word}") continue if sametypesequence: defisData = self.parseDefiBlockCompact( b_defiBlock, sametypesequence, ) else: defisData = self.parseDefiBlockGeneral(b_defiBlock) if defisData is None: log.error(f"Data file is corrupted. Word {b_word}") continue # defisData is a list of (b_defi, defiFormatCode) tuples defis = [] defiFormats = [] for b_partDefi, defiFormatCode in defisData: partDefi = b_partDefi.decode("utf-8", errors=unicode_errors) partDefiFormat = { "m": "m", "t": "m", "y": "m", "g": "h", "h": "h", "x": "x", }.get(chr(defiFormatCode), "") if partDefiFormat == "x" and self._xdxf_to_html: partDefi = self.xdxf_transform(partDefi) partDefiFormat = "h" defis.append(partDefi) defiFormats.append(partDefiFormat) # FIXME defiFormat = defiFormats[0] # defiFormat = Counter(defiFormats).most_common(1)[0][0] if not defiFormat: log.warning(f"Definition format {defiFormat!r} is not supported") word = b_word.decode("utf-8", errors=unicode_errors) try: alts = synDict[entryIndex] except KeyError: # synDict is dict pass else: word = [word] + alts defiSep = "\n
          \n" # if defiFormat == "x" # defiSep = FIXME defi = defiSep.join(defis) # FIXME: # defi = defi.replace(' src="./res/', ' src="./') yield self._glos.newEntry(word, defi, defiFormat=defiFormat) if isdir(self._resDir): for fname in os.listdir(self._resDir): fpath = join(self._resDir, fname) with open(fpath, "rb") as _file: yield self._glos.newDataEntry( fname, _file.read(), ) def readSynFile(self) -> "Dict[int, List[str]]": """ return synDict, a dict { entryIndex -> altList } """ if not isfile(self._filename + ".syn"): return {} unicode_errors = self._unicode_errors with open(self._filename + ".syn", "rb") as synFile: synBytes = synFile.read() synBytesLen = len(synBytes) synDict = {} pos = 0 while pos < synBytesLen: beg = pos pos = synBytes.find(b"\x00", beg) if pos < 0: log.error("Synonym file is corrupted") break b_alt = synBytes[beg:pos] # b_alt is bytes pos += 1 if pos + 4 > len(synBytes): log.error("Synonym file is corrupted") break entryIndex = uint32FromBytes(synBytes[pos:pos + 4]) pos += 4 if entryIndex >= self._wordCount: log.error( f"Corrupted synonym file. " + f"Word {b_alt} references invalid item" ) continue s_alt = b_alt.decode("utf-8", errors=unicode_errors) # s_alt is str try: synDict[entryIndex].append(s_alt) except KeyError: synDict[entryIndex] = [s_alt] return synDict def parseDefiBlockCompact( self, b_block: bytes, sametypesequence: str, ) -> "List[Tuple[bytes, int]]": """ Parse definition block when sametypesequence option is specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ b_sametypesequence = sametypesequence.encode("utf-8") assert len(b_sametypesequence) > 0 res = [] i = 0 for t in b_sametypesequence[:-1]: if i >= len(b_block): return None if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = uint32FromBytes(b_block[i:i + 4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i + size], t)) i += size if i >= len(b_block): return None t = b_sametypesequence[-1] if bytes([t]).islower(): if 0 in b_block[i:]: return None res.append((b_block[i:], t)) else: assert bytes([t]).isupper() res.append((b_block[i:], t)) return res def parseDefiBlockGeneral(self, b_block: bytes) -> "List[Tuple[bytes, int]]": """ Parse definition block when sametypesequence option is not specified. Return a list of (b_defi, defiFormatCode) tuples where b_defi is a bytes instance and defiFormatCode is int, so: defiFormat = chr(defiFormatCode) """ res = [] i = 0 while i < len(b_block): t = b_block[i] if not bytes([t]).isalpha(): return None i += 1 if bytes([t]).islower(): beg = i i = b_block.find(b"\x00", beg) if i < 0: return None res.append((b_block[beg:i], t)) i += 1 else: assert bytes([t]).isupper() if i + 4 > len(b_block): return None size = uint32FromBytes(b_block[i:i + 4]) i += 4 if i + size > len(b_block): return None res.append((b_block[i:i + size], t)) i += size return res # def readResources(self): # if not isdir(self._resDir): # resInfoPath = join(baseDirPath, "res.rifo") # if isfile(resInfoPath): # log.warning( # "StarDict resource database is not supported. Skipping" # ) class Writer(object): _dictzip: bool = True _sametypesequence: str = "" # type: Literal["", "h", "m", "x", None] _stardict_client: bool = False _merge_syns: bool = False _audio_goldendict: bool = False _audio_icon: bool = True def __init__(self, glos: GlossaryType): self._glos = glos self._filename = None self._resDir = None self._sourceLang = None self._targetLang = None self._p_pattern = re.compile( ']*?)?>(.*?)

          ', re.DOTALL, ) self._br_pattern = re.compile( "", re.IGNORECASE, ) self._re_audio_link = re.compile( ']*? )?href="sound://([^<>"]+)"( .*?)?>(.*?)' ) def byteSortKey(self, b_word: bytes) -> "Tuple[bytes, bytes]": return ( b_word.lower(), b_word, ) def finish(self) -> None: self._filename = None self._resDir = None self._sourceLang = None self._targetLang = None def open(self, filename: str) -> None: log.debug(f"open: filename = {filename}") fileBasePath = filename ## if splitext(filename)[1].lower() == ".ifo": fileBasePath = splitext(filename)[0] elif filename.endswith(os.sep): if not isdir(filename): os.makedirs(filename) fileBasePath = join(filename, split(filename[:-1])[-1]) elif isdir(filename): fileBasePath = join(filename, split(filename)[-1]) parentDir = split(fileBasePath)[0] if not isdir(parentDir): log.info(f"Creating directory {parentDir}") os.mkdir(parentDir) ## if fileBasePath: fileBasePath = realpath(fileBasePath) self._filename = fileBasePath self._resDir = join(dirname(fileBasePath), "res") self._sourceLang = self._glos.sourceLang self._targetLang = self._glos.targetLang if self._sametypesequence: log.debug(f"Using write option sametypesequence={self._sametypesequence}") elif self._sametypesequence is not None: stat = self._glos.collectDefiFormat(100) log.debug(f"defiFormat stat: {stat}") if stat: if stat["m"] > 0.97: log.info(f"Auto-selecting sametypesequence=m") self._sametypesequence = "m" elif stat["h"] > 0.5: log.info(f"Auto-selecting sametypesequence=h") self._sametypesequence = "h" def write(self) -> "Generator[None, BaseEntry, None]": from pyglossary.os_utils import runDictzip if self._sametypesequence: if self._merge_syns: yield from self.writeCompactMergeSyns(self._sametypesequence) else: yield from self.writeCompact(self._sametypesequence) else: if self._merge_syns: yield from self.writeGeneralMergeSyns() else: yield from self.writeGeneral() if self._dictzip: runDictzip(f"{self._filename}.dict") def fixDefi(self, defi: str, defiFormat: str) -> str: # for StarDict 3.0: if self._stardict_client and defiFormat == "h": defi = self._p_pattern.sub("\\2
          ", defi) # if there is

          left without opening, replace with
          defi = defi.replace("

          ", "
          ") defi = self._br_pattern.sub("
          ", defi) if self._audio_goldendict: if self._audio_icon: defi = self._re_audio_link.sub( r'', defi, ) else: defi = self._re_audio_link.sub( r'', defi, ) # FIXME: # defi = defi.replace(' src="./', ' src="./res/') return defi def writeCompact(self, defiFormat): """ Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in defiFormat parameter. Parameters: defiFormat - format of article definition: h - html, m - plain text """ log.debug(f"writeCompact: defiFormat={defiFormat}") dictMark = 0 altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") t0 = now() wordCount = 0 if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryIndex)) b_dictBlock = defi.encode("utf-8") dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ uint32ToBytes(dictMark) + \ uint32ToBytes(blockLen) idxFile.write(b_idxBlock) dictMark += blockLen wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") self.writeSynFile(altIndexList) self.writeIfoFile( wordCount, len(altIndexList), defiFormat=defiFormat, ) def writeGeneral(self) -> None: """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ log.debug(f"writeGeneral") dictMark = 0 altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") idxFile = open(self._filename + ".idx", "wb") t0 = now() wordCount = 0 defiFormatCounter = Counter() if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 entry.detectDefiFormat() # call no more than once defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat not in ("h", "m", "x"): log.error(f"invalid defiFormat={defiFormat}, using 'm'") defiFormat = "m" words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str for alt in words[1:]: altIndexList.append((alt.encode("utf-8"), entryIndex)) b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00" dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) b_idxBlock = word.encode("utf-8") + b"\x00" + \ uint32ToBytes(dictMark) + \ uint32ToBytes(blockLen) idxFile.write(b_idxBlock) dictMark += blockLen wordCount += 1 dictFile.close() idxFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common())) self.writeSynFile(altIndexList) self.writeIfoFile( wordCount, len(altIndexList), defiFormat="", ) def writeSynFile(self, altIndexList: "List[Tuple[bytes, int]]") -> None: """ Build .syn file """ if not altIndexList: return log.info(f"Sorting {len(altIndexList)} synonyms...") t0 = now() altIndexList.sort( key=lambda x: self.byteSortKey(x[0]) ) # 28 seconds with old sort key (converted from custom cmp) # 0.63 seconds with my new sort key # 0.20 seconds without key function (default sort) log.info( f"Sorting {len(altIndexList)} synonyms took {now()-t0:.2f} seconds", ) log.info(f"Writing {len(altIndexList)} synonyms...") t0 = now() with open(self._filename + ".syn", "wb") as synFile: synFile.write(b"".join([ b_alt + b"\x00" + uint32ToBytes(entryIndex) for b_alt, entryIndex in altIndexList ])) log.info( f"Writing {len(altIndexList)} synonyms took {now()-t0:.2f} seconds", ) def writeCompactMergeSyns(self, defiFormat): """ Build StarDict dictionary with sametypesequence option specified. Every item definition consists of a single article. All articles have the same format, specified in defiFormat parameter. Parameters: defiFormat - format of article definition: h - html, m - plain text """ log.debug(f"writeCompactMergeSyns: defiFormat={defiFormat}") dictMark = 0 idxBlockList = [] # list of tuples (b"word", startAndLength) altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") t0 = now() if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str b_dictBlock = defi.encode("utf-8") dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen) for word in words: idxBlockList.append((word.encode("utf-8"), blockData)) dictMark += blockLen wordCount = self.writeIdxFile(idxBlockList) dictFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") self.writeIfoFile( wordCount, len(altIndexList), defiFormat=defiFormat, ) def writeGeneralMergeSyns(self) -> None: """ Build StarDict dictionary in general case. Every item definition may consist of an arbitrary number of articles. sametypesequence option is not used. """ log.debug(f"writeGeneralMergeSyns") dictMark = 0 idxBlockList = [] # list of tuples (b"word", startAndLength) altIndexList = [] # list of tuples (b"alternate", entryIndex) dictFile = open(self._filename + ".dict", "wb") t0 = now() wordCount = 0 defiFormatCounter = Counter() if not isdir(self._resDir): os.mkdir(self._resDir) entryIndex = -1 while True: entry = yield if entry is None: break if entry.isData(): entry.save(self._resDir) continue entryIndex += 1 entry.detectDefiFormat() # call no more than once defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat not in ("h", "m", "x"): log.error(f"invalid defiFormat={defiFormat}, using 'm'") defiFormat = "m" words = entry.l_word # list of strs word = words[0] # str defi = self.fixDefi(entry.defi, defiFormat) # defi is str b_dictBlock = (defiFormat + defi).encode("utf-8") + b"\x00" dictFile.write(b_dictBlock) blockLen = len(b_dictBlock) blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen) for word in words: idxBlockList.append((word.encode("utf-8"), blockData)) dictMark += blockLen wordCount = self.writeIdxFile(idxBlockList) dictFile.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) log.info(f"Writing dict file took {now()-t0:.2f} seconds") log.debug("defiFormatsCount = " + pformat(defiFormatCounter.most_common())) self.writeIfoFile( wordCount, len(altIndexList), defiFormat="", ) def writeIdxFile(self, indexList: "List[Tuple[bytes, bytes]]") -> int: filename = self._filename + ".idx" if not indexList: return 0 log.info(f"Sorting {len(indexList)} items...") t0 = now() indexList.sort(key=lambda x: self.byteSortKey(x[0])) log.info( f"Sorting {len(indexList)} {filename} took {now()-t0:.2f} seconds", ) log.info(f"Writing {len(indexList)} index entries...") t0 = now() with open(filename, "wb") as indexFile: indexFile.write(b"".join([ key + b"\x00" + value for key, value in indexList ])) log.info( f"Writing {len(indexList)} {filename} took {now()-t0:.2f} seconds", ) return len(indexList) def writeIfoFile( self, wordCount: int, synWordCount: int, defiFormat: str = "", # type: Literal["", "h", "m", "x"] ) -> None: """ Build .ifo file """ glos = self._glos bookname = newlinesToSpace(glos.getInfo("name")) indexFileSize = getsize(self._filename + ".idx") sourceLang = self._sourceLang targetLang = self._targetLang if sourceLang and targetLang: langs = f"{sourceLang.code}-{targetLang.code}" if langs not in bookname.lower(): bookname = f"{bookname} ({langs})" log.info(f"bookname: {bookname}") ifo = [ ("version", "3.0.0"), ("bookname", bookname), ("wordcount", wordCount), ("idxfilesize", indexFileSize), ] if defiFormat: ifo.append(("sametypesequence", defiFormat)) if synWordCount > 0: ifo.append(("synwordcount", synWordCount)) desc = glos.getInfo("description") copyright = glos.getInfo("copyright") if copyright: desc = f"{copyright}\n{desc}" publisher = glos.getInfo("publisher") if publisher: desc = f"Publisher: {publisher}\n{desc}" for key in infoKeys: if key in ( "bookname", "description", ): continue value = glos.getInfo(key) if value == "": continue value = newlinesToSpace(value) ifo.append((key, value)) ifo.append(("description", newlinesToBr(desc))) ifoStr = "StarDict's dict ifo file\n" for key, value in ifo: ifoStr += f"{key}={value}\n" with open(self._filename + ".ifo", "w", encoding="utf-8") as ifoFile: ifoFile.write(ifoStr) pyglossary-4.5.0/pyglossary/plugins/tabfile.py000066400000000000000000000063321417733132500216230ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from pyglossary.text_reader import TextGlossaryReader from pyglossary.text_utils import ( unescapeNTB, splitByBarUnescapeNTB, ) enable = True lname = "tabfile" format = "Tabfile" description = "Tabfile (.txt, .dic)" extensions = (".txt", ".tab", ".tsv") extensionCreate = ".txt" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/Tab-separated_values" website = None optionsProp = { "encoding": EncodingOption(), "enable_info": BoolOption( comment="Enable glossary info / metedata", ), "resources": BoolOption( comment="Enable resources / data files", ), "file_size_approx": FileSizeOption( comment="Split up by given approximate file size\nexamples: 100m, 1g", ), "word_title": BoolOption( comment="Add headwords title to begining of definition", ), } class Reader(TextGlossaryReader): def __init__(self, glos: GlossaryType, hasInfo: bool = True): TextGlossaryReader.__init__(self, glos, hasInfo=hasInfo) self._resDir = "" self._resFileNames = [] def open(self, filename: str) -> None: TextGlossaryReader.open(self, filename) resDir = f"{filename}_res" if isdir(resDir): self._resDir = resDir self._resFileNames = os.listdir(self._resDir) def __iter__(self) -> "Iterator[BaseEntry]": yield from TextGlossaryReader.__iter__(self) resDir = self._resDir for fname in self._resFileNames: with open(join(resDir, fname), "rb") as _file: yield self._glos.newDataEntry( fname, _file.read(), ) def isInfoWord(self, word: str) -> bool: return word.startswith("#") def fixInfoWord(self, word: str) -> str: return word.lstrip("#") def nextPair(self) -> "Tuple[str, str]": if not self._file: raise StopIteration line = self.readline() if not line: raise StopIteration line = line.rstrip("\n") if not line: return ### word, tab, defi = line.partition("\t") if not tab: log.error( f"Warning: line starting with {line[:10]!r} has no tab!" ) return ### if self._glos.alts: word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=False) ### defi = unescapeNTB(defi) ### return word, defi class Writer(object): _encoding: str = "utf-8" _enable_info: bool = True _resources: bool = True _file_size_approx: int = 0 _word_title: bool = False compressions = stdCompressions def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None def open( self, filename: str, ): self._filename = filename def finish(self): pass def write(self) -> "Generator[None, BaseEntry, None]": from pyglossary.text_writer import TextGlossaryWriter from pyglossary.text_utils import escapeNTB, joinByBar writer = TextGlossaryWriter( self._glos, entryFmt="{word}\t{defi}\n", writeInfo=self._enable_info, outInfoKeysAliasDict=None, ) writer.setAttrs( encoding=self._encoding, wordListEncodeFunc=joinByBar, wordEscapeFunc=escapeNTB, defiEscapeFunc=escapeNTB, ext=".txt", resources=self._resources, word_title=self._word_title, file_size_approx=self._file_size_approx ) writer.open(self._filename) yield from writer.write() writer.finish() pyglossary-4.5.0/pyglossary/plugins/testformat.py000066400000000000000000000057721417733132500224140ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = False lname = "testformat" format = "Test" description = "Test Format File(.test)" extensions = (".test", ".tst") extensionCreate = ".test" kind = "text" wiki = "" website = None # key is option/argument name, value is instance of Option optionsProp = {} class Reader(object): def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = "" self._wordCount = 0 def __len__(self) -> int: # return the number of entries if you have it # if you don't, return 0 and progressbar will be disabled # self._wordCount can be set in self.open function # but if you want to set it, you should set it before # iteration begins and __iter__ method is called return self._wordCount def open(self, filename) -> None: # open the file, read headers / info and set info to self._glos # and set self._wordCount if you can # read-options should be keyword arguments in this method self._wordCount = 100 # log.info(f"some useful message") # here read info from file and set to Glossary object self._glos.setInfo("name", "Test") desc = "Test glossary craeted by a PyGlossary plugin" self._glos.setInfo("description", desc) self._glos.setInfo("author", "Me") self._glos.setInfo("copyright", "GPL") def close(self): # this is called after reading/conversion is finished # if you have an open file object, close it here # if you need to clean up temp files, do it here pass def __iter__(self) -> "Iterator[BaseEntry]": # the easiest and simplest way to implement an Iterator is # by writing a generator, by calling: yield glos.newEntry(word, defi) # inside a loop (typically iterating over a file object for text file) # another way (which is harder) is by implementing __next__ method # and returning self in __iter__ # that forces you to keep the state manually because __next__ is called # repeatedly, but __iter__ is only called once glos = self._glos for i in range(self._wordCount): # here get word and definition from file(depending on your format) word = f"word_{i}" defi = f"definition {i}" yield glos.newEntry(word, defi) class Writer(object): def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None def open(self, filename: str) -> None: self._filename = filename def write(self) -> "Generator[None, BaseEntry, None]": glos = self._glos filename = self._filename # log.info(f"some useful message") while True: entry = yield if entry is None: break word = entry.s_word defi = entry.defi # here write word and defi to the output file (depending on # your format) # here read info from Glossaey object name = glos.getInfo("name") desc = glos.getInfo("description") author = glos.author copyright = glos.getInfo("copyright") # if an info key doesn't exist, getInfo returns empty string # now write info to the output file (depending on your output format) def finish(self): self._filename = None pyglossary-4.5.0/pyglossary/plugins/wiktionary_dump.py000066400000000000000000000132011417733132500234330ustar00rootroot00000000000000# -*- coding: utf-8 -*- from time import time as now import re import html from pyglossary.plugins.formats_common import * enable = True lname = "wiktionary_dump" format = "WiktionaryDump" description = "Wiktionary Dump (.xml)" extensions = () extensionCreate = "" kind = "text" wiki = "https://en.wiktionary.org/wiki/Wiktionary:Main_Page" website = ( "https://dumps.wikimedia.org/mirrors.html", "dumps.wikimedia.org", ) optionsProp = { "encoding": EncodingOption(), } class Reader(object): def __init__(self, glos): self._glos = glos self._buff = b"" self._filename = "" self._file = None self._fileSize = 0 # self._alts = {} # { word => alts } # where alts is str (one word), or list of strs # we can't recognize alternates unless we keep all data in memory # or scan the whole file and read all entries twice self.compilePatterns() def _readUntil(self, sub: bytes) -> bytes: for line in self._file: if sub in line: return line self._buff += line def _readSiteInfo(self) -> bytes: self._buff = self._readUntil(b"") self._readUntil(b"") siteinfoBytes = self._buff + b"" self._buff = b"" return siteinfoBytes def open(self, filename): try: from lxml import etree as ET except ModuleNotFoundError as e: e.msg += f", run `{pip} install lxml` to install" raise e self._filename = filename self._file = open(filename, mode="rb") self._fileSize = os.path.getsize(filename) log.info(f"fileSize = {self._fileSize}") siteinfoBytes = self._readSiteInfo() siteinfoStr = siteinfoBytes.decode("utf-8") siteinfo = ET.fromstring(siteinfoStr) sitename = ", ".join(siteinfo.xpath("sitename/text()")) dbname = ", ".join(siteinfo.xpath("dbname/text()")) generator = ", ".join(siteinfo.xpath("generator/text()")) self._glos.setInfo("title", f"{dbname} ({sitename})") self._glos.setInfo("input_file_size", f"{self._fileSize}") base = siteinfo.xpath("base/text()") if base: wiki_url = "/".join(base[0].rstrip("/").split("/")[:-1]) self._glos.setInfo("website", wiki_url) self._glos.setInfo("entry_url", f"{wiki_url}/{{word}}") self._glos.setInfo("generator", generator) namespaces = siteinfo.find("namespaces") if namespaces is not None: self._glos.setInfo("namespaces", ET.tostring(namespaces)) def close(self): self._filename = "" self._file.close() # self._alts = {} def __len__(self): return 0 def _readPage(self) -> "lxml.etree.Element": from lxml import etree as ET pageEnd = self._readUntil(b"") if pageEnd is None: return page = ET.fromstring(self._buff + pageEnd) self._buff = b"" return page def __iter__(self) -> "Iterator[BaseEntry]": from lxml import etree as ET if not self._filename: raise RuntimeError("iterating over a reader while it's not open") while True: page = self._readPage() if page is None: break yield self._getEntryFromPage(page) def _sub_internal_link(self, m: "re.Match") -> str: ref = m.group(1) return f'{ref}' def compilePatterns(self): self._re_comment = re.compile( "", re.MULTILINE | re.DOTALL, ) self._re_internal_link = re.compile( r"\[\[(.+?)\]\]", re.MULTILINE, ) self._re_translationHeader = re.compile( r"^[;*]?\s?{{(.+?)}}:\s*(.+)$", re.MULTILINE, ) self._re_listItemEmpty = re.compile( r"^[#*]\s*$", re.MULTILINE, ) # ideally '# ...' should become
            , and '* ...' become
              # but that's hard, so we just replace both with '⚫︎ ...' self._re_listItem = re.compile( r"^[#*] ?(.*)", re.MULTILINE, ) self._re_h2 = re.compile( r"^==(\{\{\{\d+\|)?([^={}]+?)(\}\}\})?==$", re.MULTILINE, ) self._re_h3 = re.compile( r"^===(\{\{\{\d+\|)?([^={}]+?)(\}\}\})?===$", re.MULTILINE, ) self._re_h4 = re.compile( r"^={4,5}([^=]+?)={4,5}$", re.MULTILINE, ) self._re_template = re.compile( r"^\{\{(...+?\|...+?)\}\}$", re.MULTILINE, ) self._re_qualifier = re.compile( r"\{\{qualifier\|(.+?)\}\}", ) self._re_lastLineLink = re.compile( "\\n(]*>.*)\\s*$", ) self._re_remainDoubleCurlyBraces = re.compile( r"\{\{([^{}]+?)\}\}", re.MULTILINE, ) self._re_nonTaggedLine = re.compile( r"^([^<\s].+?[^>\s])$", re.MULTILINE, ) # self._re_emptyCircledLines = re.compile( # r"^\s*⚫︎\s*$", # re.MULTILINE | re.UNICODE, # ) def fixText(self, text: str) -> str: text = self._re_comment.sub("", text) text = self._re_listItemEmpty.sub("", text) text = self._re_internal_link.sub(self._sub_internal_link, text) text = self._re_translationHeader.sub( r"

              \1

              \n⚫︎ \2
              ", text, ) text = self._re_listItem.sub(r"⚫︎ \1
              ", text) text = self._re_h2.sub(r"

              \2

              ", text) text = self._re_h3.sub(r"

              \2

              ", text) text = self._re_h4.sub(r"

              \1

              ", text) text = self._re_template.sub(r"Template: \1
              ", text) text = self._re_qualifier.sub(r"(\1)", text) text = self._re_lastLineLink.sub("\n

              \\1", text) text = self._re_remainDoubleCurlyBraces.sub(r"\1
              ", text) text = self._re_nonTaggedLine.sub(r"\1
              ", text) # text = self._re_emptyCircledLines.sub("", text) return text def _getEntryFromPage(self, page: "lxml.etree.Element") -> "BaseEntry": titleElem = page.find(".//title") if titleElem is None: return title = titleElem.text if not title: return textElem = page.find(".//text") if textElem is None: return text = textElem.text if not text: return text = self.fixText(text) byteProgress = (self._file.tell(), self._fileSize) return self._glos.newEntry(title, text, byteProgress=byteProgress) pyglossary-4.5.0/pyglossary/plugins/wordset.py000066400000000000000000000046701417733132500217070ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * from pyglossary.entry import Entry from pyglossary.sort_keys import namedSortKeyByName from os import listdir from os.path import isfile from json import load enable = True lname = "wordset" format = "Wordset" description = "Wordset.org JSON directory" extensions = () extensionCreate = "-wordset/" singleFile = False kind = "directory" wiki = "" website = ( "https://github.com/wordset/wordset-dictionary", "@wordset/wordset-dictionary", ) optionsProp = { "encoding": EncodingOption(), } class Reader(object): _encoding: str = "utf-8" def __init__(self, glos: GlossaryType): self._glos = glos self._clear() self.defiTemplate = ( "

              " '{speech_part}' "
              " "{def}" "
              " "{example}" "

              " ) """ { "id": "492099d426", "def": "without musical accompaniment", "example": "they performed a cappella", "speech_part": "adverb" }, """ def close(self) -> None: self._clear() def _clear(self) -> None: self._filename = "" def open(self, filename: str) -> None: self._filename = filename name = self._glos.getInfo("name") if not name or name == "data": self._glos.setInfo("name", "Wordset.org") self._glos.setDefaultDefiFormat("h") def __len__(self) -> int: return 0 def fileNameSortKey(self, fname: str) -> str: fname = splitext(fname)[0] if fname == "misc": return "\x80" return fname def __iter__(self) -> "Iterator[BaseEntry]": if not self._filename: raise RuntimeError("iterating over a reader while it's not open") direc = self._filename encoding = self._encoding glos = self._glos for fname in sorted(listdir(direc), key=self.fileNameSortKey): fpath = join(direc, fname) if not (fname.endswith(".json") and isfile(fpath)): continue with open(fpath, encoding=encoding) as fileObj: data = load(fileObj) words = list(data.keys()) sortKey = namedSortKeyByName["headword_lower"].normal("utf-8") words.sort(key=sortKey) for word in words: entryDict = data[word] defi = "".join( self.defiTemplate.format(**{ "word": word, "def": meaning.get("def", ""), "example": meaning.get("example", ""), "speech_part": meaning.get("speech_part", ""), }) for meaning in entryDict.get("meanings", []) ) yield glos.newEntry(word, defi, defiFormat="h") log.info(f"finished reading {fname}") pyglossary-4.5.0/pyglossary/plugins/xdxf/000077500000000000000000000000001417733132500206105ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/plugins/xdxf/__init__.py000066400000000000000000000143011417733132500227200ustar00rootroot00000000000000# -*- coding: utf-8 -*- # xdxf/__init__.py """xdxf file format reader and utils to convert xdxf to html.""" # # Copyright © 2016 Ratijas # # some parts of this file include code from: # Aard Dictionary Tools . # Copyright © 2008-2009 Igor Tkach # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from os import path from pyglossary.plugins.formats_common import * from pyglossary.xdxf_transform import * enable = True lname = "xdxf" format = "Xdxf" description = "XDXF (.xdxf)" extensions = (".xdxf",) extensionCreate = ".xdxf" singleFile = True kind = "text" wiki = "https://en.wikipedia.org/wiki/XDXF" website = ( "https://github.com/soshial/xdxf_makedict/blob/master/format_standard/xdxf_description.md", "xdxf_description.md - @soshial/xdxf_makedict", ) optionsProp = { "html": BoolOption(comment="Entries are HTML"), } """ new format ... ... ... article 1 article 2 article 3 article 4 ... old format ... ... article 1 article 2 article 3 article 4 ... """ class Reader(object): depends = { "lxml": "lxml", } _html: bool = True infoKeyMap = { "full_name": "name", "full_title": "name", } def __init__(self, glos: GlossaryType): self._glos = glos self._filename = "" self._file = None self._encoding = "utf-8" self._htmlTr = None self._re_span_k = re.compile( '[^<>]*(
              )?', ) def open(self, filename: str): # from lxml import etree as ET self._filename = filename if self._html: self._htmlTr = XdxfTransformer(encoding=self._encoding) self._glos.setDefaultDefiFormat("h") else: self._glos.setDefaultDefiFormat("x") _file = open(self._filename, mode="rb") context = ET.iterparse( _file, events=("end",), ) for action, elem in context: if elem.tag in ("meta_info", "ar", "k", "abr", "dtrn"): break # every other tag before or is considered info if elem.tag in ("abbr_def",): continue if not elem.text: log.warning(f"empty tag <{elem.tag}>") continue key = self.infoKeyMap.get(elem.tag, elem.tag) self._glos.setInfo(key, elem.text) _file.close() del context self._fileSize = os.path.getsize(filename) self._file = open(self._filename, mode="rb") self._glos.setInfo("input_file_size", f"{self._fileSize}") def __len__(self): return 0 def __iter__(self): from lxml.etree import tostring from lxml import etree as ET context = ET.iterparse( self._file, events=("end",), tag="ar", ) for action, article in context: article.tail = None words = [toStr(w) for w in self.titles(article)] if self._htmlTr: defi = self._htmlTr.transform(article) defiFormat = "h" if len(words) == 1: defi = self._re_span_k.sub("", defi) else: defi = tostring(article, encoding=self._encoding) defi = defi[4:-5].decode(self._encoding).strip() defiFormat = "x" # log.info(f"defi={defi}, words={words}") yield self._glos.newEntry( words, defi, defiFormat=defiFormat, byteProgress=(self._file.tell(), self._fileSize), ) # clean up preceding siblings to save memory # this reduces memory usage from ~64 MB to ~30 MB while article.getprevious() is not None: del article.getparent()[0] def close(self) -> None: if self._file: self._file.close() self._file = None def read_metadata_old(self): full_name = self._xdxf.find("full_name").text desc = self._xdxf.find("description").text if full_name: self._glos.setInfo("name", full_name) if desc: self._glos.setInfo("description", desc) def read_metadata_new(self): meta_info = self._xdxf.find("meta_info") if meta_info is None: raise ValueError("meta_info not found") title = meta_info.find("full_title").text if not title: title = meta_info.find("title").text desc = meta_info.find("description").text if title: self._glos.setInfo("name", title) if desc: self._glos.setInfo("description", desc) def tostring(self, elem: "lxml.etree.Element") -> str: from lxml import etree as ET return ET.tostring( elem, method="html", pretty_print=True, ).decode("utf-8").strip() def titles(self, article): """ :param article: tag :return: (title (str) | None, alternative titles (set)) """ from itertools import combinations titles = [] for title_element in article.findall("k"): if title_element.text is None: # TODO: look for tag? log.warning(f"empty title element: {self.tostring(title_element)}") continue n_opts = len([c for c in title_element if c.tag == "opt"]) if n_opts: for j in range(n_opts + 1): for comb in combinations(list(range(n_opts)), j): titles.append(self._mktitle(title_element, comb)) else: titles.append(self._mktitle(title_element)) return titles def _mktitle(self, title_element, include_opts=None): if include_opts is None: include_opts = () title = title_element.text opt_i = -1 for c in title_element: if c.tag == "nu" and c.tail: if title: title += c.tail else: title = c.tail if c.tag == "opt": opt_i += 1 if opt_i in include_opts: if title: title += c.text else: title = c.text if c.tail: if title: title += c.tail else: title = c.tail return title.strip() pyglossary-4.5.0/pyglossary/plugins/zimfile.py000066400000000000000000000101021417733132500216420ustar00rootroot00000000000000# -*- coding: utf-8 -*- from pyglossary.plugins.formats_common import * enable = True lname = "zim" format = "Zim" description = "Zim (.zim, for Kiwix)" extensions = (".zim",) extensionCreate = ".zim" singleFile = True kind = "binary" wiki = "https://en.wikipedia.org/wiki/ZIM_(file_format)" website = ( "https://wiki.openzim.org/wiki/OpenZIM", "OpenZIM", ) optionsProp = { "skip_duplicate_words": BoolOption( comment="Detect and skip duplicate words", ), } # https://wiki.kiwix.org/wiki/Software class Reader(object): depends = { "libzim": "libzim==1.0", } _skip_duplicate_words = False resourceMimeTypes = { "image/png", "image/jpeg", "image/gif", "image/svg+xml", "image/webp", "image/x-icon", "text/css", "text/javascript", "application/javascript", "application/json", "application/octet-stream", "application/octet-stream+xapian", "application/x-chrome-extension", "application/warc-headers", "application/font-woff", } def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = None self._zimfile = None def open(self, filename: str) -> None: try: from libzim.reader import Archive except ModuleNotFoundError as e: e.msg += f", run `{pip} install libzim` to install" raise e self._filename = filename self._zimfile = Archive(filename) def close(self) -> None: self._filename = None self._zimfile = None def __len__(self) -> int: if self._zimfile is None: log.error(f"len(reader) called before reader.open()") return 0 return self._zimfile.entry_count def __iter__(self): glos = self._glos zimfile = self._zimfile emptyContentCount = 0 invalidMimeTypeCount = 0 entryCount = zimfile.entry_count duplicateEntryCount = 0 redirectCount = 0 skip_dup = self._skip_duplicate_words hashSet = set() f_namemax = os.statvfs(cacheDir).f_namemax fileNameTooLong = [] for entryIndex in range(entryCount): zEntry = zimfile._get_entry_by_id(entryIndex) word = zEntry.title if zEntry.is_redirect: redirectCount += 1 targetWord = zEntry.get_redirect_entry().title yield glos.newEntry( word, f'Redirect: {targetWord}', defiFormat="h", ) continue zItem = zEntry.get_item() b_content = zItem.content.tobytes() if skip_dup: if word in hashSet: duplicateEntryCount += 1 yield None continue hashSet.add(word) if not b_content: emptyContentCount += 1 yield None # TODO: test with more zim files # Looks like: zItem.path == zEntry.path == "-" + word # print(f"b_content empty, word={word!r}, zEntry.path={zEntry.path!r}, zItem.path={zItem.path}") # if zEntry.path == "-" + word: # yield None # else: # defi = f"Path: {zEntry.path}" # yield glos.newEntry(word, defi, defiFormat="m") continue try: mimetype = zItem.mimetype except RuntimeError: invalidMimeTypeCount += 1 yield glos.newDataEntry(word, b_content) if mimetype.startswith("text/html"): # can be "text/html;raw=true" defi = b_content.decode("utf-8") defi = defi.replace(' src="../I/', ' src="./') yield glos.newEntry(word, defi, defiFormat="h") continue if mimetype == "text/plain": yield glos.newEntry( word, b_content.decode("utf-8"), defiFormat="m", ) continue if mimetype not in self.resourceMimeTypes: log.warning(f"Unrecognized mimetype={mimetype!r}") if len(word) > f_namemax: fileNameTooLong.append(word) continue if "|" in word: log.error(f"resource title: {word}") yield glos.newDataEntry(word, b_content) log.info(f"ZIM Entry Count: {entryCount}") if len(fileNameTooLong) > 0: log.error(f"Files with name too long: {len(fileNameTooLong)}") if duplicateEntryCount > 0: log.info(f"Duplicate Title Count: {duplicateEntryCount}") if emptyContentCount > 0: log.info(f"Empty Content Count: {emptyContentCount}") if invalidMimeTypeCount > 0: log.info(f"Invalid MIME-Type Count: {invalidMimeTypeCount}") if redirectCount > 0: log.info(f"Redirect Count: {redirectCount}") pyglossary-4.5.0/pyglossary/reverse.py000066400000000000000000000107661417733132500202150ustar00rootroot00000000000000from .glossary_type import GlossaryType from .entry import Entry, BaseEntry import re from operator import itemgetter import logging log = logging.getLogger("pyglossary") def reverseGlossary( glos: GlossaryType, savePath: str = "", words: "Optional[List[str]]" = None, includeDefs: bool = False, reportStep: int = 300, saveStep: int = 1000, # set this to zero to disable auto saving **kwargs ) -> "Iterator[int]": """ This is a generator Usage: for wordIndex in glos.reverse(...): pass Inside the `for` loop, you can pause by waiting (for input or a flag) or stop by breaking Potential keyword arguments: words = None ## None, or list reportStep = 300 saveStep = 1000 savePath = "" matchWord = True sepChars = ".,،" maxNum = 100 minRel = 0.0 minWordLen = 3 includeDefs = False showRel = "None" allowed values: "None", "Percent", "Percent At First" """ if not savePath: savePath = glos.getInfo("name") + ".txt" if saveStep < 2: raise ValueError("saveStep must be more than 1") ui = glos.ui entries = [] for entry in glos: entries.append(entry) log.info(f"loaded {len(entries)} entries into memory") if words: words = list(words) else: words = takeOutputWords(glos, entries) wordCount = len(words) log.info( f"Reversing to file {savePath!r}" f", number of words: {wordCount}" ) glos.progressInit("Reversing") wcThreshold = wordCount // 200 + 1 with open(savePath, "w") as saveFile: for wordI in range(wordCount): word = words[wordI] if wordI % wcThreshold == 0: glos.progress(wordI, wordCount) if wordI % saveStep == 0 and wordI > 0: saveFile.flush() result = searchWordInDef( entries, word, includeDefs=includeDefs, **kwargs ) if result: try: if includeDefs: defi = "\\n\\n".join(result) else: defi = ", ".join(result) + "." except Exception: log.exception("") log.debug(f"result = {result}") return saveFile.write(f"{word}\t{defi}\n") yield wordI glos.progressEnd() yield wordCount def takeOutputWords( glos: GlossaryType, entryIter: "Iterator[BaseEntry]", minWordLen: int = 3, ) -> "List[str]": # fr"[\w]{{{minWordLen},}}" wordPattern = re.compile(r"[\w]{%d,}" % minWordLen, re.U) words = set() progressbar, glos._progressbar = glos._progressbar, False for entry in entryIter: words.update(wordPattern.findall( entry.defi, )) glos._progressbar = progressbar return sorted(words) def searchWordInDef( entryIter: "Iterator[BaseEntry]", st: str, matchWord: bool = True, sepChars: str = ".,،", maxNum: int = 100, minRel: float = 0.0, minWordLen: int = 3, includeDefs: bool = False, showRel: str = "Percent", # "Percent" | "Percent At First" | "" ) -> "List[str]": # searches word "st" in definitions of the glossary splitPattern = re.compile( "|".join([re.escape(x) for x in sepChars]), re.U, ) wordPattern = re.compile(r"[\w]{%d,}" % minWordLen, re.U) outRel = [] for entry in entryIter: words = entry.l_word defi = entry.defi if st not in defi: continue for word in words: rel = 0 # relation value of word (0 <= rel <= 1) for part in splitPattern.split(defi): if not part: continue if matchWord: partWords = wordPattern.findall( part, ) if not partWords: continue rel = max( rel, partWords.count(st) / len(partWords) ) else: rel = max( rel, part.count(st) * len(st) / len(part) ) if rel <= minRel: continue if includeDefs: outRel.append((word, rel, defi)) else: outRel.append((word, rel)) outRel.sort( key=itemgetter(1), reverse=True, ) n = len(outRel) if n > maxNum > 0: outRel = outRel[:maxNum] n = maxNum num = 0 out = [] if includeDefs: for j in range(n): numP = num w, num, m = outRel[j] m = m.replace("\n", "\\n").replace("\t", "\\t") onePer = int(1.0 / num) if onePer == 1.0: out.append(f"{w}\\n{m}") elif showRel == "Percent": out.append(f"{w}(%{100*num})\\n{m}") elif showRel == "Percent At First": if num == numP: out.append(f"{w}\\n{m}") else: out.append(f"{w}(%{100*num})\\n{m}") else: out.append(f"{w}\\n{m}") return out for j in range(n): numP = num w, num = outRel[j] onePer = int(1.0 / num) if onePer == 1.0: out.append(w) elif showRel == "Percent": out.append(f"{w}(%{100*num})") elif showRel == "Percent At First": if num == numP: out.append(w) else: out.append(f"{w}(%{100*num})") else: out.append(w) return out pyglossary-4.5.0/pyglossary/sdsqlite.py000066400000000000000000000044171417733132500203660ustar00rootroot00000000000000# -*- coding: utf-8 -*- from os.path import isfile from pyglossary.text_utils import ( splitByBar, joinByBar, ) class Writer(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect if isfile(filename): raise IOError(f"file {filename!r} already exists") self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() self._con.execute( "CREATE TABLE dict (" "word TEXT," "wordlower TEXT," "alts TEXT," "defi TEXT," "defiFormat CHAR(1)," "bindata BLOB)" ) self._con.execute( "CREATE INDEX dict_sortkey ON dict(wordlower, word);" ) def write(self): count = 0 while True: entry = yield if entry is None: break word = entry.l_word[0] alts = joinByBar(entry.l_word[1:]) defi = entry.defi defiFormat = entry.defiFormat bindata = None if entry.isData(): bindata = entry.data self._cur.execute( "insert into dict(" "word, wordlower, alts, " "defi, defiFormat, bindata)" " values (?, ?, ?, ?, ?, ?)", ( word, word.lower(), alts, defi, defiFormat, bindata, ), ) count += 1 if count % 1000 == 0: self._con.commit() self._con.commit() def finish(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() class Reader(object): def __init__(self, glos): self._glos = glos self._clear() def _clear(self): self._filename = '' self._con = None self._cur = None def open(self, filename): from sqlite3 import connect self._filename = filename self._con = connect(filename) self._cur = self._con.cursor() # self._glos.setDefaultDefiFormat("m") def __len__(self): self._cur.execute("select count(*) from dict") return self._cur.fetchone()[0] def __iter__(self): self._cur.execute( "select word, alts, defi, defiFormat from dict" " order by wordlower, word" ) for row in self._cur: words = [row[0]] + splitByBar(row[1]) defi = row[2] defiFormat = row[3] yield self._glos.newEntry(words, defi, defiFormat=defiFormat) def close(self): if self._cur: self._cur.close() if self._con: self._con.close() self._clear() pyglossary-4.5.0/pyglossary/sort_keys.py000066400000000000000000000153011417733132500205520ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from collections import namedtuple from operator import itemgetter import re NamedSortKey = namedtuple("NamedSortKey", [ "name", "normal", "sqlite", "desc", ]) """ sortKeyType = Callable[ [[List[str]], Any, ] sqliteSortKeyType = List[Tuple[str, str, sortKeyType]] """ def _headword_normal(encoding: str, **options) -> "sortKeyType": def sortKey(words: "List[str]"): return words[0].encode(encoding, errors="replace") return sortKey def _headword_sqlite(encoding: str, **options) -> "sqliteSortKeyType": def sortKey(words: "List[str]"): return words[0].encode(encoding, errors="replace") return [ ( "headword", "TEXT" if encoding == "utf-8" else "BLOB", sortKey, ), ] def _headword_lower_normal(encoding: str, **options) -> "sortKeyType": def sortKey(words: "List[str]"): return words[0].lower().encode(encoding, errors="replace") return sortKey def _headword_lower_sqlite(encoding: str, **options) -> "sqliteSortKeyType": def sortKey(words: "List[str]"): return words[0].lower().encode(encoding, errors="replace") return [ ( "headword_lower", "TEXT" if encoding == "utf-8" else "BLOB", sortKey, ), ] def _headword_bytes_lower_normal(encoding: str, **options) -> "sortKeyType": def sortKey(words: "List[str]"): return words[0].encode(encoding, errors="replace").lower() return sortKey def _headword_bytes_lower_sqlite(encoding: str, **options) \ -> "sqliteSortKeyType": def sortKey(words: "List[str]"): return words[0].encode(encoding, errors="replace").lower() return [ ( "headword_blower", "TEXT" if encoding == "utf-8" else "BLOB", sortKey, ), ] def _stardict_normal(encoding: str, **options) -> "sortKeyType": def sortKey(words: "List[str]"): b_word = words[0].encode(encoding, errors="replace") return (b_word.lower(), b_word) return sortKey def _stardict_sqlite(encoding: str, **options) -> "sqliteSortKeyType": def headword_lower(words: "List[str]"): return words[0].encode(encoding, errors="replace").lower() def headword(words: "List[str]"): return words[0].encode(encoding, errors="replace") _type = "TEXT" if encoding == "utf-8" else "BLOB" return [ ( "headword_lower", _type, headword_lower, ), ( "headword", _type, headword, ), ] def _ebook_normal(encoding: str, **options) -> "sortKeyType": length = options.get("group_by_prefix_length", 2) def sortKey(words: "List[str]"): word = words[0] if not word: return "", "" prefix = word[:length].lower() if prefix[0] < "a": return "SPECIAL", word return prefix, word return sortKey def _ebook_sqlite(encoding: str, **options) -> "sqliteSortKeyType": length = options.get("group_by_prefix_length", 2) def getPrefix(words: "List[str]"): word = words[0] if not word: return "" prefix = word[:length].lower() if prefix[0] < "a": return "SPECIAL" return prefix def headword(words: "List[str]"): return words[0].encode(encoding, errors="replace") _type = "TEXT" if encoding == "utf-8" else "BLOB" return [ ( "prefix", _type, getPrefix, ), ( "headword", _type, headword, ), ] def _ebook_length3_normal(encoding: str, **options) -> "sortKeyType": return _ebook_normal( encoding, group_by_prefix_length=3, ) def _ebook_length3_sqlite(encoding: str, **options) -> "sqliteSortKeyType": return _ebook_sqlite( encoding, group_by_prefix_length=3, ) _dicformids_re_punc = re.compile( r"[!\"$§$%&/()=?´`\\{}\[\]^°+*~#'-_.:,;<>@]*", # FIXME: | ) def _dicformids_normal(encoding: str, **options) -> "sortKeyType": re_punc = _dicformids_re_punc re_spaces = re.compile(" +") re_tabs = re.compile("\t+") def sortKey(words: "List[str]") -> "Any": word = words[0] word = word.strip() # looks like we need to remove tabs, because app gives error # but based on the java code, all punctuations should be removed # as well, including '|' which is used to separate alternate words # FIXME # word = word.replace("|", " ") word = re_punc.sub("", word) word = re_spaces.sub(" ", word) word = re_tabs.sub(" ", word) word = word.lower() return word return sortKey def _dicformids_sqlite(encoding: str, **options) -> "sqliteSortKeyType": return [ ( "headword_norm", "TEXT", _dicformids_normal(encoding, **options), ), ] def _random_normal(encoding: str, **options) -> "sortKeyType": from random import random return lambda words: random() def _random_sqlite(encoding: str, **options) -> "sqliteSortKeyType": from random import random return [ ( "random", "REAL", lambda words: random(), ), ] namedSortKeyList = [ NamedSortKey( name="headword", normal=_headword_normal, sqlite=_headword_sqlite, desc="Headword", ), NamedSortKey( name="headword_lower", normal=_headword_lower_normal, sqlite=_headword_lower_sqlite, desc="Lowercase Headword", ), NamedSortKey( name="headword_bytes_lower", normal=_headword_bytes_lower_normal, sqlite=_headword_bytes_lower_sqlite, desc="ASCII-Lowercase Headword", ), NamedSortKey( name="stardict", normal=_stardict_normal, sqlite=_stardict_sqlite, desc="StarDict", ), NamedSortKey( name="ebook", normal=_ebook_normal, sqlite=_ebook_sqlite, desc="E-Book (prefix length: 2)", ), NamedSortKey( name="ebook_length3", normal=_ebook_length3_normal, sqlite=_ebook_length3_sqlite, desc="E-Book (prefix length: 3)", ), NamedSortKey( name="dicformids", normal=_dicformids_normal, sqlite=_dicformids_sqlite, desc="DictionaryForMIDs", ), NamedSortKey( name="random", normal=_random_normal, sqlite=_random_sqlite, desc="Random", ), ] namedSortKeyByName = { item.name: item for item in namedSortKeyList } """ https://en.wikipedia.org/wiki/UTF-8#Comparison_with_other_encodings Sorting order: The chosen values of the leading bytes means that a list of UTF-8 strings can be sorted in code point order by sorting the corresponding byte sequences. """ pyglossary-4.5.0/pyglossary/sq_entry_list.py000066400000000000000000000122131417733132500214260ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from pickle import dumps, loads import os from os.path import isfile from .entry import Entry import logging log = logging.getLogger("pyglossary") PICKLE_PROTOCOL = 4 # Pickle protocol 4 performed better than protocol 5 on Python 3.9.2 # Slightly lower running time, lower memory usage, and same .db file size # Pickle protocol 5 added in Python 3.8 PEP 574 # Pickle protocol 4 added in Python 3.4 PEP 3154 # Pickle Protocol 3 added in Python 3.0 # https://docs.python.org/3/library/pickle.html class SqEntryList(list): def __init__( self, glos, filename: str, create: bool = True, persist: bool = False, ): """ sqliteSortKey[i] == (name, type, valueFunc) persist: do not delete the file when variable is deleted """ from sqlite3 import connect self._glos = glos self._filename = filename self._persist = persist self._con = connect(filename) self._cur = self._con.cursor() if not filename: raise ValueError(f"invalid filename={filename!r}") self._orderBy = "rowid" self._sorted = False self._reverse = False self._len = 0 self._create = create self._sqliteSortKey = None self._columnNames = "" def setSortKey( self, namedSortKey: "NamedSortKey", sortEncoding: "Optional[str]", writeOptions: "Dict[str, Any]", ): """ sqliteSortKey[i] == (name, type, valueFunc) """ if self._sqliteSortKey is not None: raise RuntimeError("Called setSortKey twice") sqliteSortKey = namedSortKey.sqlite(sortEncoding, **writeOptions) self._sqliteSortKey = sqliteSortKey self._columnNames = ",".join([ col[0] for col in sqliteSortKey ]) if self._create: colDefs = ",".join([ f"{col[0]} {col[1]}" for col in sqliteSortKey ] + ["pickle BLOB"]) self._con.execute( f"CREATE TABLE data ({colDefs})" ) else: self._parseExistingIndex() def __len__(self): return self._len def append(self, entry): rawEntry = entry.getRaw(self._glos) self._len += 1 colCount = len(self._sqliteSortKey) try: values = [ col[2](entry.l_word) for col in self._sqliteSortKey ] except Exception: log.critical(f"error in _sqliteSortKey funcs for rawEntry = {rawEntry!r}") raise try: pickleEntry = dumps(rawEntry, protocol=PICKLE_PROTOCOL) except Exception: log.critical(f"error in pickle.dumps for rawEntry = {rawEntry!r}") raise self._cur.execute( f"insert into data({self._columnNames}, pickle)" f" values (?{', ?' * colCount})", values + [pickleEntry], ) if self._len % 1000 == 0: self._con.commit() def __iadd__(self, other): for item in other: self.append(item) return self def sort(self, reverse=False): if self._sorted: raise NotImplementedError("can not sort more than once") self._reverse = reverse self._sorted = True sortColumnNames = self._columnNames self._orderBy = sortColumnNames if reverse: self._orderBy = ",".join([ f"{col[0]} DESC" for col in self._sqliteSortKey ]) self._con.commit() self._con.execute( f"CREATE INDEX sortkey ON data({sortColumnNames});" ) self._con.commit() def _parseExistingIndex(self) -> bool: self._cur.execute("select sql FROM sqlite_master WHERE name='sortkey'") row = self._cur.fetchone() if row is None: return False sql = row[0] # sql == "CREATE INDEX sortkey ON data(wordlower,word)" i = sql.find("(") if i < 0: log.error(f"error parsing index sql={sql!r}") return False j = sql.find(")", i) if j < 0: log.error(f"error parsing index sql={sql!r}") return False columnNames = sql[i + 1:j] self._sorted = True self._orderBy = columnNames return True def deleteAll(self): if self._con is None: return self._con.execute( f"DELETE FROM data;" ) self._con.commit() self._len = 0 def clear(self): self.close() def close(self): if self._con is None: return self._con.commit() self._cur.close() self._con.close() self._con = None self._cur = None def __del__(self): try: self.close() if not self._persist and isfile(self._filename): os.remove(self._filename) except AttributeError as e: log.error(str(e)) def __iter__(self): glos = self._glos query = f"SELECT pickle FROM data ORDER BY {self._orderBy}" self._cur.execute(query) for row in self._cur: yield Entry.fromRaw( glos, loads(row[0]), defaultDefiFormat=glos._defaultDefiFormat, ) pyglossary-4.5.0/pyglossary/text_reader.py000066400000000000000000000112321417733132500210350ustar00rootroot00000000000000from pyglossary.file_utils import fileCountLines from pyglossary.entry_base import BaseEntry from pyglossary.entry import Entry, DataEntry from pyglossary.compression import ( compressionOpen, stdCompressions, ) from pyglossary.glossary_type import GlossaryType import os from os.path import isfile import logging log = logging.getLogger("pyglossary") class TextFilePosWrapper(object): def __init__(self, fileobj, encoding): self.fileobj = fileobj self._encoding = encoding self.pos = 0 def __iter__(self): return self def close(self): self.fileobj.close() def __next__(self): line = self.fileobj.__next__() self.pos += len(line.encode(self._encoding)) return line def tell(self): return self.pos class TextGlossaryReader(object): _encoding = "utf-8" compressions = stdCompressions def __init__(self, glos: GlossaryType, hasInfo: bool = True): self._glos = glos self._filename = "" self._file = None self._hasInfo = hasInfo self._pendingEntries = [] self._wordCount = 0 self._fileSize = 0 self._pos = -1 self._fileCount = 1 self._fileIndex = -1 self._bufferLine = "" def readline(self): if self._bufferLine: line = self._bufferLine self._bufferLine = "" return line try: return next(self._file) except StopIteration: return "" def _open(self, filename: str) -> None: self._fileIndex += 1 log.info(f"Reading file: {filename}") cfile = compressionOpen(filename, mode="rt", encoding=self._encoding) if not self._wordCount: cfile.seek(0, 2) self._fileSize = cfile.tell() cfile.seek(0) log.debug(f"File size of {filename}: {self._fileSize}") self._glos.setInfo("input_file_size", f"{self._fileSize}") self._file = TextFilePosWrapper(cfile, self._encoding) if self._hasInfo: self.loadInfo() def open(self, filename: str) -> None: self._filename = filename self._open(filename) def openNextFile(self) -> bool: self.close() nextFilename = f"{self._filename}.{self._fileIndex + 1}" if isfile(nextFilename): self._open(nextFilename) return True for ext in self.compressions: if isfile(f"{nextFilename}.{ext}"): self._open(f"{nextFilename}.{ext}") return True if self._fileCount != -1: log.warning(f"next file not found: {nextFilename}") return False def close(self) -> None: if not self._file: return try: self._file.close() except Exception: log.exception(f"error while closing file {self._filename!r}") self._file = None def newEntry(self, word, defi) -> "BaseEntry": byteProgress = None if self._fileSize: byteProgress = (self._file.tell(), self._fileSize) return self._glos.newEntry( word, defi, byteProgress=byteProgress, ) def setInfo(self, word: str, defi: str) -> None: self._glos.setInfo(word, defi) def loadInfo(self) -> None: self._pendingEntries = [] try: while True: wordDefi = self.nextPair() if not wordDefi: continue word, defi = wordDefi if not self.isInfoWords(word): self._pendingEntries.append(self.newEntry(word, defi)) break if isinstance(word, list): word = [self.fixInfoWord(w) for w in word] else: word = self.fixInfoWord(word) if not word: continue if not defi: continue self.setInfo(word, defi) except StopIteration: pass if self._fileIndex == 0: fileCountStr = self._glos.getInfo("file_count") if fileCountStr: self._fileCount = int(fileCountStr) self._glos.setInfo("file_count", "") def __iter__(self) -> "Iterator[BaseEntry]": resPathSet = set() while True: self._pos += 1 if self._pendingEntries: yield self._pendingEntries.pop(0) continue ### try: wordDefi = self.nextPair() except StopIteration as e: if self._fileCount == -1 or self._fileIndex < self._fileCount - 1: if self.openNextFile(): continue self._wordCount = self._pos break if not wordDefi: yield None continue word, defi = wordDefi if isinstance(defi, tuple): defi, resList = defi for relPath, fullPath in resList: if relPath in resPathSet: continue resPathSet.add(relPath) yield DataEntry( fname=relPath, tmpPath=fullPath, ) yield self.newEntry(word, defi) def __len__(self) -> int: return self._wordCount def isInfoWord(self, word: str) -> bool: raise NotImplementedError def isInfoWords(self, arg: "Union[str, List[str]]") -> bool: if isinstance(arg, str): return self.isInfoWord(arg) if isinstance(arg, list): return self.isInfoWord(arg[0]) raise TypeError(f"bad argument {arg}") def fixInfoWord(self, word: str) -> bool: raise NotImplementedError def nextPair(self) -> "Tuple[str, str]": raise NotImplementedError pyglossary-4.5.0/pyglossary/text_utils.py000066400000000000000000000112771417733132500207440ustar00rootroot00000000000000# -*- coding: utf-8 -*- # text_utils.py # # Copyright © 2008-2022 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import string import sys import os import re import struct import binascii import logging from . import core log = logging.getLogger("pyglossary") endFormat = "\x1b[0;0;0m" # len=8 def toBytes(s: "AnyStr") -> bytes: return bytes(s, "utf-8") if isinstance(s, str) else bytes(s) def toStr(s: "AnyStr") -> str: return str(s, "utf-8") if isinstance(s, bytes) else str(s) def fixUtf8(st: "AnyStr") -> str: return toBytes(st).replace(b"\x00", b"").decode("utf-8", "replace") pattern_n_us = re.compile(r"((? "Callable[[str], str]": def replace(st: str) -> str: for rpl in rplList: st = st.replace(rpl[0], rpl[1]) return st return replace def escapeNTB(st: str, bar: bool = False) -> str: """ scapes Newline, Tab, Baskslash, and vertical Bar (if bar=True) """ st = st.replace("\\", "\\\\") st = st.replace("\t", r"\t") st = st.replace("\r", "") st = st.replace("\n", r"\n") if bar: st = st.replace("|", r"\|") return st def unescapeNTB(st: str, bar: bool = False) -> str: """ unscapes Newline, Tab, Baskslash, and vertical Bar (if bar=True) """ st = pattern_n_us.sub("\\1\n", st) st = pattern_t_us.sub("\\1\t", st) if bar: st = pattern_bar_us.sub(r"\1|", st) st = st.replace("\\\\", "\\") # probably faster than re.sub return st def splitByBarUnescapeNTB(st: str) -> "List[str]": """ splits by "|" (and not "\\|") then unescapes Newline (\\n), Tab (\\t), Baskslash (\\) and Bar (\\|) in each part returns a list """ return [ unescapeNTB(part, bar=True) for part in pattern_bar_sp.split(st) ] def escapeBar(st: str) -> str: r""" scapes vertical bar (\|) """ st = st.replace("\\", "\\\\") st = st.replace("|", r"\|") return st def unescapeBar(st: str) -> str: r""" unscapes vertical bar (\|) """ st = pattern_bar_us.sub(r"\1|", st) st = st.replace("\\\\", "\\") # probably faster than re.sub return st def splitByBar(st: str) -> "List[str]": """ splits by "|" (and not "\\|") then unescapes Baskslash (\\) and Bar (\\|) in each part """ return [ unescapeBar(part) for part in pattern_bar_sp.split(st) ] def joinByBar(parts: "List[str]") -> "str": return "|".join([ escapeBar(part) for part in parts ]) def unescapeBarBytes(st: bytes) -> bytes: r""" unscapes vertical bar (\|) """ st = b_pattern_bar_us.sub(b"\\1|", st) st = st.replace(b"\\\\", b"\\") # probably faster than re.sub return st # return a message string describing the current exception def excMessage() -> str: i = sys.exc_info() return f"{i[0].__name__}: {i[1]}" def formatHMS(h: int, m: int, s: int) -> str: if h == 0: if m == 0: return f"{s:02d}" else: return f"{m:02d}:{s:02d}" else: return f"{h:02d}:{m:02d}:{s:02d}" # ___________________________________________ # def uint32ToBytes(n: int) -> bytes: return struct.pack('>I', n) def uint32FromBytes(bs: bytes) -> int: return struct.unpack('>I', bs)[0] def uintFromBytes(bs: bytes) -> int: n = 0 for c in bs: n = (n << 8) + c return n def crc32hex(bs: bytes) -> str: return struct.pack('>I', binascii.crc32(bs) & 0xffffffff).hex() # ___________________________________________ # def urlToPath(url: str) -> str: from urllib.parse import unquote if not url.startswith("file://"): return unquote(url) path = url[7:] if path[-2:] == "\r\n": path = path[:-2] elif path[-1] == "\r": path = path[:-1] # here convert html unicode symbols to utf-8 string: return unquote(path) def replacePostSpaceChar(st: str, ch: str) -> str: st = ( st.replace(f" {ch}", ch) .replace(ch, f"{ch} ") .replace(f"{ch} ", f"{ch} ") ) if st.endswith(" "): st = st[:-1] return st def isASCII(data: str) -> bool: for c in data: if ord(c) >= 128: return False return True pyglossary-4.5.0/pyglossary/text_utils_extra.py000066400000000000000000000006451417733132500221440ustar00rootroot00000000000000def chBaseIntToStr(number, base): """ reverse function of int(str, base) and long(str, base) """ if not 2 <= base <= 36: raise ValueError('base must be in 2..36') abc = string.digits + string.ascii_letters result = '' if number < 0: number = -number sign = '-' else: sign = '' while True: number, rdigit = divmod(number, base) result = abc[rdigit] + result if number == 0: return sign + result pyglossary-4.5.0/pyglossary/text_writer.py000066400000000000000000000136531417733132500211200ustar00rootroot00000000000000import os import logging from os.path import ( isdir, splitext, ) from pyglossary.compression import compressionOpen as c_open log = logging.getLogger("pyglossary") file_size_check_every = 100 class TextGlossaryWriter(object): _encoding = "utf-8" _newline = "\n" _wordListEncodeFunc: "Optional[Callable[[List[str]], str]]" = None _wordEscapeFunc: "Optional[Callable[[str], str]]" = None _defiEscapeFunc: "Optional[Callable[[str], str]]" = None _ext: str = ".txt" _head: str = "" _tail: str = "" _resources: bool = True _file_size_approx: int = 0 _word_title: bool = False def __init__( self, glos: "GlossaryType", entryFmt: str = "", # contain {word} and {defi} writeInfo: bool = True, outInfoKeysAliasDict: "Optional[Dict[str, str]]" = None, ) -> None: self._glos = glos self._filename = "" self._file = None self._resDir = "" if not entryFmt: raise ValueError("entryFmt argument is missing") self._entryFmt = entryFmt self._writeInfo = writeInfo if not outInfoKeysAliasDict: outInfoKeysAliasDict = {} self._outInfoKeysAliasDict = outInfoKeysAliasDict # TODO: replace outInfoKeysAliasDict arg with a func? def setAttrs( self, encoding=None, newline=None, wordListEncodeFunc=None, wordEscapeFunc=None, defiEscapeFunc=None, ext=None, head=None, tail=None, resources=None, word_title=None, file_size_approx=None, ): if encoding is not None: self._encoding = encoding if newline is not None: self._newline = newline if wordListEncodeFunc is not None: self._wordListEncodeFunc = wordListEncodeFunc if wordEscapeFunc is not None: self._wordEscapeFunc = wordEscapeFunc if defiEscapeFunc is not None: self._defiEscapeFunc = defiEscapeFunc if ext is not None: self._ext = ext if head is not None: self._head = head if tail is not None: self._tail = tail if resources is not None: self._resources = resources if word_title is not None: self._word_title = word_title if file_size_approx is not None: self._file_size_approx = file_size_approx def open(self, filename: str) -> None: if self._file_size_approx > 0: self._glos.setInfo("file_count", "-1") self._open(filename) self._filename = filename self._resDir = f"{filename}_res" if not isdir(self._resDir): os.mkdir(self._resDir) def _open(self, filename: str): if not filename: filename = self._glos.filename + self._ext _file = self._file = c_open( filename, mode="wt", encoding=self._encoding, newline=self._newline, ) _file.write(self._head) if self._writeInfo: entryFmt = self._entryFmt outInfoKeysAliasDict = self._outInfoKeysAliasDict wordEscapeFunc = self._wordEscapeFunc defiEscapeFunc = self._defiEscapeFunc for key, value in self._glos.iterInfo(): # both key and value are supposed to be non-empty string if not (key and value): log.warning(f"skipping info key={key!r}, value={value!r}") continue key = outInfoKeysAliasDict.get(key, key) if not key: continue word = f"##{key}" if wordEscapeFunc is not None: word = wordEscapeFunc(word) if not word: continue if defiEscapeFunc is not None: value = defiEscapeFunc(value) if not value: continue _file.write(entryFmt.format( word=word, defi=value, )) _file.flush() return _file def write(self): glos = self._glos _file = self._file entryFmt = self._entryFmt wordListEncodeFunc = self._wordListEncodeFunc wordEscapeFunc = self._wordEscapeFunc defiEscapeFunc = self._defiEscapeFunc resources = self._resources word_title = self._word_title file_size_approx = self._file_size_approx entryCount = 0 fileIndex = 0 while True: entry = yield if entry is None: break if entry.isData(): if resources: entry.save(self._resDir) continue word = entry.s_word defi = entry.defi # if glos.alts: # FIXME if word_title: defi = glos.wordTitleStr(entry.l_word[0]) + defi if wordListEncodeFunc is not None: word = wordListEncodeFunc(entry.l_word) elif wordEscapeFunc is not None: word = wordEscapeFunc(word) if defiEscapeFunc is not None: defi = defiEscapeFunc(defi) _file.write(entryFmt.format(word=word, defi=defi)) if file_size_approx > 0: entryCount += 1 if entryCount % file_size_check_every == 0: if _file.tell() >= file_size_approx: fileIndex += 1 _file = self._open(f"{self._filename}.{fileIndex}") def finish(self): if self._tail: self._file.write(self._tail) self._file.close() if not os.listdir(self._resDir): os.rmdir(self._resDir) def writeTxt( glos: "GlossaryType", entryFmt: str = "", # contain {word} and {defi} filename: str = "", writeInfo: bool = True, wordEscapeFunc: "Optional[Callable]" = None, defiEscapeFunc: "Optional[Callable]" = None, ext: str = ".txt", head: str = "", tail: str = "", outInfoKeysAliasDict: "Optional[Dict[str, str]]" = None, encoding: str = "utf-8", newline: str = "\n", resources: bool = True, word_title: bool = False, ) -> "Generator[None, BaseEntry, None]": writer = TextGlossaryWriter( glos, entryFmt=entryFmt, writeInfo=writeInfo, outInfoKeysAliasDict=outInfoKeysAliasDict, ) writer.setAttrs( encoding=encoding, newline=newline, wordEscapeFunc=wordEscapeFunc, defiEscapeFunc=defiEscapeFunc, ext=ext, head=head, tail=tail, resources=resources, word_title=word_title, ) writer.open(filename) yield from writer.write() writer.finish() def writeTabfile( glos: "GlossaryType", filename: str = "", encoding: str = "utf-8", resources: bool = True, ) -> "Generator[None, BaseEntry, None]": from pyglossary.text_utils import escapeNTB writer = TextGlossaryWriter( glos, entryFmt="{word}\t{defi}\n", outInfoKeysAliasDict=None, ) writer.setAttrs( encoding=encoding, wordEscapeFunc=escapeNTB, defiEscapeFunc=escapeNTB, ext=".txt", resources=resources, ) writer.open(filename) yield from writer.write() writer.finish() pyglossary-4.5.0/pyglossary/ui/000077500000000000000000000000001417733132500165735ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/ui/__init__.py000066400000000000000000000000001417733132500206720ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/ui/base.py000066400000000000000000000142571417733132500200700ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2012-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from os.path import join, isfile import logging from collections import OrderedDict from pyglossary.core import ( rootConfJsonFile, confJsonFile, rootDir, dataDir, appResDir, ) from pyglossary.option import ( BoolOption, StrOption, IntOption, FloatOption, ) def fread(path): with open(path, encoding="utf-8") as fp: return fp.read() log = logging.getLogger("pyglossary") logo = join(appResDir, "pyglossary.png") aboutText = fread(join(dataDir, "about")) licenseText = fread(join(dataDir, "license-dialog")) authors = fread(join(dataDir, "AUTHORS")).split("\n") summary = "A tool for converting dictionary files aka glossaries with" \ " various formats for different dictionary applications" class UIBase(object): configDefDict = OrderedDict([ ("log_time", BoolOption( hasFlag=True, comment="Show date and time in logs", falseComment="Do not show date and time in logs", )), ("cleanup", BoolOption( hasFlag=True, comment="Cleanup cache or temporary files after conversion", falseComment="Do not cleanup cache or temporary files after conversion", )), ("auto_sqlite", BoolOption( hasFlag=False, comment=( "Auto-enable --sqlite to limit RAM usage when direct\n" "mode is not possible. Can override with --no-sqlite" ), )), ("lower", BoolOption( hasFlag=True, comment="Lowercase words before writing", falseComment="Do not lowercase words before writing", )), ("utf8_check", BoolOption( hasFlag=True, comment="Ensure entries contain valid UTF-8 strings", falseComment="Do not ensure entries contain valid UTF-8 strings", )), ("enable_alts", BoolOption( hasFlag=True, customFlag="alts", comment="Enable alternates", falseComment="Disable alternates", )), # FIXME: replace with "resources" # comment="Use resources (images, audio, etc)" ("skip_resources", BoolOption( hasFlag=True, comment="Skip resources (images, audio, css, etc)", )), ("rtl", BoolOption( hasFlag=True, comment=( "Right-To-Left all (HTML) definitions" ), )), ("remove_html", StrOption( hasFlag=True, comment=( "Remove given HTML tags (comma-separated)\n" "from definitions" ), )), ("remove_html_all", BoolOption( hasFlag=True, comment="Remove all HTML tags from definitions", )), ("normalize_html", BoolOption( hasFlag=True, comment="Lowercase and normalize HTML tags in definitions", )), ("save_info_json", BoolOption( hasFlag=True, customFlag="info", comment="Save .info file alongside output file(s)", )), ("color.enable.cmd.unix", BoolOption( hasFlag=False, comment="Enable colors in Linux/Unix command line" )), ("color.enable.cmd.windows", BoolOption( hasFlag=False, comment="Enable colors in Windows command line" )), ("color.cmd.critical", IntOption( hasFlag=False, comment="Color code for critical errors in command line", )), ("color.cmd.error", IntOption( hasFlag=False, comment="Color code for errors in command line", )), ("color.cmd.warning", IntOption( hasFlag=False, comment="Color code for warnings in command line", )), # interactive command line interface ("cmdi.prompt.indent.str", StrOption(hasFlag=False)), ("cmdi.prompt.indent.color", IntOption(hasFlag=False)), ("cmdi.prompt.msg.color", IntOption(hasFlag=False)), ("cmdi.msg.color", IntOption(hasFlag=False)), ("ui_autoSetFormat", BoolOption(hasFlag=False)), ("reverse_matchWord", BoolOption(hasFlag=False)), ("reverse_showRel", StrOption(hasFlag=False)), ("reverse_saveStep", IntOption(hasFlag=False)), ("reverse_minRel", FloatOption(hasFlag=False)), ("reverse_maxNum", IntOption(hasFlag=False)), ("reverse_includeDefs", BoolOption(hasFlag=False)), ]) conflictingParams = [ ("sqlite", "direct"), ("remove_html", "remove_html_all"), ] def __init__(self, **kwargs): self.config = {} def progressInit(self, title): pass def progress(self, rat, text=""): pass def progressEnd(self): self.progress(1.0) def loadConfig( self, user: bool = True, **options ): from pyglossary.json_utils import jsonToData data = jsonToData(fread(rootConfJsonFile)) if user and isfile(confJsonFile): try: userData = jsonToData(fread(confJsonFile)) except Exception: log.exception( f"error while loading user config file {confJsonFile!r}" ) else: data.update(userData) for key in self.configDefDict: try: self.config[key] = data.pop(key) except KeyError: pass for key, value in data.items(): log.warning( f"unknown config key {key!r}, you may edit {confJsonFile}" " file and remove this key" ) for key, value in options.items(): if key in self.configDefDict: self.config[key] = value log.setTimeEnable(self.config["log_time"]) log.debug(f"loaded config: {self.config}") return True def saveConfig(self): from pyglossary.json_utils import dataToPrettyJson config = OrderedDict() for key, option in self.configDefDict.items(): if key not in self.config: log.warning(f"saveConfig: missing key {key!r}") continue value = self.config[key] if not option.validate(value): log.error(f"saveConfig: invalid {key}={value!r}") continue config[key] = value jsonStr = dataToPrettyJson(config) with open(confJsonFile, mode="wt", encoding="utf-8") as _file: _file.write(jsonStr) log.info(f"saved {confJsonFile!r}") pyglossary-4.5.0/pyglossary/ui/dependency.py000066400000000000000000000026121417733132500212640ustar00rootroot00000000000000# -*- coding: utf-8 -*- # dependency.py # # Copyright © 2019-2019 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from pyglossary.glossary import Glossary # reqs = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']) # ^ this takes about 3 seconds # installed_packages = set(r.decode().split('==')[0] for r in reqs.split()) def checkDepends(depends: "Dict[str, str]") -> "List[str]": "returns the list of uninstalled dependencies" if not depends: return [] uninstalled = [] for moduleName, pkgName in depends.items(): try: __import__(moduleName) except ModuleNotFoundError: uninstalled.append(pkgName) return uninstalled pyglossary-4.5.0/pyglossary/ui/gtk3_utils/000077500000000000000000000000001417733132500206635ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/ui/gtk3_utils/__init__.py000066400000000000000000000001121417733132500227660ustar00rootroot00000000000000from gi.repository import Gtk as gtk from gi.repository import Gdk as gdk pyglossary-4.5.0/pyglossary/ui/gtk3_utils/about.py000066400000000000000000000070421417733132500223520ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2020 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from . import * from .utils import ( imageFromFile, VBox, pack, ) class AboutWidget(gtk.Box): def __init__( self, logo: str = "", header: str = "", about: str = "", authors: str = "", license: str = "", **kwargs, ): gtk.Box.__init__(self, orientation=gtk.Orientation.VERTICAL) ## headerBox = gtk.Box(orientation=gtk.Orientation.HORIZONTAL) if logo: headerBox.pack_start(imageFromFile(logo), False, False, 0) headerLabel = gtk.Label(label=header) headerLabel.set_selectable(True) headerBox.pack_start(headerLabel, False, False, 15) headerBox.show_all() self.pack_start(headerBox, False, False, 0) ## notebook = gtk.Notebook() self.notebook = notebook self.pack_start(notebook, True, True, 5) notebook.set_tab_pos(gtk.PositionType.LEFT) ## tab1_about = self.newTabLabelWidget(about) tab2_authors = self.newTabWidgetTextView(authors) tab3_license = self.newTabWidgetTextView(license) ## tabs = [ (tab1_about, self.newTabTitle("About", "dialog-information-22.png")), (tab2_authors, self.newTabTitle("Authors", "author-22.png")), (tab3_license, self.newTabTitle("License", "license-22.png")), ] ## for widget, titleW in tabs: notebook.append_page(widget, titleW) ## self.show_all() # Somethig does not work with TextView def newTabWidgetTextView( self, text: str, wrap: bool = False, justification: "Optional[gtk.Justification]" = None, ): tv = gtk.TextView() if wrap: tv.set_wrap_mode(gtk.WrapMode.WORD) if justification is not None: tv.set_justification(justification) tv.set_cursor_visible(False) tv.set_border_width(10) buf = tv.get_buffer() # buf.insert_markup(buf.get_end_iter(), markup=text, len=len(text.encode("utf-8"))) buf.set_text(text) tv.show_all() swin = gtk.ScrolledWindow() swin.set_policy(gtk.PolicyType.AUTOMATIC, gtk.PolicyType.AUTOMATIC) swin.set_border_width(0) swin.add(tv) return swin def newTabLabelWidget( self, text: str, wrap: bool = False, justification: "Optional[gtk.Justification]" = None, ): box = VBox() box.set_border_width(10) label = gtk.Label() label.set_selectable(True) label.set_xalign(0) label.set_yalign(0) pack(box, label, 0, 0) #if wrap: # tv.set_wrap_mode(gtk.WrapMode.WORD) #if justification is not None: # tv.set_justification(justification) # label.set_cursor_visible(False) # label.set_border_width(10) label.set_markup(text) label.show_all() swin = gtk.ScrolledWindow() swin.set_policy(gtk.PolicyType.AUTOMATIC, gtk.PolicyType.AUTOMATIC) swin.set_border_width(0) swin.add(box) return swin def newTabTitle(self, title: str, icon: str): box = gtk.Box(orientation=gtk.Orientation.VERTICAL) if icon: box.pack_start(imageFromFile(icon), False, False, 5) if title: box.pack_start(gtk.Label(label=title), False, False, 5) box.show_all() return box pyglossary-4.5.0/pyglossary/ui/gtk3_utils/dialog.py000066400000000000000000000024521417733132500224770ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016-2017 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from gi.repository import Gtk as gtk from gi.repository import Gdk as gdk class MyDialog(object): def startWaiting(self): self.queue_draw() self.vbox.set_sensitive(False) self.get_window().set_cursor(gdk.Cursor.new(gdk.CursorType.WATCH)) while gtk.events_pending(): gtk.main_iteration_do(False) def endWaiting(self): self.get_window().set_cursor(gdk.Cursor.new(gdk.CursorType.LEFT_PTR)) self.vbox.set_sensitive(True) def waitingDo(self, func, *args, **kwargs): self.startWaiting() try: func(*args, **kwargs) except Exception as e: raise e finally: self.endWaiting() pyglossary-4.5.0/pyglossary/ui/gtk3_utils/resize_button.py000066400000000000000000000022671417733132500241400ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016-2017 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from . import * from .utils import * class ResizeButton(gtk.EventBox): def __init__(self, win, edge=gdk.WindowEdge.SOUTH_EAST): gtk.EventBox.__init__(self) self.win = win self.edge = edge ### self.image = imageFromFile('resize.png') self.add(self.image) self.connect('button-press-event', self.buttonPress) def buttonPress(self, obj, gevent): self.win.begin_resize_drag( self.edge, gevent.button, int(gevent.x_root), int(gevent.y_root), gevent.time, ) pyglossary-4.5.0/pyglossary/ui/gtk3_utils/utils.py000066400000000000000000000105211417733132500223740ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # Copyright © 2016-2019 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from gi.repository import Pango as pango from . import * import logging from os.path import isabs, join from pyglossary.core import appResDir log = logging.getLogger("pyglossary") def VBox(**kwargs): return gtk.Box(orientation=gtk.Orientation.VERTICAL, **kwargs) def HBox(**kwargs): return gtk.Box(orientation=gtk.Orientation.HORIZONTAL, **kwargs) def set_tooltip(widget, text): try: widget.set_tooltip_text(text) # PyGTK 2.12 or above except AttributeError: try: widget.set_tooltip(gtk.Tooltips(), text) except Exception: log.exception("") def imageFromFile(path): # the file must exist if not isabs(path): path = join(appResDir, path) im = gtk.Image() try: im.set_from_file(path) except Exception: log.exception("") return im def imageFromIconName(iconName: str, size: int, nonStock=False) -> gtk.Image: # So gtk.Image.new_from_stock is deprecated # And the doc says we should use gtk.Image.new_from_icon_name # which does NOT have the same functionality! # because not all stock items are existing in all themes (even popular themes) # and new_from_icon_name does not seem to look in other (non-default) themes! # So for now we use new_from_stock, unless it's not a stock item # But we do not use either of these two outside this function # So that it's easy to switch if nonStock: return gtk.Image.new_from_icon_name(iconName, size) try: return gtk.Image.new_from_stock(iconName, size) except Exception: return gtk.Image.new_from_icon_name(iconName, size) def rgba_parse(colorStr): rgba = gdk.RGBA() if not rgba.parse(colorStr): raise ValueError(f"bad color string {colorStr!r}") return rgba def color_parse(colorStr): return rgba_parse(colorStr).to_color() def pack(box, child, expand=False, fill=False, padding=0): if isinstance(box, gtk.Box): box.pack_start(child, expand, fill, padding) elif isinstance(box, gtk.CellLayout): box.pack_start(child, expand) else: raise TypeError(f"pack: unknown type {type(box)}") def dialog_add_button( dialog, iconName, label, resId, onClicked=None, tooltip="", ): b = dialog.add_button(label, resId) if onClicked: b.connect("clicked", onClicked) if tooltip: set_tooltip(b, tooltip) return b def showMsg( msg, iconName="", parent=None, transient_for=None, title="", borderWidth=10, iconSize=gtk.IconSize.DIALOG, selectable=False, ): win = gtk.Dialog( parent=parent, transient_for=transient_for, ) # flags=0 makes it skip task bar if title: win.set_title(title) hbox = HBox(spacing=10) hbox.set_border_width(borderWidth) if iconName: # win.set_icon(...) pack(hbox, imageFromIconName(iconName, iconSize)) label = gtk.Label(label=msg) # set_line_wrap(True) makes the window go crazy tall (taller than screen) # and that's the reason for label.set_size_request and win.resize label.set_line_wrap(True) label.set_line_wrap_mode(pango.WrapMode.WORD) label.set_size_request(500, 1) if selectable: label.set_selectable(True) pack(hbox, label) hbox.show_all() pack(win.vbox, hbox) dialog_add_button( win, "gtk-close", "_Close", gtk.ResponseType.OK, ) win.resize(600, 1) win.run() win.destroy() def showError(msg, **kwargs): # gtk-dialog-error is deprecated since version 3.10: # Use named icon “dialog-error”. showMsg(msg, iconName="gtk-dialog-error", **kwargs) def showWarning(msg, **kwargs): # gtk-dialog-warning is deprecated since version 3.10: # Use named icon “dialog-warning”. showMsg(msg, iconName="gtk-dialog-warning", **kwargs) def showInfo(msg, **kwargs): # gtk-dialog-info is deprecated since version 3.10: # Use named icon “dialog-information”. showMsg(msg, iconName="gtk-dialog-info", **kwargs) pyglossary-4.5.0/pyglossary/ui/main.py000066400000000000000000000425221417733132500200760ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # main.py # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . import os import sys import argparse import json import logging from pyglossary import core # essential from pyglossary.entry import Entry from pyglossary.ui.base import UIBase from pyglossary.langs import langDict from pyglossary.sort_keys import namedSortKeyList, namedSortKeyByName # the first thing to do is to set up logger. # other modules also using logger "root", so it is essential to set it up prior # to importing anything else; with exception to pyglossary.core which sets up # logger class, and so should be done before actually initializing logger. # verbosity level may be given on command line, so we have to parse arguments # before setting up logger. # once more: # - import system modules like os, sys, argparse etc and pyglossary.core # - parse args # - set up logger # - import submodules # - other code # no-progress-bar only for command line UI # TODO: load ui-dependent available options from ui modules # (for example ui_cmd.available_options) # the only problem is that it has to "import gtk" before it get the # "ui_gtk.available_options" # TODO # -v (verbose or version?) # -r (reverse or read-options) log = None def canRunGUI(): if core.sysName == "linux": return bool(os.getenv("DISPLAY")) if core.sysName == "darwin": try: import tkinter except ModuleNotFoundError: return False return True class StoreConstAction(argparse.Action): def __init__( self, option_strings, same_dest="", const_value=None, nargs=0, **kwargs ): if isinstance(option_strings, str): option_strings = [option_strings] argparse.Action.__init__( self, option_strings=option_strings, nargs=nargs, **kwargs ) self.same_dest = same_dest self.const_value = const_value def __call__( self, parser=None, namespace=None, values=None, option_strings=None, required=False, dest=None, ): if not parser: return self dest = self.dest if getattr(namespace, dest) is not None: flag = self.option_strings[0] if getattr(namespace, dest) == self.const_value: parser.error(f"multiple {flag} options") else: parser.error(f"conflicting options: {self.same_dest} and {flag}") setattr(namespace, dest, self.const_value) return self def registerConfigOption(parser, key: str, option: "Option"): if not option.hasFlag: return flag = option.customFlag if not flag: flag = key.replace('_', '-') if option.typ != "bool": parser.add_argument( f"--{flag}", dest=key, default=None, help=option.comment, ) return if not option.comment: print(f"registerConfigOption: option has no comment: {option}") return if not option.falseComment: parser.add_argument( f"--{flag}", dest=key, action="store_true", default=None, help=option.comment, ) return parser.add_argument( dest=key, action=StoreConstAction( f"--{flag}", same_dest=f"--no-{flag}", const_value=True, dest=key, default=None, help=option.comment, ), ) parser.add_argument( dest=key, action=StoreConstAction( f"--no-{flag}", same_dest=f"--{flag}", const_value=False, dest=key, default=None, help=option.falseComment, ), ) def base_ui_run( inputFilename: str = "", outputFilename: str = "", inputFormat: str = "", outputFormat: str = "", reverse: bool = False, config: "Optional[Dict]" = None, readOptions: "Optional[Dict]" = None, writeOptions: "Optional[Dict]" = None, convertOptions: "Optional[Dict]" = None, glossarySetAttrs: "Optional[Dict]" = None, ): from pyglossary.glossary import Glossary if reverse: log.error("--reverse does not work with --ui=none") return False ui = UIBase() ui.loadConfig(**config) glos = Glossary(ui=ui) glos.config = ui.config if glossarySetAttrs: for attr, value in glossarySetAttrs.items(): setattr(glos, attr, value) glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, inputFormat=inputFormat, outputFormat=outputFormat, readOptions=readOptions, writeOptions=writeOptions, **convertOptions ) return True def getGitVersion(gitDir): import subprocess try: outputB, error = subprocess.Popen( [ "git", "--git-dir", gitDir, "describe", "--always", ], stdout=subprocess.PIPE, ).communicate() except Exception as e: sys.stderr.write(str(e) + "\n") return "" # if error is None: return outputB.decode("utf-8").strip() def getVersion(): from pyglossary.core import rootDir gitDir = os.path.join(rootDir, ".git") if os.path.isdir(gitDir): version = getGitVersion(gitDir) if version: return version return core.VERSION def validateLangStr(st) -> "Optional[str]": lang = langDict[st] if lang: return lang.name lang = langDict[st.lower()] if lang: return lang.name log.error(f"unknown language {st!r}") return def main(): global log uiBase = UIBase() uiBase.loadConfig() config = uiBase.config defaultHasColor = config.get( "color.enable.cmd.windows" if os.sep == "\\" else "color.enable.cmd.unix", True, ) parser = argparse.ArgumentParser( prog=sys.argv[0], add_help=False, # allow_abbrev=False, ) parser.add_argument( "-v", "--verbosity", action="store", dest="verbosity", type=int, choices=(0, 1, 2, 3, 4, 5), required=False, default=3, ) parser.add_argument( "--version", action="store_true", ) parser.add_argument( "-h", "--help", dest="help", action="store_true", ) parser.add_argument( "-u", "--ui", dest="ui_type", default="auto", choices=( "cmd", "gtk", "tk", # "qt", "auto", "none", ), ) parser.add_argument( "--cmd", dest="ui_type", action="store_const", const="cmd", default=None, help="use command-line user interface", ) parser.add_argument( "--gtk", dest="ui_type", action="store_const", const="gtk", default=None, help="use Gtk-based user interface", ) parser.add_argument( "--tk", dest="ui_type", action="store_const", const="tk", default=None, help="use Tkinter-based user interface", ) parser.add_argument( "--interactive", "--inter", dest="interactive", action="store_true", default=None, help="switch to interactive command line interface", ) parser.add_argument( "--no-interactive", "--no-inter", dest="no_interactive", action="store_true", default=None, help=( "do not automatically switch to interactive command line" " interface, for scripts" ), ) parser.add_argument( "-r", "--read-options", dest="readOptions", default="", ) parser.add_argument( "-w", "--write-options", dest="writeOptions", default="", ) parser.add_argument( "--json-read-options", dest="jsonReadOptions", default=None, ) parser.add_argument( "--json-write-options", dest="jsonWriteOptions", default=None, ) parser.add_argument( "--read-format", dest="inputFormat", ) parser.add_argument( "--write-format", dest="outputFormat", action="store", ) parser.add_argument( "--direct", dest="direct", action="store_true", default=None, help="if possible, convert directly without loading into memory", ) parser.add_argument( "--indirect", dest="direct", action="store_false", default=None, help=( "disable `direct` mode, load full data into memory before writing" ", this is default" ), ) parser.add_argument( "--sqlite", dest="sqlite", action="store_true", default=None, help=( "use SQLite as middle storage instead of RAM in direct mode," "for very large glossaries" ), ) parser.add_argument( "--no-sqlite", dest="sqlite", action="store_false", default=None, help="do not use SQLite mode", ) parser.add_argument( "--no-progress-bar", dest="progressbar", action="store_false", default=None, ) parser.add_argument( "--no-color", dest="noColor", action="store_true", default=not defaultHasColor, ) parser.add_argument( "--sort", dest="sort", action="store_true", default=None, ) parser.add_argument( "--no-sort", dest="sort", action="store_false", default=None, ) parser.add_argument( "--sort-key", action="store", dest="sortKeyName", default=None, help="name of sort key", ) parser.add_argument( "--sort-encoding", action="store", dest="sortEncoding", default=None, help="encoding of sort (default utf-8)", ) # _______________________________ parser.add_argument( "--source-lang", action="store", dest="sourceLang", default=None, help="source/query language", ) parser.add_argument( "--target-lang", action="store", dest="targetLang", default=None, help="target/definition language", ) parser.add_argument( "--name", action="store", dest="name", default=None, help="glossary name/title", ) # _______________________________ parser.add_argument( "--reverse", dest="reverse", action="store_true", ) parser.add_argument( "inputFilename", action="store", default="", nargs="?", ) parser.add_argument( "outputFilename", action="store", default="", nargs="?", ) def shouldUseCMD(args): if not canRunGUI(): return True if args.interactive: return True if args.inputFilename and args.outputFilename: return True return False # _______________________________ for key, option in UIBase.configDefDict.items(): registerConfigOption(parser, key, option) # _______________________________ args = parser.parse_args() # parser.conflict_handler == "error" if args.version: print(f"PyGlossary {getVersion()}") sys.exit(0) log = logging.getLogger("pyglossary") ui_type = args.ui_type if ui_type == "none": args.noColor = True core.noColor = args.noColor logHanlder = core.StdLogHandler( noColor=args.noColor ) log.setVerbosity(args.verbosity) log.addHandler(logHanlder) # with the logger setted up, we can import other pyglossary modules, so they # can do some logging in right way. for param1, param2 in UIBase.conflictingParams: if getattr(args, param1) and getattr(args, param2): log.critical( "Conflicting flags: " f"--{param1.replace('_', '-')} and " f"--{param2.replace('_', '-')}" ) sys.exit(1) if args.sqlite: # args.direct is None by default which means automatic args.direct = False if not args.sort: if args.sortKeyName: log.critical("Passed --sort-key without --sort") sys.exit(1) if args.sortEncoding: log.critical("Passed --sort-encoding without --sort") sys.exit(1) if args.sortKeyName: if args.sortKeyName not in namedSortKeyByName: _valuesStr = ", ".join([_sk.name for _sk in namedSortKeyList]) log.critical( f"Invalid sortKeyName={args.sortKeyName!r}" f". Supported values:\n{_valuesStr}" ) sys.exit(1) core.checkCreateConfDir() if sys.getdefaultencoding() != "utf-8": log.warning(f"System encoding is not utf-8, it's {sys.getdefaultencoding()!r}") ############################## from pyglossary.glossary import Glossary from pyglossary.langs import langDict from pyglossary.ui.ui_cmd import help, parseFormatOptionsStr Glossary.init() if log.isDebug(): log.debug(f"en -> {langDict['en']!r}") ############################## ui_list = [ "gtk", "tk", ] # log.info(f"PyGlossary {core.VERSION}") if args.help: help() sys.exit(0) # only used in ui_cmd for now readOptions = parseFormatOptionsStr(args.readOptions) if readOptions is None: return if args.jsonReadOptions: newReadOptions = json.loads(args.jsonReadOptions) if isinstance(newReadOptions, dict): readOptions.update(newReadOptions) else: log.error( f"invalid value for --json-read-options, " f"must be an object/dict, not {type(newReadOptions)}" ) writeOptions = parseFormatOptionsStr(args.writeOptions) if writeOptions is None: return if args.jsonWriteOptions: newWriteOptions = json.loads(args.jsonWriteOptions) if isinstance(newWriteOptions, dict): writeOptions.update(newWriteOptions) else: log.error( f"invalid value for --json-write-options, " f"must be an object/dict, not {type(newWriteOptions)}" ) """ examples for read and write options: --read-options testOption=stringValue --read-options enableFoo=True --read-options fooList=[1,2,3] --read-options 'fooList=[1, 2, 3]' --read-options 'testOption=stringValue; enableFoo=True; fooList=[1, 2, 3]' --read-options 'testOption=stringValue;enableFoo=True;fooList=[1,2,3]' if a desired value contains ";", you can use --json-read-options or --json-write-options flags instead, with json object as value, quoted for command line. for example: '--json-write-options={"delimiter": ";"}' """ convertOptionsKeys = ( "direct", "progressbar", "sort", "sortKeyName", "sortEncoding", "sqlite", ) infoOverrideSpec = ( ("sourceLang", validateLangStr), ("targetLang", validateLangStr), ("name", str), ) for key, option in uiBase.configDefDict.items(): if not option.hasFlag: continue value = getattr(args, key, None) if value is None: continue log.debug(f"config: {key} = {value}") if not option.validate(value): log.error(f"invalid config value: {key} = {value!r}") continue config[key] = value logHanlder.config = config convertOptions = {} for key in convertOptionsKeys: value = getattr(args, key, None) if value is not None: convertOptions[key] = value infoOverride = {} for key, validate in infoOverrideSpec: value = getattr(args, key, None) if value is None: continue value = validate(value) if value is None: continue infoOverride[key] = value if infoOverride: convertOptions["infoOverride"] = infoOverride if args.inputFilename and readOptions: inputArgs = Glossary.detectInputFormat( args.inputFilename, format=args.inputFormat, ) if not inputArgs: log.error( f"Could not detect format for input file {args.inputFilename}" ) sys.exit(1) inputFormat = inputArgs[1] readOptionsProp = Glossary.plugins[inputFormat].optionsProp for optName, optValue in readOptions.items(): if optName not in Glossary.formatsReadOptions[inputFormat]: log.error(f"Invalid option name {optName} for format {inputFormat}") sys.exit(1) prop = readOptionsProp[optName] optValueNew, ok = prop.evaluate(optValue) if not ok or not prop.validate(optValueNew): log.error( f"Invalid option value {optName}={optValue!r}" f" for format {inputFormat}" ) sys.exit(1) readOptions[optName] = optValueNew if args.outputFilename and writeOptions: outputArgs = Glossary.detectOutputFormat( filename=args.outputFilename, format=args.outputFormat, inputFilename=args.inputFilename, ) if outputArgs is None: sys.exit(1) _, outputFormat, _ = outputArgs writeOptionsProp = Glossary.plugins[outputFormat].optionsProp for optName, optValue in writeOptions.items(): if optName not in Glossary.formatsWriteOptions[outputFormat]: log.error(f"Invalid option name {optName} for format {outputFormat}") sys.exit(1) prop = writeOptionsProp[optName] optValueNew, ok = prop.evaluate(optValue) if not ok or not prop.validate(optValueNew): log.error( f"Invalid option value {optName}={optValue!r}" f" for format {outputFormat}" ) sys.exit(1) writeOptions[optName] = optValueNew if convertOptions: log.debug(f"convertOptions = {convertOptions}") runKeywordArgs = dict( inputFilename=args.inputFilename, outputFilename=args.outputFilename, inputFormat=args.inputFormat, outputFormat=args.outputFormat, reverse=args.reverse, config=config, readOptions=readOptions, writeOptions=writeOptions, convertOptions=convertOptions, glossarySetAttrs=None, ) if ui_type == "none": sys.exit(0 if base_ui_run(**runKeywordArgs) else 1) if ui_type == "auto" and shouldUseCMD(args): ui_type = "cmd" if ui_type == "cmd": if args.interactive: from pyglossary.ui.ui_cmd_interactive import UI elif args.inputFilename and args.outputFilename: from pyglossary.ui.ui_cmd import UI elif not args.no_interactive: from pyglossary.ui.ui_cmd_interactive import UI else: log.error("no input file given, try --help") sys.exit(1) sys.exit(0 if UI().run(**runKeywordArgs) else 1) if ui_type == "auto": ui_module = None for ui_type2 in ui_list: try: ui_module = __import__( f"pyglossary.ui.ui_{ui_type2}", fromlist=f"ui_{ui_type2}", ) except ImportError: log.exception("error while importing UI module:") else: break if ui_module is None: log.error( "no user interface module found! " f"try \"{sys.argv[0]} -h\" to see command line usage" ) sys.exit(1) else: ui_module = __import__( f"pyglossary.ui.ui_{ui_type}", fromlist=f"ui_{ui_type}", ) sys.exit(0 if ui_module.UI().run(**runKeywordArgs) else 1) pyglossary-4.5.0/pyglossary/ui/progressbar/000077500000000000000000000000001417733132500211245ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/ui/progressbar/__init__.py000066400000000000000000000035641417733132500232450ustar00rootroot00000000000000#!/usr/bin/python # -*- coding: utf-8 -*- # # progressbar - Text progress bar library for Python. # Copyright (c) 2005 Nilton Volpato # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Text progress bar library for Python. A text progress bar is typically used to display the progress of a long running operation, providing a visual cue that processing is underway. The ProgressBar class manages the current progress, and the format of the line is given by a number of widgets. A widget is an object that may display differently depending on the state of the progress bar. There are three types of widgets: - a string, which always shows itself - a ProgressBarWidget, which may return a different value every time its update method is called - a ProgressBarWidgetHFill, which is like ProgressBarWidget, except it expands to fill the remaining width of the line. The progressbar module is very easy to use, yet very powerful. It will also automatically enable features like auto-resizing when the system supports it. """ __author__ = 'Nilton Volpato' __author_email__ = 'nilton.volpato@gmail.com' __date__ = '2011-05-14' __version__ = '2.5' from .compat import * from .widgets import * from .progressbar import * pyglossary-4.5.0/pyglossary/ui/progressbar/compat.py000066400000000000000000000026421417733132500227650ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # progressbar - Text progress bar library for Python. # Copyright (c) 2005 Nilton Volpato # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Compatibility methods and classes for the progressbar module.""" # Python 3.x (and backports) use a modified iterator syntax # This will allow 2.x to behave with 3.x iterators try: next except NameError: def next(iter): try: # Try new style iterators return iter.__next__() except AttributeError: # Fallback in case of a "native" iterator return iter.next() # Python < 2.5 does not have "any" try: any except NameError: def any(iterator): for item in iterator: if item: return True return False pyglossary-4.5.0/pyglossary/ui/progressbar/progressbar.py000066400000000000000000000227501417733132500240350ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # progressbar - Text progress bar library for Python. # Copyright (c) 2005 Nilton Volpato # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Main ProgressBar class.""" from __future__ import division import math import os import signal import sys import time try: from fcntl import ioctl from array import array import termios except ImportError: pass from .compat import * # for: any, next from . import widgets class ProgressBar(object): """The ProgressBar class which updates and prints the bar. A common way of using it is like: >>> pbar = ProgressBar().start() >>> for i in range(100): ... # do something ... pbar.update(i+1) ... >>> pbar.finish() You can also use a ProgressBar as an iterator: >>> progress = ProgressBar() >>> for i in progress(some_iterable): ... # do something ... Since the progress bar is incredibly customizable you can specify different widgets of any type in any order. You can even write your own widgets! However, since there are already a good number of widgets you should probably play around with them before moving on to create your own widgets. The term_width parameter represents the current terminal width. If the parameter is set to an integer then the progress bar will use that, otherwise it will attempt to determine the terminal width falling back to 80 columns if the width cannot be determined. When implementing a widget's update method you are passed a reference to the current progress bar. As a result, you have access to the ProgressBar's methods and attributes. Although there is nothing preventing you from changing the ProgressBar you should treat it as read only. Useful methods and attributes include (Public API): - currval: current progress (0 <= currval <= maxval) - maxval: maximum (and final) value - finished: True if the bar has finished (reached 100%) - start_time: the time when start() method of ProgressBar was called - seconds_elapsed: seconds elapsed since start_time and last call to update - percentage(): progress in percent [0..100] """ __slots__ = ('currval', 'fd', 'finished', 'last_update_time', 'left_justify', 'maxval', 'next_update', 'num_intervals', 'poll', 'seconds_elapsed', 'signal_set', 'start_time', 'term_width', 'update_interval', 'widgets', '_time_sensitive', '__iterable') _DEFAULT_MAXVAL = 100 _DEFAULT_TERMSIZE = 80 _DEFAULT_WIDGETS = [widgets.Percentage(), ' ', widgets.Bar()] def __init__(self, maxval=None, widgets=None, term_width=None, poll=1, left_justify=True, fd=None): """Initializes a progress bar with sane defaults.""" # Don't share a reference with any other progress bars if widgets is None: widgets = list(self._DEFAULT_WIDGETS) self.maxval = maxval self.widgets = widgets self.fd = fd if fd is not None else sys.stderr self.left_justify = left_justify self.signal_set = False if term_width is not None: self.term_width = term_width else: try: self._handle_resize() signal.signal(signal.SIGWINCH, self._handle_resize) self.signal_set = True except (SystemExit, KeyboardInterrupt): raise except: self.term_width = self._env_size() self.__iterable = None self._update_widgets() self.currval = 0 self.finished = False self.last_update_time = None self.poll = poll self.seconds_elapsed = 0 self.start_time = None self.update_interval = 1 self.next_update = 0 def __call__(self, iterable): """Use a ProgressBar to iterate through an iterable.""" try: self.maxval = len(iterable) except: if self.maxval is None: self.maxval = widgets.UnknownLength self.__iterable = iter(iterable) return self def __iter__(self): return self def __next__(self): try: value = next(self.__iterable) if self.start_time is None: self.start() else: self.update(self.currval + 1) return value except StopIteration: if self.start_time is None: self.start() self.finish() raise # Create an alias so that Python 2.x won't complain about not being # an iterator. next = __next__ def _env_size(self): """Tries to find the term_width from the environment.""" return int(os.environ.get('COLUMNS', self._DEFAULT_TERMSIZE)) - 1 def _handle_resize(self, signum=None, frame=None): """Tries to catch resize signals sent from the terminal.""" h, w = array('h', ioctl(self.fd, termios.TIOCGWINSZ, '\0' * 8))[:2] self.term_width = w def percentage(self): """Returns the progress as a percentage.""" if self.maxval is widgets.UnknownLength: return float("NaN") if self.currval >= self.maxval: return 100.0 return (self.currval * 100.0 / self.maxval) if self.maxval else 100.00 percent = property(percentage) def _format_widgets(self): result = [] expanding = [] width = self.term_width for index, widget in enumerate(self.widgets): if isinstance(widget, widgets.WidgetHFill): result.append(widget) expanding.insert(0, index) else: widget = widgets.format_updatable(widget, self) result.append(widget) width -= len(widget) count = len(expanding) while count: portion = max(int(math.ceil(width * 1. / count)), 0) index = expanding.pop() count -= 1 widget = result[index].update(self, portion) width -= len(widget) result[index] = widget return result def _format_line(self): """Joins the widgets and justifies the line.""" widgets = ''.join(self._format_widgets()) if self.left_justify: return widgets.ljust(self.term_width) else: return widgets.rjust(self.term_width) def _need_update(self): """Returns whether the ProgressBar should redraw the line.""" if self.currval >= self.next_update or self.finished: return True delta = time.time() - self.last_update_time return self._time_sensitive and delta > self.poll def _update_widgets(self): """Checks all widgets for the time sensitive bit.""" self._time_sensitive = any(getattr(w, 'TIME_SENSITIVE', False) for w in self.widgets) def update(self, value=None): """Updates the ProgressBar to a new value.""" if value is not None and value is not widgets.UnknownLength: if (self.maxval is not widgets.UnknownLength and not 0 <= value <= self.maxval): raise ValueError('Value out of range') self.currval = value if not self._need_update(): return if self.start_time is None: raise RuntimeError('You must call "start" before calling "update"') now = time.time() self.seconds_elapsed = now - self.start_time self.next_update = self.currval + self.update_interval self.fd.write(self._format_line() + '\r') self.fd.flush() self.last_update_time = now def start(self, num_intervals=0): """Starts measuring time, and prints the bar at 0%. It returns self so you can use it like this: >>> pbar = ProgressBar().start() >>> for i in range(100): ... # do something ... pbar.update(i+1) ... >>> pbar.finish() """ if self.maxval is None: self.maxval = self._DEFAULT_MAXVAL if num_intervals > 0: self.num_intervals = num_intervals else: self.num_intervals = max(100, self.term_width) self.next_update = 0 if self.maxval is not widgets.UnknownLength: if self.maxval < 0: raise ValueError('Value out of range') self.update_interval = self.maxval / self.num_intervals self.start_time = self.last_update_time = time.time() self.update(0) return self def finish(self): """Puts the ProgressBar bar in the finished state.""" if self.finished: return self.finished = True self.update(self.maxval) self.fd.write('\n') if self.signal_set: signal.signal(signal.SIGWINCH, signal.SIG_DFL) pyglossary-4.5.0/pyglossary/ui/progressbar/widgets.py000066400000000000000000000264001417733132500231460ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # progressbar - Text progress bar library for Python. # Copyright (c) 2005 Nilton Volpato # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Default ProgressBar widgets.""" from __future__ import division import datetime import math try: from abc import ABCMeta, abstractmethod except ImportError: AbstractWidget = object abstractmethod = lambda fn: fn else: AbstractWidget = ABCMeta('AbstractWidget', (object,), {}) class UnknownLength: pass def format_updatable(updatable, pbar): if hasattr(updatable, 'update'): return updatable.update(pbar) else: return updatable class Widget(AbstractWidget): """The base class for all widgets. The ProgressBar will call the widget's update value when the widget should be updated. The widget's size may change between calls, but the widget may display incorrectly if the size changes drastically and repeatedly. The boolean TIME_SENSITIVE informs the ProgressBar that it should be updated more often because it is time sensitive. """ TIME_SENSITIVE = False __slots__ = () @abstractmethod def update(self, pbar): """Updates the widget. pbar - a reference to the calling ProgressBar """ class WidgetHFill(Widget): """The base class for all variable width widgets. This widget is much like the \\hfill command in TeX, it will expand to fill the line. You can use more than one in the same line, and they will all have the same width, and together will fill the line. """ @abstractmethod def update(self, pbar, width): """Updates the widget providing the total width the widget must fill. pbar - a reference to the calling ProgressBar width - The total width the widget must fill """ class Timer(Widget): """Widget which displays the elapsed seconds.""" __slots__ = ('format_string',) TIME_SENSITIVE = True def __init__(self, format='Elapsed Time: %s'): self.format_string = format @staticmethod def format_time(seconds): """Formats time as the string "HH:MM:SS".""" return str(datetime.timedelta(seconds=int(seconds))) def update(self, pbar): """Updates the widget to show the elapsed time.""" return self.format_string % self.format_time(pbar.seconds_elapsed) class ETA(Timer): """Widget which attempts to estimate the time of arrival.""" TIME_SENSITIVE = True def update(self, pbar): """Updates the widget to show the ETA or total time when finished.""" if pbar.maxval is UnknownLength or pbar.currval == 0: return 'ETA: --:--:--' elif pbar.finished: return 'Time: %s' % self.format_time(pbar.seconds_elapsed) else: elapsed = pbar.seconds_elapsed eta = elapsed * pbar.maxval / pbar.currval - elapsed return 'ETA: %s' % self.format_time(eta) class AdaptiveETA(Timer): """Widget which attempts to estimate the time of arrival. Uses a weighted average of two estimates: 1) ETA based on the total progress and time elapsed so far 2) ETA based on the progress as per the last 10 update reports The weight depends on the current progress so that to begin with the total progress is used and at the end only the most recent progress is used. """ TIME_SENSITIVE = True NUM_SAMPLES = 10 def _update_samples(self, currval, elapsed): sample = (currval, elapsed) if not hasattr(self, 'samples'): self.samples = [sample] * (self.NUM_SAMPLES + 1) else: self.samples.append(sample) return self.samples.pop(0) def _eta(self, maxval, currval, elapsed): return elapsed * maxval / float(currval) - elapsed def update(self, pbar): """Updates the widget to show the ETA or total time when finished.""" if pbar.maxval is UnknownLength or pbar.currval == 0: return 'ETA: --:--:--' elif pbar.finished: return 'Time: %s' % self.format_time(pbar.seconds_elapsed) else: elapsed = pbar.seconds_elapsed currval1, elapsed1 = self._update_samples(pbar.currval, elapsed) eta = self._eta(pbar.maxval, pbar.currval, elapsed) if pbar.currval > currval1: etasamp = self._eta(pbar.maxval - currval1, pbar.currval - currval1, elapsed - elapsed1) weight = (pbar.currval / float(pbar.maxval)) ** 0.5 eta = (1 - weight) * eta + weight * etasamp return 'ETA: %s' % self.format_time(eta) class FileTransferSpeed(Widget): """Widget for showing the transfer speed (useful for file transfers).""" FMT = '%6.2f %s%s/s' PREFIXES = ' kMGTPEZY' __slots__ = ('unit',) def __init__(self, unit='B'): self.unit = unit def update(self, pbar): """Updates the widget with the current SI prefixed speed.""" if pbar.seconds_elapsed < 2e-6 or pbar.currval < 2e-6: # =~ 0 scaled = power = 0 else: speed = pbar.currval / pbar.seconds_elapsed power = int(math.log(speed, 1000)) scaled = speed / 1000.**power return self.FMT % (scaled, self.PREFIXES[power], self.unit) class AnimatedMarker(Widget): """An animated marker for the progress bar which defaults to appear as if it were rotating. """ __slots__ = ('markers', 'curmark') def __init__(self, markers='|/-\\'): self.markers = markers self.curmark = -1 def update(self, pbar): """Updates the widget to show the next marker or the first marker when finished""" if pbar.finished: return self.markers[0] self.curmark = (self.curmark + 1) % len(self.markers) return self.markers[self.curmark] # Alias for backwards compatibility RotatingMarker = AnimatedMarker class Counter(Widget): """Displays the current count.""" __slots__ = ('format_string',) def __init__(self, format='%d'): self.format_string = format def update(self, pbar): return self.format_string % pbar.currval class Percentage(Widget): """Displays the current percentage as a number with a percent sign.""" def __init__(self, prefix="%"): Widget.__init__(self) self.prefix = prefix def update(self, pbar): return f"{self.prefix}{pbar.percentage():.1f}"\ .rjust(5 + len(self.prefix)) class FormatLabel(Timer): """Displays a formatted label.""" mapping = { 'elapsed': ('seconds_elapsed', Timer.format_time), 'finished': ('finished', None), 'last_update': ('last_update_time', None), 'max': ('maxval', None), 'seconds': ('seconds_elapsed', None), 'start': ('start_time', None), 'value': ('currval', None) } __slots__ = ('format_string',) def __init__(self, format): self.format_string = format def update(self, pbar): context = {} for name, (key, transform) in self.mapping.items(): try: value = getattr(pbar, key) if transform is None: context[name] = value else: context[name] = transform(value) except: pass return self.format_string % context class SimpleProgress(Widget): """Returns progress as a count of the total (e.g.: "5 of 47").""" __slots__ = ('sep',) def __init__(self, sep=' of '): self.sep = sep def update(self, pbar): if pbar.maxval is UnknownLength: return '%d%s?' % (pbar.currval, self.sep) return '%d%s%s' % (pbar.currval, self.sep, pbar.maxval) class Bar(WidgetHFill): """A progress bar which stretches to fill the line.""" __slots__ = ('marker', 'left', 'right', 'fill', 'fill_left') def __init__(self, marker='#', left='|', right='|', fill=' ', fill_left=True): """Creates a customizable progress bar. marker - string or updatable object to use as a marker left - string or updatable object to use as a left border right - string or updatable object to use as a right border fill - character to use for the empty part of the progress bar fill_left - whether to fill from the left or the right """ self.marker = marker self.left = left self.right = right self.fill = fill self.fill_left = fill_left def update(self, pbar, width): """Updates the progress bar and its subcomponents.""" left, marked, right = (format_updatable(i, pbar) for i in (self.left, self.marker, self.right)) width -= len(left) + len(right) # Marked must *always* have length of 1 if pbar.maxval is not UnknownLength and pbar.maxval: marked *= int(pbar.currval / pbar.maxval * width) else: marked = '' if self.fill_left: return '%s%s%s' % (left, marked.ljust(width, self.fill), right) else: return '%s%s%s' % (left, marked.rjust(width, self.fill), right) class ReverseBar(Bar): """A bar which has a marker which bounces from side to side.""" def __init__(self, marker='#', left='|', right='|', fill=' ', fill_left=False): """Creates a customizable progress bar. marker - string or updatable object to use as a marker left - string or updatable object to use as a left border right - string or updatable object to use as a right border fill - character to use for the empty part of the progress bar fill_left - whether to fill from the left or the right """ self.marker = marker self.left = left self.right = right self.fill = fill self.fill_left = fill_left class BouncingBar(Bar): def update(self, pbar, width): """Updates the progress bar and its subcomponents.""" left, marker, right = (format_updatable(i, pbar) for i in (self.left, self.marker, self.right)) width -= len(left) + len(right) if pbar.finished: return '%s%s%s' % (left, width * marker, right) position = int(pbar.currval % (width * 2 - 1)) if position > width: position = width * 2 - position lpad = self.fill * (position - 1) rpad = self.fill * (width - len(marker) - len(lpad)) # Swap if we want to bounce the other way if not self.fill_left: rpad, lpad = lpad, rpad return '%s%s%s%s%s' % (left, lpad, marker, rpad, right) pyglossary-4.5.0/pyglossary/ui/ui_cmd.py000066400000000000000000000175531417733132500204200ustar00rootroot00000000000000# -*- coding: utf-8 -*- # ui_cmd.py # # Copyright © 2008-2021 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . from os.path import join import time from pyglossary.glossary import * from .base import * from . import progressbar as pb from .wcwidth import wcswidth def wc_ljust(text, length, padding=' '): return text + padding * max(0, (length - wcswidth(text))) if os.sep == "\\": # Operating system is Windows startBold = "" startUnderline = "" endFormat = "" else: startBold = "\x1b[1m" # Start Bold # len=4 startUnderline = "\x1b[4m" # Start Underline # len=4 endFormat = "\x1b[0;0;0m" # End Format # len=8 # redOnGray = "\x1b[0;1;31;47m" COMMAND = "pyglossary" def getColWidth(subject, strings): return max( len(x) for x in [subject] + strings ) def getFormatsTable(names, header): descriptions = [ Glossary.plugins[name].description for name in names ] extensions = [ " ".join(Glossary.plugins[name].extensions) for name in names ] nameWidth = getColWidth("Name", names) descriptionWidth = getColWidth("Description", descriptions) extensionsWidth = getColWidth("Extensions", extensions) lines = ["\n"] lines.append(startBold + header + endFormat) lines.append( " | ".join([ "Name".center(nameWidth), "Description".center(descriptionWidth), "Extensions".center(extensionsWidth) ]) ) lines.append( "-+-".join([ "-" * nameWidth, "-" * descriptionWidth, "-" * extensionsWidth, ]) ) for index, name in enumerate(names): lines.append( " | ".join([ name.ljust(nameWidth), descriptions[index].ljust(descriptionWidth), extensions[index].ljust(extensionsWidth) ]) ) return "\n".join(lines) def help(): import string text = fread(join(dataDir, "help")) text = text.replace("", startBold)\ .replace("", startUnderline)\ .replace("", endFormat)\ .replace("", endFormat) text = string.Template(text).substitute( CMD=COMMAND, ) text += getFormatsTable(Glossary.readFormats, "Supported input formats:") text += getFormatsTable(Glossary.writeFormats, "Supported output formats:") print(text) def parseFormatOptionsStr(st) -> "Optional[Dict]": """ prints error and returns None if failed to parse one option """ st = st.strip() if not st: return {} opt = {} parts = st.split(";") for part in parts: if not part: continue eq = part.find("=") if eq < 1: log.critical(f"bad option syntax: {part!r}") return None key = part[:eq].strip() if not key: log.critical(f"bad option syntax: {part!r}") return None value = part[eq + 1:].strip() opt[key] = value return opt def encodeFormatOptions(opt: "Dict") -> str: if not opt: return "" parts = [] for key, value in opt.items(): parts.append(f"{key}={value}") return ";".join(parts) class NullObj(object): def __getattr__(self, attr): return self def __setattr__(self, attr, value): pass def __setitem__(self, key, value): pass def __call__(self, *args, **kwargs): pass class UI(UIBase): def __init__(self): UIBase.__init__(self) # log.debug(self.config) self.pbar = NullObj() self._toPause = False self._resetLogFormatter = None def onSigInt(self, *args): log.info("") if self._toPause: log.info("Operation Canceled") sys.exit(0) else: self._toPause = True log.info("Please wait...") def setText(self, text): self.pbar.widgets[0] = text def fixLogger(self): for h in log.handlers: if h.name == "std": self.fixLogHandler(h) return def fillMessage(self, msg): return wc_ljust(msg, self.pbar.term_width) def fixLogHandler(self, h): def reset(): h.formatter.fill = None self._resetLogFormatter = reset h.formatter.fill = self.fillMessage def progressInit(self, title): rot = pb.RotatingMarker() self.pbar = pb.ProgressBar( maxval=1.0, # update_step=0.5, removed ) self.pbar.widgets = [ title + " ", pb.AnimatedMarker(), " ", pb.Bar(marker="█"), pb.Percentage(), " ", pb.ETA(), ] self.pbar.start(num_intervals=1000) rot.pbar = self.pbar self.fixLogger() def progress(self, rat, text=""): self.pbar.update(rat) def progressEnd(self): self.pbar.finish() if self._resetLogFormatter: self._resetLogFormatter() def reverseLoop(self, *args, **kwargs): from pyglossary.reverse import reverseGlossary reverseKwArgs = {} for key in ( "words", "matchWord", "showRel", "includeDefs", "reportStep", "saveStep", "maxNum", "minRel", "minWordLen" ): try: reverseKwArgs[key] = self.config["reverse_" + key] except KeyError: pass reverseKwArgs.update(kwargs) if not self._toPause: log.info("Reversing glossary... (Press Ctrl+C to pause/stop)") for wordI in reverseGlossary(self.glos, **reverseKwArgs): if self._toPause: log.info( "Reverse is paused." " Press Enter to continue, and Ctrl+C to exit" ) input() self._toPause = False def run( self, inputFilename: str = "", outputFilename: str = "", inputFormat: str = "", outputFormat: str = "", reverse: bool = False, config: "Optional[Dict]" = None, readOptions: "Optional[Dict]" = None, writeOptions: "Optional[Dict]" = None, convertOptions: "Optional[Dict]" = None, glossarySetAttrs: "Optional[Dict]" = None, ): if config is None: config = {} if readOptions is None: readOptions = {} if writeOptions is None: writeOptions = {} if convertOptions is None: convertOptions = {} if glossarySetAttrs is None: glossarySetAttrs = {} self.config = config if inputFormat: # inputFormat = inputFormat.capitalize() if inputFormat not in Glossary.readFormats: log.error(f"invalid read format {inputFormat}") if outputFormat: # outputFormat = outputFormat.capitalize() if outputFormat not in Glossary.writeFormats: log.error(f"invalid write format {outputFormat}") log.error(f"try: {COMMAND} --help") return 1 if not outputFilename: if reverse: pass elif outputFormat: try: ext = Glossary.plugins[outputFormat].extensions[0] except (KeyError, IndexError): log.error(f"invalid write format {outputFormat}") log.error(f"try: {COMMAND} --help") return 1 else: outputFilename = os.path.splitext(inputFilename)[0] + ext else: log.error("neither output file nor output format is given") log.error(f"try: {COMMAND} --help") return 1 glos = self.glos = Glossary(ui=self) self.glos.config = self.config for attr, value in glossarySetAttrs.items(): setattr(glos, attr, value) if reverse: import signal signal.signal(signal.SIGINT, self.onSigInt) # good place? FIXME readOptions["direct"] = True if not glos.read( inputFilename, format=inputFormat, **readOptions ): log.error("reading input file was failed!") return False self.setText("Reversing: ") self.pbar.update_step = 0.1 self.reverseLoop(savePath=outputFilename) else: finalOutputFile = self.glos.convert( inputFilename, inputFormat=inputFormat, outputFilename=outputFilename, outputFormat=outputFormat, readOptions=readOptions, writeOptions=writeOptions, **convertOptions ) return bool(finalOutputFile) return True pyglossary-4.5.0/pyglossary/ui/ui_cmd_interactive.py000066400000000000000000000704111417733132500230050ustar00rootroot00000000000000#!/usr/bin/python3 # -*- coding: utf-8 -*- # ui_cmd_interactive.py # # Copyright © 2008-2022 Saeed Rasooli (ilius) # This file is part of PyGlossary project, https://github.com/ilius/pyglossary # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program. Or on Debian systems, from /usr/share/common-licenses/GPL # If not, see . """ To use this user interface: sudo pip3 install prompt_toolkit """ # GitHub repo for prompt_toolkit # https://github.com/prompt-toolkit/python-prompt-toolkit # The code for Python's cmd.Cmd was very ugly and hard to understand last I # cheched. But we don't use cmd module here, and nor does prompt_toolkit. # Completion func for Python's readline, silently (and stupidly) hides any # exception, and only shows the print if it's in the first line of function. # very awkward! # We also don't use readline module, and nor does prompt_toolkit. # Looks like prompt_toolkit works directly with sys.stdin, sys.stdout # and sys.stderr. # prompt_toolkit also supports ncurses-like dialogs with buttons and widgets, # but I prefer this kind of UI with auto-completion and history import sys import os from os.path import ( dirname, join, abspath, relpath, isdir, isabs, ) import logging from collections import OrderedDict import argparse import shlex import json from pyglossary import core from pyglossary.core import confDir from pyglossary.glossary import Glossary from pyglossary.ui import ui_cmd from pyglossary.sort_keys import namedSortKeyList, namedSortKeyByName from prompt_toolkit import prompt as promptLow from prompt_toolkit.history import FileHistory from prompt_toolkit.auto_suggest import AutoSuggestFromHistory from prompt_toolkit.completion import ( WordCompleter, PathCompleter, Completion, ) from prompt_toolkit import ANSI from prompt_toolkit.shortcuts import confirm, PromptSession from prompt_toolkit.key_binding import KeyBindings from prompt_toolkit.keys import Keys endFormat = "\x1b[0;0;0m" class MiniCheckBoxPrompt(object): def __init__( self, message: str = "", fmt: str = "{message}: {check}", value: bool = False, ): self.message = message self.fmt = fmt self.value = value def formatMessage(self): msg = self.fmt.format( check="[x]" if self.value else "[ ]", message=self.message, ) # msg = ANSI(msg) # NOT SUPPORTED return msg def __pt_formatted_text__(self): return [("", self.formatMessage())] def checkbox_prompt( message: str, default: bool, ) -> PromptSession[bool]: """ Create a `PromptSession` object for the 'confirm' function. """ bindings = KeyBindings() check = MiniCheckBoxPrompt(message=message, value=default) @bindings.add(" ") def space(event: "E") -> None: check.value = not check.value # cursor_pos = check.formatMessage().find("[") + 1 # cur_cursor_pos = session.default_buffer.cursor_position # print(f"cur_cursor_pos={cur_cursor_pos}, cursor_pos={cursor_pos}") # session.default_buffer.cursor_position = cursor_pos @bindings.add(Keys.Any) def _(event: "E") -> None: " Disallow inserting other text. " pass complete_message = check session: PromptSession[bool] = PromptSession( complete_message, key_bindings=bindings ) session.prompt() return check.value log = logging.getLogger("pyglossary") indent = "\t" cmdiConfDir = join(confDir, "cmdi") histDir = join(cmdiConfDir, "history") for direc in (cmdiConfDir, histDir): os.makedirs(direc, mode=0o700, exist_ok=True) if __name__ == "__main__": Glossary.init() pluginByDesc = { plugin.description: plugin for plugin in Glossary.plugins.values() } readFormatDescList = [ Glossary.plugins[_format].description for _format in Glossary.readFormats ] writeFormatDescList = [ Glossary.plugins[_format].description for _format in Glossary.writeFormats ] convertOptionsFlags = { "direct": ("indirect", "direct"), "sqlite": ("", "sqlite"), "progressbar": ("no-progress-bar", ""), "sort": ("no-sort", "sort"), } infoOverrideFlags = { "sourceLang": "source-lang", "targetLang": "target-lang", "name": "name", } def dataToPrettyJson(data, ensure_ascii=False, sort_keys=False): return json.dumps( data, sort_keys=sort_keys, indent=2, ensure_ascii=ensure_ascii, ) def prompt( message: str, multiline: bool = False, **kwargs, ): if kwargs.get("default", "") is None: kwargs["default"] = "" text = promptLow(message=message, **kwargs) if multiline and text == "!m": print("Entering Multi-line mode, press Alt+ENTER to end") text = promptLow( message="", multiline=True, **kwargs ) return text back = "back" class MyPathCompleter(PathCompleter): def __init__( self, reading: bool, fs_action_names=None, **kwargs ): PathCompleter.__init__( self, file_filter=self.file_filter, **kwargs ) if fs_action_names is None: fs_action_names = [] self.fs_action_names = fs_action_names def file_filter(self, filename: str) -> bool: # filename is full/absoule file path return True # def get_completions_exception(document, complete_event, e): # log.error(f"Execption in get_completions: {e}") def get_completions( self, document: "Document", complete_event: "CompleteEvent", ) -> "Iterable[Completion]": text = document.text_before_cursor for action in self.fs_action_names: if action.startswith(text): yield Completion( text=action, start_position=-len(text), display=action, ) yield from PathCompleter.get_completions( self, document=document, complete_event=complete_event, ) class AbsolutePathHistory(FileHistory): def load_history_strings(self) -> "Iterable[str]": # pwd = os.getcwd() pathList = FileHistory.load_history_strings(self) return [ relpath(p) for p in pathList ] def store_string(self, string: str) -> None: FileHistory.store_string(self, abspath(string)) class UI(ui_cmd.UI): def __init__(self): self._inputFilename = "" self._outputFilename = "" self._inputFormat = "" self._outputFormat = "" self.config = None self._readOptions = None self._writeOptions = None self._convertOptions = None ui_cmd.UI.__init__(self) self.ls_parser = argparse.ArgumentParser(add_help=False) self.ls_parser.add_argument( "-l", "--long", action="store_true", dest="long", help="use a long listing format", ) self.ls_parser.add_argument( "--help", action="store_true", dest="help", help="display help", ) self.ls_usage = ( 'Usage: !ls [--help] [-l] [FILE/DIRECTORY]...\n\n' 'optional arguments:\n' ' --help show this help message and exit\n' ' -l, --long use a long listing format\n' ) self._fsActions = OrderedDict([ ("!pwd", (self.fs_pwd, "")), ("!ls", (self.fs_ls, self.ls_usage)), ("!..", (self.fs_cd_parent, "")), ("!cd", (self.fs_cd, "")), ]) self._finalActions = OrderedDict([ ("formats", self.askFormats), ("read-options", self.askReadOptions), ("write-options", self.askWriteOptions), ("reset-read-options", self.resetReadOptions), ("reset-write-options", self.resetWriteOptions), ("config", self.askConfig), ("indirect", self.setIndirect), ("sqlite", self.setSQLite), ("no-progressbar", self.setNoProgressbar), ("sort", self.setSort), ("sort-key", self.setSortKey), ("show-options", self.showOptions), ("back", None), ]) def fs_pwd(self, args: "List[str]"): print(os.getcwd()) def get_ls_l( self, arg: str, st: "Optional[os.stat_result]" = None, parentDir: str = "", sizeWidth: int = 0, ) -> str: import stat import pwd import grp import time argPath = arg if parentDir: argPath = join(parentDir, arg) if st is None: st = os.lstat(argPath) # os.lstat does not follow sym links, like "ls" command details = [ stat.filemode(st.st_mode), pwd.getpwuid(st.st_uid).pw_name, grp.getgrgid(st.st_gid).gr_name, str(st.st_size).rjust(sizeWidth), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime)), arg, ] if stat.S_ISLNK(st.st_mode): details.append(f"-> {os.readlink(argPath)}") return " ".join(details) def fs_ls(self, args: "List[str]"): opts, args = self.ls_parser.parse_known_args(args=args) if opts.help: print(self.ls_usage) return if not args: args = [os.getcwd()] showTitle = len(args) > 1 # Note: isdir and isfile funcs follow sym links, so no worry about links for i, arg in enumerate(args): if i > 0: print() if not isdir(arg): print(self.get_ls_l(arg)) continue if showTitle: print(f"> List of directory {arg!r}:") if not opts.long: for _path in os.listdir(arg): if isdir(_path): _path += "/" print(f"{_path}") continue contents = os.listdir(arg) statList = [ os.lstat(join(arg, _path)) for _path in contents ] maxFileSize = max([ st.st_size for st in statList ]) sizeWidth = len(str(maxFileSize)) for i, _path in enumerate(contents): print(self.get_ls_l( _path, parentDir=arg, st=statList[i], sizeWidth=sizeWidth, )) def fs_cd_parent(self, args: "List[str]"): if args: log.error("This command does not take arguments") return newDir = dirname(os.getcwd()) os.chdir(newDir) print(f"Changed current directory to: {newDir}") def fs_cd(self, args: "List[str]"): if len(args) != 1: log.error("This command takes exactly one argument") return newDir = args[0] if not isabs(newDir): newDir = abspath(newDir) os.chdir(newDir) print(f"Changed current directory to: {newDir}") def formatPromptMsg(self, level, msg, colon=":"): indent = self.promptIndentStr * level if core.noColor: return f"{indent} {msg}{colon} ", False if self.promptIndentColor >= 0: indent = f"\x1b[38;5;{self.promptIndentColor}m{indent}{endFormat}" if self.promptMsgColor >= 0: msg = f"\x1b[38;5;{self.promptMsgColor}m{msg}{endFormat}" return f"{indent} {msg}{colon} ", True def prompt(self, level, msg, colon=":", **kwargs): msg, colored = self.formatPromptMsg(level, msg, colon) if colored: msg = ANSI(msg) return prompt(msg, **kwargs) def checkbox_prompt(self, level, msg, colon=":", **kwargs): # FIXME: colors are not working, they are being escaped msg = f"{self.promptIndentStr * level} {msg}{colon} " # msg, colored = self.formatPromptMsg(level, msg, colon) return checkbox_prompt(msg, **kwargs) def askFile(self, kind: str, histName: str, varName: str, reading: bool): from shlex import split as shlex_split history = AbsolutePathHistory(join(histDir, histName)) auto_suggest = AutoSuggestFromHistory() # Note: isdir and isfile funcs follow sym links, so no worry about links completer = MyPathCompleter( reading=reading, fs_action_names=list(self._fsActions.keys()), ) default = getattr(self, varName) while True: filename = self.prompt( 1, kind, history=history, auto_suggest=auto_suggest, completer=completer, default=default, ) if not filename: continue parts = shlex_split(filename) if parts[0] in self._fsActions: actionFunc, usage = self._fsActions[parts[0]] try: actionFunc(parts[1:]) except Exception as e: log.exception("") if usage: print("\n" + usage) continue setattr(self, varName, filename) return filename raise ValueError(f"{kind} is not given") def askInputFile(self): return self.askFile( "Input file", "filename-input", "_inputFilename", True, ) def askOutputFile(self): return self.askFile( "Output file", "filename-output", "_outputFilename", False, ) def pluginByNameOrDesc(self, value: str) -> "Optional[PluginProp]": plugin = pluginByDesc.get(value) if plugin: return plugin plugin = Glossary.plugins.get(value) if plugin: return plugin log.error(f"internal error: invalid format name/desc {value!r}") return None def askInputFormat(self) -> str: history = FileHistory(join(histDir, "format-input")) auto_suggest = AutoSuggestFromHistory() completer = WordCompleter( readFormatDescList + Glossary.readFormats, ignore_case=True, match_middle=True, sentence=True, ) while True: value = self.prompt( 1, "Input format", history=history, auto_suggest=auto_suggest, completer=completer, default=self._inputFormat, ) if not value: continue plugin = self.pluginByNameOrDesc(value) if plugin: return plugin.name raise ValueError("input format is not given") def askOutputFormat(self) -> str: history = FileHistory(join(histDir, "format-output")) auto_suggest = AutoSuggestFromHistory() completer = WordCompleter( writeFormatDescList + Glossary.writeFormats, ignore_case=True, match_middle=True, sentence=True, ) while True: value = self.prompt( 1, "Output format", history=history, auto_suggest=auto_suggest, completer=completer, default=self._outputFormat, ) if not value: continue plugin = self.pluginByNameOrDesc(value) if plugin: return plugin.name raise ValueError("output format is not given") def finish(self): pass # TODO: how to handle \r and \n in NewlineOption.values? def getOptionValueSuggestValues(self, option: "option.Option"): if option.values: return [str(x) for x in option.values] if option.typ == "bool": return ["True", "False"] return None def getOptionValueCompleter(self, option: "option.Option"): values = self.getOptionValueSuggestValues(option) if values: return WordCompleter( values, ignore_case=True, match_middle=True, sentence=True, ) return None def askReadOptions(self): plugin = Glossary.plugins[self._inputFormat] options = Glossary.formatsReadOptions.get(self._inputFormat) if options is None: log.error(f"internal error: invalid format {self._inputFormat!r}") return optionsProp = Glossary.plugins[self._inputFormat].optionsProp history = FileHistory(join(histDir, f"read-options-{self._inputFormat}")) auto_suggest = AutoSuggestFromHistory() completer = WordCompleter( options.keys(), ignore_case=True, match_middle=True, sentence=True, ) while True: try: optName = self.prompt( 2, "ReadOption: Name (ENTER if done)", history=history, auto_suggest=auto_suggest, completer=completer, ) except (KeyboardInterrupt, EOFError): return if not optName: return option = optionsProp[optName] valueCompleter = self.getOptionValueCompleter(option) default = self._readOptions.get(optName) if default is None: default = options[optName] print(f"Comment: {option.longComment}") while True: if option.typ == "bool": try: valueNew = self.checkbox_prompt( 3, f"ReadOption: {optName}", default=default, ) except (KeyboardInterrupt, EOFError): break print(f"Set read-option: {optName} = {valueNew!r}") self._readOptions[optName] = valueNew break try: value = self.prompt( 3, f"ReadOption: {optName}", colon=" =", history=FileHistory(join(histDir, f"option-value-{optName}")), auto_suggest=AutoSuggestFromHistory(), default=str(default), completer=valueCompleter, ) except (KeyboardInterrupt, EOFError): break if value == "": if optName in self._readOptions: print(f"Unset read-option {optName!r}") del self._readOptions[optName] # FIXME: set empty value? break valueNew, ok = option.evaluate(value) if not ok or not option.validate(valueNew): log.error( f"Invalid read option value {optName}={value!r}" f" for format {self._inputFormat}" ) continue print(f"Set read-option: {optName} = {valueNew!r}") self._readOptions[optName] = valueNew break def askWriteOptions(self): plugin = Glossary.plugins[self._inputFormat] options = Glossary.formatsWriteOptions.get(self._outputFormat) if options is None: log.error(f"internal error: invalid format {self._outputFormat!r}") return optionsProp = Glossary.plugins[self._outputFormat].optionsProp history = FileHistory(join(histDir, f"write-options-{self._outputFormat}")) auto_suggest = AutoSuggestFromHistory() completer = WordCompleter( options.keys(), ignore_case=True, match_middle=True, sentence=True, ) while True: try: optName = self.prompt( 2, "WriteOption: Name (ENTER if done)", history=history, auto_suggest=auto_suggest, completer=completer, ) except (KeyboardInterrupt, EOFError): return if not optName: return option = optionsProp[optName] print(f"Comment: {option.longComment}") valueCompleter = self.getOptionValueCompleter(option) default = self._writeOptions.get(optName) if default is None: default = options[optName] while True: if option.typ == "bool": try: valueNew = self.checkbox_prompt( 3, f"WriteOption: {optName}", default=default, ) except (KeyboardInterrupt, EOFError): break print(f"Set write-option: {optName} = {valueNew!r}") self._writeOptions[optName] = valueNew break try: value = self.prompt( 3, f"WriteOption: {optName}", colon=" =", history=FileHistory(join(histDir, f"option-value-{optName}")), auto_suggest=AutoSuggestFromHistory(), default=str(default), completer=valueCompleter, ) except (KeyboardInterrupt, EOFError): break if value == "": if optName in self._writeOptions: print(f"Unset write-option {optName!r}") del self._writeOptions[optName] # FIXME: set empty value? break valueNew, ok = option.evaluate(value) if not ok or not option.validate(valueNew): log.error( f"Invalid write option value {optName}={value!r}" f" for format {self._outputFormat}" ) continue print(f"Set write-option: {optName} = {valueNew!r}") self._writeOptions[optName] = valueNew break def resetReadOptions(self): self._readOptions = {} def resetWriteOptions(self): self._writeOptions = {} def askConfigValue(self, configKey, option): default = self.config.get(configKey, "") if option.typ == "bool": return str(self.checkbox_prompt( 3, f"Config: {configKey}", default=bool(default), )) return self.prompt( 3, f"Config: {configKey}", colon=" =", history=FileHistory(join(histDir, f"config-value-{configKey}")), auto_suggest=AutoSuggestFromHistory(), default=str(default), completer=self.getOptionValueCompleter(option), ) def askConfig(self): configKeys = list(sorted(self.configDefDict.keys())) history = FileHistory(join(histDir, f"config-key")) auto_suggest = AutoSuggestFromHistory() completer = WordCompleter( configKeys, ignore_case=True, match_middle=True, sentence=True, ) while True: try: configKey = self.prompt( 2, "Config: Key (ENTER if done)", history=history, auto_suggest=auto_suggest, completer=completer, ) except (KeyboardInterrupt, EOFError): return if not configKey: return option = self.configDefDict[configKey] while True: try: value = self.askConfigValue(configKey, option) except (KeyboardInterrupt, EOFError): break if value == "": if configKey in self.config: print(f"Unset config {configKey!r}") del self.config[configKey] # FIXME: set empty value? break valueNew, ok = option.evaluate(value) if not ok or not option.validate(valueNew): log.error( f"Invalid config value {configKey}={value!r}" ) continue print(f"Set config: {configKey} = {valueNew!r}") self.config[configKey] = valueNew self.config[configKey] = valueNew break def showOptions(self): print(f"readOptions = {self._readOptions}") print(f"writeOptions = {self._writeOptions}") print(f"convertOptions = {self._convertOptions}") print(f"config = {self.config}") print() def setIndirect(self): self._convertOptions["direct"] = False self._convertOptions["sqlite"] = None print("Switched to indirect mode") def setSQLite(self): self._convertOptions["direct"] = None self._convertOptions["sqlite"] = True print("Switched to SQLite mode") def setNoProgressbar(self): self._convertOptions["progressbar"] = False print("Disabled progress bar") def setSort(self): from pyglossary.entry import Entry try: value = self.checkbox_prompt( 2, f"Enable Sort", default=self._convertOptions.get("sort", False), ) except (KeyboardInterrupt, EOFError): return self._convertOptions["sort"] = value def setSortKey(self): completer = WordCompleter( [_sk.name for _sk in namedSortKeyList], ignore_case=False, match_middle=True, sentence=True, ) default = self._convertOptions.get("sortKeyName", "") sortKeyName = self.prompt( 2, "SortKey", history=FileHistory(join(histDir, f"sort-key")), auto_suggest=AutoSuggestFromHistory(), default=default, completer=completer, ) if not sortKeyName: if "sortKeyName" in self._convertOptions: del self._convertOptions["sortKeyName"] return if sortKeyName not in namedSortKeyByName: log.error(f"invalid sortKeyName = {sortKeyName!r}") return self._convertOptions["sortKeyName"] = sortKeyName if not self._convertOptions.get("sort"): self.setSort() def askFinalAction(self) -> "Optional[str]": history = FileHistory(join(histDir, "action")) auto_suggest = AutoSuggestFromHistory() completer = WordCompleter( list(self._finalActions.keys()), ignore_case=False, match_middle=True, sentence=True, ) while True: action = self.prompt( 1, "Select action (ENTER to convert)", history=history, auto_suggest=auto_suggest, completer=completer, ) if not action: return None if action not in self._finalActions: log.error(f"invalid action: {action}") continue return action def askFinalOptions(self) -> "Union[bool, Literal[back]]": while True: try: action = self.askFinalAction() except (KeyboardInterrupt, EOFError): return False except Exception as e: log.exception("") return False if action == back: return back if action is None: return True # convert actionFunc = self._finalActions[action] if actionFunc is None: return True # convert actionFunc() return True # convert def getRunKeywordArgs(self) -> "Dict": return dict( inputFilename=self._inputFilename, outputFilename=self._outputFilename, inputFormat=self._inputFormat, outputFormat=self._outputFormat, config=self.config, readOptions=self._readOptions, writeOptions=self._writeOptions, convertOptions=self._convertOptions, glossarySetAttrs=self._glossarySetAttrs, ) def checkInputFormat(self, forceAsk: bool = False): if not forceAsk: inputArgs = Glossary.detectInputFormat(self._inputFilename, quiet=True) if inputArgs: inputFormat = inputArgs[1] self._inputFormat = inputFormat return self._inputFormat = self.askInputFormat() def checkOutputFormat(self, forceAsk: bool = False): if not forceAsk: outputArgs = Glossary.detectOutputFormat( filename=self._outputFilename, inputFilename=self._inputFilename, quiet=True, ) if outputArgs: self._outputFormat = outputArgs[1] return self._outputFormat = self.askOutputFormat() def askFormats(self): self.checkInputFormat(forceAsk=True) self.checkOutputFormat(forceAsk=True) def askInputOutputAgain(self): self.askInputFile() self.checkInputFormat(forceAsk=True) self.askOutputFile() self.checkOutputFormat(forceAsk=True) def printNonInteractiveCommand(self): from shlex import quote cmd = [ ui_cmd.COMMAND, quote(self._inputFilename), quote(self._outputFilename), quote(f"--read-format={self._inputFormat}"), quote(f"--write-format={self._outputFormat}"), ] if self._readOptions: optionsJson = json.dumps(self._readOptions, ensure_ascii=True) cmd.append(quote(f"--json-read-options={optionsJson}")) if self._writeOptions: optionsJson = json.dumps(self._writeOptions, ensure_ascii=True) cmd.append(quote(f"--json-write-options={optionsJson}")) if self.config: for key, value in self.config.items(): if value is None: continue if value == self.savedConfig.get(key): continue option = self.configDefDict.get(key) if option is None: log.error(f"config key {key} was not found") if not option.hasFlag: log.error(f"config key {key} has no command line flag") flag = option.customFlag if not flag: flag = key.replace('_', '-') if option.typ == "bool": if not value: flag = f"no-{flag}" cmd.append(f"--{flag}") else: cmd.append(quote(f"--{flag}={value}")) if self._convertOptions: if "infoOverride" in self._convertOptions: infoOverride = self._convertOptions.pop("infoOverride") for key, value in infoOverride.items(): flag = infoOverrideFlags.get(key) if not flag: log.error(f"unknow key {key} in infoOverride") continue cmd.append(f"--{flag}={value}") if "sortKeyName" in self._convertOptions: value = self._convertOptions.pop("sortKeyName") cmd.append(f"--sort-key={value}") for key, value in self._convertOptions.items(): if value is None: continue if key not in convertOptionsFlags: log.error(f"unknow key {key} in convertOptions") continue ftup = convertOptionsFlags[key] if ftup is None: continue if isinstance(value, bool): flag = ftup[int(value)] if flag: cmd.append(f"--{flag}") else: flag = ftup[0] cmd.append(f"--{flag}={value}") print() print( "If you want to repeat this conversion later, " "you can use this command:" ) print(shlex.join(cmd)) def setConfigAttrs(self): config = self.config self.promptIndentStr = config.get("cmdi.prompt.indent.str", ">") self.promptIndentColor = config.get("cmdi.prompt.indent.color", 2) self.promptMsgColor = config.get("cmdi.prompt.msg.color", -1) self.msgColor = config.get("cmdi.msg.color", -1) def run( self, inputFilename: str = "", outputFilename: str = "", inputFormat: str = "", outputFormat: str = "", reverse: bool = False, config: "Optional[Dict]" = None, readOptions: "Optional[Dict]" = None, writeOptions: "Optional[Dict]" = None, convertOptions: "Optional[Dict]" = None, glossarySetAttrs: "Optional[Dict]" = None, ): if config is None: config = {} if readOptions is None: readOptions = {} if writeOptions is None: writeOptions = {} if convertOptions is None: convertOptions = {} if glossarySetAttrs is None: glossarySetAttrs = {} self._inputFilename = inputFilename self._outputFilename = outputFilename self._inputFormat = inputFormat self._outputFormat = outputFormat self._readOptions = readOptions self._writeOptions = writeOptions self._convertOptions = convertOptions self._glossarySetAttrs = glossarySetAttrs self.loadConfig() self.savedConfig = dict(self.config) self.config = config del inputFilename, outputFilename, inputFormat, outputFormat del config, readOptions, writeOptions, convertOptions self.setConfigAttrs() if not self._inputFilename: try: self.askInputFile() except (KeyboardInterrupt, EOFError): return if not self._inputFormat: try: self.checkInputFormat() except (KeyboardInterrupt, EOFError): return if not self._outputFilename: try: self.askOutputFile() except (KeyboardInterrupt, EOFError): return if not self._outputFormat: try: self.checkOutputFormat() except (KeyboardInterrupt, EOFError): return while True: status = self.askFinalOptions() if status == back: self.askInputOutputAgain() continue if not status: return try: succeed = ui_cmd.UI.run(self, **self.getRunKeywordArgs()) except Exception as e: log.exception("") else: self.printNonInteractiveCommand() if succeed: if self.config != self.savedConfig and confirm("Save Config?"): self.saveConfig() return succeed print("Press Control + C to exit") pyglossary-4.5.0/pyglossary/ui/ui_gtk.py000066400000000000000000001231321417733132500204310ustar00rootroot00000000000000# -*- coding: utf-8 -*- # ui_gtk.py # # Copyright © 2008-2022 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. import shutil import sys import os from os.path import join, isfile, isabs, splitext, abspath import logging import traceback from collections import OrderedDict from pyglossary.text_utils import urlToPath from pyglossary.os_utils import click_website from pyglossary.glossary import ( Glossary, defaultSortKeyName, ) from pyglossary.sort_keys import namedSortKeyList, namedSortKeyByName from .base import ( UIBase, logo, aboutText, authors, licenseText, ) from pyglossary import core from .dependency import checkDepends import gi gi.require_version("Gtk", "3.0") from .gtk3_utils import * from .gtk3_utils.utils import * from .gtk3_utils.dialog import MyDialog from .gtk3_utils.resize_button import ResizeButton from .gtk3_utils.about import AboutWidget # from gi.repository import GdkPixbuf log = logging.getLogger("pyglossary") gtk.Window.set_default_icon_from_file(logo) _ = str # later replace with translator function pluginByDesc = { plugin.description: plugin for plugin in Glossary.plugins.values() } readDesc = [ plugin.description for plugin in Glossary.plugins.values() if plugin.canRead ] writeDesc = [ plugin.description for plugin in Glossary.plugins.values() if plugin.canWrite ] def getScreenSize(): rootWindow = gdk.get_default_root_window() return rootWindow.get_width(), rootWindow.get_height() def getWorkAreaSize(): display = gdk.Display.get_default() monitor = display.get_primary_monitor() rect = monitor.get_workarea() return rect.width, rect.height def buffer_get_text(b): return b.get_text( b.get_start_iter(), b.get_end_iter(), True, ) class FormatDialog(gtk.Dialog): def __init__(self, descList: "List[str]", parent=None, **kwargs): gtk.Dialog.__init__(self, parent=parent, **kwargs) self.descList = descList self.items = descList self.activeDesc = "" ## self.connect("response", lambda w, e: self.hide()) dialog_add_button( self, "gtk-cancel", "_Cancel", gtk.ResponseType.CANCEL, ) dialog_add_button( self, "gtk-ok", "_OK", gtk.ResponseType.OK, ) ### treev = gtk.TreeView() treeModel = gtk.ListStore(str) treev.set_headers_visible(False) treev.set_model(treeModel) treev.connect("row-activated", self.rowActivated) # treev.connect("response", self.onResponse) ### self.treev = treev ############# cell = gtk.CellRendererText(editable=False) col = gtk.TreeViewColumn( title="Descriptin", cell_renderer=cell, text=0, ) col.set_property("expand", True) col.set_resizable(True) treev.append_column(col) self.descCol = col ############ hbox = HBox(spacing=15) hbox.set_border_width(10) pack(hbox, gtk.Label("Search:")) entry = self.entry = gtk.Entry() pack(hbox, entry, 1, 1) pack(self.vbox, hbox) ### entry.connect("changed", self.onEntryChange) ############ self.swin = swin = gtk.ScrolledWindow() swin.add(treev) swin.set_policy(gtk.PolicyType.NEVER, gtk.PolicyType.AUTOMATIC) pack(self.vbox, swin, 1, 1) self.vbox.show_all() ## treev.set_can_focus(True) # no need, just to be safe treev.set_can_default(True) treev.set_receives_default(True) # print("can_focus:", treev.get_can_focus()) # print("can_default:", treev.get_can_default()) # print("receives_default:", treev.get_receives_default()) #### self.updateTree() self.resize(400, 400) self.connect("realize", self.onRealize) def onRealize(self, widget=None): if self.activeDesc: self.treev.grab_focus() else: self.entry.grab_focus() def onEntryChange(self, entry): text = entry.get_text().strip() if not text: self.items = self.descList self.updateTree() return text = text.lower() descList = self.descList items1 = [] items2 = [] for desc in descList: if desc.lower().startswith(text): items1.append(desc) elif text in desc.lower(): items2.append(desc) self.items = items1 + items2 self.updateTree() def setCursor(self, desc: str): model = self.treev.get_model() _iter = model.iter_children(None) while _iter is not None: if model.get_value(_iter, 0) == desc: path = model.get_path(_iter) self.treev.set_cursor(path, self.descCol, False) self.treev.scroll_to_cell(path) return _iter = model.iter_next(_iter) def updateTree(self): model = self.treev.get_model() model.clear() for desc in self.items: model.append([desc]) if self.activeDesc: self.setCursor(self.activeDesc) def getActive(self) -> "Optional[PluginProp]": _iter = self.treev.get_selection().get_selected()[1] if _iter is None: return model = self.treev.get_model() desc = model.get_value(_iter, 0) return pluginByDesc[desc] def setActive(self, plugin): if plugin is None: self.activeDesc = "" return desc = plugin.description self.activeDesc = desc self.setCursor(desc) def rowActivated(self, treev, path, col): model = treev.get_model() _iter = model.get_iter(path) desc = model.get_value(_iter, 0) self.activeDesc = desc self.response(gtk.ResponseType.OK) # def onResponse class FormatButton(gtk.Button): noneLabel = "[Select Format]" dialogTitle = "Select Format" def __init__(self, descList: "List[str]", parent=None): gtk.Button.__init__(self) self.set_label(self.noneLabel) ### self.descList = descList self._parent = parent self.activePlugin = None ### self.connect("clicked", self.onClick) def onChanged(self, obj=None): pass def onClick(self, button=None): dialog = FormatDialog( descList=self.descList, parent=self._parent, title=self.dialogTitle, ) dialog.setActive(self.activePlugin) if dialog.run() != gtk.ResponseType.OK: return plugin = dialog.getActive() self.activePlugin = plugin if plugin: self.set_label(plugin.description) else: self.set_label(self.noneLabel) self.onChanged() def getActive(self): if self.activePlugin is None: return "" return self.activePlugin.name def setActive(self, _format): plugin = Glossary.plugins[_format] self.activePlugin = plugin self.set_label(plugin.description) self.onChanged() class FormatOptionsDialog(gtk.Dialog): def __init__( self, formatName: str, options: "List[str]", optionsValues: "Dict[str, Any]", parent=None, ): gtk.Dialog.__init__(self, parent=parent) optionsProp = Glossary.plugins[formatName].optionsProp self.optionsProp = optionsProp ## self.connect("response", lambda w, e: self.hide()) dialog_add_button( self, "gtk-cancel", "_Cancel", gtk.ResponseType.CANCEL, ) dialog_add_button( self, "gtk-ok", "_OK", gtk.ResponseType.OK, ) ### treev = gtk.TreeView() treeModel = gtk.ListStore( bool, # enable str, # name str, # comment str, # value ) treev.set_headers_clickable(True) treev.set_model(treeModel) treev.connect("row-activated", self.rowActivated) treev.connect("button-press-event", self.treeviewButtonPress) ### self.treev = treev ############# cell = gtk.CellRendererToggle() # cell.set_property("activatable", True) cell.connect("toggled", self.enableToggled) col = gtk.TreeViewColumn(title="Enable", cell_renderer=cell) col.add_attribute(cell, "active", 0) # cell.set_active(False) col.set_property("expand", False) col.set_resizable(True) treev.append_column(col) ### col = gtk.TreeViewColumn( title="Name", cell_renderer=gtk.CellRendererText(), text=1, ) col.set_property("expand", False) col.set_resizable(True) treev.append_column(col) ### cell = gtk.CellRendererText(editable=True) self.valueCell = cell self.valueCol = 3 cell.connect("edited", self.valueEdited) col = gtk.TreeViewColumn( title="Value", cell_renderer=cell, text=self.valueCol, ) col.set_property("expand", True) col.set_resizable(True) col.set_min_width(200) treev.append_column(col) ### col = gtk.TreeViewColumn( title="Comment", cell_renderer=gtk.CellRendererText(), text=2, ) col.set_property("expand", False) col.set_resizable(False) treev.append_column(col) ############# for name in options: prop = optionsProp[name] comment = prop.longComment if prop.typ != "bool" and not prop.values: comment += " (double-click to edit)" treeModel.append([ name in optionsValues, # enable name, # name comment, # comment str(optionsValues.get(name, "")), # value ]) ############ pack(self.vbox, treev, 1, 1) self.vbox.show_all() def enableToggled(self, cell, path): # enable is column 0 model = self.treev.get_model() active = not cell.get_active() itr = model.get_iter(path) model.set_value(itr, 0, active) def valueEdited(self, cell, path, rawValue): # value is column 3 model = self.treev.get_model() itr = model.get_iter(path) optName = model.get_value(itr, 1) prop = self.optionsProp[optName] if not prop.customValue: return enable = True if rawValue == "" and prop.typ != "str": enable = False elif not prop.validateRaw(rawValue): log.error(f"invalid {prop.typ} value: {optName} = {rawValue!r}") return model.set_value(itr, self.valueCol, rawValue) model.set_value(itr, 0, enable) def rowActivated(self, treev, path, col): # forceMenu=True because we can not enter edit mode # if double-clicked on a cell other than Value return self.valueCellClicked(path, forceMenu=True) def treeviewButtonPress(self, treev, gevent): if gevent.button != 1: return False pos_t = treev.get_path_at_pos(int(gevent.x), int(gevent.y)) if not pos_t: return False # pos_t == path, col, xRel, yRel path = pos_t[0] col = pos_t[1] # cell = col.get_cells()[0] if col.get_title() == "Value": return self.valueCellClicked(path) return False def valueItemActivate(self, item: gtk.MenuItem, itr: gtk.TreeIter): # value is column 3 value = item.get_label() model = self.treev.get_model() model.set_value(itr, self.valueCol, value) model.set_value(itr, 0, True) # enable it def valueCustomOpenDialog(self, itr: gtk.TreeIter, optName: str): model = self.treev.get_model() prop = self.optionsProp[optName] currentValue = model.get_value(itr, self.valueCol) optDesc = optName if prop.comment: optDesc += f" ({prop.comment})" label = gtk.Label(label=f"Value for {optDesc}") dialog = gtk.Dialog(parent=self, title="Option Value") dialog.connect("response", lambda w, e: dialog.hide()) dialog_add_button( dialog, "gtk-cancel", "_Cancel", gtk.ResponseType.CANCEL, ) dialog_add_button( dialog, "gtk-ok", "_OK", gtk.ResponseType.OK, ) pack(dialog.vbox, label, 0, 0) entry = gtk.Entry() entry.set_text(currentValue) entry.connect("activate", lambda w: dialog.response(gtk.ResponseType.OK)) pack(dialog.vbox, entry, 0, 0) dialog.vbox.show_all() if dialog.run() != gtk.ResponseType.OK: return value = entry.get_text() model.set_value(itr, self.valueCol, value) model.set_value(itr, 0, True) # enable it def valueItemCustomActivate(self, item: gtk.MenuItem, itr: gtk.TreeIter): model = self.treev.get_model() optName = model.get_value(itr, 1) self.valueCustomOpenDialog(itr, optName) def valueCellClicked(self, path, forceMenu=False) -> bool: """ returns True if event is handled, False if not handled (need to enter edit mode) """ model = self.treev.get_model() itr = model.get_iter(path) optName = model.get_value(itr, 1) prop = self.optionsProp[optName] if prop.typ == "bool": rawValue = model.get_value(itr, self.valueCol) if rawValue == "": value = False else: value, isValid = prop.evaluate(rawValue) if not isValid: log.error(f"invalid {optName} = {rawValue!r}") value = False model.set_value(itr, self.valueCol, str(not value)) model.set_value(itr, 0, True) # enable it return True propValues = prop.values if not propValues: if forceMenu: propValues = [] else: return False menu = gtk.Menu() if prop.customValue: item = gtk.MenuItem("[Custom Value]") item.connect("activate", self.valueItemCustomActivate, itr) item.show() menu.append(item) groupedValues = None if len(propValues) > 10: groupedValues = prop.groupValues() if groupedValues: for groupName, values in groupedValues.items(): item = gtk.MenuItem() item.set_label(groupName) if values is None: item.connect("activate", self.valueItemActivate, itr) else: subMenu = gtk.Menu() for subValue in values: subItem = gtk.MenuItem(label=str(subValue)) subItem.connect("activate", self.valueItemActivate, itr) subItem.show() subMenu.append(subItem) item.set_submenu(subMenu) item.show() menu.append(item) else: for value in propValues: item = gtk.MenuItem(value) item.connect("activate", self.valueItemActivate, itr) item.show() menu.append(item) etime = gtk.get_current_event_time() menu.popup(None, None, None, None, 3, etime) return True def getOptionsValues(self): model = self.treev.get_model() optionsValues = {} for row in model: if not row[0]: # not enable continue optName = row[1] rawValue = row[3] prop = self.optionsProp[optName] value, isValid = prop.evaluate(rawValue) if not isValid: log.error(f"invalid option value {optName} = {rawValue}") continue optionsValues[optName] = value return optionsValues class FormatBox(FormatButton): def __init__(self, descList: "List[str]", parent=None): FormatButton.__init__(self, descList, parent=parent) self.optionsValues = {} self.optionsButton = gtk.Button(label="Options") self.optionsButton.set_image(gtk.Image.new_from_icon_name( "gtk-preferences", gtk.IconSize.BUTTON, )) self.optionsButton.connect("clicked", self.optionsButtonClicked) self.dependsButton = gtk.Button(label="Install dependencies") self.dependsButton.pkgNames = [] self.dependsButton.connect("clicked", self.dependsButtonClicked) def setOptionsValues(self, optionsValues: "Dict[str, Any]"): self.optionsValues = optionsValues def kind(self): "returns 'r' or 'w'" raise NotImplementedError def getActiveOptions(self): raise NotImplementedError def optionsButtonClicked(self, button): formatName = self.getActive() options = self.getActiveOptions() dialog = FormatOptionsDialog( formatName, options, self.optionsValues, parent=self._parent, ) dialog.set_title("Options for " + formatName) if dialog.run() != gtk.ResponseType.OK: dialog.destroy() return self.optionsValues = dialog.getOptionsValues() dialog.destroy() def dependsButtonClicked(self, button): formatName = self.getActive() pkgNames = button.pkgNames if not pkgNames: print("All dependencies are stattisfied for " + formatName) return pkgNamesStr = " ".join(pkgNames) msg = ( "Run the following command:\n" f"{core.pip} install {pkgNamesStr}" ) showInfo( msg, title="Dependencies for " + formatName, selectable=True, parent=self._parent, ) self.onChanged(self) def onChanged(self, obj=None): name = self.getActive() if not name: self.optionsButton.set_visible(False) return self.optionsValues.clear() options = self.getActiveOptions() self.optionsButton.set_visible(bool(options)) kind = self.kind() plugin = Glossary.plugins[name] if kind == "r": depends = plugin.readDepends elif kind == "w": depends = plugin.writeDepends else: raise RuntimeError(f"invalid kind={kind}") uninstalled = checkDepends(depends) self.dependsButton.pkgNames = uninstalled self.dependsButton.set_visible(bool(uninstalled)) class InputFormatBox(FormatBox): dialogTitle = "Select Input Format" def __init__(self, **kwargs): FormatBox.__init__(self, readDesc, **kwargs) def kind(self): "returns 'r' or 'w'" return "r" def getActiveOptions(self): formatName = self.getActive() if not formatName: return return list(Glossary.formatsReadOptions[formatName].keys()) class OutputFormatBox(FormatBox): dialogTitle = "Select Output Format" def __init__(self, **kwargs): FormatBox.__init__(self, writeDesc, **kwargs) def kind(self): "returns 'r' or 'w'" return "w" def getActiveOptions(self): return list(Glossary.formatsWriteOptions[self.getActive()].keys()) class GtkTextviewLogHandler(logging.Handler): def __init__(self, treeview_dict): logging.Handler.__init__(self) self.buffers = {} for levelNameCap in log.levelNamesCap[:-1]: levelName = levelNameCap.upper() textview = treeview_dict[levelName] buff = textview.get_buffer() tag = gtk.TextTag.new(levelName) buff.get_tag_table().add(tag) self.buffers[levelName] = buff def getTag(self, levelname): return self.buffers[levelname].get_tag_table().lookup(levelname) def setColor(self, levelname: str, rgba: gdk.RGBA) -> None: self.getTag(levelname).set_property("foreground-rgba", rgba) # foreground-gdk is deprecated since Gtk 3.4 def emit(self, record): msg = "" if record.getMessage(): msg = self.format(record) # msg = msg.replace("\x00", "") if record.exc_info: _type, value, tback = record.exc_info tback_text = "".join( traceback.format_exception(_type, value, tback) ) if msg: msg += "\n" msg += tback_text buff = self.buffers[record.levelname] buff.insert_with_tags_by_name( buff.get_end_iter(), msg + "\n", record.levelname, ) class GtkSingleTextviewLogHandler(GtkTextviewLogHandler): def __init__(self, textview): GtkTextviewLogHandler.__init__(self, { "CRITICAL": textview, "ERROR": textview, "WARNING": textview, "INFO": textview, "DEBUG": textview, "TRACE": textview, }) class BrowseButton(gtk.Button): def __init__( self, setFilePathFunc, label="Browse", actionSave=False, title="Select File", ): gtk.Button.__init__(self) self.set_label(label) self.set_image(gtk.Image.new_from_icon_name( "document-save" if actionSave else "document-open", gtk.IconSize.BUTTON, )) self.actionSave = actionSave self.setFilePathFunc = setFilePathFunc self.title = title self.connect("clicked", self.onClick) def onClick(self, widget): fcd = gtk.FileChooserDialog( transient_for=self.get_toplevel(), action=gtk.FileChooserAction.SAVE if self.actionSave else gtk.FileChooserAction.OPEN, title=self.title, ) fcd.add_button(gtk.STOCK_CANCEL, gtk.ResponseType.CANCEL) fcd.add_button(gtk.STOCK_OK, gtk.ResponseType.OK) fcd.connect("response", lambda w, e: fcd.hide()) fcd.connect( "file-activated", lambda w: fcd.response(gtk.ResponseType.OK) ) if fcd.run() == gtk.ResponseType.OK: self.setFilePathFunc(fcd.get_filename()) fcd.destroy() sortKeyNameByDesc = { _sk.desc: _sk.name for _sk in namedSortKeyList } sortKeyNames = [ _sk.name for _sk in namedSortKeyList ] class SortOptionsBox(gtk.Box): def __init__(self, ui): gtk.Box.__init__(self, orientation=gtk.Orientation.VERTICAL) self.ui = ui ### hbox = gtk.HBox() sortCheck = gtk.CheckButton("Sort entries by") sortKeyCombo = gtk.ComboBoxText() for _sk in namedSortKeyList: sortKeyCombo.append_text(_sk.desc) sortKeyCombo.set_active(sortKeyNames.index(defaultSortKeyName)) sortKeyCombo.set_border_width(0) sortKeyCombo.set_sensitive(False) # sortKeyCombo.connect("changed", self.sortKeyComboChanged) self.sortCheck = sortCheck self.sortKeyCombo = sortKeyCombo sortCheck.connect("clicked", self.onSortCheckClicked) pack(hbox, sortCheck, 0, 0, padding=5) pack(hbox, sortKeyCombo, 0, 0, padding=5) pack(self, hbox, 0, 0, padding=5) ### hbox = self.encodingHBox = gtk.HBox() label = gtk.Label(label="Sort Encoding") encodingEntry = self.encodingEntry = gtk.Entry() encodingEntry.set_text("utf-8") encodingEntry.set_width_chars(15) pack(hbox, label, 0, 0, padding=10) pack(hbox, encodingEntry, 0, 0, padding=5) pack(self, hbox, 0, 0, padding=5) ### self.show_all() def onSortCheckClicked(self, check): sort = check.get_active() self.sortKeyCombo.set_sensitive(sort) self.encodingHBox.set_sensitive(sort) def updateWidgets(self): convertOptions = self.ui.convertOptions sort = convertOptions.get("sort") self.sortCheck.set_active(sort) self.sortKeyCombo.set_sensitive(sort) self.encodingHBox.set_sensitive(sort) sortKeyName = convertOptions.get("sortKeyName") if sortKeyName: self.sortKeyCombo.set_active(sortKeyNames.index(sortKeyName)) sortEncoding = convertOptions.get("sortEncoding", "utf-8") self.encodingEntry.set_text(sortEncoding) def applyChanges(self): convertOptions = self.ui.convertOptions sort = self.sortCheck.get_active() if not sort: for param in ("sort", "sortKeyName", "sortEncoding"): if param in convertOptions: del convertOptions[param] return sortKeyDesc = self.sortKeyCombo.get_active_text() convertOptions["sort"] = sort convertOptions["sortKeyName"] = sortKeyNameByDesc[sortKeyDesc] convertOptions["sortEncoding"] = self.encodingEntry.get_text() class GeneralOptionsDialog(gtk.Dialog): def onDeleteEvent(self, widget, event): self.hide() return True def onResponse(self, widget, event): self.applyChanges() self.hide() return True def __init__(self, ui, **kwargs): gtk.Dialog.__init__( self, transient_for=ui, **kwargs, ) self.set_title("General Options") self.ui = ui ## self.resize(600, 500) self.connect("delete-event", self.onDeleteEvent) ## self.connect("response", self.onResponse) dialog_add_button( self, "gtk-ok", "_OK", gtk.ResponseType.OK, ) ## hpad = 10 vpad = 5 ## self.sortOptionsBox = SortOptionsBox(ui) pack(self.vbox, self.sortOptionsBox, 0, 0, padding=vpad) ## hbox = gtk.HBox() self.sqliteCheck = gtk.CheckButton(label="SQLite mode") pack(hbox, self.sqliteCheck, 0, 0, padding=hpad) pack(self.vbox, hbox, 0, 0, padding=vpad) ## self.configParams = OrderedDict([ ("save_info_json", False), ("lower", False), ("skip_resources", False), ("rtl", False), ("enable_alts", True), ("cleanup", True), ("remove_html_all", True), ]) self.configCheckButtons = {} configDefDict = UIBase.configDefDict for param, default in self.configParams.items(): hbox = gtk.HBox() comment = configDefDict[param].comment checkButton = gtk.CheckButton( label=comment.split("\n")[0] ) self.configCheckButtons[param] = checkButton pack(hbox, checkButton, 0, 0, padding=hpad) pack(self.vbox, hbox, 0, 0, padding=vpad) ## self.updateWidgets() self.vbox.show_all() def getSQLite(self) -> bool: convertOptions = self.ui.convertOptions sqlite = convertOptions.get("sqlite") if sqlite is not None: return sqlite return self.ui.config.get("auto_sqlite", True) def updateWidgets(self): config = self.ui.config self.sortOptionsBox.updateWidgets() self.sqliteCheck.set_active(self.getSQLite()) for param, check in self.configCheckButtons.items(): default = self.configParams[param] check.set_active(config.get(param, default)) def applyChanges(self): # print("applyChanges") self.sortOptionsBox.applyChanges() convertOptions = self.ui.convertOptions config = self.ui.config convertOptions["sqlite"] = self.sqliteCheck.get_active() for param, check in self.configCheckButtons.items(): config[param] = check.get_active() class GeneralOptionsButton(gtk.Button): def __init__(self, ui): gtk.Button.__init__(self, label="General Options") self.ui = ui self.connect("clicked", self.onClick) self.dialog = None def onClick(self, widget): if self.dialog is None: self.dialog = GeneralOptionsDialog(self.ui) self.dialog.present() class UI(gtk.Dialog, MyDialog, UIBase): def status(self, msg): # try: # _id = self.statusMsgDict[msg] # except KeyError: # _id = self.statusMsgDict[msg] = self.statusNewId # self.statusNewId += 1 _id = self.statusBar.get_context_id(msg) self.statusBar.push(_id, msg) def __init__(self): gtk.Dialog.__init__(self) UIBase.__init__(self) self.set_title("PyGlossary (Gtk3)") ##### screenW, screenH = getWorkAreaSize() winSize = min(800, screenW - 50, screenH - 50) self.resize(winSize, winSize) # print(f"{screenW}x{screenH}, {'%sx%s' % getScreenSize()}") ##### self.connect("delete-event", self.onDeleteEvent) self.pages = [] # self.statusNewId = 0 # self.statusMsgDict = {}## message -> id ##### self.convertOptions = {} ##### self.styleProvider = gtk.CssProvider() gtk.StyleContext.add_provider_for_screen( gdk.Screen.get_default(), self.styleProvider, gtk.STYLE_PROVIDER_PRIORITY_APPLICATION, ) css = "check {min-width: 1.25em; min-height: 1.25em;}\n" self.styleProvider.load_from_data(css.encode("utf-8")) ##### self.assert_quit = False self.path = "" # ____________________ Tab 1 - Convert ____________________ # labelSizeGroup = gtk.SizeGroup(mode=gtk.SizeGroupMode.HORIZONTAL) buttonSizeGroup = gtk.SizeGroup(mode=gtk.SizeGroupMode.HORIZONTAL) #### vbox = VBox() vbox.label = _("Convert") vbox.icon = "" # "*.png" self.pages.append(vbox) ###### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Input File:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.convertInputEntry = gtk.Entry() pack(hbox, self.convertInputEntry, 1, 1) button = BrowseButton( self.convertInputEntry.set_text, label="Browse", actionSave=False, title="Select Input File", ) pack(hbox, button) buttonSizeGroup.add_widget(button) pack(vbox, hbox) ## self.convertInputEntry.connect( "changed", self.convertInputEntryChanged, ) ### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Input Format:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.convertInputFormatCombo = InputFormatBox(parent=self) buttonSizeGroup.add_widget(self.convertInputFormatCombo.optionsButton) pack(hbox, self.convertInputFormatCombo) pack(hbox, gtk.Label(), 1, 1) pack(hbox, self.convertInputFormatCombo.dependsButton) pack(hbox, self.convertInputFormatCombo.optionsButton) pack(vbox, hbox) ##### vbox.sep1 = gtk.Label(label="") vbox.sep1.show() pack(vbox, vbox.sep1) ##### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Output File:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.convertOutputEntry = gtk.Entry() pack(hbox, self.convertOutputEntry, 1, 1) button = BrowseButton( self.convertOutputEntry.set_text, label="Browse", actionSave=True, title="Select Output File", ) pack(hbox, button) buttonSizeGroup.add_widget(button) pack(vbox, hbox) ## self.convertOutputEntry.connect( "changed", self.convertOutputEntryChanged, ) ### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Output Format:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.convertOutputFormatCombo = OutputFormatBox(parent=self) buttonSizeGroup.add_widget(self.convertOutputFormatCombo.optionsButton) pack(hbox, self.convertOutputFormatCombo) pack(hbox, gtk.Label(), 1, 1) pack(hbox, self.convertOutputFormatCombo.dependsButton) pack(hbox, self.convertOutputFormatCombo.optionsButton) pack(vbox, hbox) ##### hbox = HBox(spacing=10) label = gtk.Label(label="") pack(hbox, label, 1, 1, 5) ## button = GeneralOptionsButton(self) button.set_size_request(300, 40) pack(hbox, button, 0, 0, 0) ## self.convertButton = gtk.Button() self.convertButton.set_label("Convert") self.convertButton.connect("clicked", self.convertClicked) self.convertButton.set_size_request(300, 40) pack(hbox, self.convertButton, 0, 0, 10) ## pack(vbox, hbox, 0, 0, 15) ##### self.convertConsoleTextview = textview = gtk.TextView() swin = gtk.ScrolledWindow() swin.set_policy(gtk.PolicyType.AUTOMATIC, gtk.PolicyType.AUTOMATIC) swin.set_border_width(0) swin.add(textview) pack(vbox, swin, 1, 1) # ____________________ Tab 2 - Reverse ____________________ # self.reverseStatus = "" #### labelSizeGroup = gtk.SizeGroup(mode=gtk.SizeGroupMode.HORIZONTAL) #### vbox = VBox() vbox.label = _("Reverse") vbox.icon = "" # "*.png" # self.pages.append(vbox) ###### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Input Format:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.reverseInputFormatCombo = InputFormatBox() pack(hbox, self.reverseInputFormatCombo) pack(vbox, hbox) ### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Input File:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.reverseInputEntry = gtk.Entry() pack(hbox, self.reverseInputEntry, 1, 1) button = BrowseButton( self.reverseInputEntry.set_text, label="Browse", actionSave=False, title="Select Input File", ) pack(hbox, button) pack(vbox, hbox) ## self.reverseInputEntry.connect( "changed", self.reverseInputEntryChanged, ) ##### vbox.sep1 = gtk.Label(label="") vbox.sep1.show() pack(vbox, vbox.sep1) ##### hbox = HBox(spacing=3) hbox.label = gtk.Label(label=_("Output Tabfile:")) pack(hbox, hbox.label) labelSizeGroup.add_widget(hbox.label) hbox.label.set_property("xalign", 0) self.reverseOutputEntry = gtk.Entry() pack(hbox, self.reverseOutputEntry, 1, 1) button = BrowseButton( self.reverseOutputEntry.set_text, label="Browse", actionSave=True, title="Select Output File", ) pack(hbox, button) pack(vbox, hbox) ## self.reverseOutputEntry.connect( "changed", self.reverseOutputEntryChanged, ) ##### hbox = HBox(spacing=3) label = gtk.Label(label="") pack(hbox, label, 1, 1, 5) ### self.reverseStartButton = gtk.Button() self.reverseStartButton.set_label(_("Start")) self.reverseStartButton.connect("clicked", self.reverseStartClicked) pack(hbox, self.reverseStartButton, 1, 1, 2) ### self.reversePauseButton = gtk.Button() self.reversePauseButton.set_label(_("Pause")) self.reversePauseButton.set_sensitive(False) self.reversePauseButton.connect("clicked", self.reversePauseClicked) pack(hbox, self.reversePauseButton, 1, 1, 2) ### self.reverseResumeButton = gtk.Button() self.reverseResumeButton.set_label(_("Resume")) self.reverseResumeButton.set_sensitive(False) self.reverseResumeButton.connect("clicked", self.reverseResumeClicked) pack(hbox, self.reverseResumeButton, 1, 1, 2) ### self.reverseStopButton = gtk.Button() self.reverseStopButton.set_label(_("Stop")) self.reverseStopButton.set_sensitive(False) self.reverseStopButton.connect("clicked", self.reverseStopClicked) pack(hbox, self.reverseStopButton, 1, 1, 2) ### pack(vbox, hbox, 0, 0, 5) ###### about = AboutWidget( logo=logo, header=f"PyGlossary\nVersion {core.VERSION}", # about=summary, about=f'{aboutText}\n{core.homePage}', authors="\n".join(authors), license=licenseText, ) about.label = _("About") about.icon = "" # "*.png" self.pages.append(about) ##### # ____________________________________________________________ # notebook = gtk.Notebook() self.notebook = notebook ######### for vbox in self.pages: label = gtk.Label(label=vbox.label) label.set_use_underline(True) vb = VBox(spacing=3) if vbox.icon: vbox.image = imageFromFile(vbox.icon) pack(vb, vbox.image) pack(vb, label) vb.show_all() notebook.append_page(vbox, vb) try: notebook.set_tab_reorderable(vbox, True) except AttributeError: pass ####################### pack(self.vbox, notebook, 1, 1) # for i in ui.pagesOrder: # try: # j = pagesOrder[i] # except IndexError: # continue # notebook.reorder_child(self.pages[i], j) # ____________________________________________________________ # handler = GtkSingleTextviewLogHandler(textview) log.addHandler(handler) ### textview.override_background_color( gtk.StateFlags.NORMAL, gdk.RGBA(0, 0, 0, 1), ) ### handler.setColor("CRITICAL", rgba_parse("red")) handler.setColor("ERROR", rgba_parse("red")) handler.setColor("WARNING", rgba_parse("yellow")) handler.setColor("INFO", rgba_parse("white")) handler.setColor("DEBUG", rgba_parse("white")) handler.setColor("TRACE", rgba_parse("white")) ### textview.get_buffer().set_text("Output & Error Console:\n") textview.set_editable(False) # ____________________________________________________________ # self.progressTitle = "" self.progressBar = pbar = gtk.ProgressBar() pbar.set_fraction(0) # pbar.set_text(_("Progress Bar")) # pbar.get_style_context() # pbar.set_property("height-request", 20) pack(self.vbox, pbar, 0, 0) ############ hbox = HBox(spacing=5) clearButton = gtk.Button( use_stock=gtk.STOCK_CLEAR, always_show_image=True, label=_("Clear"), ) clearButton.show_all() # image = gtk.Image() # image.set_from_stock(gtk.STOCK_CLEAR, gtk.IconSize.MENU) # clearButton.add(image) clearButton.set_border_width(0) clearButton.connect("clicked", self.consoleClearButtonClicked) set_tooltip(clearButton, "Clear Console") pack(hbox, clearButton, 0, 0) #### # hbox.sepLabel1 = gtk.Label(label="") # pack(hbox, hbox.sepLabel1, 1, 1) ###### hbox.verbosityLabel = gtk.Label(label=_("Verbosity:")) pack(hbox, hbox.verbosityLabel, 0, 0) ## self.verbosityCombo = combo = gtk.ComboBoxText() for level, levelName in enumerate(log.levelNamesCap): combo.append_text(f"{level} - {_(levelName)}") combo.set_active(log.getVerbosity()) combo.set_border_width(0) combo.connect("changed", self.verbosityComboChanged) pack(hbox, combo, 0, 0) #### # hbox.sepLabel2 = gtk.Label(label="") # pack(hbox, hbox.sepLabel2, 1, 1) #### self.statusBar = sbar = gtk.Statusbar() pack(hbox, self.statusBar, 1, 1) #### hbox.resizeButton = ResizeButton(self) pack(hbox, hbox.resizeButton, 0, 0) ###### pack(self.vbox, hbox, 0, 0) # ____________________________________________________________ # self.vbox.show_all() notebook.set_current_page(0) # Convert tab self.convertInputFormatCombo.dependsButton.hide() self.convertOutputFormatCombo.dependsButton.hide() self.convertInputFormatCombo.optionsButton.hide() self.convertOutputFormatCombo.optionsButton.hide() ######## self.status("Select input file") def run( self, inputFilename: str = "", outputFilename: str = "", inputFormat: str = "", outputFormat: str = "", reverse: bool = False, config: "Optional[Dict]" = None, readOptions: "Optional[Dict]" = None, writeOptions: "Optional[Dict]" = None, convertOptions: "Optional[Dict]" = None, glossarySetAttrs: "Optional[Dict]" = None, ): if glossarySetAttrs is None: glossarySetAttrs = {} self.config = config if inputFilename: self.convertInputEntry.set_text(abspath(inputFilename)) if outputFilename: self.convertOutputEntry.set_text(abspath(outputFilename)) if inputFormat: self.convertInputFormatCombo.setActive(inputFormat) if outputFormat: self.convertOutputFormatCombo.setActive(outputFormat) if reverse: log.error(f"Gtk interface does not support Reverse feature") if readOptions: self.convertInputFormatCombo.setOptionsValues(readOptions) if writeOptions: self.convertOutputFormatCombo.setOptionsValues(writeOptions) self.convertOptions = convertOptions if convertOptions: log.debug(f"Using convertOptions={convertOptions}") self._glossarySetAttrs = glossarySetAttrs gtk.Dialog.present(self) gtk.main() def onDeleteEvent(self, widget, event): self.destroy() # gtk.main_quit() # if callled while converting, main_quit does not exit program, # it keeps printing warnings, # and makes you close the terminal or force kill the process sys.exit(0) def consoleClearButtonClicked(self, widget=None): self.convertConsoleTextview.get_buffer().set_text("") def verbosityComboChanged(self, widget=None): verbosity = self.verbosityCombo.get_active() # or int(self.verbosityCombo.get_active_text()) log.setVerbosity(verbosity) def convertClicked(self, widget=None): inPath = self.convertInputEntry.get_text() if not inPath: self.status("Input file path is empty!") log.critical("Input file path is empty!") return inFormat = self.convertInputFormatCombo.getActive() if inFormat: inFormatDesc = Glossary.plugins[inFormat].description else: inFormatDesc = "" # log.critical("Input format is empty!");return outPath = self.convertOutputEntry.get_text() if not outPath: self.status("Output file path is empty!") log.critical("Output file path is empty!") return outFormat = self.convertOutputFormatCombo.getActive() if outFormat: outFormatDesc = Glossary.plugins[outFormat].description else: outFormatDesc = "" # log.critical("Output format is empty!");return while gtk.events_pending(): gtk.main_iteration_do(False) self.convertButton.set_sensitive(False) self.progressTitle = "Converting" readOptions = self.convertInputFormatCombo.optionsValues writeOptions = self.convertOutputFormatCombo.optionsValues glos = Glossary(ui=self) glos.config = self.config for attr, value in self._glossarySetAttrs.items(): setattr(glos, attr, value) log.debug(f"readOptions: {readOptions}") log.debug(f"writeOptions: {writeOptions}") log.debug(f"convertOptions: {self.convertOptions}") log.debug(f"config: {self.config}") try: finalOutputFile = glos.convert( inPath, inputFormat=inFormat, outputFilename=outPath, outputFormat=outFormat, readOptions=readOptions, writeOptions=writeOptions, **self.convertOptions, ) if finalOutputFile: self.status("Convert finished") else: self.status("Convert failed") return bool(finalOutputFile) finally: self.convertButton.set_sensitive(True) self.assert_quit = False self.progressTitle = "" return True def convertInputEntryChanged(self, widget=None): inPath = self.convertInputEntry.get_text() inFormat = self.convertInputFormatCombo.getActive() if inPath.startswith("file://"): inPath = urlToPath(inPath) self.convertInputEntry.set_text(inPath) if self.config["ui_autoSetFormat"] and not inFormat: inputArgs = Glossary.detectInputFormat(inPath, quiet=True) if inputArgs: inFormatNew = inputArgs[1] self.convertInputFormatCombo.setActive(inFormatNew) if not isfile(inPath): return self.status("Select output file") def convertOutputEntryChanged(self, widget=None): outPath = self.convertOutputEntry.get_text() outFormat = self.convertOutputFormatCombo.getActive() if not outPath: return if outPath.startswith("file://"): outPath = urlToPath(outPath) self.convertOutputEntry.set_text(outPath) if self.config["ui_autoSetFormat"] and not outFormat: outputArgs = Glossary.detectOutputFormat( filename=outPath, inputFilename=self.convertInputEntry.get_text(), quiet=True, ) if outputArgs: outFormat = outputArgs[1] self.convertOutputFormatCombo.setActive(outFormat) if outFormat: self.status("Press \"Convert\"") else: self.status("Select output format") def reverseLoad(self): pass def reverseStartLoop(self): pass def reverseStart(self): if not self.reverseLoad(): return ### self.reverseStatus = "doing" self.reverseStartLoop() ### self.reverseStartButton.set_sensitive(False) self.reversePauseButton.set_sensitive(True) self.reverseResumeButton.set_sensitive(False) self.reverseStopButton.set_sensitive(True) def reverseStartClicked(self, widget=None): self.waitingDo(self.reverseStart) def reversePause(self): self.reverseStatus = "pause" ### self.reverseStartButton.set_sensitive(False) self.reversePauseButton.set_sensitive(False) self.reverseResumeButton.set_sensitive(True) self.reverseStopButton.set_sensitive(True) def reversePauseClicked(self, widget=None): self.waitingDo(self.reversePause) def reverseResume(self): self.reverseStatus = "doing" ### self.reverseStartButton.set_sensitive(False) self.reversePauseButton.set_sensitive(True) self.reverseResumeButton.set_sensitive(False) self.reverseStopButton.set_sensitive(True) def reverseResumeClicked(self, widget=None): self.waitingDo(self.reverseResume) def reverseStop(self): self.reverseStatus = "stop" ### self.reverseStartButton.set_sensitive(True) self.reversePauseButton.set_sensitive(False) self.reverseResumeButton.set_sensitive(False) self.reverseStopButton.set_sensitive(False) def reverseStopClicked(self, widget=None): self.waitingDo(self.reverseStop) def reverseInputEntryChanged(self, widget=None): inPath = self.reverseInputEntry.get_text() inFormat = self.reverseInputFormatCombo.getActive() if inPath.startswith("file://"): inPath = urlToPath(inPath) self.reverseInputEntry.set_text(inPath) if not inFormat and self.config["ui_autoSetFormat"]: inputArgs = Glossary.detectInputFormat(inPath, quiet=True) if inputArgs: inFormat = inputArgs[1] self.reverseInputFormatCombo.setActive(inFormat) def reverseOutputEntryChanged(self, widget=None): pass def progressInit(self, title): self.progressTitle = title def progress(self, rat, text=None): if not text: text = "%" + str(int(rat * 100)) text += " - " + self.progressTitle self.progressBar.set_fraction(rat) # self.progressBar.set_text(text) # not working self.status(text) while gtk.events_pending(): gtk.main_iteration_do(False) pyglossary-4.5.0/pyglossary/ui/ui_qt.py000066400000000000000000000025351417733132500202730ustar00rootroot00000000000000# -*- coding: utf-8 -*- # ui_qk.py # # Copyright © 2010-2019 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from pyglossary.glossary import * from .base import * from os.path import join from PyQt4 import QtGui as qt from PyQt4 import QtCore as qc noneItem = 'Not Selected' class UI(qt.QWidget, UIBase): def __init__(self): qt.QWidget.__init__(self) UIBase.__init__(self) self.setWindowTitle('PyGlossary (Qt)') self.setWindowIcon(qt.QIcon(join(uiDir, 'pyglossary.png'))) ###################### self.running = False self.glos = Glossary(ui=self) self.glos.config = self.config self.pathI = '' self.pathO = '' self.fcd_dir = join(homeDir, 'Desktop') ###################### vbox = qt.QVBoxLayout() self.setLayout(vbox) pyglossary-4.5.0/pyglossary/ui/ui_tk.py000066400000000000000000001117471417733132500202730ustar00rootroot00000000000000# -*- coding: utf-8 -*- # ui_tk.py # # Copyright © 2009-2021 Saeed Rasooli (ilius) # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. from pyglossary import core from pyglossary.core import homeDir, confDir from pyglossary.glossary import ( Glossary, ) from .base import ( UIBase, logo, aboutText, authors, licenseText, ) from pyglossary.text_utils import urlToPath import os from os.path import join, isfile, abspath, splitext import logging import traceback import tkinter as tk from tkinter import filedialog from tkinter import tix from tkinter import ttk from tkinter import font as tkFont log = logging.getLogger("pyglossary") pluginByDesc = { plugin.description: plugin for plugin in Glossary.plugins.values() } readDesc = [ plugin.description for plugin in Glossary.plugins.values() if plugin.canRead ] writeDesc = [ plugin.description for plugin in Glossary.plugins.values() if plugin.canWrite ] def set_window_icon(window): window.iconphoto( True, tk.PhotoImage(file=logo), ) def decodeGeometry(gs): """ example for gs: "253x252+30+684" returns (x, y, w, h) """ p = gs.split("+") w, h = p[0].split("x") return (int(p[1]), int(p[2]), int(w), int(h)) def encodeGeometry(x, y, w, h): return f"{w}x{h}+{x}+{y}" def encodeLocation(x, y): return f"+{x}+{y}" def centerWindow(win): """ centers a tkinter window :param win: the root or Toplevel window to center """ win.update_idletasks() width = win.winfo_width() frm_width = win.winfo_rootx() - win.winfo_x() win_width = width + 2 * frm_width height = win.winfo_height() titlebar_height = win.winfo_rooty() - win.winfo_y() win_height = height + titlebar_height + frm_width x = win.winfo_screenwidth() // 2 - win_width // 2 y = win.winfo_screenheight() // 2 - win_height // 2 win.geometry(encodeGeometry(x, y, width, height)) win.deiconify() def newButton(*args, **kwargs): button = tk.Button(*args, **kwargs) def onEnter(event=None): button.invoke() button.bind("", onEnter) button.bind("", onEnter) return button def newTTKButton(*args, **kwargs): button = ttk.Button(*args, **kwargs) def onEnter(event=None): button.invoke() button.bind("", onEnter) button.bind("", onEnter) return button def newLabelWithImage(parent, file=""): image = tk.PhotoImage(file=file) label = ttk.Label(parent, image=image) label.image = image # keep a reference! return label def newReadOnlyText( parent, text="", borderwidth=10, font=None, ): height = len(text.strip().split("\n")) widget = tk.Text( parent, height=height, borderwidth=borderwidth, font=font, ) widget.insert(1.0, text) widget.pack() # widget.bind("", lambda e: "break") widget.configure(state="disabled") # if tkinter is 8.5 or above you'll want the selection background # to appear like it does when the widget is activated # comment this out for older versions of Tkinter widget.configure( inactiveselectbackground=widget.cget("selectbackground"), bg=parent.cget('bg'), relief="flat", ) return widget class TkTextLogHandler(logging.Handler): def __init__(self, tktext): logging.Handler.__init__(self) ##### tktext.tag_config("CRITICAL", foreground="#ff0000") tktext.tag_config("ERROR", foreground="#ff0000") tktext.tag_config("WARNING", foreground="#ffff00") tktext.tag_config("INFO", foreground="#00ff00") tktext.tag_config("DEBUG", foreground="#ffffff") tktext.tag_config("TRACE", foreground="#ffffff") ### self.tktext = tktext def emit(self, record): msg = "" if record.getMessage(): msg = self.format(record) ### if record.exc_info: _type, value, tback = record.exc_info tback_text = "".join( traceback.format_exception(_type, value, tback) ) if msg: msg += "\n" msg += tback_text ### self.tktext.insert( "end", msg + "\n", record.levelname, ) # Monkey-patch Tkinter # http://stackoverflow.com/questions/5191830/python-exception-logging def CallWrapper__call__(self, *args): """ Apply first function SUBST to arguments, than FUNC. """ if self.subst: args = self.subst(*args) try: return self.func(*args) except Exception: log.exception("Exception in Tkinter callback:") tk.CallWrapper.__call__ = CallWrapper__call__ class ProgressBar(tix.Frame): """ This comes from John Grayson's book "Python and Tkinter programming" Edited by Saeed Rasooli """ def __init__( self, rootWin=None, orientation="horizontal", min_=0, max_=100, width=100, height=18, appearance="sunken", fillColor="blue", background="gray", labelColor="yellow", labelFont="Verdana", labelFormat="%d%%", value=0, bd=2, ): # preserve various values self.rootWin = rootWin self.orientation = orientation self.min = min_ self.max = max_ self.width = width self.height = height self.fillColor = fillColor self.labelFont = labelFont self.labelColor = labelColor self.background = background self.labelFormat = labelFormat self.value = value tix.Frame.__init__(self, rootWin, relief=appearance, bd=bd) self.canvas = tix.Canvas( self, height=height, width=width, bd=0, highlightthickness=0, background=background, ) self.scale = self.canvas.create_rectangle( 0, 0, width, height, fill=fillColor, ) self.label = self.canvas.create_text( width / 2, height / 2, text="", anchor="c", fill=labelColor, font=self.labelFont, ) self.update() self.bind("", self.update) self.canvas.pack(side="top", fill="x", expand="no") def updateProgress(self, value, _max=None, text=""): if _max: self.max = _max self.value = value self.update(None, text) def update(self, event=None, labelText=""): # Trim the values to be between min and max value = self.value if value > self.max: value = self.max if value < self.min: value = self.min # Adjust the rectangle width = int(self.winfo_width()) # width = self.width ratio = float(value) / self.max if self.orientation == "horizontal": self.canvas.coords( self.scale, 0, 0, width * ratio, self.height, ) else: self.canvas.coords( self.scale, 0, self.height * (1 - ratio), width, self.height, ) # Now update the colors self.canvas.itemconfig(self.scale, fill=self.fillColor) self.canvas.itemconfig(self.label, fill=self.labelColor) # And update the label if not labelText: labelText = self.labelFormat % int(ratio * 100) self.canvas.itemconfig(self.label, text=labelText) # FIXME: resizing window causes problem in progressbar # self.canvas.move(self.label, width/2, self.height/2) # self.canvas.scale(self.label, 0, 0, float(width)/self.width, 1) self.canvas.update_idletasks() class FormatDialog(tix.Toplevel): def __init__( self, descList: "List[str]", title: str, onOk: "Callable", button: "FormatButton", activeDesc: str = "", ): tix.Toplevel.__init__(self) # bg="#0f0" does not work self.descList = descList self.items = self.descList self.onOk = onOk self.activeDesc = activeDesc self.lastSearch = None self.resizable(width=True, height=True) if title: self.title(title) set_window_icon(self) self.bind('', lambda e: self.destroy()) px, py, pw, ph = decodeGeometry(button.winfo_toplevel().geometry()) width = 400 height = 400 self.geometry(encodeGeometry( px + pw // 2 - width // 2, py + ph // 2 - height // 2, width, height, )) entryBox = tk.Frame(master=self) label = ttk.Label(master=entryBox, text="Search: ") label.pack(side="left") entry = self.entry = ttk.Entry(master=entryBox) entry.pack(fill="x", expand=True, side="left") entryBox.pack(fill="x", padx=5, pady=5) entry.bind("", self.onEntryKeyRelease) entry.focus() treevBox = tk.Frame(master=self) treev = self.treev = ttk.Treeview( master=treevBox, columns=["Description"], show="", ) treev.bind("", self.onTreeDoubleClick) treev.pack( side="left", fill="both", expand=True, ) vsb = ttk.Scrollbar( master=treevBox, orient="vertical", command=treev.yview, ) vsb.pack(side="right", fill="y") treevBox.pack( fill="both", expand=True, padx=5, pady=5, ) treev.configure(yscrollcommand=vsb.set) self.updateTree() buttonBox = tix.Frame(master=self) cancelButton = newTTKButton( buttonBox, text="Cancel", command=self.cancelClicked, ) cancelButton.pack(side="right") okButton = newTTKButton( buttonBox, text=" OK ", command=self.okClicked, # bg="#ff0000", # activebackground="#ff5050", ) okButton.pack(side="right") buttonBox.pack(fill="x") self.bind("", self.onReturnPress) self.bind("", self.onReturnPress) self.bind("", self.onDownPress) self.bind("", self.onUpPress) # self.bind("", self.onKeyPress) def setActiveRow(self, desc): self.treev.selection_set(desc) self.treev.see(desc) def updateTree(self): treev = self.treev current = treev.get_children() if current: treev.delete(*current) for desc in self.items: treev.insert("", "end", values=[desc], iid=desc) # iid should be rowId if self.activeDesc in self.items: self.setActiveRow(self.activeDesc) def onEntryKeyRelease(self, event): text = self.entry.get().strip() if text == self.lastSearch: return if not text: self.items = self.descList self.updateTree() self.lastSearch = text return text = text.lower() descList = self.descList items1 = [] items2 = [] for desc in descList: if desc.lower().startswith(text): items1.append(desc) elif text in desc.lower(): items2.append(desc) self.items = items1 + items2 self.updateTree() self.lastSearch = text def onTreeDoubleClick(self, event): self.okClicked() def cancelClicked(self): self.destroy() def onReturnPress(self, event): self.okClicked() def onDownPress(self, event): treev = self.treev selection = treev.selection() if selection: nextDesc = treev.next(selection[0]) if nextDesc: self.setActiveRow(nextDesc) else: if self.items: self.setActiveRow(self.items[0]) treev.focus() def onUpPress(self, event): treev = self.treev treev.focus() selection = treev.selection() if not selection: if self.items: self.setActiveRow(self.items[0]) return nextDesc = treev.prev(selection[0]) if nextDesc: self.setActiveRow(nextDesc) def onKeyPress(self, event): print(f"FormatDialog: onKeyPress: {event}") def okClicked(self): treev = self.treev selectedList = treev.selection() if selectedList: desc = selectedList[0] else: desc = "" self.onOk(desc) self.destroy() class FormatButton(ttk.Button): noneLabel = "[Select Format]" def __init__( self, descList: "List[str]", dialogTitle: str, onChange: "Callable", master=None, ): self.var = tk.StringVar() self.var.set(self.noneLabel) ttk.Button.__init__( self, master=master, textvariable=self.var, command=self.onClick, ) self.descList = descList self.dialogTitle = dialogTitle self._onChange = onChange self.activeDesc = "" self.bind("", self.onEnter) self.bind("", self.onEnter) def onEnter(self, event=None): self.invoke() def onChange(self, desc): self.set(desc) self._onChange(desc) def get(self): return self.activeDesc def set(self, desc): if desc: self.var.set(desc) else: self.var.set(self.noneLabel) self.activeDesc = desc def onClick(self): dialog = FormatDialog( descList=self.descList, title=self.dialogTitle, onOk=self.onChange, button=self, activeDesc=self.activeDesc, ) dialog.focus() class FormatOptionsDialog(tix.Toplevel): kindFormatsOptions = { "Read": Glossary.formatsReadOptions, "Write": Glossary.formatsWriteOptions, } def __init__(self, format, kind, values, master=None): tix.Toplevel.__init__(self) # bg="#0f0" does not work self.resizable(width=True, height=True) self.title(kind + " Options") set_window_icon(self) self.bind('', lambda e: self.destroy()) self.menu = None self.format = format self.kind = kind self.values = values self.options = list(self.kindFormatsOptions[kind][format].keys()) self.optionsProp = Glossary.plugins[format].optionsProp self.createOptionsList() buttonBox = tix.Frame(self) okButton = newTTKButton( buttonBox, text=" OK ", command=self.okClicked, # bg="#ff0000", # activebackground="#ff5050", ) okButton.pack(side="right") buttonBox.pack(fill="x") def createOptionsList(self): values = self.values self.valueCol = "#3" cols = [ "Enable", # bool "Name", # str "Value", # str "Comment", # str ] treev = self.treev = ttk.Treeview( master=self, columns=cols, show="headings", ) for col in cols: treev.heading( col, text=col, # command=lambda c=col: sortby(treev, c, 0), ) # adjust the column's width to the header string treev.column( col, width=tkFont.Font().measure(col.title()), ) ### treev.bind( "", # "<>", # event.x and event.y are zero self.treeClicked, ) treev.pack(fill="x", expand=True) ### for optName in self.options: prop = self.optionsProp[optName] comment = prop.longComment row = [ int(optName in values), optName, str(values.get(optName, "")), comment, ] treev.insert("", "end", values=row, iid=optName) # iid should be rowId # adjust column's width if necessary to fit each value for col_i, value in enumerate(row): value = str(value) if col_i == 3: value = value.zfill(20) # to reserve window width, because it's hard to resize it later col_w = tkFont.Font().measure(value) if treev.column(cols[col_i], width=None) < col_w: treev.column(cols[col_i], width=col_w) def valueMenuItemCustomSelected(self, treev, format, optName, menu=None): if menu: menu.destroy() self.menu = None value = treev.set(optName, self.valueCol) dialog = tix.Toplevel(master=treev) # bg="#0f0" does not work dialog.resizable(width=True, height=True) dialog.title(optName) set_window_icon(dialog) dialog.bind('', lambda e: dialog.destroy()) px, py, pw, ph = decodeGeometry(treev.winfo_toplevel().geometry()) width = 300 height = 100 dialog.geometry(encodeGeometry( px + pw // 2 - width // 2, py + ph // 2 - height // 2, width, height, )) frame = tix.Frame(master=dialog) label = ttk.Label(master=frame, text="Value for " + optName) label.pack() entry = ttk.Entry(master=frame) entry.insert(0, value) entry.pack(fill="x") prop = Glossary.plugins[format].optionsProp[optName] def customOkClicked(event=None): rawValue = entry.get() if not prop.validateRaw(rawValue): log.error(f"invalid {prop.typ} value: {optName} = {rawValue!r}") return treev.set(optName, self.valueCol, rawValue) treev.set(optName, "#1", "1") # enable it col_w = tkFont.Font().measure(rawValue) if treev.column("Value", width=None) < col_w: treev.column("Value", width=col_w) dialog.destroy() entry.bind("", customOkClicked) label = ttk.Label(master=frame) label.pack(fill="x") customOkbutton = newTTKButton( frame, text=" OK ", command=customOkClicked, # bg="#ff0000", # activebackground="#ff5050", ) customOkbutton.pack(side="right") ### frame.pack(fill="x") dialog.focus() def valueMenuItemSelected(self, optName, menu, value): treev = self.treev treev.set(optName, self.valueCol, value) treev.set(optName, "#1", "1") # enable it col_w = tkFont.Font().measure(value) if treev.column("Value", width=None) < col_w: treev.column("Value", width=col_w) menu.destroy() self.menu = None def valueCellClicked(self, event, optName): if not optName: return treev = self.treev prop = self.optionsProp[optName] propValues = prop.values if not propValues: if prop.customValue: self.valueMenuItemCustomSelected(treev, self.format, optName, None) else: log.error( f"invalid option {optName}, values={propValues}" f", customValue={prop.customValue}" ) return if prop.typ == "bool": rawValue = treev.set(optName, self.valueCol) if rawValue == "": value = False else: value, isValid = prop.evaluate(rawValue) if not isValid: log.error(f"invalid {optName} = {rawValue!r}") value = False treev.set(optName, self.valueCol, str(not value)) treev.set(optName, "#1", "1") # enable it return menu = tk.Menu( master=treev, title=optName, tearoff=False, ) self.menu = menu # to destroy it later if prop.customValue: menu.add_command( label="[Custom Value]", command=lambda: self.valueMenuItemCustomSelected( treev, self.format, optName, menu, ), ) groupedValues = None if len(propValues) > 10: groupedValues = prop.groupValues() maxItemW = 0 def valueMenuItemSelectedCommand(value): def callback(): self.valueMenuItemSelected(optName, menu, value) return callback if groupedValues: for groupName, subValues in groupedValues.items(): if subValues is None: menu.add_command( label=str(value), command=valueMenuItemSelectedCommand(value), ) maxItemW = max(maxItemW, tkFont.Font().measure(str(value))) else: subMenu = tk.Menu(tearoff=False) for subValue in subValues: subMenu.add_command( label=str(subValue), command=valueMenuItemSelectedCommand(subValue), ) menu.add_cascade(label=groupName, menu=subMenu) maxItemW = max(maxItemW, tkFont.Font().measure(groupName)) else: for value in propValues: value = str(value) menu.add_command( label=value, command=valueMenuItemSelectedCommand(value), ) def close(): menu.destroy() self.menu = None menu.add_command( label="[Close]", command=close, ) try: menu.tk_popup( event.x_root, event.y_root, ) # do not pass the third argument (entry), so that the menu # apears where the pointer is on its top-left corner finally: # make sure to release the grab (Tk 8.0a1 only) menu.grab_release() def treeClicked(self, event): treev = self.treev if self.menu: self.menu.destroy() self.menu = None return optName = treev.identify_row(event.y) # optName is rowId if not optName: return col = treev.identify_column(event.x) # "#1" to self.valueCol if col == "#1": value = treev.set(optName, col) treev.set(optName, col, 1 - int(value)) return if col == self.valueCol: self.valueCellClicked(event, optName) def okClicked(self): treev = self.treev for optName in self.options: enable = bool(int(treev.set(optName, "#1"))) if not enable: if optName in self.values: del self.values[optName] continue rawValue = treev.set(optName, self.valueCol) prop = self.optionsProp[optName] value, isValid = prop.evaluate(rawValue) if not isValid: log.error(f"invalid option value {optName} = {rawValue}") continue self.values[optName] = value self.destroy() class FormatOptionsButton(tk.Button): def __init__( self, kind: "Literal['Read', 'Write']", values: "Dict", formatInput: "FormatButton", master=None, ): tk.Button.__init__( self, master=master, text="Options", command=self.buttonClicked, # bg="#f0f000", # activebackground="#f6f622", borderwidth=3, ) self.kind = kind self.values = values self.formatInput = formatInput def setOptionsValues(self, values): self.values = values def buttonClicked(self): formatD = self.formatInput.get() if not formatD: return format = pluginByDesc[formatD].name dialog = FormatOptionsDialog(format, self.kind, self.values, master=self) # x, y, w, h = decodeGeometry(dialog.geometry()) w, h = 380, 250 # w and h are rough estimated width and height of `dialog` px, py, pw, ph = decodeGeometry(self.winfo_toplevel().geometry()) # move dialog without changing the size dialog.geometry(encodeLocation( px + pw // 2 - w // 2, py + ph // 2 - h // 2, )) dialog.focus() class UI(tix.Frame, UIBase): fcd_dir_save_path = join(confDir, "ui-tk-fcd-dir") def __init__(self): rootWin = self.rootWin = tix.Tk() # a hack that hides the window until we move it to the center of screen if os.sep == "\\": # Windows rootWin.attributes('-alpha', 0.0) else: # Linux rootWin.withdraw() tix.Frame.__init__(self, rootWin) UIBase.__init__(self) rootWin.title("PyGlossary (Tkinter)") rootWin.resizable(True, False) ######## set_window_icon(rootWin) rootWin.bind('', lambda e: rootWin.quit()) ######### # Linux: ('clam', 'alt', 'default', 'classic') # Windows: ('winnative', 'clam', 'alt', 'default', 'classic', 'vista', # 'xpnative') # style = ttk.Style() # style.theme_use("default") ######## self.pack(fill="x") # rootWin.bind("", self.resized) ####################### defaultFont = tkFont.nametofont('TkDefaultFont') if core.sysName in ("linux", "freebsd"): defaultFont.configure(size=int(defaultFont.cget("size") * 1.4)) #### self.biggerFont = defaultFont.copy() self.biggerFont.configure(size=int(defaultFont.cget("size") * 1.8)) ###################### self.glos = Glossary(ui=self) self.glos.config = self.config self._convertOptions = {} self.pathI = "" self.pathO = "" fcd_dir = join(homeDir, "Desktop") if isfile(self.fcd_dir_save_path): try: with open(self.fcd_dir_save_path, encoding="utf-8") as fp: fcd_dir = fp.read().strip("\n") except Exception: log.exception("") self.fcd_dir = fcd_dir ###################### notebook = tix.NoteBook(self) notebook.add("tabConvert", label="Convert", underline=0) # notebook.add("tabReverse", label="Reverse", underline=0) notebook.add("tabAbout", label="About", underline=0) convertFrame = tix.Frame(notebook.tabConvert) aboutFrame = tix.Frame(notebook.tabAbout) ################### row = 0 label = ttk.Label(convertFrame, text="Input File: ") label.grid( row=row, column=0, sticky=tk.W, padx=5, ) ## entry = tix.Entry(convertFrame) entry.grid( row=row, column=1, columnspan=2, sticky=tk.W + tk.E, padx=0, ) entry.bind_all("", self.anyEntryChanged) self.entryInputConvert = entry ## button = newButton( convertFrame, text="Browse", command=self.browseInputConvert, # bg="#f0f000", # activebackground="#f6f622", borderwidth=3, ) button.grid( row=row, column=3, sticky=tk.W + tk.E, padx=5, ) ###################### row += 1 label = ttk.Label(convertFrame, text="Input Format: ") label.grid( row=row, column=0, sticky=tk.W, padx=5, ) ## self.formatButtonInputConvert = FormatButton( master=convertFrame, descList=readDesc, dialogTitle="Select Input Format", onChange=self.inputFormatChanged, ) self.formatButtonInputConvert.grid( row=row, column=1, columnspan=2, sticky=tk.W, padx=0, ) ## self.readOptions = {} # type: Dict[str, Any] self.writeOptions = {} # type: Dict[str, Any] ## self.readOptionsButton = FormatOptionsButton( "Read", self.readOptions, self.formatButtonInputConvert, master=convertFrame, ) self.inputFormatRow = row ###################### row += 1 label = ttk.Label(convertFrame) label.grid( row=row, column=0, sticky=tk.W, ) ###################### row += 1 label = ttk.Label(convertFrame, text="Output Format: ") label.grid( row=row, column=0, sticky=tk.W, padx=5, ) ## self.formatButtonOutputConvert = FormatButton( master=convertFrame, descList=writeDesc, dialogTitle="Select Output Format", onChange=self.outputFormatChanged, ) self.formatButtonOutputConvert.grid( row=row, column=1, columnspan=2, sticky=tk.W, padx=0, ) ## self.writeOptionsButton = FormatOptionsButton( "Write", self.writeOptions, self.formatButtonOutputConvert, master=convertFrame, ) self.outputFormatRow = row ################### row += 1 label = ttk.Label(convertFrame, text="Output File: ") label.grid( row=row, column=0, sticky=tk.W, padx=5, ) ## entry = tix.Entry(convertFrame) entry.grid( row=row, column=1, columnspan=2, sticky=tk.W + tk.E, padx=0, ) entry.bind_all("", self.anyEntryChanged) self.entryOutputConvert = entry ## button = newButton( convertFrame, text="Browse", command=self.browseOutputConvert, # bg="#f0f000", # activebackground="#f6f622", borderwidth=3, ) button.grid( row=row, column=3, sticky=tk.W + tk.E, padx=5, ) ################### row += 1 button = newButton( convertFrame, text="Convert", command=self.convert, # background="#00e000", # activebackground="#22f022", borderwidth=7, font=self.biggerFont, padx=5, pady=5, ) button.grid( row=row, column=2, columnspan=3, sticky=tk.W + tk.E + tk.S, padx=5, pady=5, ) # print(f"row number for Convert button: {row}") ###### convertFrame.pack(fill="x") # convertFrame.grid(sticky=tk.W + tk.E + tk.N + tk.S) ################# row += 1 console = tix.Text( convertFrame, height=15, background="#000", foreground="#fff", ) console.bind("", self.consoleKeyPress) # self.consoleH = 15 # sbar = Tix.Scrollbar( # convertFrame, # orien=Tix.VERTICAL, # command=console.yview # ) # sbar.grid (row=row, column=1) # console["yscrollcommand"] = sbar.set console.grid( row=row, column=0, columnspan=4, sticky=tk.W + tk.E, padx=5, pady=0, ) log.addHandler( TkTextLogHandler(console), ) console.insert("end", "Console:\n") #### self.console = console ################## aboutFrame2 = tix.Frame(aboutFrame) ## label = newLabelWithImage(aboutFrame2, file=logo) label.pack(side="left") ## ## label = ttk.Label(aboutFrame2, text=f"PyGlossary\nVersion {core.VERSION}") label.pack(side="left") ## aboutFrame2.pack(side="top", fill="x") ## style = ttk.Style(self) style.configure("TNotebook", tabposition="wn") # ws => to the left (west) and to the bottom (south) # wn => to the left (west) and at top aboutNotebook = ttk.Notebook(aboutFrame, style="TNotebook") aboutFrame3 = tk.Frame(aboutNotebook) authorsFrame = tk.Frame(aboutNotebook) licenseFrame = tk.Frame(aboutNotebook) # tabImg = tk.PhotoImage(file=join(dataDir, "res", "dialog-information-22.png")) # tabImg = tk.PhotoImage(file=join(dataDir, "res", "author-22.png")) aboutNotebook.add( aboutFrame3, text="\n About \n", # image=tabImg, # not working # compound=tk.TOP, # padding=50, # not working ) aboutNotebook.add( authorsFrame, text="\nAuthors\n", ) aboutNotebook.add( licenseFrame, text="\nLicense\n", ) label = newReadOnlyText( aboutFrame3, text=f"{aboutText}\nHome page: {core.homePage}", font=("DejaVu Sans", 11, ""), ) label.pack(fill="x") authorsText = "\n".join(authors) authorsText = authorsText.replace("\t", " ") label = newReadOnlyText( authorsFrame, text=authorsText, font=("DejaVu Sans", 11, ""), ) label.pack(fill="x") label = newReadOnlyText( licenseFrame, text=licenseText, font=("DejaVu Sans", 11, ""), ) label.pack(fill="x") aboutNotebook.pack(fill="x") aboutFrame.pack(fill="x") ###################### tk.Grid.columnconfigure(convertFrame, 0, weight=1) tk.Grid.columnconfigure(convertFrame, 1, weight=30) tk.Grid.columnconfigure(convertFrame, 2, weight=20) tk.Grid.columnconfigure(convertFrame, 3, weight=1) tk.Grid.rowconfigure(convertFrame, 0, weight=50) tk.Grid.rowconfigure(convertFrame, 1, weight=50) tk.Grid.rowconfigure(convertFrame, 2, weight=1) tk.Grid.rowconfigure(convertFrame, 3, weight=50) tk.Grid.rowconfigure(convertFrame, 4, weight=50) tk.Grid.rowconfigure(convertFrame, 5, weight=1) tk.Grid.rowconfigure(convertFrame, 6, weight=50) # _________________________________________________________________ # notebook.pack(fill="both", expand=True) # _________________________________________________________________ # statusBarframe = tk.Frame(self, borderwidth=3) clearB = newButton( statusBarframe, text="Clear", command=self.console_clear, # bg="black", # fg="#ffff00", # activebackground="#333333", # activeforeground="#ffff00", borderwidth=3, height=2, ) clearB.pack(side="left") #### label = ttk.Label(statusBarframe, text="Verbosity") label.pack(side="left") ## comboVar = tk.StringVar() combo = ttk.OptionMenu( statusBarframe, comboVar, log.getVerbosity(), # default 0, 1, 2, 3, 4, 5, ) comboVar.trace("w", self.verbosityChanged) combo.pack(side="left") self.verbosityCombo = comboVar ##### pbar = ProgressBar(statusBarframe, width=700, height=28) pbar.pack(side="left", fill="x", expand=True) self.pbar = pbar statusBarframe.pack(fill="x") self.progressTitle = "" # _________________________________________________________________ # centerWindow(rootWin) # show the window if os.sep == "\\": # Windows rootWin.attributes('-alpha', 1.0) else: # Linux rootWin.deiconify() def textSelectAll(self, tktext): tktext.tag_add(tk.SEL, "1.0", tk.END) tktext.mark_set(tk.INSERT, "1.0") tktext.see(tk.INSERT) def consoleKeyPress(self, e): # print(e.state, e.keysym) if e.state > 0: if e.keysym == "c": return if e.keysym == "a": self.textSelectAll(self.console) return "break" if e.keysym == "Escape": return return "break" def verbosityChanged(self, index, value, op): log.setVerbosity( int(self.verbosityCombo.get()) ) def resized(self, event): dh = self.rootWin.winfo_height() - self.winfo_height() # log.debug(dh, self.consoleH) # if dh > 20: # self.consoleH += 1 # self.console["height"] = self.consoleH # self.console["width"] = int(self.console["width"]) + 1 # self.console.grid() # for x in dir(self): # if "info" in x: # log.debug(x) def inputFormatChanged(self, *args): formatDesc = self.formatButtonInputConvert.get() if not formatDesc: return self.readOptions.clear() # reset the options, DO NOT re-assign format = pluginByDesc[formatDesc].name if Glossary.formatsReadOptions[format]: self.readOptionsButton.grid( row=self.inputFormatRow, column=3, sticky=tk.W + tk.E, padx=5, pady=0, ) else: self.readOptionsButton.grid_forget() def outputFormatChanged(self, *args): formatDesc = self.formatButtonOutputConvert.get() if not formatDesc: return format = pluginByDesc[formatDesc].name plugin = Glossary.plugins.get(format) if not plugin: log.error(f"plugin {format} not found") return self.writeOptions.clear() # reset the options, DO NOT re-assign if Glossary.formatsWriteOptions[format]: self.writeOptionsButton.grid( row=self.outputFormatRow, column=3, sticky=tk.W + tk.E, padx=5, pady=0, ) else: self.writeOptionsButton.grid_forget() pathI = self.entryInputConvert.get() if pathI and not self.entryOutputConvert.get(): if self.formatButtonInputConvert.get() and plugin.extensionCreate: pathNoExt, ext = splitext(pathI) self.entryOutputConvert.insert( 0, pathNoExt + plugin.extensionCreate, ) def anyEntryChanged(self, event=None): self.inputEntryChanged() self.outputEntryChanged() def inputEntryChanged(self, event=None): # char = event.keysym pathI = self.entryInputConvert.get() if self.pathI == pathI: return if pathI.startswith("file://"): pathI = urlToPath(pathI) self.entryInputConvert.delete(0, "end") self.entryInputConvert.insert(0, pathI) if self.config["ui_autoSetFormat"]: formatDesc = self.formatButtonInputConvert.get() if not formatDesc: inputArgs = Glossary.detectInputFormat(pathI, quiet=True) if inputArgs: format = inputArgs[1] plugin = Glossary.plugins.get(format) if plugin: self.formatButtonInputConvert.set(plugin.description) self.inputFormatChanged() self.pathI = pathI def outputEntryChanged(self, event=None): pathO = self.entryOutputConvert.get() if self.pathO == pathO: return if pathO.startswith("file://"): pathO = urlToPath(pathO) self.entryOutputConvert.delete(0, "end") self.entryOutputConvert.insert(0, pathO) if self.config["ui_autoSetFormat"]: formatDesc = self.formatButtonOutputConvert.get() if not formatDesc: outputArgs = Glossary.detectOutputFormat( filename=pathO, inputFilename=self.entryInputConvert.get(), quiet=True, ) if outputArgs: outputFormat = outputArgs[1] self.formatButtonOutputConvert.set( Glossary.plugins[outputFormat].description ) self.outputFormatChanged() self.pathO = pathO def save_fcd_dir(self): if not self.fcd_dir: return with open(self.fcd_dir_save_path, mode="w", encoding="utf-8") as fp: fp.write(self.fcd_dir) def browseInputConvert(self): path = filedialog.askopenfilename(initialdir=self.fcd_dir) if path: self.entryInputConvert.delete(0, "end") self.entryInputConvert.insert(0, path) self.inputEntryChanged() self.fcd_dir = os.path.dirname(path) self.save_fcd_dir() def browseOutputConvert(self): path = filedialog.asksaveasfilename() if path: self.entryOutputConvert.delete(0, "end") self.entryOutputConvert.insert(0, path) self.outputEntryChanged() self.fcd_dir = os.path.dirname(path) self.save_fcd_dir() def convert(self): inPath = self.entryInputConvert.get() if not inPath: log.critical("Input file path is empty!") return inFormatDesc = self.formatButtonInputConvert.get() if not inFormatDesc: # log.critical("Input format is empty!");return inFormat = "" else: inFormat = pluginByDesc[inFormatDesc].name outPath = self.entryOutputConvert.get() if not outPath: log.critical("Output file path is empty!") return outFormatDesc = self.formatButtonOutputConvert.get() if not outFormatDesc: log.critical("Output format is empty!") return outFormat = pluginByDesc[outFormatDesc].name for attr, value in self._glossarySetAttrs.items(): setattr(self.glos, attr, value) finalOutputFile = self.glos.convert( inPath, inputFormat=inFormat, outputFilename=outPath, outputFormat=outFormat, readOptions=self.readOptions, writeOptions=self.writeOptions, **self._convertOptions ) # if finalOutputFile: # self.status("Convert finished") # else: # self.status("Convert failed") return bool(finalOutputFile) def run( self, inputFilename: str = "", outputFilename: str = "", inputFormat: str = "", outputFormat: str = "", reverse: bool = False, config: "Optional[Dict]" = None, readOptions: "Optional[Dict]" = None, writeOptions: "Optional[Dict]" = None, convertOptions: "Optional[Dict]" = None, glossarySetAttrs: "Optional[Dict]" = None, ): if glossarySetAttrs is None: glossarySetAttrs = {} self.config = config if inputFilename: self.entryInputConvert.insert(0, abspath(inputFilename)) self.inputEntryChanged() if outputFilename: self.entryOutputConvert.insert(0, abspath(outputFilename)) self.outputEntryChanged() if inputFormat: self.formatButtonInputConvert.set( Glossary.plugins[inputFormat].description, ) self.inputFormatChanged() if outputFormat: self.formatButtonOutputConvert.set( Glossary.plugins[outputFormat].description, ) self.outputFormatChanged() if reverse: log.error(f"Tkinter interface does not support Reverse feature") # must be before setting self.readOptions and self.writeOptions self.anyEntryChanged() if readOptions: self.readOptionsButton.setOptionsValues(readOptions) self.readOptions = readOptions if writeOptions: self.writeOptionsButton.setOptionsValues(writeOptions) self.writeOptions = writeOptions self._convertOptions = convertOptions if convertOptions: log.info(f"Using convertOptions={convertOptions}") self._glossarySetAttrs = glossarySetAttrs # inputFilename and readOptions are for DB Editor # which is not implemented self.mainloop() def progressInit(self, title): self.progressTitle = title def progress(self, rat, text=""): if not text: text = "%" + str(int(rat * 100)) text += " - " + self.progressTitle self.pbar.updateProgress(rat * 100, None, text) # self.pbar.value = rat * 100 # self.pbar.update() self.rootWin.update() def console_clear(self, event=None): self.console.delete("1.0", "end") self.console.insert("end", "Console:\n") # def reverseBrowseInput(self): # pass # def reverseBrowseOutput(self): # pass # def reverseLoad(self): # pass if __name__ == "__main__": import sys if len(sys.argv) > 1: path = sys.argv[1] else: path = "" ui = UI(path) ui.run() pyglossary-4.5.0/pyglossary/ui/wcwidth/000077500000000000000000000000001417733132500202445ustar00rootroot00000000000000pyglossary-4.5.0/pyglossary/ui/wcwidth/LICENSE000066400000000000000000000024521417733132500212540ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2014 Jeff Quast Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Markus Kuhn -- 2007-05-26 (Unicode 5.0) Permission to use, copy, modify, and distribute this software for any purpose and without fee is hereby granted. The author disclaims all warranties with regard to this software. pyglossary-4.5.0/pyglossary/ui/wcwidth/__init__.py000066400000000000000000000003651417733132500223610ustar00rootroot00000000000000'\nwcwidth module.\n\nhttps://github.com/jquast/wcwidth\n' from .wcwidth import ZERO_WIDTH,WIDE_EASTASIAN,wcwidth,wcswidth,_bisearch,list_versions,_wcmatch_version,_wcversion_value __all__='wcwidth','wcswidth','list_versions' __version__='0.2.5'pyglossary-4.5.0/pyglossary/ui/wcwidth/table_wide.py000066400000000000000000000347601417733132500227270ustar00rootroot00000000000000'Wide_Eastasian table, created by bin/update-tables.py.' WIDE_EASTASIAN={'4.1.0':((4352,4441),(4447,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12588),(12593,12686),(12688,12727),(12736,12751),(12784,12830),(12832,12867),(12880,13054),(13056,19893),(19968,40891),(40960,42124),(42128,42182),(44032,55203),(63744,64045),(64048,64106),(64112,64217),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(131072,196605),(196608,262141)),'5.0.0':((4352,4441),(4447,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12588),(12593,12686),(12688,12727),(12736,12751),(12784,12830),(12832,12867),(12880,13054),(13056,19893),(19968,40891),(40960,42124),(42128,42182),(44032,55203),(63744,64045),(64048,64106),(64112,64217),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(131072,196605),(196608,262141)),'5.1.0':((4352,4441),(4447,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12727),(12736,12771),(12784,12830),(12832,12867),(12880,13054),(13056,19893),(19968,40899),(40960,42124),(42128,42182),(44032,55203),(63744,64045),(64048,64106),(64112,64217),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(131072,196605),(196608,262141)),'5.2.0':((4352,4447),(4515,4519),(4602,4607),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12727),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(55216,55238),(55243,55291),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(127488,127488),(127504,127537),(127552,127560),(131072,196605),(196608,262141)),'6.0.0':((4352,4447),(4515,4519),(4602,4607),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(55216,55238),(55243,55291),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(110592,110593),(127488,127490),(127504,127546),(127552,127560),(127568,127569),(131072,194367),(177984,196605),(196608,262141)),'6.1.0':((4352,4447),(4515,4519),(4602,4607),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(55216,55238),(55243,55291),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(110592,110593),(127488,127490),(127504,127546),(127552,127560),(127568,127569),(131072,196605),(196608,262141)),'6.2.0':((4352,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(110592,110593),(127488,127490),(127504,127546),(127552,127560),(127568,127569),(131072,196605),(196608,262141)),'6.3.0':((4352,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(110592,110593),(127488,127490),(127504,127546),(127552,127560),(127568,127569),(131072,196605),(196608,262141)),'7.0.0':((4352,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(110592,110593),(127488,127490),(127504,127546),(127552,127560),(127568,127569),(131072,196605),(196608,262141)),'8.0.0':((4352,4447),(9001,9002),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(110592,110593),(127488,127490),(127504,127546),(127552,127560),(127568,127569),(131072,196605),(196608,262141)),'9.0.0':((4352,4447),(8986,8987),(9001,9002),(9193,9196),(9200,9200),(9203,9203),(9725,9726),(9748,9749),(9800,9811),(9855,9855),(9875,9875),(9889,9889),(9898,9899),(9917,9918),(9924,9925),(9934,9934),(9940,9940),(9962,9962),(9970,9971),(9973,9973),(9978,9978),(9981,9981),(9989,9989),(9994,9995),(10024,10024),(10060,10060),(10062,10062),(10067,10069),(10071,10071),(10133,10135),(10160,10160),(10175,10175),(11035,11036),(11088,11088),(11093,11093),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12589),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(94176,94176),(94208,100332),(100352,101106),(110592,110593),(126980,126980),(127183,127183),(127374,127374),(127377,127386),(127488,127490),(127504,127547),(127552,127560),(127568,127569),(127744,127776),(127789,127797),(127799,127868),(127870,127891),(127904,127946),(127951,127955),(127968,127984),(127988,127988),(127992,128062),(128064,128064),(128066,128252),(128255,128317),(128331,128334),(128336,128359),(128378,128378),(128405,128406),(128420,128420),(128507,128591),(128640,128709),(128716,128716),(128720,128722),(128747,128748),(128756,128758),(129296,129310),(129312,129319),(129328,129328),(129331,129342),(129344,129355),(129360,129374),(129408,129425),(129472,129472),(131072,196605),(196608,262141)),'10.0.0':((4352,4447),(8986,8987),(9001,9002),(9193,9196),(9200,9200),(9203,9203),(9725,9726),(9748,9749),(9800,9811),(9855,9855),(9875,9875),(9889,9889),(9898,9899),(9917,9918),(9924,9925),(9934,9934),(9940,9940),(9962,9962),(9970,9971),(9973,9973),(9978,9978),(9981,9981),(9989,9989),(9994,9995),(10024,10024),(10060,10060),(10062,10062),(10067,10069),(10071,10071),(10133,10135),(10160,10160),(10175,10175),(11035,11036),(11088,11088),(11093,11093),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12590),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(94176,94177),(94208,100332),(100352,101106),(110592,110878),(110960,111355),(126980,126980),(127183,127183),(127374,127374),(127377,127386),(127488,127490),(127504,127547),(127552,127560),(127568,127569),(127584,127589),(127744,127776),(127789,127797),(127799,127868),(127870,127891),(127904,127946),(127951,127955),(127968,127984),(127988,127988),(127992,128062),(128064,128064),(128066,128252),(128255,128317),(128331,128334),(128336,128359),(128378,128378),(128405,128406),(128420,128420),(128507,128591),(128640,128709),(128716,128716),(128720,128722),(128747,128748),(128756,128760),(129296,129342),(129344,129356),(129360,129387),(129408,129431),(129472,129472),(129488,129510),(131072,196605),(196608,262141)),'11.0.0':((4352,4447),(8986,8987),(9001,9002),(9193,9196),(9200,9200),(9203,9203),(9725,9726),(9748,9749),(9800,9811),(9855,9855),(9875,9875),(9889,9889),(9898,9899),(9917,9918),(9924,9925),(9934,9934),(9940,9940),(9962,9962),(9970,9971),(9973,9973),(9978,9978),(9981,9981),(9989,9989),(9994,9995),(10024,10024),(10060,10060),(10062,10062),(10067,10069),(10071,10071),(10133,10135),(10160,10160),(10175,10175),(11035,11036),(11088,11088),(11093,11093),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12591),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(94176,94177),(94208,100337),(100352,101106),(110592,110878),(110960,111355),(126980,126980),(127183,127183),(127374,127374),(127377,127386),(127488,127490),(127504,127547),(127552,127560),(127568,127569),(127584,127589),(127744,127776),(127789,127797),(127799,127868),(127870,127891),(127904,127946),(127951,127955),(127968,127984),(127988,127988),(127992,128062),(128064,128064),(128066,128252),(128255,128317),(128331,128334),(128336,128359),(128378,128378),(128405,128406),(128420,128420),(128507,128591),(128640,128709),(128716,128716),(128720,128722),(128747,128748),(128756,128761),(129296,129342),(129344,129392),(129395,129398),(129402,129402),(129404,129442),(129456,129465),(129472,129474),(129488,129535),(131072,196605),(196608,262141)),'12.0.0':((4352,4447),(8986,8987),(9001,9002),(9193,9196),(9200,9200),(9203,9203),(9725,9726),(9748,9749),(9800,9811),(9855,9855),(9875,9875),(9889,9889),(9898,9899),(9917,9918),(9924,9925),(9934,9934),(9940,9940),(9962,9962),(9970,9971),(9973,9973),(9978,9978),(9981,9981),(9989,9989),(9994,9995),(10024,10024),(10060,10060),(10062,10062),(10067,10069),(10071,10071),(10133,10135),(10160,10160),(10175,10175),(11035,11036),(11088,11088),(11093,11093),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12591),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,13054),(13056,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(94176,94179),(94208,100343),(100352,101106),(110592,110878),(110928,110930),(110948,110951),(110960,111355),(126980,126980),(127183,127183),(127374,127374),(127377,127386),(127488,127490),(127504,127547),(127552,127560),(127568,127569),(127584,127589),(127744,127776),(127789,127797),(127799,127868),(127870,127891),(127904,127946),(127951,127955),(127968,127984),(127988,127988),(127992,128062),(128064,128064),(128066,128252),(128255,128317),(128331,128334),(128336,128359),(128378,128378),(128405,128406),(128420,128420),(128507,128591),(128640,128709),(128716,128716),(128720,128722),(128725,128725),(128747,128748),(128756,128762),(128992,129003),(129293,129393),(129395,129398),(129402,129442),(129445,129450),(129454,129482),(129485,129535),(129648,129651),(129656,129658),(129664,129666),(129680,129685),(131072,196605),(196608,262141)),'12.1.0':((4352,4447),(8986,8987),(9001,9002),(9193,9196),(9200,9200),(9203,9203),(9725,9726),(9748,9749),(9800,9811),(9855,9855),(9875,9875),(9889,9889),(9898,9899),(9917,9918),(9924,9925),(9934,9934),(9940,9940),(9962,9962),(9970,9971),(9973,9973),(9978,9978),(9981,9981),(9989,9989),(9994,9995),(10024,10024),(10060,10060),(10062,10062),(10067,10069),(10071,10071),(10133,10135),(10160,10160),(10175,10175),(11035,11036),(11088,11088),(11093,11093),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12591),(12593,12686),(12688,12730),(12736,12771),(12784,12830),(12832,12871),(12880,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(94176,94179),(94208,100343),(100352,101106),(110592,110878),(110928,110930),(110948,110951),(110960,111355),(126980,126980),(127183,127183),(127374,127374),(127377,127386),(127488,127490),(127504,127547),(127552,127560),(127568,127569),(127584,127589),(127744,127776),(127789,127797),(127799,127868),(127870,127891),(127904,127946),(127951,127955),(127968,127984),(127988,127988),(127992,128062),(128064,128064),(128066,128252),(128255,128317),(128331,128334),(128336,128359),(128378,128378),(128405,128406),(128420,128420),(128507,128591),(128640,128709),(128716,128716),(128720,128722),(128725,128725),(128747,128748),(128756,128762),(128992,129003),(129293,129393),(129395,129398),(129402,129442),(129445,129450),(129454,129482),(129485,129535),(129648,129651),(129656,129658),(129664,129666),(129680,129685),(131072,196605),(196608,262141)),'13.0.0':((4352,4447),(8986,8987),(9001,9002),(9193,9196),(9200,9200),(9203,9203),(9725,9726),(9748,9749),(9800,9811),(9855,9855),(9875,9875),(9889,9889),(9898,9899),(9917,9918),(9924,9925),(9934,9934),(9940,9940),(9962,9962),(9970,9971),(9973,9973),(9978,9978),(9981,9981),(9989,9989),(9994,9995),(10024,10024),(10060,10060),(10062,10062),(10067,10069),(10071,10071),(10133,10135),(10160,10160),(10175,10175),(11035,11036),(11088,11088),(11093,11093),(11904,11929),(11931,12019),(12032,12245),(12272,12283),(12288,12350),(12353,12438),(12441,12543),(12549,12591),(12593,12686),(12688,12771),(12784,12830),(12832,12871),(12880,19903),(19968,42124),(42128,42182),(43360,43388),(44032,55203),(63744,64255),(65040,65049),(65072,65106),(65108,65126),(65128,65131),(65281,65376),(65504,65510),(94176,94180),(94192,94193),(94208,100343),(100352,101589),(101632,101640),(110592,110878),(110928,110930),(110948,110951),(110960,111355),(126980,126980),(127183,127183),(127374,127374),(127377,127386),(127488,127490),(127504,127547),(127552,127560),(127568,127569),(127584,127589),(127744,127776),(127789,127797),(127799,127868),(127870,127891),(127904,127946),(127951,127955),(127968,127984),(127988,127988),(127992,128062),(128064,128064),(128066,128252),(128255,128317),(128331,128334),(128336,128359),(128378,128378),(128405,128406),(128420,128420),(128507,128591),(128640,128709),(128716,128716),(128720,128722),(128725,128727),(128747,128748),(128756,128764),(128992,129003),(129292,129338),(129340,129349),(129351,129400),(129402,129483),(129485,129535),(129648,129652),(129656,129658),(129664,129670),(129680,129704),(129712,129718),(129728,129730),(129744,129750),(131072,196605),(196608,262141))}pyglossary-4.5.0/pyglossary/ui/wcwidth/table_zero.py000066400000000000000000001403701417733132500227510ustar00rootroot00000000000000'Zero_Width table, created by bin/update-tables.py.' ZERO_WIDTH={'4.1.0':((768,879),(1155,1158),(1160,1161),(1425,1465),(1467,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1557),(1611,1630),(1648,1648),(1750,1756),(1758,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2305,2306),(2364,2364),(2369,2376),(2381,2381),(2385,2388),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2672,2673),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2883),(2893,2893),(2902,2902),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3393,3395),(3405,3405),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3984,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4146),(4150,4151),(4153,4153),(4184,4185),(4959,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(7616,7619),(8400,8427),(12330,12335),(12441,12442),(43014,43014),(43019,43019),(43045,43046),(64286,64286),(65024,65039),(65056,65059),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'5.0.0':((768,879),(1155,1158),(1160,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1557),(1611,1630),(1648,1648),(1750,1756),(1758,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2305,2306),(2364,2364),(2369,2376),(2381,2381),(2385,2388),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2672,2673),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2883),(2893,2893),(2902,2902),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3395),(3405,3405),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3984,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4146),(4150,4151),(4153,4153),(4184,4185),(4959,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7616,7626),(7678,7679),(8400,8431),(12330,12335),(12441,12442),(43014,43014),(43019,43019),(43045,43046),(64286,64286),(65024,65039),(65056,65059),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'5.1.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1630),(1648,1648),(1750,1756),(1758,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2305,2306),(2364,2364),(2369,2376),(2381,2381),(2385,2388),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3984,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4959,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7212,7219),(7222,7223),(7616,7654),(7678,7679),(8400,8432),(11744,11775),(12330,12335),(12441,12442),(42607,42610),(42620,42621),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43302,43309),(43335,43345),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(64286,64286),(65024,65039),(65056,65062),(66045,66045),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'5.2.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1630),(1648,1648),(1750,1756),(1758,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2304,2306),(2364,2364),(2369,2376),(2381,2381),(2385,2389),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3984,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4959,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7616,7654),(7677,7679),(8400,8432),(11503,11505),(11744,11775),(12330,12335),(12441,12442),(42607,42610),(42620,42621),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65062),(66045,66045),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(69760,69761),(69811,69814),(69817,69818),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'6.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2304,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7616,7654),(7676,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12335),(12441,12442),(42607,42610),(42620,42621),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65062),(66045,66045),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(69633,69633),(69688,69702),(69760,69761),(69811,69814),(69817,69818),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'6.1.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2276,2302),(2304,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7083),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7616,7654),(7676,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42655,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65062),(66045,66045),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(69633,69633),(69688,69702),(69760,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70016,70017),(70070,70078),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(94095,94098),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'6.2.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2276,2302),(2304,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7083),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7616,7654),(7676,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42655,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65062),(66045,66045),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(69633,69633),(69688,69702),(69760,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70016,70017),(70070,70078),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(94095,94098),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'6.3.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2276,2302),(2304,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7083),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7616,7654),(7676,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42655,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65062),(66045,66045),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(69633,69633),(69688,69702),(69760,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70016,70017),(70070,70078),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(94095,94098),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(917760,917999)),'7.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2276,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3329,3329),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7669),(7676,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42655,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65069),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70191,70193),(70196,70196),(70198,70199),(70367,70367),(70371,70378),(70401,70401),(70460,70460),(70464,70464),(70502,70508),(70512,70516),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(92912,92916),(92976,92982),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(125136,125142),(917760,917999)),'8.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3329,3329),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7669),(7676,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43204),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70090,70092),(70191,70193),(70196,70196),(70198,70199),(70367,70367),(70371,70378),(70400,70401),(70460,70460),(70464,70464),(70502,70508),(70512,70516),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(92912,92916),(92976,92982),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(125136,125142),(917760,917999)),'9.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2260,2273),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3329,3329),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6277,6278),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7669),(7675,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43205),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70090,70092),(70191,70193),(70196,70196),(70198,70199),(70206,70206),(70367,70367),(70371,70378),(70400,70401),(70460,70460),(70464,70464),(70502,70508),(70512,70516),(70712,70719),(70722,70724),(70726,70726),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(72752,72758),(72760,72765),(72767,72767),(72850,72871),(72874,72880),(72882,72883),(72885,72886),(92912,92916),(92976,92982),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(122880,122886),(122888,122904),(122907,122913),(122915,122916),(122918,122922),(125136,125142),(125252,125258),(917760,917999)),'10.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2260,2273),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2810,2815),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3328,3329),(3387,3388),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6277,6278),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7673),(7675,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43205),(43232,43249),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70090,70092),(70191,70193),(70196,70196),(70198,70199),(70206,70206),(70367,70367),(70371,70378),(70400,70401),(70460,70460),(70464,70464),(70502,70508),(70512,70516),(70712,70719),(70722,70724),(70726,70726),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(72193,72198),(72201,72202),(72243,72248),(72251,72254),(72263,72263),(72273,72278),(72281,72283),(72330,72342),(72344,72345),(72752,72758),(72760,72765),(72767,72767),(72850,72871),(72874,72880),(72882,72883),(72885,72886),(73009,73014),(73018,73018),(73020,73021),(73023,73029),(73031,73031),(92912,92916),(92976,92982),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(122880,122886),(122888,122904),(122907,122913),(122915,122916),(122918,122922),(125136,125142),(125252,125258),(917760,917999)),'11.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2045,2045),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2259,2273),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2558,2558),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2810,2815),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3076,3076),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3328,3329),(3387,3388),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3769),(3771,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6277,6278),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7673),(7675,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43205),(43232,43249),(43263,43263),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43452),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(68900,68903),(69446,69456),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70089,70092),(70191,70193),(70196,70196),(70198,70199),(70206,70206),(70367,70367),(70371,70378),(70400,70401),(70459,70460),(70464,70464),(70502,70508),(70512,70516),(70712,70719),(70722,70724),(70726,70726),(70750,70750),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(71727,71735),(71737,71738),(72193,72202),(72243,72248),(72251,72254),(72263,72263),(72273,72278),(72281,72283),(72330,72342),(72344,72345),(72752,72758),(72760,72765),(72767,72767),(72850,72871),(72874,72880),(72882,72883),(72885,72886),(73009,73014),(73018,73018),(73020,73021),(73023,73029),(73031,73031),(73104,73105),(73109,73109),(73111,73111),(73459,73460),(92912,92916),(92976,92982),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(122880,122886),(122888,122904),(122907,122913),(122915,122916),(122918,122922),(125136,125142),(125252,125258),(917760,917999)),'12.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2045,2045),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2259,2273),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2558,2558),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2810,2815),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3076,3076),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3328,3329),(3387,3388),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6277,6278),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7673),(7675,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43205),(43232,43249),(43263,43263),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43453),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(68900,68903),(69446,69456),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70089,70092),(70191,70193),(70196,70196),(70198,70199),(70206,70206),(70367,70367),(70371,70378),(70400,70401),(70459,70460),(70464,70464),(70502,70508),(70512,70516),(70712,70719),(70722,70724),(70726,70726),(70750,70750),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(71727,71735),(71737,71738),(72148,72151),(72154,72155),(72160,72160),(72193,72202),(72243,72248),(72251,72254),(72263,72263),(72273,72278),(72281,72283),(72330,72342),(72344,72345),(72752,72758),(72760,72765),(72767,72767),(72850,72871),(72874,72880),(72882,72883),(72885,72886),(73009,73014),(73018,73018),(73020,73021),(73023,73029),(73031,73031),(73104,73105),(73109,73109),(73111,73111),(73459,73460),(92912,92916),(92976,92982),(94031,94031),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(122880,122886),(122888,122904),(122907,122913),(122915,122916),(122918,122922),(123184,123190),(123628,123631),(125136,125142),(125252,125258),(917760,917999)),'12.1.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2045,2045),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2259,2273),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2558,2558),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2810,2815),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2902,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3076,3076),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3328,3329),(3387,3388),(3393,3396),(3405,3405),(3426,3427),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6277,6278),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6846),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7673),(7675,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43204,43205),(43232,43249),(43263,43263),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43453),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(68900,68903),(69446,69456),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70089,70092),(70191,70193),(70196,70196),(70198,70199),(70206,70206),(70367,70367),(70371,70378),(70400,70401),(70459,70460),(70464,70464),(70502,70508),(70512,70516),(70712,70719),(70722,70724),(70726,70726),(70750,70750),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(71727,71735),(71737,71738),(72148,72151),(72154,72155),(72160,72160),(72193,72202),(72243,72248),(72251,72254),(72263,72263),(72273,72278),(72281,72283),(72330,72342),(72344,72345),(72752,72758),(72760,72765),(72767,72767),(72850,72871),(72874,72880),(72882,72883),(72885,72886),(73009,73014),(73018,73018),(73020,73021),(73023,73029),(73031,73031),(73104,73105),(73109,73109),(73111,73111),(73459,73460),(92912,92916),(92976,92982),(94031,94031),(94095,94098),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(122880,122886),(122888,122904),(122907,122913),(122915,122916),(122918,122922),(123184,123190),(123628,123631),(125136,125142),(125252,125258),(917760,917999)),'13.0.0':((768,879),(1155,1161),(1425,1469),(1471,1471),(1473,1474),(1476,1477),(1479,1479),(1552,1562),(1611,1631),(1648,1648),(1750,1756),(1759,1764),(1767,1768),(1770,1773),(1809,1809),(1840,1866),(1958,1968),(2027,2035),(2045,2045),(2070,2073),(2075,2083),(2085,2087),(2089,2093),(2137,2139),(2259,2273),(2275,2306),(2362,2362),(2364,2364),(2369,2376),(2381,2381),(2385,2391),(2402,2403),(2433,2433),(2492,2492),(2497,2500),(2509,2509),(2530,2531),(2558,2558),(2561,2562),(2620,2620),(2625,2626),(2631,2632),(2635,2637),(2641,2641),(2672,2673),(2677,2677),(2689,2690),(2748,2748),(2753,2757),(2759,2760),(2765,2765),(2786,2787),(2810,2815),(2817,2817),(2876,2876),(2879,2879),(2881,2884),(2893,2893),(2901,2902),(2914,2915),(2946,2946),(3008,3008),(3021,3021),(3072,3072),(3076,3076),(3134,3136),(3142,3144),(3146,3149),(3157,3158),(3170,3171),(3201,3201),(3260,3260),(3263,3263),(3270,3270),(3276,3277),(3298,3299),(3328,3329),(3387,3388),(3393,3396),(3405,3405),(3426,3427),(3457,3457),(3530,3530),(3538,3540),(3542,3542),(3633,3633),(3636,3642),(3655,3662),(3761,3761),(3764,3772),(3784,3789),(3864,3865),(3893,3893),(3895,3895),(3897,3897),(3953,3966),(3968,3972),(3974,3975),(3981,3991),(3993,4028),(4038,4038),(4141,4144),(4146,4151),(4153,4154),(4157,4158),(4184,4185),(4190,4192),(4209,4212),(4226,4226),(4229,4230),(4237,4237),(4253,4253),(4957,4959),(5906,5908),(5938,5940),(5970,5971),(6002,6003),(6068,6069),(6071,6077),(6086,6086),(6089,6099),(6109,6109),(6155,6157),(6277,6278),(6313,6313),(6432,6434),(6439,6440),(6450,6450),(6457,6459),(6679,6680),(6683,6683),(6742,6742),(6744,6750),(6752,6752),(6754,6754),(6757,6764),(6771,6780),(6783,6783),(6832,6848),(6912,6915),(6964,6964),(6966,6970),(6972,6972),(6978,6978),(7019,7027),(7040,7041),(7074,7077),(7080,7081),(7083,7085),(7142,7142),(7144,7145),(7149,7149),(7151,7153),(7212,7219),(7222,7223),(7376,7378),(7380,7392),(7394,7400),(7405,7405),(7412,7412),(7416,7417),(7616,7673),(7675,7679),(8400,8432),(11503,11505),(11647,11647),(11744,11775),(12330,12333),(12441,12442),(42607,42610),(42612,42621),(42654,42655),(42736,42737),(43010,43010),(43014,43014),(43019,43019),(43045,43046),(43052,43052),(43204,43205),(43232,43249),(43263,43263),(43302,43309),(43335,43345),(43392,43394),(43443,43443),(43446,43449),(43452,43453),(43493,43493),(43561,43566),(43569,43570),(43573,43574),(43587,43587),(43596,43596),(43644,43644),(43696,43696),(43698,43700),(43703,43704),(43710,43711),(43713,43713),(43756,43757),(43766,43766),(44005,44005),(44008,44008),(44013,44013),(64286,64286),(65024,65039),(65056,65071),(66045,66045),(66272,66272),(66422,66426),(68097,68099),(68101,68102),(68108,68111),(68152,68154),(68159,68159),(68325,68326),(68900,68903),(69291,69292),(69446,69456),(69633,69633),(69688,69702),(69759,69761),(69811,69814),(69817,69818),(69888,69890),(69927,69931),(69933,69940),(70003,70003),(70016,70017),(70070,70078),(70089,70092),(70095,70095),(70191,70193),(70196,70196),(70198,70199),(70206,70206),(70367,70367),(70371,70378),(70400,70401),(70459,70460),(70464,70464),(70502,70508),(70512,70516),(70712,70719),(70722,70724),(70726,70726),(70750,70750),(70835,70840),(70842,70842),(70847,70848),(70850,70851),(71090,71093),(71100,71101),(71103,71104),(71132,71133),(71219,71226),(71229,71229),(71231,71232),(71339,71339),(71341,71341),(71344,71349),(71351,71351),(71453,71455),(71458,71461),(71463,71467),(71727,71735),(71737,71738),(71995,71996),(71998,71998),(72003,72003),(72148,72151),(72154,72155),(72160,72160),(72193,72202),(72243,72248),(72251,72254),(72263,72263),(72273,72278),(72281,72283),(72330,72342),(72344,72345),(72752,72758),(72760,72765),(72767,72767),(72850,72871),(72874,72880),(72882,72883),(72885,72886),(73009,73014),(73018,73018),(73020,73021),(73023,73029),(73031,73031),(73104,73105),(73109,73109),(73111,73111),(73459,73460),(92912,92916),(92976,92982),(94031,94031),(94095,94098),(94180,94180),(113821,113822),(119143,119145),(119163,119170),(119173,119179),(119210,119213),(119362,119364),(121344,121398),(121403,121452),(121461,121461),(121476,121476),(121499,121503),(121505,121519),(122880,122886),(122888,122904),(122907,122913),(122915,122916),(122918,122922),(123184,123190),(123628,123631),(125136,125142),(125252,125258),(917760,917999))}pyglossary-4.5.0/pyglossary/ui/wcwidth/unicode_versions.py000066400000000000000000000011671417733132500242010ustar00rootroot00000000000000'\nExports function list_versions() for unicode version level support.\n\nThis code generated by bin/update-tables.py on 2020-06-23 15:58:44.035540.\n' def list_versions():'\n Return Unicode version levels supported by this module release.\n\n Any of the version strings returned may be used as keyword argument\n ``unicode_version`` to the ``wcwidth()`` family of functions.\n\n :returns: Supported Unicode version numbers in ascending sorted order.\n :rtype: list[str]\n ';return'4.1.0','5.0.0','5.1.0','5.2.0','6.0.0','6.1.0','6.2.0','6.3.0','7.0.0','8.0.0','9.0.0','10.0.0','11.0.0','12.0.0','12.1.0','13.0.0'pyglossary-4.5.0/pyglossary/ui/wcwidth/wcwidth.py000066400000000000000000000233511417733132500222730ustar00rootroot00000000000000'\nThis is a python implementation of wcwidth() and wcswidth().\n\nhttps://github.com/jquast/wcwidth\n\nfrom Markus Kuhn\'s C code, retrieved from:\n\n http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n\nThis is an implementation of wcwidth() and wcswidth() (defined in\nIEEE Std 1002.1-2001) for Unicode.\n\nhttp://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html\nhttp://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html\n\nIn fixed-width output devices, Latin characters all occupy a single\n"cell" position of equal width, whereas ideographic CJK characters\noccupy two such cells. Interoperability between terminal-line\napplications and (teletype-style) character terminals using the\nUTF-8 encoding requires agreement on which character should advance\nthe cursor by how many cell positions. No established formal\nstandards exist at present on which Unicode character shall occupy\nhow many cell positions on character terminals. These routines are\na first attempt of defining such behavior based on simple rules\napplied to data provided by the Unicode Consortium.\n\nFor some graphical characters, the Unicode standard explicitly\ndefines a character-cell width via the definition of the East Asian\nFullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.\nIn all these cases, there is no ambiguity about which width a\nterminal shall use. For characters in the East Asian Ambiguous (A)\nclass, the width choice depends purely on a preference of backward\ncompatibility with either historic CJK or Western practice.\nChoosing single-width for these characters is easy to justify as\nthe appropriate long-term solution, as the CJK practice of\ndisplaying these characters as double-width comes from historic\nimplementation simplicity (8-bit encoded characters were displayed\nsingle-width and 16-bit ones double-width, even for Greek,\nCyrillic, etc.) and not any typographic considerations.\n\nMuch less clear is the choice of width for the Not East Asian\n(Neutral) class. Existing practice does not dictate a width for any\nof these characters. It would nevertheless make sense\ntypographically to allocate two character cells to characters such\nas for instance EM SPACE or VOLUME INTEGRAL, which cannot be\nrepresented adequately with a single-width glyph. The following\nroutines at present merely assign a single-cell width to all\nneutral characters, in the interest of simplicity. This is not\nentirely satisfactory and should be reconsidered before\nestablishing a formal standard in this area. At the moment, the\ndecision which Not East Asian (Neutral) characters should be\nrepresented by double-width glyphs cannot yet be answered by\napplying a simple rule from the Unicode database content. Setting\nup a proper standard for the behavior of UTF-8 character terminals\nwill require a careful analysis not only of each Unicode character,\nbut also of each presentation form, something the author of these\nroutines has avoided to do so far.\n\nhttp://www.unicode.org/unicode/reports/tr11/\n\nLatest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n' from __future__ import division _A='auto' import os,sys,warnings from .table_wide import WIDE_EASTASIAN from .table_zero import ZERO_WIDTH from .unicode_versions import list_versions try:from functools import lru_cache except ImportError:from backports.functools_lru_cache import lru_cache _UNICODE_CMPTABLE=None _PY3=sys.version_info[0]>=3 ZERO_WIDTH_CF=set([0,847,8203,8204,8205,8206,8207,8232,8233,8234,8235,8236,8237,8238,8288,8289,8290,8291]) def _bisearch(ucs,table): '\n Auxiliary function for binary search in interval table.\n\n :arg int ucs: Ordinal value of unicode character.\n :arg list table: List of starting and ending ranges of ordinal values,\n in form of ``[(start, end), ...]``.\n :rtype: int\n :returns: 1 if ordinal value ucs is found within lookup table, else 0.\n ';B=ucs;A=table;E=0;C=len(A)-1 if BA[C][1]:return 0 while C>=E: D=(E+C)//2 if B>A[D][1]:E=D+1 elif B>> _wcmatch_version('4.9.9')\n '4.1.0'\n >>> _wcmatch_version('8.0')\n '8.0.0'\n >>> _wcmatch_version('1')\n '4.1.0'\n\n :param str given_version: given version for compare, may be ``auto``\n (default), to select Unicode Version from Environment Variable,\n ``UNICODE_VERSION``. If the environment variable is not set, then the\n latest is used.\n :rtype: str\n :returns: unicode string, or non-unicode ``str`` type for python 2\n when given ``version`` is also type ``str``.\n ";G='latest';A=given_version;D=not _PY3 and isinstance(A,str) if D:B=[A.encode()for A in list_versions()] else:B=list_versions() C=B[-1] if A in(_A,_A):A=os.environ.get('UNICODE_VERSION',G if not D else C.encode()) if A in(G,G):return C if not D else C.encode() if A in B:return A if not D else A.encode() try:E=_wcversion_value(A) except ValueError:warnings.warn("UNICODE_VERSION value, {given_version!r}, is invalid. Value should be in form of `integer[.]+', the latest supported unicode version {latest_version!r} has been inferred.".format(given_version=A,latest_version=C));return C if not D else C.encode() F=B[0];J=_wcversion_value(F) if E<=J:warnings.warn('UNICODE_VERSION value, {given_version!r}, is lower than any available unicode version. Returning lowest version level, {earliest_version!r}'.format(given_version=A,earliest_version=F));return F if not D else F.encode() for (H,K) in enumerate(B): try:I=_wcversion_value(B[H+1]) except IndexError:return C if not D else C.encode() if E==I[:len(E)]:return B[H+1] if I>E:return K assert False,('Code path unreachable',A,B)pyglossary-4.5.0/pyglossary/xdxf_transform.py000066400000000000000000000155701417733132500216040ustar00rootroot00000000000000from pyglossary import core from pyglossary.core import rootDir import logging from io import BytesIO from io import StringIO log = logging.getLogger("pyglossary") class XdxfTransformer(object): _gram_color: str = "green" _example_padding: int = 10 def __init__(self, encoding="utf-8"): self._encoding = encoding def tostring(self, elem: "lxml.etree.Element") -> str: from lxml import etree as ET return ET.tostring( elem, method="html", pretty_print=True, ).decode("utf-8").strip() def hasPrevText(self, prev: "Union[None, str, lxml.etree.Element]"): if isinstance(prev, str): return True if prev is None: return False if prev.tag == "k": return False if prev.tag in ( "dtrn", "def", "span", "co", "i", "b", "sub", "sup", "tt", "big", "small", ): return True if prev.text: return True # print(prev) return False def writeString( self, hf: "lxml.etree.htmlfile", child: str, parent: "lxml.etree.Element", prev: "Union[None, str, lxml.etree.Element]", ): from lxml import etree as ET hasPrev = self.hasPrevText(prev) trailNL = False if parent.tag in ("ar", "font"): if child.startswith("\n"): child = child.lstrip("\n") if hasPrev: hf.write(ET.Element("br")) elif child.endswith("\n"): child = child.rstrip("\n") trailNL = True if not hasPrev: child = child.lstrip() elif child.startswith("\n"): child = child.lstrip() hf.write(ET.Element("br")) for index, parag in enumerate(child.split("\n")): if index > 0: hf.write(ET.Element("br")) hf.write(parag) if trailNL: hf.write(ET.Element("br")) return def writeExample( self, hf: "lxml.etree.htmlfile", elem: "Union[str, lxml.etree.Element]", ): with hf.element("div", **{ "class": "example", "style": f"padding: {self._example_padding}px 0px;", }): for child in elem.xpath("child::node()"): if isinstance(child, str): continue if child.tag == "iref": with hf.element("div"): self.writeIRef(hf, child) continue if child.tag in ("ex_orig", "ex_tran"): with hf.element("div"): self.writeChildrenOf(hf, child) continue log.warning(f"unknown tag {child.tag} inside ") def writeIRef( self, hf: "lxml.etree.htmlfile", child: "Union[str, lxml.etree.Element]", ): iref_url = child.attrib.get("href", "") if child.text: with hf.element("a", **{ "class": "iref", "href": child.attrib.get("href", child.text), }): hf.write(child.text) elif any(iref_url.endswith(ext) for ext in ("mp3", "wav", "aac", "ogg")): # with hf.element("audio", src=iref_url): with hf.element("a", **{ "class": "iref", "href": iref_url, }): hf.write("🔊") return elif iref_url: with hf.element("a", **{ "class": "iref", "href": iref_url, }): hf.write("⎋") else: log.warning(f"iref with no text and no url: {self.tostring(child)}") def writeChild( self, hf: "lxml.etree.htmlfile", child: "Union[str, lxml.etree.Element]", parent: "lxml.etree.Element", prev: "Union[None, str, lxml.etree.Element]", ): from lxml import etree as ET if isinstance(child, str): self.writeString(hf, child, parent, prev) return if child.tag == f"br": hf.write(ET.Element("br")) return if child.tag in ("i", "b", "sub", "sup", "tt", "big", "small"): with hf.element(child.tag): self.writeChildrenOf(hf, child) # if child.text is not None: # hf.write(child.text.strip("\n")) return if child.tag == "blockquote": with hf.element("div", **{"class": "m"}): self.writeChildrenOf(hf, child) return if child.tag == "tr": hf.write("[") self.writeChildrenOf(hf, child) hf.write("]") return if child.tag in ("k", "sr"): with hf.element("div", **{"class": child.tag}): self.writeChildrenOf(hf, child) return if child.tag == "ex": self.writeExample(hf, child) return if child.tag == "mrkd": if child.text: return with hf.element("span", **{"class": child.tag}): with hf.element("b"): hf.write(child.text) return if child.tag in ("pos", "abr"): with hf.element("span", **{"class": "abr"}): with hf.element("font", color="green"): with hf.element("i"): self.writeChildrenOf(hf, child) return if child.tag in ("dtrn", "co"): self.writeChildrenOf(hf, child) return if child.tag == "c": color = child.attrib.get("c", "green") with hf.element("font", color=color): self.writeChildrenOf(hf, child) return if child.tag == "kref": if not child.text: log.warning(f"kref with no text: {self.tostring(child)}") return with hf.element("a", **{ "class": "kref", "href": f"bword://{child.attrib.get('k', child.text)}", }): hf.write(child.text) return if child.tag == "iref": self.writeIRef(hf, child) return if child.tag == "rref": if not child.text: log.warning(f"rref with no text: {self.tostring(child)}") return if child.tag == "def": self.writeChildrenOf(hf, child) return if child.tag == "deftext": self.writeChildrenOf(hf, child) return if child.tag == "span": with hf.element("span"): self.writeChildrenOf(hf, child) return if child.tag == "abbr_def": # _type = child.attrib.get("type", "") # {"": "", "grm": "grammatical", "stl": "stylistical", # "knl": "area/field of knowledge", "aux": "subsidiary" # "oth": "others"}[_type] self.writeChildrenOf(hf, child) return if child.tag == "gr": with hf.element("font", color=self._gram_color): hf.write(child.text) return if child.tag == "ex_orig": with hf.element("i"): hf.write(child.text) return if child.tag == "ex_transl" and prev.tag == "ex_orig": if child.text != prev.text: with hf.element("i"): hf.write(child.text) return if child.tag == "opt": if child.text: hf.write(" (") hf.write(child.text) hf.write(")") return if child.tag == "img": with hf.element("img", **child.attrib): pass return if child.tag == "abbr": with hf.element("i"): hf.write(f"{child.text}") return log.warning(f"unknown tag {child.tag}") self.writeChildrenOf(hf, child) def writeChildrenOf( self, hf: "lxml.etree.htmlfile", elem: "lxml.etree.Element", ): prev = None for child in elem.xpath("child::node()"): self.writeChild(hf, child, elem, prev) prev = child def transform(self, article: "lxml.etree.Element") -> str: from lxml import etree as ET encoding = self._encoding f = BytesIO() with ET.htmlfile(f, encoding="utf-8") as hf: with hf.element("div", **{"class": "article"}): self.writeChildrenOf(hf, article) text = f.getvalue().decode("utf-8") text = text.replace("
              ", "
              ") # for compatibility return text def transformByInnerString(self, articleInnerStr: str) -> str: from lxml import etree as ET return self.transform( ET.fromstring(f"{articleInnerStr}") ) pyglossary-4.5.0/pyglossary/xml_utils.py000066400000000000000000000006641417733132500205560ustar00rootroot00000000000000# from xml.sax.saxutils import escape as xml_escape # from xml.sax.saxutils import unescape as xml_unescape def xml_escape(data: str, quotation: bool = True) -> str: """Escape &, <, and > in a string of data. """ # must do ampersand first data = data.replace("&", "&") data = data.replace(">", ">") data = data.replace("<", "<") if quotation: data = data.replace("\"", """).replace("'", "'") return data pyglossary-4.5.0/res/000077500000000000000000000000001417733132500145335ustar00rootroot00000000000000pyglossary-4.5.0/res/author-22.png000066400000000000000000000017271417733132500167730ustar00rootroot00000000000000PNG  IHDRĴl;sBIT|d pHYs9tEXtSoftwarewww.inkscape.org<TIDAT8]h[eo6YI:!ŭMaPTd+.vDQ7٨_ln8:dq5M$&&=qгsw#@ i:sssd2vHp$b,p@H)3VPM=g⤂tQNzcxx VAQ!V=%~$mW_j|# q[= sn\0śM^ ^U7-\m亣[zy!S\rw'-tvyn0C;H ǿ_.*/ݙ3껏bB Ã<|唚bvf?a3j:j:o,:M 1k>| ܦRv쎎j7!>, '8s/𘏸]]SXs'[D_/_#]y_Zw,tpw σadx/W<s[Ztm}]3 nY %p\Tp\rvUz;șSj1Կ}rȡdn( //`k1YkY[ZOCRt *ݣ{T~d`n^*t=>YknݴqY\ KcEcF~̽ TxwvC(h*,G&47!.[ ^?U)ijR(8b-f>y4 E0Eg wv9E]4íDJ--5`! BQ {Y65x?qRVW#1~?>hUQTHd lFJ|4rrhӶ dT$ab(YnM?|Vqw'w^p1psIQ$ .,xa~z?[_=xƾL- +.u{>|+LՃ(1]AL^v7ךӋ`.Ky~ˇO>=@ь!6N2H1X\MF] ێ=vR ֶ2!`ߢhk_hR F澥GM]s¿FY 9JWuA}'Obb```i{C=/sUȴe`##Ed(z%0^A])?R' 5gh6:4(6iR300|ytKB7EIENDB`pyglossary-4.5.0/res/pyglossary.ico000066400000000000000000000226761417733132500174600ustar00rootroot0000000000000000 %(0` C@=3;S;c;W;;;;  NK7GC@=;;;;;;S; "G zQ0 URoNKHDA>;;;;;;;   t%P 5$Q;VSOLIFB?<;;;k;;h(((3 R=p7d/T%A+jFZWTQMJGD@>V;%;;=Z!!!111&&&"#!A lIHHFAvPt_[XUROLHEB@f<;;; ---999,,,-,.JzLLKKJXsc`]YVSPMJFC@=;9 888@@@000%%%%%%81=OOOOON\shea^[WRMjIjEk@o;{5/   ===???111)))...B1MRayy7bFVFTm)i_tlifc_\WRPNJGB=97޺798Y$ @@@<<<---&&&222F)Xs+oo@u:s8q6BXbupmjgda]ZWTPMJGDA=;;6 $+ 8AAA999+++"""444T-XnFyyx***???111'''"""=78rIB~@|?z=xz=xz=x;{NxfF_/b/b/a/a.a6badb^\YUROKIf=666666))) ***M)MJFDCA}@{>y=w;u;yB|Ev@j;d9b8a7a7Z>>Xh dc`]ZWSPNXa777333'''+++M^yPGEDCA}?{>y=w;u9s8q6p5n4l2j1i0i0i0WAi*gda^[XUS=888000&&&)*(O vjZHFEDBA}?{>yx=w;u9s8q6o5n3MMbbb)))%%% EqTVVVV\dhF{JU_oNA}?{>yQPPPTg2aøȸơqTKHEB~A|@y?rBUTfZ[]]0 /NNOPPPPORs nrBk{n}TiBe~7lv-ro"xj[YYZZ[6]MMMNPPQvkRQPNORUVVV\\WWXYYY >lKLMMMPqaPPPQQQQQSUX_VVWWWW]  DxKKKKXfNOPPPPQQQQS_VVVVVWW8 FIKLerLMMNOPPPPQQaUQSUVVVW{a 'GFNrs|MKLLMMMOPPP_ZQQQQRUVV^9F[uaOJKKKKLMMMOZaPPPQQQQRS;... +MmmUTFFHJKKKKLLVfOPPPPPPQQQ"ɖ```AAA(((h-~z,i [DFFFFGJKKKSjPLNNPPPPPQR~~~|ycvQiAb0Z!QIEFFGIQmTKKLMMNOPPP[* 555VVV{{{srYgAa2]%UL RrWJKKKKLMMMOPV5a_NuscxyfjJj1g&YP IJKKKKLLLM[MDWNA` P@gJOX'VTNKKKLMf=6I?6MF9XF9[E6[+E5_YH3gM7m^Jzue}kjL\0Q~RQS<6(L;,S?.X8?+\jC.cO9obN~n8[0 Z=Y )6(K 6'N"6$TI5 U}?)`Q:rjUz#R'\[&57(K1I/-M[.Q<%^?&d,Z0[(  "$<0!G0>?9????pyglossary-4.5.0/res/pyglossary.png000066400000000000000000000067241417733132500174660ustar00rootroot00000000000000PNG  IHDR00WsRGBbKGD pHYs B(xtIME $4)C TIDATh͙]lysff\KHвl'hrkM M4($ F/\MAQM: rQ4i"umװ\GeK(Q(KrgbgDIܝ}{|g'`*7:<4 ֫W`ރxv4ь4|QFI)o~cݟs{OOY)#O fN=T;=5`hȜ_ ? c,fF g}ci?-Lؗ:tax@04fA\E Nq'?7K?!'G3䥵Yw2 3=Q;w|q QRָͅ..S[.S/'~!5hTZȣi?lxa83{I2(OEFՍ5V:4J-C~=B#?*ApldxS{{䋅\afWbhl/6C5,L}DMr]ɏ)@%c<7\.BdX<4;;SL'9:닋lQ^"\ݨoG4l!tjxBP31^ yR(TޢriJd z3DܤiL ²ahO|3eJH3cJ#")}}8F{mUk"# }Os ֗0%CfH՛j!6ڹCe\dw#~D}OE[~{H2I `|2 >G(<o ` :m4BM[lbCEE cƮmpwY&уrHDr^Ǿi?|8ƿ㻜߿I/nq5u/+FD4Q Lq{xǾ}tdqdQɥytF^,[kZwtiٲp~Wdiɤ- 0ƠP팊XB~ro U 4])%qS(x_~LY,8ιt:!Pb: ٞGPݓWߢgٱD*>Qu&'3aǴgdASH#ͰW6x??Zjp\p209Փ9h"fasl"d\ {4s "qthZ*˜SиI $7GK-\!vZKuؑVi o85T,㭆xg -xT苚^%$L7[ƘucLa``C@V;{18FR~W:MN]!0c0 z\Ƣh8FW-sREzZkiuм_y鯥8F -m s"\mkqwUsVwM&"'ZGCvY_Y-!n ðu+U:B[jd]}ֶb\g(Xi;9\[ ]Q8Nѯu;E;u&D\xʷq@[B90e ~J$}>NdoZg;#f)㋧PMZl=^c6`{fzTG28ѕ [x,q/UY/nL90bnfn}d࠾[y;z=m41W8>400Y4 u1ZbxS<7u>"Sc1bƏ*HF(:_z+& vg=_~V)1vQ7M5^̐>jTm%q= R/cB*#+COa$qfZfIE0 4j$;01e}f"K`ڢ̠lz})Nk[ ֞fYUֶh^ڠM'olKvܟ#~S=2'P (`h|7N1"i4\5Ɯ3|kT׈6clh=8ϨAOp*z5 @8(:l$`zM.F"+8mHG;@Ԋ$N8ە.=3_F[Y$oO#nvgQbwUK^?\QYIENDB`pyglossary-4.5.0/res/resize-16.png000066400000000000000000000012471417733132500167720ustar00rootroot00000000000000PNG  IHDRasRGBbKGD pHYs  tIME.rtEXtCommentCreated with The GIMPd%nIDAT8˕?hSaE2ZLڈ(ɢɞ8ԬB  \ m]2t)}|ǡ~{΅S6p;<`()h_W`}>p.ϓJ$@nK[__p?222455et:-`(V*IfYn;[[[ԭVos|>XEIGd$ukF\$5 ߀WKKKO| <ѳb^.JS(3An\.icc c +w{}t%Fgw:|>zmm zwLpVALp9Nl6!.ZflooOR'}E.:83{' "!iD>a3 s0 1fr9^8>>vLFSxf8fsZph5 \?_ ; IENDB`pyglossary-4.5.0/res/resize.png000066400000000000000000000132371417733132500165500ustar00rootroot00000000000000PNG  IHDR #ꦷ pHYs7\7\Ǥ vpAg bKGD XIDATxYwTS?(EA (cÊ{C(Hj&( :6HÀRBHBi@;sYoz{>fy: !#l ?!zAuS4q!EDt@L{ʩT\)B@;<ApBv)!ܣrY*xg>ʵ^Ss6860MwY?+cs sb)! 6^l(^aO쏰3R,'nVbb|6#ĝr\q le((zWt"Buc\+cC"3B^ \i^4w&CYZ> u@u[m.M.5f,"8_gs+cpʤFՍz̡@2uWIMֻeY1T߈G${~ ^WzBLpk9bEj }cG3o'j@tdEV>:;Sɺ˕NbwF1v=5)Z@*d;&7s Ɔ[zFKW/~2cO)1P-JбyQ<56ekog?e`͹Fpc7 ~l*5ǯAeԘ|D=۸yNTc=kuȓMQ1陖+@qy%hcd߈U03lWrNޡM ĵ4- >Sf ? 0Uv@y]^{* %T) ȶ9HۿtɖylޜEhO<*ޞ=::)ʚKZvp`a%{ү6dlkM67e즒fWƻ- `aƪ҆{@Q]{}n,yK6Q/M}g3ӇL9UF0BVnoFyQ6lf~> kM;EYSDx]9*vcy*ndk]6\6 Gs cq+7:Z}a Ra  zz{`uG=pS[nuVwa:t-/aLwJгgrw_wb}oZv07km2?nh: مgi!N:22>qt;iCʍܬC|8ozCs07yH ;ؐ`v/F YLbYjl֗20] H|p0DxABc]x!dA(:CK[_ۀRЂyȯA1   ,l{o5Z /g\r [EkӕII<[la۪\)VeF*FHit NAE=`ZZ TgӮs nSrHVgxQi<>wOK} NaE^\$ik{G`G:F̆u;Z"$B,C^$LP:M` H Z36TM@G.@+Ef՞~#DLH~I8?R(fUD4݆B;{3ԃaơfjzTr<*x?0q'n ͐B糃@Śk Hܲ,ɥVfXҳp6=LIan"bel MV]* ij!"f ѓ/mA"`':6@+U꒖vdj8Qӏ`pkI&:)`48x7!r$ ͽ)@ Ruِ ґQ _mਝEѩcPnh,[ mCDht{#EO6vvcd!bLCD|D FJ,k0m/E?Oޝ =DDg"bwd7PXfd̾c ״ YyoS CEQʝHVcx#;bvʟ5G3ElTJ{(Lt@.DG.` KTX1BPŧ.}2ȋR@}K-@t(|SC'fxoS3 5 V NfG(܅A3kԌX,2y/ڭה:h9#= Gm};P=b10"YqvVXxxtq=i~'\3}|3&]}Oրo>ˮ(Yr_>s:l"w'95P!ҔRd '{W>AI$I:A)ٌP\.YO\x9P^*fBa!ҾNf:%gLGEh\~wL J0Ո4ah}ޣ +/=}+e!|Ԛ_ X$gjO͜{L*\}p)q.!zոQS[MS!YcRt]6ɊxK ۽RvtIxYByH"ihpE ݴ4yPUӛweV'}V~vIr.C~.hME{AWc}`\*N8QO5i;caƑR+郿KQt;Mpn3J2Yh5fĽǰ9R1H5/$r!=%y>y&. ?Bnsݷ8S ;V=-t5T'}a7c~ܸF衖G[FZ~ꩋfh7\}?S ]3 ;rLZC:pb܈ ڋeh+Z&vQuJ ൡ*t׊@^w!߂bt:?z=o4݇W`sЭVmo5,x3xqY˸ ghUmݒ0-Sh⩐"=,`/³l_sr}J| ۤ\O;P?*lzz#qHewy+<8i8 E򰦸To>6Z|9qgbSU|f$hO ;[~Y(1:~`׫[ ?(Mp"v tyMMbǣ %;3ಕ1'" KFfV,H7yl༪\ЄĮrљ_IK*5tj-'arJB^ 5sXS~['[:Ȣsu)3k_SSg9ߦi8.bOaw{lRHfC rgA8u8qho6]vސj_Fߌ^6=\e^pF#!MI/%8㵄vJnYB=ڻ²\=nFD+ZbG~\)|ן*i>?0U*=i#ni)K\"ڨ{*dNlg7޸Ѿ#}S hXlNś)wY7x]FG i5.T# )h 88td.N˱@C0! qR)cg770+'T*%y%%`h}b<5J@Á5P^OQxߐ/0FGgc;lt_}t}HM':)!pL8pMKj*u.`B\o7/Q}h_`xÞ4\6" 1=6z:xFhn Q"zTXtSoftwarex+//.NN,H/J6XS\IENDB`pyglossary-4.5.0/res/resources.xml000066400000000000000000000011601417733132500172650ustar00rootroot00000000000000 >Oxygen Icon Theme LGPLv3+ scalable/status/dialog-information.svg www.svgrepo.com Public Domain https://www.svgrepo.com/svg/24604/user www.svgrepo.com Public Domain https://www.svgrepo.com/svg/112407/license pyglossary-4.5.0/run-with-docker.sh000077500000000000000000000005621417733132500173260ustar00rootroot00000000000000#!/bin/bash set -e myPath=$(realpath "$0") myDir=$(dirname "$myPath") cd "$myDir" version=$(./scripts/version) echo $version aptCacheDir=$HOME/.minideb-apt mkdir -p "$aptCacheDir" docker build . \ -f Dockerfile \ -t pyglossary:$version \ -t pyglossary:latest docker run -it \ --volume $HOME:/root/ \ --volume $aptCacheDir:/var/cache/apt \ pyglossary:$version pyglossary-4.5.0/scripts/000077500000000000000000000000001417733132500154315ustar00rootroot00000000000000pyglossary-4.5.0/scripts/check-style000077500000000000000000000012711417733132500175730ustar00rootroot00000000000000#!/bin/bash IGNORE= function ignore() { IGNORE="$IGNORE,$1" } ignore W191 "indentation contains tabs" #ignore W503 "line break occurred before a binary operator" ignore W504 "line break after binary operator" #ignore E117 "over-indented" #ignore E261 "at least two spaces before inline comment" #ignore E262 "inline comment should start with '# '" #ignore E265 "block comment should start with '# '" #ignore E402 "module level import not at top of file" #ignore E702 "multiple statements on one line (semicolon)" #if [ -z $2 ] ; then pycodestyle --ignore=$IGNORE "$@" | grep --color=always -P ':\d*:' 2>&1 | less -RU #else # pycodestyle --select "$@" | grep --color=always -P ':\d*:' 2>&1 #fi pyglossary-4.5.0/scripts/config-doc.py000077500000000000000000000102441417733132500200170ustar00rootroot00000000000000#!/usr/bin/python3 import sys import json import re from os.path import join, dirname, abspath from pprint import pprint from mako.template import Template rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.core import userPluginsDir from pyglossary.ui.base import UIBase ui = UIBase() ui.loadConfig(user=False) # ui.configDefDict re_flag = re.compile("(\\s)(--[a-z\\-]+)") template = Template("""${paramsTable} ${"Configuration Files"} ${"-------------------"} The default configuration values are stored in `config.json <./../config.json/>`_ file in source/installation directory. The user configuration file - if exists - will override default configuration values. The location of this file depends on the operating system: - Linux or BSD: ``~/.pyglossary/config.json`` - Mac: ``~/Library/Preferences/PyGlossary/config.json`` - Windows: ``C:\\Users\\USERNAME\\AppData\\Roaming\\PyGlossary\\config.json`` ${"Using as library"} ${"----------------"} When you use PyGlossary as a library, neither of ``config.json`` files are loaded. So if you want to change the config, you should set ``glos.config`` property (which you can do only once for each instance of ``Glossary``). For example: .. code:: python glos = Glossary() glos.config = { "lower": True, } """) with open(join(rootDir, "scripts/term-colors.json")) as _file: termColors = json.load(_file) def codeValue(x): s = str(x) if s: return "``" + s + "``" return "" def tableRowSep(width, c="-"): return "+" + c + f"{c}+{c}".join([ c * w for w in width ]) + c + "+" def renderTable(rows): """ rows[0] must be headers """ colN = len(rows[0]) width = [ max( max(len(line) for line in row[i].split("\n")) for row in rows ) for i in range(colN) ] rowSep = tableRowSep(width, "-") headerSep = tableRowSep(width, "=") lines = [rowSep] for rowI, row in enumerate(rows): newRows = [] for colI, cell in enumerate(row): for lineI, line in enumerate(cell.split("\n")): if lineI >= len(newRows): newRows.append([ " " * width[colI] for colI in range(colN) ]) newRows[lineI][colI] = line.ljust(width[colI], " ") for row in newRows: lines.append("| " + " | ".join(row) + " |") if rowI == 0: lines.append(headerSep) else: lines.append(rowSep) # widthsStr = ", ".join([str(w) for w in width]) # header = f".. table:: my table\n\t:widths: {widthsStr}\n\n" # return header + "\n".join(["\t" + line for line in lines]) return "\n".join(lines) def getCommandFlagsMD(name, opt): if name.startswith("color.enable.cmd."): return f"``--no-color``" if not opt.hasFlag: return "" flag = opt.customFlag if not flag: flag = name.replace('_', '-') if opt.falseComment: return f"| ``--{flag}``\n| ``--no-{flag}``" # return f"- ``--{flag}``\n- ``--no-{flag}``" return f"``--{flag}``" def optionComment(name, opt): comment = opt.comment comment = re_flag.sub("\\1``\\2``", comment) if name.startswith("color.cmd."): comment = f"| {comment}\n| See `term-colors.md <./term-colors.md/>`_" return comment def jsonCodeValue(value): # if isinstance(value, str): # return codeValue(value) return codeValue(json.dumps(value)) def defaultOptionValue(name, opt, images): value = ui.config[name] valueMD = jsonCodeValue(value) if name.startswith("color.cmd."): _hex = termColors[str(value)].lstrip("#") imageI = f"image{len(images)}" images.append( f".. |{imageI}| image:: https://via.placeholder.com/20/{_hex}/000000?text=+" ) valueMD += f"\n|{imageI}|" return valueMD title = "Configuration Parameters" title += "\n" + len(title) * "-" + "\n" images = [] paramsTable = title + renderTable( [( "Name", "Command Flags", "Type", "Default", "Comment", )] + [ ( codeValue(name), getCommandFlagsMD(name, opt), opt.typ, defaultOptionValue(name, opt, images), optionComment(name, opt), ) for name, opt in ui.configDefDict.items() if not opt.disabled ], ) text = template.render( codeValue=codeValue, ui=ui, paramsTable=paramsTable, ) text += "\n" for image in images: text += "\n" + image with open(join(rootDir, "doc", "config.rst"), mode="w") as _file: _file.write(text) pyglossary-4.5.0/scripts/doc-pypi-links.sh000077500000000000000000000001251417733132500206300ustar00rootroot00000000000000#!/bin/bash grep -roh 'https://pypi.org/project/[^)]*' doc/p/ | sort | uniq --count pyglossary-4.5.0/scripts/dump.py000077500000000000000000000004141417733132500167520ustar00rootroot00000000000000import sys from pprint import pformat from pyglossary.glossary import Glossary glos = Glossary() glos.read(sys.argv[1]) for entry in glos: print('Words: ' + pformat(entry.l_word)) print('Definitions: ' + pformat(entry.defis)) print('-------------------------') pyglossary-4.5.0/scripts/entry-filters-doc.py000077500000000000000000000040611417733132500213610ustar00rootroot00000000000000#!/usr/bin/python3 import sys import json from os.path import join, dirname, abspath from pprint import pprint from mako.template import Template rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.glossary import Glossary from pyglossary.core import userPluginsDir from pyglossary.ui.base import UIBase ui = UIBase() ui.loadConfig(user=False) template = Template("""${entryFiltersTable} """) def codeValue(x): s = str(x) if s: return "`" + s + "`" return "" def yesNo(x): if x is True: return "Yes" if x is False: return "No" return "" def renderCell(value): return str(value).replace("\n", "\\n").replace("\t", "\\t") def renderTable(rows): """ rows[0] must be headers """ rows = [ [ renderCell(cell) for cell in row ] for row in rows ] width = [ max(len(row[i]) for row in rows) for i in range(len(rows[0])) ] rows = [ [ cell.ljust(width[i], " ") for i, cell in enumerate(row) ] for rowI, row in enumerate(rows) ] rows.insert(1, [ "-" * colWidth for colWidth in width ]) return "\n".join([ "| " + " | ".join(row) + " |" for row in rows ]) def getCommandFlagsMD(configRule): if configRule is None: return "" name = configRule[0] opt = ui.configDefDict[name] flag = name.replace("_", "-") if opt.falseComment: return f"`--{flag}`
              `--no-{flag}`" return f"`--{flag}`" for configRule, filterClass in Glossary.entryFiltersRules: if configRule is None: continue name, default = configRule assert ui.config[name] == default assert filterClass.name == name entryFiltersTable = "## Entry Filters\n\n" + renderTable( [( "Name", "Default Enabled", "Command Flags", "Description", )] + [ ( codeValue(filterClass.name), yesNo(configRule is None or bool(configRule[1])), getCommandFlagsMD(configRule), filterClass.desc, ) for configRule, filterClass in Glossary.entryFiltersRules ], ) text = template.render( entryFiltersTable=entryFiltersTable, ) with open(join(rootDir, "doc", "entry-filters.md"), mode="w") as _file: _file.write(text) pyglossary-4.5.0/scripts/gen.sh000077500000000000000000000003041417733132500165360ustar00rootroot00000000000000#!/usr/bin/env bash set -e myPath=$(realpath "$0") myDir1=$(dirname "$myPath") set -x "$myDir1/plugin-index.py" "$myDir1/plugin-doc.py" "$myDir1/config-doc.py" "$myDir1/entry-filters-doc.py" pyglossary-4.5.0/scripts/glos-find-bar-words.py000077500000000000000000000012371417733132500215710ustar00rootroot00000000000000#!/usr/bin/python3 import sys from os.path import join, dirname, abspath rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary import Glossary def hasBar(entry): for word in entry.l_word: if "|" in word: return True return False Glossary.init( # usePluginsJson=False, ) for direct in (True, False): print(f"\n-------- direct={direct}") glos = Glossary() glos.config = { "enable_alts": True, } glos.read( filename=sys.argv[1], direct=direct, ) for entry in glos: if hasBar(entry): print(f"+++ {entry.l_word!r} -> {entry.defi[:60]}") continue #print(f"--- {entry.l_word!r} -> {entry.defi[:60]}") pyglossary-4.5.0/scripts/plugin-doc.py000077500000000000000000000153411417733132500200530ustar00rootroot00000000000000#!/usr/bin/python3 import sys import json from os.path import join, dirname, abspath from pathlib import Path from pprint import pprint from collections import OrderedDict from mako.template import Template import toml rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.glossary import Glossary, defaultSortKeyName from pyglossary.core import userPluginsDir Glossary.init( # usePluginsJson=False, ) """ Mako template engine: https://docs.makotemplates.org/en/latest/ https://github.com/sqlalchemy/mako https://pypi.org/project/Mako/ Package python3-mako in Debian repos """ template = Template("""${"##"} ${description} ${topTables} % if readDependsLinks and readDependsLinks == writeDependsLinks: ${"### Dependencies for reading and writing"} PyPI Links: ${readDependsLinks} To install, run: ```sh ${readDependsCmd} ``` % else: % if readDependsLinks: ${"### Dependencies for reading"} PyPI Links: ${readDependsLinks} To install, run: ```sh ${readDependsCmd} ``` % endif % if writeDependsLinks: ${"### Dependencies for writing"} PyPI Links: ${writeDependsLinks} To install, run ```sh ${writeDependsCmd} ``` % endif % endif % if extraDocs: % for title, text in extraDocs: ${f"### {title}"} ${text.replace('(./doc/', '(../')} % endfor % endif ${toolsTable} """) def codeValue(x): s = str(x) if s: return "`" + s + "`" return "" def yesNo(x): if x is True: return "Yes" if x is False: return "No" return "" def kindEmoji(kind): if not kind: return "" return { "text": "📝", "binary": "🔢", "directory": "📁", "package": "📦", }[kind] def renderLink(title, url): if "(" in title or ")" in title: url = f"<{url}>" title = title.replace("|", "\\|") return f"[{title}]({url})" def pypiLink(pypiName): return renderLink( pypiName.replace('==', ' '), f"https://pypi.org/project/{pypiName.replace('==', '/')}", ) def makeDependsDoc(cls): if not (cls and getattr(cls, "depends", None)): return "", "" links = ", ".join([ pypiLink(pypiName) for pypiName in cls.depends.values() ]) cmd = "pip3 install " + " ".join( cls.depends.values() ) return links, cmd def sortKeyName(p): value = p.sortKeyName if value: return codeValue(value) return "(" + codeValue(defaultSortKeyName) + ")" def renderCell(value): return str(value).replace("\n", "\\n").replace("\t", "\\t") def renderTable(rows): """ rows[0] must be headers """ rows = [ [ renderCell(cell) for cell in row ] for row in rows ] width = [ max(len(row[i]) for row in rows) for i in range(len(rows[0])) ] rows = [ [ cell.ljust(width[i], " ") for i, cell in enumerate(row) ] for rowI, row in enumerate(rows) ] rows.insert(1, [ "-" * colWidth for colWidth in width ]) return "\n".join([ "| " + " | ".join(row) + " |" for row in rows ]) def renderRWOptions(options): return renderTable( [("Name", "Default", "Type", "Comment")] + [ ( optName, codeValue(default), optionsType[optName], optionsComment[optName], ) for optName, default in options.items() ] ) def pluginIsActive(p): if not p.enable: return False if not (p.canRead or p.canWrite): return False if userPluginsDirPath in p.path.parents: return False return True userPluginsDirPath = Path(userPluginsDir) plugins = [ p for p in Glossary.plugins.values() if pluginIsActive(p) ] toolsDir = join(rootDir, "plugins-meta", "tools") for p in plugins: module = p.module optionsProp = p.optionsProp wiki = module.wiki wiki_md = "―" if wiki: if wiki.startswith("https://github.com/"): wiki_title = "@" + wiki[len("https://github.com/"):] else: wiki_title = wiki.split("/")[-1].replace("_", " ") wiki_md = renderLink(wiki_title, wiki) website_md = "―" website = module.website if website: if isinstance(website, str): website_md = website else: try: url, title = website except ValueError: raise ValueError(f"website = {website!r}") website_md = renderLink(title, url) ( readDependsLinks, readDependsCmd, ) = makeDependsDoc(getattr(module, "Reader", None)) ( writeDependsLinks, writeDependsCmd, ) = makeDependsDoc(getattr(module, "Writer", None)) extraDocs = getattr(module, "extraDocs", []) toolsFile = join(toolsDir, f"{p.lname}.toml") try: with open(toolsFile) as _file: tools_toml = toml.load(_file, _dict=OrderedDict) except FileNotFoundError: tools = [] except Exception as e: print(f"\nFile: {toolsFile}") raise e else: for toolName, tool in tools_toml.items(): tool.update({"name": toolName}) tools = tools_toml.values() generalInfoTable = "### General Information\n\n" + renderTable([ ("Attribute", "Value"), ("Name", p.name), ("snake_case_name", p.lname), ("Description", p.description), ("Extensions", ", ".join([ codeValue(ext) for ext in p.extensions ])), ("Read support", yesNo(p.canRead)), ("Write support", yesNo(p.canWrite)), ("Single-file", yesNo(p.singleFile)), ("Kind", f"{kindEmoji(module.kind)} {module.kind}"), ("Sort-on-write", p.sortOnWrite), ("Sort key", sortKeyName(p)), ("Wiki", wiki_md), ("Website", website_md), ]) topTables = generalInfoTable try: optionsType = { optName: opt.typ for optName, opt in optionsProp.items() } except: print(f"optionsProp = {optionsProp}") raise optionsComment = { optName: opt.comment.replace("\n", "
              ") for optName, opt in optionsProp.items() } readOptions = p.getReadOptions() if readOptions: topTables += "\n\n### Read options\n\n" + renderRWOptions(readOptions) writeOptions = p.getWriteOptions() if writeOptions: topTables += "\n\n### Write options\n\n" + renderRWOptions(writeOptions) toolsTable = "" if tools: toolsTable = "### Dictionary Applications/Tools\n\n" + renderTable( [("Name & Website", "License", "Platforms")] + [ ( f"[{tool['name']}]({tool['web']})", tool["license"], ", ".join(tool["platforms"]), ) for tool in tools ], ) text = template.render( description=p.description, codeValue=codeValue, yesNo=yesNo, topTables=topTables, optionsProp=optionsProp, readOptions=readOptions, writeOptions=writeOptions, optionsComment=optionsComment, optionsType=optionsType, readDependsLinks=readDependsLinks, readDependsCmd=readDependsCmd, writeDependsLinks=writeDependsLinks, writeDependsCmd=writeDependsCmd, extraDocs=extraDocs, toolsTable=toolsTable, ) with open(join(rootDir, "doc", "p", f"{p.lname}.md"), mode="w") as _file: _file.write(text) indexText = renderTable( [("Description", "Name", "Doc Link")] + [ ( p.description, p.name, renderLink(f"{p.lname}.md", f"./{p.lname}.md"), ) for p in plugins ] ) with open(join(rootDir, "doc", "p", f"__index__.md"), mode="w") as _file: _file.write(indexText + "\n") pyglossary-4.5.0/scripts/plugin-index.py000077500000000000000000000032141417733132500204110ustar00rootroot00000000000000#!/usr/bin/python3 import sys import json from os.path import join, dirname, abspath from collections import OrderedDict as odict from pathlib import Path rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.glossary import Glossary from pyglossary.core import userPluginsDir from pyglossary.flags import DEFAULT_NO Glossary.init( usePluginsJson=False, skipDisabledPlugins=False, ) userPluginsDirPath = Path(userPluginsDir) plugins = [ p for p in Glossary.plugins.values() if userPluginsDirPath not in p.path.parents ] data = [] for p in plugins: canRead = p.canRead canWrite = p.canWrite item = odict([ ("module", p.module.__name__), ("lname", p.lname), ("name", p.name), ("description", p.description), ("extensions", p.extensions), ("singleFile", p.singleFile), ("optionsProp", { name: opt.toDict() for name, opt in p.optionsProp.items() }), ("canRead", canRead), ("canWrite", canWrite), ]) if p.sortOnWrite != DEFAULT_NO: item["sortOnWrite"] = p.sortOnWrite if p.sortKeyName: item["sortKeyName"] = p.sortKeyName if canRead: item["readOptions"] = p.getReadOptions() if canWrite: item["writeOptions"] = p.getWriteOptions() if not p.enable: item["enable"] = False if p.readDepends: item["readDepends"] = p.readDepends if p.writeDepends: item["writeDepends"] = p.writeDepends if p.readCompressions: item["readCompressions"] = p.readCompressions data.append(item) jsonText = json.dumps( data, sort_keys=False, indent="\t", ensure_ascii=False, ) with open( join(rootDir, "plugins-meta", "index.json"), mode="w", encoding="utf-8", ) as _file: _file.write(jsonText) pyglossary-4.5.0/scripts/term-colors.json000066400000000000000000000112241417733132500205720ustar00rootroot00000000000000{ "0": "#000000", "1": "#aa0000", "2": "#00aa00", "3": "#aa5500", "4": "#0000aa", "5": "#aa00aa", "6": "#00aaaa", "7": "#b9b9b9", "8": "#555555", "9": "#ff5555", "10": "#55ff55", "11": "#ffff55", "12": "#5555ff", "13": "#ff55ff", "14": "#55ffff", "15": "#ffffff", "16": "#000000", "17": "#00005f", "18": "#000087", "19": "#0000af", "20": "#0000d7", "21": "#0000ff", "22": "#005f00", "23": "#005f5f", "24": "#005f87", "25": "#005faf", "26": "#005fd7", "27": "#005fff", "28": "#008700", "29": "#00875f", "30": "#008787", "31": "#0087af", "32": "#0087d7", "33": "#0087ff", "34": "#00af00", "35": "#00af5f", "36": "#00af87", "37": "#00afaf", "38": "#00afd7", "39": "#00afff", "40": "#00d700", "41": "#00d75f", "42": "#00d787", "43": "#00d7af", "44": "#00d7d7", "45": "#00d7ff", "46": "#00ff00", "47": "#00ff5f", "48": "#00ff87", "49": "#00ffaf", "50": "#00ffd7", "51": "#00ffff", "52": "#5f0000", "53": "#5f005f", "54": "#5f0087", "55": "#5f00af", "56": "#5f00d7", "57": "#5f00ff", "58": "#5f5f00", "59": "#5f5f5f", "60": "#5f5f87", "61": "#5f5faf", "62": "#5f5fd7", "63": "#5f5fff", "64": "#5f8700", "65": "#5f875f", "66": "#5f8787", "67": "#5f87af", "68": "#5f87d7", "69": "#5f87ff", "70": "#5faf00", "71": "#5faf5f", "72": "#5faf87", "73": "#5fafaf", "74": "#5fafd7", "75": "#5fafff", "76": "#5fd700", "77": "#5fd75f", "78": "#5fd787", "79": "#5fd7af", "80": "#5fd7d7", "81": "#5fd7ff", "82": "#5fff00", "83": "#5fff5f", "84": "#5fff87", "85": "#5fffaf", "86": "#5fffd7", "87": "#5fffff", "88": "#870000", "89": "#87005f", "90": "#870087", "91": "#8700af", "92": "#8700d7", "93": "#8700ff", "94": "#875f00", "95": "#875f5f", "96": "#875f87", "97": "#875faf", "98": "#875fd7", "99": "#875fff", "100": "#878700", "101": "#87875f", "102": "#878787", "103": "#8787af", "104": "#8787d7", "105": "#8787ff", "106": "#87af00", "107": "#87af5f", "108": "#87af87", "109": "#87afaf", "110": "#87afd7", "111": "#87afff", "112": "#87d700", "113": "#87d75f", "114": "#87d787", "115": "#87d7af", "116": "#87d7d7", "117": "#87d7ff", "118": "#87ff00", "119": "#87ff5f", "120": "#87ff87", "121": "#87ffaf", "122": "#87ffd7", "123": "#87ffff", "124": "#af0000", "125": "#af005f", "126": "#af0087", "127": "#af00af", "128": "#af00d7", "129": "#af00ff", "130": "#af5f00", "131": "#af5f5f", "132": "#af5f87", "133": "#af5faf", "134": "#af5fd7", "135": "#af5fff", "136": "#af8700", "137": "#af875f", "138": "#af8787", "139": "#af87af", "140": "#af87d7", "141": "#af87ff", "142": "#afaf00", "143": "#afaf5f", "144": "#afaf87", "145": "#afafaf", "146": "#afafd7", "147": "#afafff", "148": "#afd700", "149": "#afd75f", "150": "#afd787", "151": "#afd7af", "152": "#afd7d7", "153": "#afd7ff", "154": "#afff00", "155": "#afff5f", "156": "#afff87", "157": "#afffaf", "158": "#afffd7", "159": "#afffff", "160": "#d70000", "161": "#d7005f", "162": "#d70087", "163": "#d700af", "164": "#d700d7", "165": "#d700ff", "166": "#d75f00", "167": "#d75f5f", "168": "#d75f87", "169": "#d75faf", "170": "#d75fd7", "171": "#d75fff", "172": "#d78700", "173": "#d7875f", "174": "#d78787", "175": "#d787af", "176": "#d787d7", "177": "#d787ff", "178": "#d7af00", "179": "#d7af5f", "180": "#d7af87", "181": "#d7afaf", "182": "#d7afd7", "183": "#d7afff", "184": "#d7d700", "185": "#d7d75f", "186": "#d7d787", "187": "#d7d7af", "188": "#d7d7d7", "189": "#d7d7ff", "190": "#d7ff00", "191": "#d7ff5f", "192": "#d7ff87", "193": "#d7ffaf", "194": "#d7ffd7", "195": "#d7ffff", "196": "#ff0000", "197": "#ff005f", "198": "#ff0087", "199": "#ff00af", "200": "#ff00d7", "201": "#ff00ff", "202": "#ff5f00", "203": "#ff5f5f", "204": "#ff5f87", "205": "#ff5faf", "206": "#ff5fd7", "207": "#ff5fff", "208": "#ff8700", "209": "#ff875f", "210": "#ff8787", "211": "#ff87af", "212": "#ff87d7", "213": "#ff87ff", "214": "#ffaf00", "215": "#ffaf5f", "216": "#ffaf87", "217": "#ffafaf", "218": "#ffafd7", "219": "#ffafff", "220": "#ffd700", "221": "#ffd75f", "222": "#ffd787", "223": "#ffd7af", "224": "#ffd7d7", "225": "#ffd7ff", "226": "#ffff00", "227": "#ffff5f", "228": "#ffff87", "229": "#ffffaf", "230": "#ffffd7", "231": "#ffffff", "232": "#080808", "233": "#121212", "234": "#1c1c1c", "235": "#262626", "236": "#303030", "237": "#3a3a3a", "238": "#444444", "239": "#4e4e4e", "240": "#585858", "241": "#626262", "242": "#6c6c6c", "243": "#767676", "244": "#808080", "245": "#8a8a8a", "246": "#949494", "247": "#9e9e9e", "248": "#a8a8a8", "249": "#b2b2b2", "250": "#bcbcbc", "251": "#c6c6c6", "252": "#d0d0d0", "253": "#dadada", "254": "#e4e4e4", "255": "#eeeeee" }pyglossary-4.5.0/scripts/test-cover-html.sh000077500000000000000000000003771417733132500210340ustar00rootroot00000000000000#!/usr/bin/env bash set -e myPath=$(realpath "$0") myDir1=$(dirname "$myPath") rootDir=$(dirname "$myDir1") cd "$rootDir/tests" coverage run -m unittest *_test.py coverage html --include="$rootDir/pyglossary/*" --omit="$rootDir/pyglossary/plugin_lib/*" pyglossary-4.5.0/scripts/test-glossary.sh000077500000000000000000000004071417733132500206110ustar00rootroot00000000000000#!/usr/bin/env bash set -e myPath=$(realpath "$0") myDir1=$(dirname "$myPath") rootDir=$(dirname "$myDir1") echo "$rootDir/tests/glossary_test.py" python3 "$rootDir/tests/glossary_test.py" find "$rootDir/tests" -name "g_*_test.py" -print -exec python3 '{}' \; pyglossary-4.5.0/scripts/test.sh000077500000000000000000000004431417733132500167500ustar00rootroot00000000000000#!/usr/bin/env bash set -e myPath=$(realpath "$0") myDir1=$(dirname "$myPath") rootDir=$(dirname "$myDir1") echo "$rootDir/tests" cd "$rootDir/tests" python -m unittest *_test.py echo echo "$rootDir/pyglossary/plugin_lib" cd "$rootDir/pyglossary/plugin_lib" python -m unittest *_test.py pyglossary-4.5.0/scripts/tools-py2toml.py000077500000000000000000000015751417733132500205620ustar00rootroot00000000000000#!/usr/bin/python3 import sys import json from os.path import join, dirname, abspath from pathlib import Path from collections import OrderedDict import toml from pprint import pprint rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.glossary import Glossary from pyglossary.core import userPluginsDir Glossary.init( # usePluginsJson=False, ) userPluginsDirPath = Path(userPluginsDir) plugins = [ p for p in Glossary.plugins.values() if userPluginsDirPath not in p.path.parents ] toolsDir = join(rootDir, "plugins-meta", "tools") for p in plugins: module = p.module optionsProp = p.optionsProp tools = OrderedDict() for tool in getattr(p.module, "tools", []): tools[tool.pop("name")] = tool # if not tools: # continue # pprint(tools) with open(join(toolsDir, f"{p.lname}.toml"), mode="w") as _file: toml.dump(tools, _file) pyglossary-4.5.0/scripts/version000077500000000000000000000005671417733132500170540ustar00rootroot00000000000000#!/bin/bash set -e myPath=$(realpath "$0") myDir=$(dirname "$myPath") sourceDir=$(dirname "$myDir") gitDir="$sourceDir/.git" if [ -d "$gitDir" ] ; then git --git-dir "$gitDir" describe --always exit 0 fi cat "$sourceDir/pyglossary/core.py" | while read line ; do if [[ $line = VERSION* ]] ; then echo $line | sed 's/VERSION\s*=\s*//' | sed 's/"//g' exit 0 fi done pyglossary-4.5.0/scripts/wiki-formats.py000077500000000000000000000051711417733132500204260ustar00rootroot00000000000000#!/usr/bin/python3 import sys import os import json from os.path import join, dirname, abspath from pprint import pprint from mako.template import Template rootDir = join( os.getenv("HOME"), "pyglossary", ) sys.path.insert(0, rootDir) from pyglossary.glossary import Glossary Glossary.init( # usePluginsJson=False, ) """ Mako template engine: https://docs.makotemplates.org/en/latest/ https://github.com/sqlalchemy/mako https://pypi.org/project/Mako/ Package python3-mako in Debian repos """ hasIconSet = set([ "aard2_slob", "appledict_bin", "appledict", "babylon_bgl", "cc_cedict", "csv", "dicformids", "dict_cc", "dict_cc_split", "digitalnk", "dsl", "epub2", "jmdict", "kobo", "lingoes_ldf", "octopus_mdict", "sql", "stardict", "tabfile", "wiktionary_dump", "zim", ]) def codeValue(x): s = str(x) if s: return "`" + s + "`" return "" def yesNo(x): if x is True: return "Yes" if x is False: return "No" return "" def iconImg(p): if p.lname not in hasIconSet: return "" return f'' def kindEmoji(p): kind = p.module.kind if not kind: return "" return { "text": "📝", "binary": "🔢", "directory": "📁", "package": "📦", }[kind] willNotSupportRead = set([ "epub2", "kobo", "mobi", # "html_dir", "info", "sql", ]) willNotSupportWrite = set([ "appledict_bin", "babylon_bgl", "cc_cedict", "cc_kedict", "freedict", "jmdict", "octopus_mdict", "wiktionary_dump", "xdxf", ]) def readCheck(p): if p.lname in willNotSupportRead: return "❌" return "✔" if p.canRead else "" def writeCheck(p): if p.lname in willNotSupportWrite: return "❌" return "✔" if p.canRead else "" template = Template(""" | | Description | | Read | Write| Doc Link | |:-:| ----------- |:-:|:----:|:----:| -------- | % for p in plugins: | ${iconImg(p)} | ${p.description} | ${kindEmoji(p)} | ${readCheck(p)} | ${writeCheck(p)} | [${p.lname}.md](https://github.com/ilius/pyglossary/blob/master/doc/p/${p.lname}.md) | % endfor Legend: - 📁 Directory - 📝 Text file - 📦 Package/archive file - 🔢 Binary file - ✔ Supported - ❌ Will not be supported """) # wiki = module.wiki # wiki_md = "―" # if module.wiki: # wiki_title = wiki.split("/")[-1].replace("_", " ") # wiki_md = f"[{wiki_title}]({wiki})" # website_md = "―" # if module.website: # website_md = module.website text = template.render( plugins=Glossary.plugins.values(), iconImg=iconImg, kindEmoji=kindEmoji, readCheck=readCheck, writeCheck=writeCheck, ) with open("Formats.md", mode="w") as _file: _file.write(text) pyglossary-4.5.0/setup.py000077500000000000000000000057261417733132500154710ustar00rootroot00000000000000#!/usr/bin/env python3 import glob import sys import os from os.path import join, dirname, exists, isdir import re import logging import setuptools from setuptools import setup from setuptools.command.install import install from pyglossary.core import VERSION log = logging.getLogger("root") relRootDir = "share/pyglossary" class my_install(install): def run(self): install.run(self) if os.sep == "/": binPath = join(self.install_scripts, "pyglossary") log.info("creating script file \"%s\"", binPath) if not exists(self.install_scripts): os.makedirs(self.install_scripts) # let it fail on wrong permissions. else: if not isdir(self.install_scripts): raise OSError( "installation path already exists " + "but is not a directory: %s" % self.install_scripts ) open(binPath, "w").write("""#!/usr/bin/env python3 import sys from os.path import dirname sys.path.insert(0, dirname(__file__)) from pyglossary.ui.main import main main()""") os.chmod(binPath, 0o755) root_data_file_names = [ "about", "license.txt", "license-dialog", "help", "AUTHORS", "config.json", ] package_data = { "": root_data_file_names, "plugins-meta": [ "index.json", ], "pyglossary": [ "*.py", "xdxf.xsl", "res/*", "plugins/*.py", "langs/*", "plugin_lib/*.py", "plugin_lib/py*/*.py", "ui/*.py", "ui/progressbar/*.py", "ui/gtk3_utils/*.py", "ui/wcwidth/*.py", ] + [ # safest way found so far to include every resource of plugins # producing plugins/pkg/*, plugins/pkg/sub1/*, ... except .pyc/.pyo re.sub( r"^.*?pyglossary%s(?=plugins)" % ("\\\\" if os.sep == "\\" else os.sep), "", join(dirpath, f), ) for top in glob.glob( join(dirname(__file__), "pyglossary", "plugins") ) for dirpath, _, files in os.walk(top) for f in files if not (f.endswith(".pyc") or f.endswith(".pyo")) ], } def files(folder): for path in glob.glob(folder + "/*"): if os.path.isfile(path): yield path with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() setup( name="pyglossary", version=VERSION, python_requires=">=3.7.0", cmdclass={ "install": my_install, }, description="A tool for converting dictionary files aka glossaries.", long_description_content_type="text/markdown", long_description=long_description, author="Saeed Rasooli", author_email="saeed.gnu@gmail.com", license="GPLv3+", url="https://github.com/ilius/pyglossary", packages=[ "pyglossary", ], entry_points={ 'console_scripts': [ 'pyglossary = pyglossary.ui.main:main', ], }, package_data=package_data, # FIXME: data_files is deprecated, but without it # `pip install --user` does not work data_files=[ (relRootDir, root_data_file_names), (f"{relRootDir}/plugins-meta", ["plugins-meta/index.json"]), (f"{relRootDir}/res", glob.glob("res/*")), ], extras_require={ "full": [ "lxml", "beautifulsoup4", "PyICU", "PyYAML", "marisa-trie", "libzim", "python-lzo", "html5lib", ], }, ) pyglossary-4.5.0/tests/000077500000000000000000000000001417733132500151045ustar00rootroot00000000000000pyglossary-4.5.0/tests/dsl_test.py000066400000000000000000000330111417733132500172750ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Copyright © 2016 Ratijas # Copyright © 2016-2017 Saeed Rasooli # # This program is a free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3 of the License. # # You can get a copy of GNU General Public License along this program # But you can always get it from http://www.gnu.org/licenses/gpl.txt # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. """ test everything. """ import unittest import os from os.path import dirname, realpath import sys from functools import partial rootDir = dirname(dirname(realpath(__file__))) sys.path.insert(0, rootDir) from pyglossary.plugins.dsl import layer, tag from pyglossary.plugins.dsl.main import ( process_closing_tags, DSLParser, BRACKET_L, BRACKET_R, ) tag_i = tag.Tag("i", "i") tag_m = tag.Tag("m1", "m") tag_p = tag.Tag("p", "p") tag_s = tag.Tag("s", "s") def parse(line, tags=None): """parse DSL markup. WARNING! `parse` function is not optimal because it creates new parser instance on each call. consider cache one [per thread] instance of DSLParser in your code. """ if tags: parser = DSLParser(tags) else: parser = DSLParser() return parser.parse(line) class LayerTestCase(unittest.TestCase): def setUp(self): pass def test_new_layer(self): stack = [] lay = layer.Layer(stack) self.assertEqual(1, len(stack)) self.assertEqual(lay, stack[0]) def test_was_opened_AND_close_tags(self): stack = [] l1, l2 = layer.Layer(stack), layer.Layer(stack) l1.text = "..." l2.tags, l2.text = {tag_i}, ",,," self.assertTrue(tag.was_opened(stack, tag_i)) self.assertFalse(tag.was_opened(stack, tag.Tag("c green", "c"))) layer.close_tags(stack, {tag_i}, len(stack) - 1) expected = [] lay = layer.Layer(expected) lay.text = "...[i],,,[/i]" self.assertEqual(expected, stack) def test_close_layer(self): stack = [] l1, l2, l3 = layer.Layer(stack), layer.Layer(stack), layer.Layer(stack) l1.tags, l1.text = {tag_m}, "..." l2.tags, l2.text = {tag_i}, ",,," l3.tags, l3.text = {tag_p, tag_s}, "+++" expected = [] l1, l2 = layer.Layer(expected), layer.Layer(expected) l1.tags, l1.text = {tag_m}, "..." l2.tags = {tag_i} l2.text = f",,,[{tag_p.opening}][{tag_s.opening}]" \ f"+++[/{tag_s.closing}][/{tag_p.closing}]" layer.close_layer(stack) self.assertEqual(expected, stack) class CanonicalOrderTestCase(unittest.TestCase): def setUp(self): pass def test_no_tags(self): tags = {} expected = [] result = tag.canonical_order(tags) self.assertEqual(expected, result) def test_one_tag_not_predefined(self): tags = {tag_p} expected = [tag_p] result = tag.canonical_order(tags) self.assertEqual(expected, result) def test_one_tag_predefined(self): tags = {tag_i} expected = [tag_i] result = tag.canonical_order(tags) self.assertEqual(expected, result) def test_many_tags_not_predefined(self): tags = {tag_p, tag_s} expected = [tag_p, tag_s] result = tag.canonical_order(tags) self.assertEqual(expected, result) def test_many_tags_predefined(self): tags = {tag_m, tag_p} expected = [tag_m, tag_p] result = tag.canonical_order(tags) self.assertEqual(expected, result) def test_many_tags_mixed(self): tags = {tag_m, tag_i, tag_s, tag_p} expected = [tag_m, tag_i, tag_p, tag_s] result = tag.canonical_order(tags) self.assertEqual(expected, result) class ProcessClosingTagsTestCase(unittest.TestCase): def setUp(self): pass def test_index_of_layer_containing_tag(self): stack = [] l1, l2, l3 = layer.Layer(stack), layer.Layer(stack), layer.Layer(stack) l1.tags, l1.text = {tag_m}, "..." l2.tags, l2.text = {tag_i, tag_s}, ",,," l3.tags, l3.text = {tag_p}, "---" fn = partial(tag.index_of_layer_containing_tag, stack) self.assertEqual(0, fn(tag_m.closing)) self.assertEqual(1, fn(tag_i.closing)) self.assertEqual(1, fn(tag_s.closing)) self.assertEqual(2, fn(tag_p.closing)) def test_close_one(self): stack = [] l1, l2 = layer.Layer(stack), layer.Layer(stack) l1.tags, l1.text = (), "..." l2.tags, l2.text = {tag_p}, ",,," expected = [] lay = layer.Layer(expected) lay.text = f"...[{tag_p.opening}],,,[/{tag_p.closing}]" lay.tags = () closings = {tag_p.closing} process_closing_tags(stack, closings) self.assertEqual(expected, stack) class PutBracketsAwayTestCase(unittest.TestCase): def setUp(self): tags = frozenset({ "b", "'", "c", "i", "sup", "sub", "ex", "p", "*", ("m", r"\d"), }) parser = DSLParser(tags) self.put_brackets_away = parser.put_brackets_away def testStandaloneLeftEscapedAtTheBeginning(self): before = "[..." after = f"{BRACKET_L}..." self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneRightEscapedAtTheBeginning(self): before = "]..." after = f"{BRACKET_R}..." self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneLeftEscaped(self): before = r"...\[,,," after = fr"...\{BRACKET_L},,," self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneRightEscaped(self): before = r"...\],,," after = fr"...\{BRACKET_R},,," self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneLeftNonEscaped(self): before = "...[,,," after = f"...{BRACKET_L},,," self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneRightNonEscaped(self): before = "...],,," after = f"...{BRACKET_R},,," self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneLeftNonEscapedBeforeTagName(self): before = "...[p ,,," after = f"...{BRACKET_L}p ,,," self.assertEqual(after, self.put_brackets_away(before)) def testStandaloneRightNonEscapedAfterTagName(self): before = "c]..." after = f"c{BRACKET_R}..." self.assertEqual(after, self.put_brackets_away(before)) def testPairEscaped(self): before = r"...\[the\],,," after = fr"...\{BRACKET_L}the\{BRACKET_R},,," self.assertEqual(after, self.put_brackets_away(before)) def testPairEscapedAroundTagName(self): before = r"...\[i\],,," after = fr"...\{BRACKET_L}i\{BRACKET_R},,," self.assertEqual(after, self.put_brackets_away(before)) def testPairEscapedAroundClosingTagName(self): before = r"...\[/i\],,," after = fr"...\{BRACKET_L}/i\{BRACKET_R},,," self.assertEqual(after, self.put_brackets_away(before)) def testMixed(self): L, R = BRACKET_L, BRACKET_R before = r"[i]...\[on \]\[the] to[p][/i]" after = fr"[i]...\{L}on \{R}\{L}the{R} to[p][/i]" self.assertEqual(after, self.put_brackets_away(before)) def testEverythingEscaped(self): before = r" change it to \[b\]...\[c\]...\[/c\]\[/b\]\[c\]...\[/c\]" after = before self.assertEqual(after, parse(before)) class DSLParserTestCase(unittest.TestCase): def setUp(self): self.split_join = lambda x: DSLParser.join_paragraphs( *DSLParser.split_line_by_paragraphs(x)) def testStartsWithStandaloneClosed(self): before = """[/p]...""" after = """...""" self.assertEqual(after, parse(before)) def testStandaloneClosedAtTheBeginning(self): before = """...[/p],,,""" after = """...,,,""" self.assertEqual(after, parse(before)) def testStandaloneClosedAtTheBeginningBeforeMarkup(self): before = """...[/p],,,[i][b]+++[/b][/i]---""" after = """...,,,[i][b]+++[/b][/i]---""" self.assertEqual(after, parse(before)) def testEndsWithStandaloneOpened(self): before = """...[i]""" after = """...""" self.assertEqual(after, parse(before)) def testStandaloneOpenedAtTheEnd(self): before = """...[i],,,""" after = """...,,,""" self.assertEqual(after, parse(before)) def testStandaloneOpenedAtTheEndAfterMarkup(self): before = """...[i][b],,,[/b][/i]+++[i]---""" after = """...[i][b],,,[/b][/i]+++---""" self.assertEqual(after, parse(before)) def testWrongOrder2(self): before = """...[i][b],,,[/i][/b]+++""" after = """...[i][b],,,[/b][/i]+++""" self.assertEqual(after, parse(before)) def testWrongOrder3(self): before = """...[i][c],,,[b]+++[/i][/c][/b]---""" after = """...[p],,,[b]+++[/b][/p]---""" self.assertEqual(after, parse(before)) def testOpenOneCloseAnother(self): before = """...[i],,,[/p]+++""" after = """...,,,+++""" self.assertEqual(after, parse(before)) def testStartsWtihClosingAndEndsWithOpening(self): before = """[/c]...[i]""" after = """...""" self.assertEqual(after, parse(before)) def testValidEmptyTagsDestructionOne(self): before = """...[i][/i],,,""" after = """...,,,""" self.assertEqual(after, parse(before)) def testValidEmptyTagsDestructionMany(self): before = """...[b][c][i][/i][/c][/b],,,""" after = """...,,,""" self.assertEqual(after, parse(before)) def testBrokenEmptyTagsDestructionMany(self): before = """...[b][i][c][/b][/c][/i],,,""" after = """...,,,""" self.assertEqual(after, parse(before)) def testNestedWithBrokenOutter(self): before = """[i][p]...[/p][/c]""" after = """[p]...[/p]""" self.assertEqual(after, parse(before)) def testHorriblyBrokenTags(self): before = """[/c]...[i][/p],,,[/i]+++[b]""" after = """...[i],,,[/i]+++""" self.assertEqual(after, parse(before)) def testWrongOrder2_WithConent(self): before = """[b]...[c red]...[/b]...[/c]""" after = """[b]...[c red]...[/c][/b][c red]...[/c]""" self.assertEqual(after, parse(before)) def testWrongOrderWithTextBefore(self): before = "[c]...[i],,,[/c][/i]" after = "[c]...[i],,,[/i][/c]" self.assertEqual(after, parse(before)) def testRespect_m_TagsProperly(self): before = " [m1]for tags like: [p]n[/c][/i][/p]" \ ", the line needs scan again[/m]" after = " [m1]for tags like: [p]n[/p], the line needs scan again[/m]" self.assertEqual(after, parse(before)) def testNoTagsDoNothing(self): before = after = """no tags, do nothing""" self.assertEqual(after, parse(before)) def testValidNestedTags(self): before = """...[i][c][b]...[/b][/c][/i]...""" after = """...[b][p]...[/p][/b]...""" self.assertEqual(after, parse(before)) def testBrokenNestedTags(self): before = """...[b][i][c]...[/b][/c][/i]...""" after = """...[b][p]...[/p][/b]...""" self.assertEqual(after, parse(before)) def testEscapedBrackets(self): before = after = r"""on \[the\] top""" self.assertEqual(after, parse(before)) def testPoorlyEscapedBracketsWithTags(self): before = r"""...\[c],,,[/c]+++""" after = r"""...\[c],,,+++""" self.assertEqual(after, parse(before)) def testPoorlyEscapedBracketsWithTags2(self): before = r"""on \[the\] [b]roof[/b]]""" after = r"""on \[the\] [b]roof[/b]]""" self.assertEqual(after, parse(before)) def testValidRealDictionaryArticle(self): # zh => ru, http://bkrs.info/slovo.php?ch=和田 before = after = "和田\n" \ "[m1][p]г. и уезд[/p] Хотан ([i]Синьцзян-Уйгурский[c] авт.[/c]" \ " р-н, КНР[/i])[/m]" \ "[m2][*][ex]和田玉 Хотанский нефрит[/ex][/*][/m]" self.assertEqual(after, parse(before)) def testBrokenRealDictionaryArticle(self): # zh => ru, http://bkrs.info/slovo.php?ch=一一相应 before = """一一相应 yīyī xiāngyìng [m1][c][i]мат.[/c][/i] взаимнооднозначное соответствие[/m]""" after = """一一相应 yīyī xiāngyìng [m1][p]мат.[/p] взаимнооднозначное соответствие[/m]""" self.assertEqual(after, parse(before)) def testBrokenManyRealDictionaryArticle(self): # zh => ru, http://bkrs.info/slovo.php?ch=一轮 before = "一轮\nyīlún\n" \ "[m1]1) одна очередь[/m][m1]2) цикл ([i]в 12 лет[/i])[/m][m1]" \ "3) диск ([c][i]напр.[/c] луны[/i])[/m]" \ "[m1]4) [c] [i]спорт[/c][/i] раунд, круг" \ " ([i]встречи спортсменов[/i])[/m]" \ "[m1]5) [c] [i]дипл.[/c][/i] раунд ([i]переговоров[/i])[/m]" after = "一轮\nyīlún\n" \ "[m1]1) одна очередь[/m][m1]2) цикл ([i]в 12 лет[/i])[/m][m1]3)" \ " диск ([i][c]напр.[/c] луны[/i])[/m]" \ "[m1]4) [c] [i]спорт[/i][/c] раунд, круг" \ " ([i]встречи спортсменов[/i])[/m]" \ "[m1]5) [c] [i]дипл.[/i][/c] раунд ([i]переговоров[/i])[/m]" self.assertEqual(after, parse(before)) def testSameTagsNested(self): before = "...[p],,,[p]+++[/p]---[/p]```" after = "...[p],,,+++[/p]---```" self.assertEqual(after, parse(before)) def testOneLastTextLetter(self): before = after = "b" self.assertEqual(after, parse(before)) def testOneLastTextLetterAfterTag(self): before = after = "...[b],,,[/b]b" self.assertEqual(after, parse(before)) def testTagMInsideAnotherTag(self): # tag order. before = "[c][m1]...[/m][/c]" after = "[m1][c]...[/c][/m]" self.assertEqual(after, parse(before)) def testTagMInsideAnotherTagAfterText(self): before = "[c]...[m1],,,[/m][/c]" after = "[c]...[/c][m1][c],,,[/c][/m]" self.assertEqual(after, parse(before)) def testTagMDeepInside(self): before = "...[i],,,[b]+++[c green][/b]---[m1]```[/i][/c][/m]..." after = "...[i],,,[b]+++[/b][c green]---[/c][/i][m1][i][c green]" \ "```[/c][/i][/m]..." self.assertEqual(after, parse(before)) def testTagMInsideBroken(self): before = "[m1][*]- [ref]...[/ref][/m][m1]- [ref],,,[/ref][/*][/m]" after = "[m1][*]- [ref]...[/ref][/*][/m][m1][*]- [ref],,,[/ref][/*][/m]" self.assertEqual(after, parse(before)) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/ebook_kobo_test.py000066400000000000000000000063471417733132500206400ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- from os.path import join, dirname, abspath import sys import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, join(rootDir, "pyglossary", "plugins")) sys.path.insert(0, rootDir) from pyglossary.plugins.ebook_kobo import ( Writer, ) class GetPrefixTest(unittest.TestCase): def case(self, word, prefix): w = Writer(None) self.assertEqual( w.get_prefix(word), prefix, ) def test_examples(self): # examples from https://pgaskin.net/dictutil/dicthtml/prefixes.html self.case("test", "te") self.case("a", "aa") self.case("Èe", "èe") self.case("multiple words", "mu") self.case("àççèñts", "àç") self.case("à", "àa") self.case("ç", "ça") self.case("", "11") self.case(" ", "11") self.case(" x", "xa") self.case(" 123", "11") self.case("x 23", "xa") self.case("д ", "д") self.case("дaд", "дa") self.case("未未", "未未") self.case("未", "未a") self.case(" 未", "11") self.case(" 未", "未a") # the rest of test cases are from # https://github.com/pgaskin/dictutil/blob/master/kobodict/util_test.go def test_dicthtml_en(self): self.case("a-", "11") self.case("-an", "11") self.case("GB", "gb") def test_dicthtml_fr(self): self.case("ébahir", "éb") self.case("a1", "11") self.case("ô", "ôa") self.case("kébab", "ké") self.case("aérer", "aé") self.case("living-room", "li") # dicthtml-ja # Note, Kanji not currently implemented, so not testing (note, the logic # is in a separate function, anyways). # self.case("あ", "あ") # self.case("アークとう", "アー") def test_dictword_spaces(self): # generated by dictword-test: spaces self.case(" ", "11") self.case(" ", "11") self.case("\t\t", "11") self.case("\t\f\t", "11") self.case("x ", "xa") self.case(" xx", "xa") # generated by dictword-test: spaces where trim/prefix order matters self.case(" x", "11") self.case(" xy", "11") self.case(" xyz", "11") self.case("x z", "xa") def test_dictword_cyrillic(self): # generated by dictword-test: cyrillic self.case(" д", "д") self.case(" дд", "д") self.case("д", "д") self.case("aд", "aд") self.case("дa", "дa") self.case("aдa", "aд") def test_dictword_uppercase_accented(self): # generated by dictword-test: uppercase accented letters self.case("Ȅe", "ȅe") self.case("eȄ", "eȅ") self.case("Ȅ", "ȅa") self.case("Ȅ!", "11") def test_dictword_cjk(self): # generated by dictword-test: cjk self.case("x未", "x未") self.case("未x", "未x") self.case("xy未", "xy") self.case("还没", "还没") def test_dictword_misc(self): # generated by dictword-test: misc self.case("!", "11") self.case("!!", "11") self.case("!!!", "11") self.case("x!", "11") self.case("x!!", "11") self.case("xx!", "xx") self.case("xxx!", "xx") self.case(" !", "11") self.case(" !!", "11") self.case(" !!!", "11") self.case(" !", "11") self.case(" !!", "11") self.case(" !!!", "11") self.case(" x!", "xa") self.case(" x!!", "xa") self.case(" xx!", "xa") self.case(" xxx!", "xa") def test_synthetic(self): self.case("x\x00y", "xa") self.case("\x00xy", "11") if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/entry_test.py000066400000000000000000000123211417733132500176550ustar00rootroot00000000000000#!/usr/bin/python3 import sys from os.path import join, dirname, abspath import unittest import logging rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.entry import * from pyglossary.core_test import getMockLogger class TestEntryBasic(unittest.TestCase): def test_exc_1(self): try: Entry(b"word", "defi") except TypeError as e: self.assertEqual(str(e), "invalid word type ") else: self.fail("must raise TypeError") def test_exc_2(self): try: Entry(("word",), "defi") except TypeError as e: self.assertEqual(str(e), "invalid word type ") else: self.fail("must raise TypeError") def test_exc_3(self): try: Entry("word", b"defi") except TypeError as e: self.assertEqual(str(e), "invalid defi type ") else: self.fail("must raise TypeError") def test_exc_4(self): try: Entry("word", ("defi",)) except TypeError as e: self.assertEqual(str(e), "invalid defi type ") else: self.fail("must raise TypeError") def test_exc_5(self): try: Entry("word", "defi", "b") except ValueError as e: self.assertEqual(str(e), "invalid defiFormat 'b'") else: self.fail("must raise ValueError") def test_1(self): entry = Entry("test1", "something") self.assertEqual(entry.l_word, ["test1"]) self.assertEqual(entry.defi, "something") def test_2(self): entry = Entry(["test1"], "something") self.assertEqual(entry.l_word, ["test1"]) self.assertEqual(entry.defi, "something") def test_3(self): entry = Entry("test1", ["something"]) self.assertEqual(entry.l_word, ["test1"]) self.assertEqual(entry.defi, "something") def test_repr_1(self): entry = Entry("test1", "something") self.assertEqual( repr(entry), "Entry('test1', 'something', defiFormat='m')", ) def test_repr_1(self): entry = Entry("test1", "something", defiFormat="h") self.assertEqual( repr(entry), "Entry('test1', 'something', defiFormat='h')", ) def test_defiFormat_1(self): entry = Entry("test1", "something") self.assertEqual(entry.defiFormat, "m") def test_defiFormat_2(self): entry = Entry("test1", "something", defiFormat="h") self.assertEqual(entry.defiFormat, "h") def test_defiFormat_3(self): entry = Entry("test1", "something", defiFormat="h") entry.defiFormat = "x" self.assertEqual(entry.defiFormat, "x") def test_addAlt_1(self): entry = Entry("test1", "something") self.assertEqual(entry.l_word, ["test1"]) entry.addAlt("test 1") self.assertEqual(entry.l_word, ["test1", "test 1"]) class TestEntryDetectDefiFormat(unittest.TestCase): def test_1(self): entry = Entry("test1", "something") entry.detectDefiFormat() self.assertEqual(entry.defiFormat, "m") def test_2(self): entry = Entry("test1", "something", defiFormat="h") entry.detectDefiFormat() self.assertEqual(entry.defiFormat, "h") def test_3(self): entry = Entry("test1", "something", defiFormat="x") entry.detectDefiFormat() self.assertEqual(entry.defiFormat, "x") def test_4(self): entry = Entry("test1", "something") entry.detectDefiFormat() self.assertEqual(entry.defiFormat, "h") def test_5(self): entry = Entry("test1", "titlesomething") entry.detectDefiFormat() self.assertEqual(entry.defiFormat, "x") class TestEntryStripFullHtml(unittest.TestCase): def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) self.mockLog = getMockLogger() def setUp(self): self.mockLog.clear() def tearDown(self): self.assertEqual(0, self.mockLog.printRemainingErrors()) def case( self, word: str, origDefi: str, fixedDefi: str, logMsg: str = "", logLevel: int = logging.ERROR, ): entry = Entry(word, origDefi) entry.stripFullHtml() self.assertEqual(entry.defi, fixedDefi) if logMsg: record = self.mockLog.popLog(logLevel, logMsg) self.assertIsNotNone(record, msg=f"logMsg={logMsg!r}") def test_1(self): self.case( word="test1", origDefi="plain text", fixedDefi="plain text", logMsg="", ) def test_2(self): self.case( word="test2", origDefi="

              simple html text

              ", fixedDefi="

              simple html text

              ", logMsg="", ) def test_3(self): self.case( word="test3", origDefi="simple html", fixedDefi="simple html", logMsg="", ) def test_4(self): self.case( word="test4", origDefi="simple html", fixedDefi="simple html", logMsg="", ) def test_5(self): self.case( word="test5", origDefi="simple html", fixedDefi="simple html", logMsg="', b'', data, ) def remove_content_extra(self, data): data = re.sub( b'[0-9a-f]{32}', b'', data, ) data = re.sub( b'[0-9-]{10}', b'', data, ) return data def convert_to_epub( self, inputFname, ouputFname, testId, **convertArgs ): inputFilename = self.downloadFile(f"{inputFname}") outputFilename = self.newTempFilePath( f"{inputFname.replace('.', '_')}-{testId}.epub" ) expectedFilename = self.downloadFile(f"{ouputFname}.epub") glos = self.glos = Glossary() res = glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, **convertArgs ) self.assertEqual(outputFilename, res) self.compareZipFiles( outputFilename, expectedFilename, { "OEBPS/toc.ncx": self.remove_toc_uid, "OEBPS/content.opf": self.remove_content_extra, }, ) def test_convert_to_epub_1(self): self.convert_to_epub( "100-en-fa-res.slob", "100-en-fa-res-slob", "1", ) def test_convert_to_epub_2(self): for sort in (True, False): self.convert_to_epub( "100-en-fa-res.slob", "100-en-fa-res-slob", "2", sort=sort, ) def test_convert_to_epub_3(self): for sqlite in (True, False): self.convert_to_epub( "100-en-fa-res.slob", "100-en-fa-res-slob", "3", sqlite=sqlite, ) def test_convert_to_epub_4(self): for direct in (True, False): self.convert_to_epub( "100-en-fa-res.slob", "100-en-fa-res-slob", "4", direct=direct, ) def test_convert_to_epub_5(self): for sqlite in (True, False): self.convert_to_epub( "100-en-fa.txt", "100-en-fa-prefix3", "5", sqlite=sqlite, writeOptions={"group_by_prefix_length": 3}, ) def test_convert_to_epub_6(self): self.convert_to_epub( "300-rand-en-fa.txt", "300-rand-en-fa-prefix3", "6", sqlite=True, writeOptions={"group_by_prefix_length": 3}, ) def test_convert_to_epub_7(self): self.convert_to_epub( "300-rand-en-fa.txt", "300-rand-en-fa-prefix3", "7", sqlite=False, writeOptions={"group_by_prefix_length": 3}, ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/g_freedict_test.py000066400000000000000000000013011417733132500206030ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase class TestGlossaryFreeDict(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "100-en-de.tei": "542c210e", }) def convert_tei_txt(self, fname, fname2, **convertArgs): self.convert( f"{fname}.tei", f"{fname}-2.txt", compareText=f"{fname2}.txt", **convertArgs ) def test_convert_tei_txt_1(self): self.convert_tei_txt( "100-en-de", "100-en-de-v2", infoOverride={"input_file_size": None}, ) pyglossary-4.5.0/tests/g_jmdict_test.py000066400000000000000000000014271417733132500203010ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase class TestGlossaryJMdict(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "050-JMdict-English": "aec9ad8c", "050-JMdict-English.txt": "edd13a27", }) def convert_jmdict_txt(self, fname, fname2, **convertArgs): self.convert( fname, f"{fname}-2.txt", compareText=f"{fname2}.txt", inputFormat="JMDict", **convertArgs ) def test_convert_jmdict_txt_1(self): self.convert_jmdict_txt( "050-JMdict-English", "050-JMdict-English", ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/g_json_test.py000066400000000000000000000016761417733132500200060ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase class TestGlossaryJSON(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "004-bar.json": "7e4b2663", "100-en-de.json": "6fa8e159", "100-en-fa.json": "8d29c1be", "100-ja-en.json": "fab2c106", }) def convert_txt_json(self, fname): self.convert( f"{fname}.txt", f"{fname}-2.json", compareText=f"{fname}.json", ) def test_convert_txt_json_0(self): self.convert_txt_json("004-bar") def test_convert_txt_json_1(self): self.convert_txt_json("100-en-fa") def test_convert_txt_json_2(self): self.convert_txt_json("100-en-de") def test_convert_txt_json_3(self): self.convert_txt_json("100-ja-en") if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/g_kobo_dictfile_test.py000066400000000000000000000030521417733132500216200ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase from pyglossary.entry import Entry class TestGlossaryDictfile(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "022-en-en.df": "edff6de1", "022-en-en.df.txt": "93a2450f", "022-en-en.df.txt.df": "8e952e56", "res/01cf5b41.gif": "01cf5b41", "res/1f3c1a36.gif": "1f3c1a36", "res/3af9fd5d.gif": "3af9fd5d", "res/6684158d.gif": "6684158d", }) def convert_df_txt(self, fname, fname2, resFiles, **convertArgs): resFilesPath = { resFileName: self.newTempFilePath(f"{fname}-2.txt_res/{resFileName}") for resFileName in resFiles } self.convert( f"{fname}.df", f"{fname}-2.txt", compareText=f"{fname2}.txt", **convertArgs ) for resFileName in resFiles: fpath1 = self.downloadFile(f"res/{resFileName}") fpath2 = resFilesPath[resFileName] self.compareBinaryFiles(fpath1, fpath2) def convert_txt_df(self, fname, fname2, **convertArgs): self.convert( f"{fname}.txt", f"{fname}-2.df", compareText=f"{fname2}.df", **convertArgs ) def test_convert_df_txt_1(self): self.convert_df_txt( "022-en-en", "022-en-en.df", resFiles=[ "01cf5b41.gif", "1f3c1a36.gif", "3af9fd5d.gif", "6684158d.gif", ], ) def test_convert_txt_df_1(self): self.convert_txt_df( "022-en-en.df", "022-en-en.df.txt", ) pyglossary-4.5.0/tests/g_kobo_test.py000066400000000000000000000024261417733132500177610ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest import gzip rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase from pyglossary.glossary import Glossary class TestGlossaryKobo(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) # self.dataFileCRC32.update({}) def convert_txt_kobo(self, fname, sha1sumDict, **convertArgs): outputFname = f"{fname}-2.kobo.zip" outputFpath = self.newTempFilePath(outputFname) # expectedFpath = self.downloadFile(f"{fname}.kobo.zip") self.convert( f"{fname}.txt", outputFname, **convertArgs ) dataReplaceFuncs = { _zfname: gzip.decompress for _zfname in sha1sumDict if _zfname != "words" } self.checkZipFileSha1sum( outputFpath, sha1sumDict=sha1sumDict, dataReplaceFuncs=dataReplaceFuncs ) def test_convert_txt_kobo_1(self): sha1sumDict = { "11.html": "39f0f46560da7398ab0d3b19cc1c2387ecd201dd", "aa.html": "df9460450e8b46e913c57bf39dcc799ffdc2fb33", "ab.html": "be4271a8508dbb499bafd439810af621a7b3474f", "words": "d0f74e854f090fbaa8211bcfd162ad99ec4da0a3", } self.convert_txt_kobo("100-en-fa", sha1sumDict) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/g_lingoes_ldf_test.py000066400000000000000000000020031417733132500213030ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase from pyglossary.glossary import Glossary class TestGlossaryLingoesLDF(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "004-bar.ldf": "b1aa776d", }) def convert_txt_ldf(self, fname, fname2, **convertArgs): self.convert( f"{fname}.txt", f"{fname}-2.ldf", compareText=f"{fname2}.ldf", **convertArgs ) def convert_ldf_txt(self, fname, fname2, **convertArgs): self.convert( f"{fname}.ldf", f"{fname}-2.txt", compareText=f"{fname2}.txt", **convertArgs ) def test_convert_txt_ldf_1(self): self.convert_txt_ldf( "004-bar", "004-bar", ) def test_convert_ldf_txt_1(self): self.convert_ldf_txt( "004-bar", "004-bar", infoOverride={ "name": None, "input_file_size": None, }, ) pyglossary-4.5.0/tests/g_stardict_test.py000066400000000000000000000124531417733132500206450ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase from pyglossary.glossary import Glossary class TestGlossaryStarDict(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "004-bar.sd/004-bar.dict": "9ea397f8", "004-bar.sd/004-bar.idx": "cf9440cf", "004-bar.sd/004-bar.ifo": "ada870e4", "004-bar.sd/004-bar.syn": "286b17bf", "100-en-de.sd/100-en-de.dict": "d74bf277", "100-en-de.sd/100-en-de.idx": "945b303c", "100-en-de.sd/100-en-de.ifo": "6529871f", "100-en-fa.sd/100-en-fa.dict": "223a0d1d", "100-en-fa.sd/100-en-fa.idx": "6df43378", "100-en-fa.sd/100-en-fa.ifo": "3f2086cd", "100-en-fa.sd/100-en-fa.syn": "1160fa0b", "100-en-fa-sd.txt": "85f9d3fc", "100-ja-en.sd/100-ja-en.dict": "39715f01", "100-ja-en.sd/100-ja-en.idx": "adf0e552", "100-ja-en.sd/100-ja-en.ifo": "b01e368c", "100-ja-en.sd/100-ja-en.syn": "76e6df95", "300-ru-en.txt": "77cfee2f", "300-ru-en.sd/300-ru-en.dict": "8be7fa4c", "300-ru-en.sd/300-ru-en.idx": "1cd30f1a", "300-ru-en.sd/300-ru-en.ifo": "0b135812", "300-ru-en.sd/300-ru-en.syn": "87ee3372", }) def convert_txt_stardict( self, fname, syn=True, dictzip=False, config=None, rawEntryCompress=None, **kwargs ): binExtList = ["idx", "dict"] if syn: binExtList.append("syn") inputFilename = self.downloadFile(f"{fname}.txt") outputFilename = self.newTempFilePath(f"{fname}.ifo") otherFiles = { ext: self.newTempFilePath(f"{fname}.{ext}") for ext in binExtList } glos = self.glos = Glossary() if config is not None: glos.config = config if rawEntryCompress is not None: glos.setRawEntryCompress(rawEntryCompress) res = glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, writeOptions={ "dictzip": dictzip, }, **kwargs ) self.assertEqual(outputFilename, res) self.compareTextFiles( outputFilename, self.downloadFile(f"{fname}.sd/{fname}.ifo"), ) for ext in binExtList: self.compareBinaryFiles( otherFiles[ext], self.downloadFile(f"{fname}.sd/{fname}.{ext}") ) def convert_txt_stardict_zip( self, fname, sha1sumDict, dictzip=False, config=None, rawEntryCompress=None, **kwargs ): inputFilename = self.downloadFile(f"{fname}.txt") outputFilename = self.newTempFilePath(f"{fname}.zip") glos = self.glos = Glossary() if config is not None: glos.config = config if rawEntryCompress is not None: glos.setRawEntryCompress(rawEntryCompress) res = glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, outputFormat="Stardict", writeOptions={ "dictzip": dictzip, }, **kwargs ) self.assertEqual(outputFilename, res) self.checkZipFileSha1sum( outputFilename, sha1sumDict=sha1sumDict, ) def convert_stardict_txt( self, inputFname: str, ouputFname: str, testId: str, ): inputFilename = self.downloadFile(f"{inputFname}.sd/{inputFname}.ifo") outputFilename = self.newTempFilePath( f"{inputFname}-{testId}.txt" ) expectedFilename = self.downloadFile(f"{ouputFname}.txt") glos = self.glos = Glossary() res = glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, ) self.assertEqual(outputFilename, res) self.compareTextFiles(outputFilename, expectedFilename) def test_convert_txt_stardict_0(self): self.convert_txt_stardict( "100-en-fa", config={"auto_sqlite": True}, direct=True, ) def test_convert_txt_stardict_1(self): for sqlite in (None, False, True): for rawEntryCompress in (None, True, False): self.convert_txt_stardict( "100-en-fa", rawEntryCompress=rawEntryCompress, sqlite=sqlite, ) def test_convert_txt_stardict_1_zip(self): sha1sumDict = { "100-en-fa.dict", "1e462e829f9e2bf854ceac2ef8bc55911460c79e", "100-en-fa.idx", "943005945b35abf3a3e7b80375c76daa87e810f0", "100-en-fa.ifo", "3e982a76f83eef66a8d4915e7a0018746f4180bc", "100-en-fa.syn", "fcefc76628fed18b84b9aa83cd7139721b488545", } for sqlite in (None, False, True): self.convert_txt_stardict_zip( "100-en-fa", sha1sumDict=sha1sumDict, sqlite=sqlite, ) def test_convert_txt_stardict_2(self): for sqlite in (None, False, True): for rawEntryCompress in (None, True, False): self.convert_txt_stardict( "004-bar", rawEntryCompress=rawEntryCompress, sqlite=sqlite, ) def test_convert_txt_stardict_3(self): for sqlite in (None, False, True): self.convert_txt_stardict( "100-en-de", syn=False, sqlite=sqlite, ) def test_convert_txt_stardict_4(self): for sqlite in (None, False, True): self.convert_txt_stardict( "100-ja-en", sqlite=sqlite, ) def test_convert_txt_stardict_5(self): for sqlite in (None, False, True): self.convert_txt_stardict( "300-ru-en", syn=True, sqlite=sqlite, ) def test_convert_txt_stardict_sqlite_no_alts(self): self.convert_txt_stardict( "100-en-fa", config={"enable_alts": False}, sqlite=True, ) def test_convert_stardict_txt_1(self): self.convert_stardict_txt( "100-en-fa", "100-en-fa-sd", "1", ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/g_xdxf_test.py000066400000000000000000000014301417733132500177720ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase class TestGlossaryXDXF(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "100-cyber_lexicon_en-es.txt": "8571e444", "100-cyber_lexicon_en-es.xdxf": "8d9ba394" }) def convert_xdxf_txt(self, fname, fname2, **convertArgs): self.convert( f"{fname}.xdxf", f"{fname}-2.txt", compareText=f"{fname2}.txt", **convertArgs ) def test_convert_xdxf_txt_1(self): self.convert_xdxf_txt( "100-cyber_lexicon_en-es", "100-cyber_lexicon_en-es", ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/glossary_errors_test.py000066400000000000000000000270101417733132500217540ustar00rootroot00000000000000#!/usr/bin/python3 import sys import os from os.path import join, dirname, abspath, isdir, isfile import unittest import logging rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_test import TestGlossaryBase, appTmpDir from pyglossary.glossary import Glossary from pyglossary.core_test import getMockLogger from pyglossary.os_utils import rmtree Glossary.init() class MyStr(str): pass class TestGlossaryErrors(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.mockLog = getMockLogger() def setUp(self): TestGlossaryBase.setUp(self) self.mockLog.clear() def tearDown(self): TestGlossaryBase.tearDown(self) self.assertEqual(0, self.mockLog.printRemainingErrors()) def assertLogCritical(self, errorMsg): self.assertIsNotNone(self.mockLog.popLog( logging.CRITICAL, errorMsg, ), msg=f"did not find critical log {errorMsg!r}") def assertLogError(self, errorMsg): self.assertIsNotNone(self.mockLog.popLog( logging.ERROR, errorMsg, ), msg=f"did not find error log {errorMsg!r}") def assertLogWarning(self, errorMsg): self.assertIsNotNone(self.mockLog.popLog( logging.WARNING, errorMsg, ), msg=f"did not find warning log {errorMsg!r}") def test_loadPlugins_invalidDir(self): Glossary.loadPlugins("/abc/def/ghe") self.assertLogCritical("Invalid plugin directory: '/abc/def/ghe'") def test_loadPlugin_moduleNotFound(self): Glossary.loadPlugin("abc.def.ghe") self.assertLogWarning( "Module 'abc.def' not found, skipping plugin 'abc.def.ghe'" ) def test_detectInputFormat_err1(self): res = Glossary.detectInputFormat( filename="", format="", ) self.assertIsNone(res) self.assertLogCritical("Unable to detect input format!") def test_detectInputFormat_err2(self): res = Glossary.detectInputFormat( filename="test.abcd", format="", ) self.assertIsNone(res) self.assertLogCritical("Unable to detect input format!") def test_detectInputFormat_err3(self): res = Glossary.detectInputFormat( filename="test.sql", format="", ) self.assertIsNone(res) self.assertLogCritical("plugin Sql does not support reading") def test_detectInputFormat_err4(self): res = Glossary.detectInputFormat( filename="test", format="FooBar", ) self.assertIsNone(res) self.assertLogCritical("Invalid format 'FooBar'") def test_detectInputFormat_ok1(self): res = Glossary.detectInputFormat( filename="test1.txt.gz", format="", ) self.assertEqual(res, ("test1.txt.gz", "Tabfile", "")) def test_detectInputFormat_ok2(self): res = Glossary.detectInputFormat( filename="test2.txt.zip", format="", ) self.assertEqual(res, ("test2.txt", "Tabfile", "zip")) def test_detectOutputFormat_err1(self): res = Glossary.detectOutputFormat( filename="", format="", inputFilename="" ) self.assertIsNone(res) self.assertLogCritical("Invalid filename ''") def test_detectOutputFormat_err2(self): res = Glossary.detectOutputFormat( filename="test", format="FooBar", inputFilename="" ) self.assertIsNone(res) self.assertLogCritical("Invalid format FooBar") def test_detectOutputFormat_err3(self): res = Glossary.detectOutputFormat( filename="", format="", inputFilename="test" ) self.assertIsNone(res) self.assertLogCritical("No filename nor format is given for output file") def test_detectOutputFormat_err4_1(self): res = Glossary.detectOutputFormat( filename="", format="BabylonBgl", inputFilename="test3.txt" ) self.assertIsNone(res) self.assertLogCritical("plugin BabylonBgl does not support writing") def test_detectOutputFormat_err4_2(self): res = Glossary.detectOutputFormat( filename="test.bgl", format="", inputFilename="" ) self.assertIsNone(res) self.assertLogCritical("plugin BabylonBgl does not support writing") def test_detectOutputFormat_err5(self): res = Glossary.detectOutputFormat( filename="test", format="", inputFilename="", ) self.assertIsNone(res) self.assertLogCritical("Unable to detect output format!") def test_detectOutputFormat_err6(self): res = Glossary.detectOutputFormat( filename="test", format="Tabfile", inputFilename="", addExt=True, ) self.assertEqual(res, ("test", "Tabfile", "")) self.assertLogError("inputFilename is empty") def test_init_infoBadType(self): try: Glossary(info=["a"]) except Exception as e: self.assertEqual(str(type(e)), "") self.assertEqual( str(e), "Glossary: `info` has invalid type, dict or OrderedDict expected", ) else: self.fail("did not raise an exception") def test_cleanup_removed(self): glos = Glossary() tmpFname = "test_cleanup_removed" entry = glos.newDataEntry(tmpFname, b"test") tmpFpath = entry._tmpPath self.assertTrue(bool(tmpFpath), msg="entry tmpPath is empty") self.assertTrue(isfile(tmpFpath), msg=f"tmp file does not exist: {tmpFpath}") rmtree(appTmpDir) glos.cleanup() self.assertLogError(f"no such file or directory: {appTmpDir}") def test_lang_err_get_source(self): glos = Glossary() glos.setInfo("sourcelang", "test") self.assertEqual(glos.sourceLangName, "") self.assertLogError("unknown language 'test'") def test_lang_err_get_target(self): glos = Glossary() glos.setInfo("targetlang", "test") self.assertEqual(glos.targetLangName, "") self.assertLogError("unknown language 'test'") def test_lang_err_set_source(self): glos = Glossary() glos.sourceLangName = "foobar" self.assertLogError("unknown language 'foobar'") self.assertEqual(glos.sourceLangName, "") def test_lang_err_set_target(self): glos = Glossary() glos.targetLangName = "foobar" self.assertLogError("unknown language 'foobar'") self.assertEqual(glos.targetLangName, "") def test_lang_err_setObj_source(self): glos = Glossary() try: glos.sourceLang = "foobar" except TypeError as e: self.assertEqual(str(e), "invalid lang=foobar, must be a Lang object") else: self.fail("must raise a TypeError") def test_lang_err_setObj_target(self): glos = Glossary() try: glos.targetLang = "foobar" except TypeError as e: self.assertEqual(str(e), "invalid lang=foobar, must be a Lang object") else: self.fail("must raise a TypeError") def test_config_attr_set_twice(self): glos = Glossary() glos.config = {"lower": True} self.assertEqual(glos.getConfig("lower", False), True) glos.config = {"lower": False} self.assertLogError("glos.config is set more than once") self.assertEqual(glos.getConfig("lower", False), True) def test_iter_empty(self): glos = Glossary() self.assertEqual(list(glos), []) self.assertLogError( "Trying to iterate over a blank Glossary, must call `glos.read` first" ) def test_convert_typeErr_1(self): glos = Glossary() try: glos.convert( inputFilename=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "inputFilename must be str") else: self.fail("must raise TypeError") def test_convert_typeErr_2(self): glos = Glossary() try: glos.convert( inputFilename="", outputFilename=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "outputFilename must be str") else: self.fail("must raise TypeError") def test_convert_typeErr_3(self): glos = Glossary() try: glos.convert( inputFilename="", outputFilename="", inputFormat=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "inputFormat must be str") else: self.fail("must raise TypeError") def test_convert_typeErr_4(self): glos = Glossary() try: glos.convert( inputFilename="", outputFilename="", inputFormat="", outputFormat=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "outputFormat must be str") else: self.fail("must raise TypeError") def test_read_typeErr_1(self): glos = Glossary() try: glos.read( filename=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "filename must be str") else: self.fail("must raise TypeError") def test_read_typeErr_2(self): glos = Glossary() try: glos.read( filename="", format=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "format must be str") else: self.fail("must raise TypeError") def test_write_typeErr_1(self): glos = Glossary() try: glos.write( filename=MyStr(""), format="" ) except TypeError as e: self.assertEqual(str(e), "filename must be str") else: self.fail("must raise TypeError") def test_write_typeErr_2(self): glos = Glossary() try: glos.write( filename="", format=MyStr(""), ) except TypeError as e: self.assertEqual(str(e), "format must be str") else: self.fail("must raise TypeError") def test_convert_sameFilename(self): glos = Glossary() res = glos.convert( inputFilename="test4.txt", outputFilename="test4.txt", ) self.assertIsNone(res) self.assertLogCritical("Input and output files are the same") def test_convert_dirExists(self): glos = Glossary() res = glos.convert( inputFilename="test5.txt", outputFilename=self.tempDir, outputFormat="Stardict", ) self.assertIsNone(res) self.assertLogCritical(f"Directory already exists: {self.tempDir}") def test_convert_fileNotFound(self): glos = Glossary() res = glos.convert( inputFilename="/abc/def/test6.txt", outputFilename="test2.txt", ) self.assertIsNone(res) self.assertLogCritical( "[Errno 2] No such file or directory: '/abc/def/test6.txt'" ) self.assertLogCritical("Reading file '/abc/def/test6.txt' failed.") def test_convert_unableDetectOutputFormat(self): glos = Glossary() res = glos.convert( inputFilename="test7.txt", outputFilename="test", outputFormat="", ) self.assertIsNone(res) self.assertLogCritical("Unable to detect output format!") self.assertLogCritical("Writing file 'test' failed.") def test_convert_writeFileNotFound_txt(self): outputFilename = "/test/7de8cf6f17bc4c9abb439e71adbec95d.txt" glos = Glossary() res = glos.convert( inputFilename=self.downloadFile("100-en-fa.txt"), outputFilename=outputFilename, ) self.assertIsNone(res) self.assertLogCritical( f"[Errno 2] No such file or directory: '{outputFilename}'" ) self.assertLogCritical(f"Writing file '{outputFilename}' failed.") def test_convert_writeFileNotFound_hdir(self): outputFilename = "/test/40e20107f5b04087bfc0ec0d61510017.hdir" glos = Glossary() res = glos.convert( inputFilename=self.downloadFile("100-en-fa.txt"), outputFilename=outputFilename, ) self.assertIsNone(res) self.assertLogCritical( f"[Errno 2] No such file or directory: '{outputFilename}'" ) self.assertLogCritical(f"Writing file '{outputFilename}' failed.") def test_convert_invalidSortKeyName(self): glos = self.glos = Glossary() outputFilename = self.newTempFilePath("none.txt") res = glos.convert( inputFilename=self.downloadFile("100-en-fa.txt"), outputFilename=outputFilename, sort=True, sortKeyName="blah", ) self.assertIsNone(res) self.assertLogCritical("invalid sortKeyName = 'blah'") def test_collectDefiFormat_direct(self): fname = "100-en-fa.txt" glos = self.glos = Glossary() glos.read(self.downloadFile(fname), direct=True) res = glos.collectDefiFormat(10) self.assertIsNone(res) self.assertLogError("collectDefiFormat: not supported in direct mode") def test_sortWords_invalidSortKeyName(self): glos = self.glos = Glossary() glos.sortWords( sortKeyName="blah", ) self.assertLogCritical("invalid sortKeyName = 'blah'") if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/glossary_security_test.py000066400000000000000000000034441417733132500223140ustar00rootroot00000000000000#!/usr/bin/python3 import sys import os from os.path import join, dirname, abspath, isdir, isfile import unittest import logging rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from tests.glossary_errors_test import TestGlossaryErrors from tests.glossary_test import dataDir from pyglossary.glossary import Glossary class TestGlossarySecurity(TestGlossaryErrors): def __init__(self, *args, **kwargs): TestGlossaryErrors.__init__(self, *args, **kwargs) self.mockLog.setLevel(logging.INFO) def test_convert_1(self): glos = Glossary() res = glos.convert( inputFilename="os.system('abcd')", outputFilename="os.system('abcd -l')", ) self.assertLogCritical("Unable to detect output format!") self.assertLogCritical( 'Writing file "os.system(\'abcd -l\')" failed.' ) def test_convert_2(self): glos = Glossary() res = glos.convert( inputFilename="os.system('abcd');test.txt", outputFilename="os.system('abcd -l')", ) self.assertLogCritical("Unable to detect output format!") self.assertLogCritical( 'Writing file "os.system(\'abcd -l\')" failed.' ) def test_convert_3(self): glos = Glossary() res = glos.convert( inputFilename="os.system('abcd');test.txt", outputFilename="os.system('abcd -l');test.csv", ) self.assertLogCritical( f'[Errno 2] No such file or directory: ' f'"{dataDir}/os.system(\'abcd\');test.txt"' ) self.assertLogCritical( 'Reading file "os.system(\'abcd\');test.txt" failed.' ) def test_convert_3(self): glos = Glossary() res = glos.convert( inputFilename="test.txt\nos.system('abcd')", outputFilename="test.csv\nos.system('abcd -l')", ) self.assertLogCritical("Unable to detect output format!") self.assertLogCritical( 'Writing file "test.csv\\nos.system(\'abcd -l\')" failed.' ) pyglossary-4.5.0/tests/glossary_test.py000066400000000000000000000627711417733132500203750ustar00rootroot00000000000000#!/usr/bin/python3 import sys import os from os.path import join, dirname, abspath, isdir, isfile import unittest import tempfile import logging from urllib.request import urlopen import zipfile import random import hashlib import tracemalloc import os rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.glossary import Glossary, log from pyglossary.entry import Entry from pyglossary.core import cacheDir from pyglossary.os_utils import rmtree from pyglossary.text_utils import crc32hex tracemalloc.start() Glossary.init() dataURL = ( "https://raw.githubusercontent.com/" "ilius/pyglossary-test/main/{filename}" ) dataDir = join(cacheDir, "test") appTmpDir = join(cacheDir, "tmp") if not isdir(dataDir): os.makedirs(dataDir) os.chdir(dataDir) class TestGlossaryBase(unittest.TestCase): def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) self.maxDiff = None self.dataFileCRC32 = { "004-bar.txt": "6775e590", "004-bar-sort.txt": "fe861123", "006-empty.txt": "07ff224b", "006-empty-filtered.txt": "2b3c1c0f", "100-en-de.txt": "f22fc392", "100-en-de-v2.txt": "70eff46c", "100-en-fa.txt": "f5c53133", "100-ja-en.txt": "93542e89", "100-en-de.info": "718adeef", "100-en-fa.info": "9bddb7bb", "100-ja-en.info": "8cf5403c", "300-rand-en-fa.txt": "586617c8", "res/stardict.png": "7e1447fa", "res/test.json": "41f8cf31", } # The setUp() and tearDown() methods allow you to define instructions that # will be executed before and after each test method. def setUp(self): self.glos = None self.tempDir = tempfile.mkdtemp(dir=dataDir) def tearDown(self): if self.glos is not None: self.glos.cleanup() self.glos.clear() if os.getenv("NO_CLEANUP"): return for direc in [ self.tempDir, appTmpDir, ]: if isdir(direc): rmtree(direc) def downloadFile(self, filename): _crc32 = self.dataFileCRC32[filename] fpath = join(dataDir, filename.replace("/", "__")) if isfile(fpath): with open(fpath, mode="rb") as _file: data = _file.read() if crc32hex(data) != _crc32: raise RuntimeError(f"CRC32 check failed for existing file: {fpath}") return fpath try: with urlopen(dataURL.format(filename=filename)) as res: data = res.read() except Exception as e: e.msg += f", filename={filename}" raise e if crc32hex(data) != _crc32: raise RuntimeError(f"CRC32 check failed for downloaded file: {filename}") with open(fpath, mode="wb") as _file: _file.write(data) return fpath def newTempFilePath(self, filename): fpath = join(self.tempDir, filename) if isfile(fpath): os.remove(fpath) return fpath def compareTextFiles(self, fpath1, fpath2): self.assertTrue(isfile(fpath1)) self.assertTrue(isfile(fpath2)) with open(fpath1) as file1: text1 = file1.read().rstrip("\n") with open(fpath2) as file2: text2 = file2.read().rstrip("\n") self.assertEqual( len(text1), len(text2), msg=f"{fpath1} differs from {fpath2}", ) self.assertEqual( text1, text2, msg=f"{fpath1} differs from {fpath2}", ) def compareBinaryFiles(self, fpath1, fpath2): self.assertTrue(isfile(fpath1), f"File {fpath1} does not exist") self.assertTrue(isfile(fpath2), f"File {fpath2} does not exist") with open(fpath1, mode="rb") as file1: data1 = file1.read() with open(fpath2, mode="rb") as file2: data2 = file2.read() self.assertEqual(len(data1), len(data2), msg=f"{fpath1}") self.assertTrue( data1 == data2, msg=f"{fpath1} differs from {fpath2}", ) def compareZipFiles( self, fpath1, fpath2, dataReplaceFuncs: "Dict[str: Callable", ): zf1 = zipfile.ZipFile(fpath1) zf2 = zipfile.ZipFile(fpath2) pathList1 = zf1.namelist() pathList2 = zf1.namelist() self.assertEqual(pathList1, pathList2) for zfpath in pathList1: data1 = zf1.read(zfpath) data2 = zf2.read(zfpath) func = dataReplaceFuncs.get(zfpath) if func is not None: data1 = func(data1) data2 = func(data2) self.assertEqual(len(data1), len(data2), msg=f"zfpath={zfpath!r}") self.assertTrue( data1 == data2, msg=f"zfpath={zfpath!r}", ) def checkZipFileSha1sum( self, fpath, sha1sumDict: "Dict[str, str]", dataReplaceFuncs: "Optional[Dict[str, Callable]]" = None, ): if dataReplaceFuncs is None: dataReplaceFuncs = {} zf = zipfile.ZipFile(fpath) pathList = zf.namelist() for zfpath in pathList: expectedSha1 = sha1sumDict[zfpath] data = zf.read(zfpath) func = dataReplaceFuncs.get(zfpath) if func is not None: data = func(data) actualSha1 = hashlib.sha1(data).hexdigest() self.assertEqual(actualSha1, expectedSha1, msg=f"file: {zfpath}") def convert( self, fname, # input file with extension fname2, # output file with extension testId="tmp", compareText="", compareBinary="", sha1sum=None, md5sum=None, config=None, **convertArgs, ): inputFilename = self.downloadFile(fname) outputFilename = self.newTempFilePath(fname2) glos = self.glos = Glossary() if config is not None: glos.config = config res = glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, **convertArgs ) self.assertEqual(outputFilename, res) if compareText: self.compareTextFiles(outputFilename, self.downloadFile(compareText)) elif compareBinary: self.compareBinaryFiles(outputFilename, self.downloadFile(compareBinary)) elif sha1sum: with open(outputFilename, mode="rb") as _file: actualSha1 = hashlib.sha1(_file.read()).hexdigest() self.assertEqual(actualSha1, sha1sum) elif md5sum: with open(outputFilename, mode="rb") as _file: actualMd5 = hashlib.md5(_file.read()).hexdigest() self.assertEqual(actualMd5, md5sum) class TestGlossary(TestGlossaryBase): def __init__(self, *args, **kwargs): TestGlossaryBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update({ "100-en-fa-sort.txt": "d7a82dc8", "100-en-fa-sort-headword.txt": "4067a29f", "100-en-fa-sort-ebook.txt": "aa620d07", "100-en-fa-sort-ebook3.txt": "5a20f140", "100-en-fa-lower.txt": "62178940", "100-en-fa-remove_html_all.txt": "d611c978", "100-en-fa-rtl.txt": "25ede1e8", "100-en-de-remove_font_b.txt": "727320ac", "300-rand-en-fa-sort-headword-w1256.txt": "06d83bac", "300-rand-en-fa-sort-headword.txt": "df0f8020", "300-rand-en-fa-sort-w1256.txt": "9594aab3", }) def setUp(self): TestGlossaryBase.setUp(self) self.prevLogLevel = log.level log.setLevel(logging.ERROR) def tearDown(self): TestGlossaryBase.tearDown(self) log.setLevel(self.prevLogLevel) def test__str__1(self): glos = self.glos = Glossary() self.assertEqual(str(glos), "Glossary{filename: '', name: None}") def test__str__2(self): glos = self.glos = Glossary() glos._filename = "test.txt" self.assertEqual(str(glos), "Glossary{filename: 'test.txt', name: None}") def test__str__3(self): glos = self.glos = Glossary() glos.setInfo("title", "Test Title") self.assertEqual( str(glos), "Glossary{filename: '', name: 'Test Title'}", ) def test__str__4(self): glos = self.glos = Glossary() glos._filename = "test.txt" glos.setInfo("title", "Test Title") self.assertEqual( str(glos), "Glossary{filename: 'test.txt', name: 'Test Title'}", ) def test_info_1(self): glos = self.glos = Glossary() glos.setInfo("test", "ABC") self.assertEqual(glos.getInfo("test"), "ABC") def test_info_2(self): glos = self.glos = Glossary() glos.setInfo("bookname", "Test Glossary") self.assertEqual(glos.getInfo("title"), "Test Glossary") def test_info_3(self): glos = self.glos = Glossary() glos.setInfo("bookname", "Test Glossary") glos.setInfo("title", "Test 2") self.assertEqual(glos.getInfo("name"), "Test 2") self.assertEqual(glos.getInfo("bookname"), "Test 2") self.assertEqual(glos.getInfo("title"), "Test 2") def test_info_4(self): glos = self.glos = Glossary() glos.setInfo("test", 123) self.assertEqual(glos.getInfo("test"), "123") def test_info_del_1(self): glos = self.glos = Glossary() glos.setInfo("test", "abc") self.assertEqual(glos.getInfo("test"), "abc") glos.setInfo("test", None) self.assertEqual(glos.getInfo("test"), "") def test_info_del_2(self): glos = self.glos = Glossary() glos.setInfo("test", None) self.assertEqual(glos.getInfo("test"), "") def test_setInfo_err1(self): glos = self.glos = Glossary() try: glos.setInfo(1, "a") except TypeError as e: self.assertEqual(str(e), "invalid key=1, must be str") else: self.fail("must raise a TypeError") def test_getInfo_err1(self): glos = self.glos = Glossary() try: glos.getInfo(1) except TypeError as e: self.assertEqual(str(e), "invalid key=1, must be str") else: self.fail("must raise a TypeError") def test_getExtraInfos_1(self): glos = self.glos = Glossary() glos.setInfo("a", "test 1") glos.setInfo("b", "test 2") glos.setInfo("c", "test 3") glos.setInfo("d", "test 4") glos.setInfo("name", "my name") self.assertEqual( glos.getExtraInfos(["b", "c", "title"]), {"a": "test 1", "d": "test 4"}, ) def test_infoKeys_1(self): glos = self.glos = Glossary() glos.setInfo("a", "test 1") glos.setInfo("b", "test 2") glos.setInfo("name", "test name") glos.setInfo("title", "test title") self.assertEqual( glos.infoKeys(), ["a", "b", "name"], ) def test_config_attr_get(self): glos = self.glos = Glossary() try: glos.config except NotImplementedError: pass else: self.fail("must raise NotImplementedError") def test_config_attr_set(self): glos = self.glos = Glossary() glos.config = {"lower": True} self.assertEqual(glos.getConfig("lower", False), True) def test_read_txt_1(self): inputFilename = self.downloadFile("100-en-fa.txt") glos = self.glos = Glossary() res = glos.read(filename=inputFilename) self.assertTrue(res) self.assertEqual(glos.sourceLangName, "English") self.assertEqual(glos.targetLangName, "Persian") self.assertIn("Sample: ", glos.getInfo("name")) self.assertEqual(len(glos), 100) def test_read_txt_direct_1(self): inputFilename = self.downloadFile("100-en-fa.txt") glos = self.glos = Glossary() res = glos.read(filename=inputFilename, direct=True) self.assertTrue(res) self.assertEqual(glos.sourceLangName, "English") self.assertEqual(glos.targetLangName, "Persian") self.assertIn("Sample: ", glos.getInfo("name")) self.assertEqual(len(glos), 0) def test_init_infoDict(self): glos = self.glos = Glossary(info={"a": "b"}) self.assertEqual(list(glos.iterInfo()), [('a', 'b')]) def test_init_infoOrderedDict(self): from collections import OrderedDict glos = self.glos = Glossary(info=OrderedDict([ ("y", "z"), ("a", "b"), ("1", "2"), ])) self.assertEqual(list(glos.iterInfo()), [('y', 'z'), ('a', 'b'), ('1', '2')]) def test_lang_1(self): glos = self.glos = Glossary() self.assertEqual(glos.sourceLangName, "") self.assertEqual(glos.targetLangName, "") glos.sourceLangName = "ru" glos.targetLangName = "de" self.assertEqual(glos.sourceLangName, "Russian") self.assertEqual(glos.targetLangName, "German") def test_lang_get_source(self): glos = self.glos = Glossary() glos.setInfo("sourcelang", "farsi") self.assertEqual(glos.sourceLangName, "Persian") def test_lang_get_target(self): glos = self.glos = Glossary() glos.setInfo("targetlang", "malay") self.assertEqual(glos.targetLangName, "Malay") def test_lang_set_source(self): glos = self.glos = Glossary() glos.sourceLangName = "en" self.assertEqual(glos.sourceLangName, "English") def test_lang_set_source_empty(self): glos = self.glos = Glossary() glos.sourceLangName = "" self.assertEqual(glos.sourceLangName, "") def test_lang_set_target(self): glos = self.glos = Glossary() glos.targetLangName = "fa" self.assertEqual(glos.targetLangName, "Persian") def test_lang_set_target_empty(self): glos = self.glos = Glossary() glos.targetLangName = "" self.assertEqual(glos.targetLangName, "") def test_lang_getObj_source(self): glos = self.glos = Glossary() glos.setInfo("sourcelang", "farsi") self.assertEqual(glos.sourceLang.name, "Persian") def test_lang_getObj_target(self): glos = self.glos = Glossary() glos.setInfo("targetlang", "malay") self.assertEqual(glos.targetLang.name, "Malay") def test_lang_detect_1(self): glos = self.glos = Glossary() glos.setInfo("name", "en-fa") glos.detectLangsFromName() self.assertEqual( (glos.sourceLangName, glos.targetLangName), ("English", "Persian"), ) def test_lang_detect_2(self): glos = self.glos = Glossary() glos.setInfo("name", "test-en-fa") glos.detectLangsFromName() self.assertEqual( (glos.sourceLangName, glos.targetLangName), ("English", "Persian"), ) def test_lang_detect_3(self): glos = self.glos = Glossary() glos.setInfo("name", "eng to per") glos.detectLangsFromName() self.assertEqual( (glos.sourceLangName, glos.targetLangName), ("English", "Persian"), ) def test_lang_detect_4(self): glos = self.glos = Glossary() glos.setInfo("name", "Test english to farsi") glos.detectLangsFromName() self.assertEqual( (glos.sourceLangName, glos.targetLangName), ("English", "Persian"), ) def test_lang_detect_5(self): glos = self.glos = Glossary() glos.setInfo("name", "freedict-eng-deu.index") glos.detectLangsFromName() self.assertEqual( (glos.sourceLangName, glos.targetLangName), ("English", "German"), ) def convert_txt_txt( self, fname, # input txt file without extension fname2, # expected output txt file without extension testId="tmp", config=None, **convertArgs, ): self.convert( f"{fname}.txt", f"{fname2}-{testId}.txt", compareText=f"{fname2}.txt", testId=testId, config=config, **convertArgs, ) def convert_to_txtZip( self, fname, # input file with extension fname2, # expected output file without extensions testId="tmp", config=None, **convertArgs, ): inputFilename = self.downloadFile(fname) outputTxtName = f"{fname2}-{testId}.txt" outputFilename = self.newTempFilePath(f"{outputTxtName}.zip") expectedFilename = self.downloadFile(f"{fname2}.txt") glos = self.glos = Glossary() if config is not None: glos.config = config res = glos.convert( inputFilename=inputFilename, outputFilename=outputFilename, **convertArgs ) self.assertEqual(outputFilename, res) zf = zipfile.ZipFile(outputFilename) self.assertTrue( outputTxtName in zf.namelist(), msg=f"{outputTxtName} not in {zf.namelist()}", ) with open(expectedFilename, encoding="utf-8") as expectedFile: expectedText = expectedFile.read() actualText = zf.read(outputTxtName).decode("utf-8") self.assertEqual(len(actualText), len(expectedText)) self.assertEqual(actualText, expectedText) def test_txt_txtZip_1(self): self.convert_to_txtZip( "100-en-fa.txt", "100-en-fa", testId="txt_txtZip_1", infoOverride={"input_file_size": None}, ) def test_sort_1(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-sort", testId="sort_1", sort=True, ) def test_sort_2(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-sort", testId="sort_2", sort=True, sortKeyName="headword_lower", ) def test_sort_3(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-sort-headword", testId="sort_3", sort=True, sortKeyName="headword", ) def test_sort_4(self): self.convert_txt_txt( "300-rand-en-fa", "300-rand-en-fa-sort-headword", testId="sort_4", sort=True, sortKeyName="headword", ) def test_sort_5(self): self.convert_txt_txt( "300-rand-en-fa", "300-rand-en-fa-sort-headword-w1256", testId="sort_5", sort=True, sortKeyName="headword", sortEncoding="windows-1256", ) def test_sort_6(self): self.convert_txt_txt( "300-rand-en-fa", "300-rand-en-fa-sort-w1256", testId="sort_6", sort=True, sortKeyName="headword_lower", sortEncoding="windows-1256", ) def test_sort_7(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-sort-ebook", testId="sort_7", sort=True, sortKeyName="ebook", ) def test_sort_8(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-sort-ebook3", testId="sort_8", sort=True, sortKeyName="ebook_length3", ) def test_lower_1(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-lower", testId="lower_1", config={"lower": True}, ) def test_rtl_1(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-rtl", testId="rtl_1", config={"rtl": True}, ) def test_remove_html_all_1(self): self.convert_txt_txt( "100-en-fa", "100-en-fa-remove_html_all", testId="remove_html_all_1", config={"remove_html_all": True}, ) def test_remove_html_1(self): self.convert_txt_txt( "100-en-de", "100-en-de-remove_font_b", testId="remove_html_1", config={"remove_html": "font,b"}, ) def test_save_info_json(self): fname = "100-en-fa" testId = "save_info_json" infoPath = self.newTempFilePath(f"{fname}-{testId}.info") self.convert_txt_txt( fname, fname, testId=testId, config={"save_info_json": True}, infoOverride={"input_file_size": None}, ) self.compareTextFiles( infoPath, self.downloadFile(f"{fname}.info"), ) def test_convert_sqlite_direct_error(self): glos = self.glos = Glossary() try: res = glos.convert( inputFilename="foo.txt", outputFilename="bar.txt", direct=True, sqlite=True, ) except ValueError as e: self.assertEqual(str(e), "Conflictng arguments: direct=True, sqlite=True") else: self.fail("must raise a ValueError") def test_txt_txt_bar(self): for direct in (None, False, True): self.convert_txt_txt( "004-bar", "004-bar", testId="bar", direct=direct, infoOverride={ "name": None, "input_file_size": None, }, ) def test_txt_txt_bar_sort(self): for sqlite in (None, False, True): self.convert_txt_txt( "004-bar", "004-bar-sort", testId="bar_sort", sort=True, sqlite=sqlite, ) def test_txt_txt_empty_filtered(self): for direct in (None, False, True): self.convert_txt_txt( "006-empty", "006-empty-filtered", testId="empty_filtered", direct=direct, ) def test_txt_txt_empty_filtered_sqlite(self): for sqlite in (None, False, True): self.convert_txt_txt( "006-empty", "006-empty-filtered", testId="empty_filtered_sqlite", sqlite=sqlite, ) def test_dataEntry_save(self): glos = self.glos = Glossary() tmpFname = "test_dataEntry_save" entry = glos.newDataEntry(tmpFname, b"test") saveFpath = entry.save(self.tempDir) self.assertTrue( isfile(saveFpath), msg=f"saved file does not exist: {saveFpath}", ) def test_dataEntry_getFileName(self): glos = self.glos = Glossary() tmpFname = "test_dataEntry_getFileName" entry = glos.newDataEntry(tmpFname, b"test") self.assertEqual(entry.getFileName(), tmpFname) def test_cleanup_noFile(self): glos = self.glos = Glossary() glos.cleanup() def test_cleanup_cleanup(self): glos = self.glos = Glossary() tmpFname = "test_cleanup_cleanup" entry = glos.newDataEntry(tmpFname, b"test") tmpFpath = entry._tmpPath self.assertTrue(bool(tmpFpath), msg="entry tmpPath is empty") self.assertTrue( isfile(tmpFpath), msg=f"tmp file does not exist: {tmpFpath}", ) glos.cleanup() self.assertTrue( not isfile(tmpFpath), msg=f"tmp file still exists: {tmpFpath}", ) def test_cleanup_noCleanup(self): glos = self.glos = Glossary() tmpFname = "test_cleanup_noCleanup" entry = glos.newDataEntry(tmpFname, b"test") tmpFpath = entry._tmpPath self.assertTrue(bool(tmpFpath), msg="entry tmpPath is empty") self.assertTrue(isfile(tmpFpath), msg=f"tmp file does not exist: {tmpFpath}") glos.config = {"cleanup": False} glos.cleanup() self.assertTrue(isfile(tmpFpath), msg=f"tmp file does not exist: {tmpFpath}") def test_rawEntryCompress(self): glos = self.glos = Glossary() glos.setRawEntryCompress(True) self.assertTrue(glos.rawEntryCompress) glos.setRawEntryCompress(False) self.assertFalse(glos.rawEntryCompress) def addWordsList(self, glos, words, newDefiFunc=str, defiFormat=""): wordsList = [] for index, line in enumerate(words): words = line.rstrip().split("|") wordsList.append(words) glos.addEntryObj(glos.newEntry( words, newDefiFunc(index), defiFormat=defiFormat, )) glos.updateIter() return wordsList def addWords(self, glos, wordsStr, **kwargs): return self.addWordsList(glos, wordsStr.split("\n"), **kwargs) tenWordsStr = """comedic tubenose organosol adipocere gid next friend bitter apple caca|ca-ca darkling beetle japonica""" tenWordsStr2 = """comedic Tubenose organosol Adipocere gid Next friend bitter apple Caca|ca-ca darkling beetle Japonica""" tenWordsStrFa = "بیمارانه\nگالوانومتر\nنقاهت\nرشکمندی\nناکاستنی\nشگفتآفرینی\nچندپاری\nنامبارکی\nآماسش\nانگیزنده" def test_addEntries_1(self): glos = self.glos = Glossary() wordsList = self.addWords( glos, self.tenWordsStr, newDefiFunc=lambda i: str(random.randint(0, 10000)), ) self.assertEqual(wordsList, [entry.l_word for entry in glos]) def test_addEntries_2(self): # entry filters don't apply to loaded entries (added with addEntryObj) glos = self.glos = Glossary() glos.addEntryObj(glos.newEntry(["a"], "test 1")) glos.addEntryObj(glos.newEntry([""], "test 2")) glos.addEntryObj(glos.newEntry(["b"], "test 3")) glos.addEntryObj(glos.newEntry([], "test 4")) glos.updateEntryFilters() glos.updateIter() self.assertEqual( [['a'], [''], ['b'], []], [entry.l_word for entry in glos], ) def test_sortWords_1(self): glos = self.glos = Glossary() wordsList = self.addWords( glos, self.tenWordsStr, newDefiFunc=lambda i: str(random.randint(0, 10000)), ) self.assertEqual(wordsList, [entry.l_word for entry in glos]) glos.sortWords() self.assertEqual(sorted(wordsList), [entry.l_word for entry in glos]) def test_sortWords_2(self): glos = self.glos = Glossary() wordsList = self.addWords( glos, self.tenWordsStr2, newDefiFunc=lambda i: str(random.randint(0, 10000)), ) self.assertEqual(wordsList, [entry.l_word for entry in glos]) glos.sortWords(sortKeyName="headword") self.assertEqual( [entry.l_word for entry in glos], [ ['Adipocere'], ['Caca', 'ca-ca'], ['Japonica'], ['Next friend'], ['Tubenose'], ['bitter apple'], ['comedic'], ['darkling beetle'], ['gid'], ['organosol'], ], ) def test_sortWords_3(self): glos = self.glos = Glossary() wordsList = self.addWords( glos, self.tenWordsStrFa, newDefiFunc=lambda i: str(random.randint(0, 10000)), ) self.assertEqual(wordsList, [entry.l_word for entry in glos]) glos.sortWords(sortKeyName="headword") ls1 = ['آماسش', 'انگیزنده', 'بیمارانه', 'رشکمندی', 'شگفتآفرینی'] ls2 = ['نامبارکی', 'ناکاستنی', 'نقاهت', 'چندپاری', 'گالوانومتر'] self.assertEqual( [entry.s_word for entry in glos], ls1 + ls2, ) def test_sortWords_4(self): glos = self.glos = Glossary() wordsList = self.addWords( glos, self.tenWordsStrFa, newDefiFunc=lambda i: str(random.randint(0, 10000)), ) self.assertEqual(wordsList, [entry.l_word for entry in glos]) glos.sortWords( sortKeyName="headword", sortEncoding="windows-1256", ) ls1 = ['چندپاری', 'گالوانومتر', 'آماسش', 'انگیزنده', 'بیمارانه'] ls2 = ['رشکمندی', 'شگفتآفرینی', 'ناکاستنی', 'نامبارکی', 'نقاهت'] self.assertEqual( [entry.s_word for entry in glos], ls1 + ls2, ) def test_sortWords_5(self): glos = self.glos = Glossary() alphabetW1256 = "ءآأئابتثجحخدذرزسشصضطظعغـفقكلمنهوىي" alphabetW1256_shuf = "مفزنصـذرخوآظسقلدغطيعحءأتىئاجهضثشكب" wordsList = self.addWordsList( glos, list(alphabetW1256_shuf), newDefiFunc=lambda i: str(random.randint(0, 10000)), ) self.assertEqual(wordsList, [entry.l_word for entry in glos]) glos.sortWords( sortKeyName="headword", sortEncoding="windows-1256", ) self.assertEqual( [entry.s_word for entry in glos], list(alphabetW1256), ) def test_sortWords_exc_1(self): fname = "100-en-fa.txt" glos = self.glos = Glossary() glos.read(self.downloadFile(fname), direct=True) try: glos.sortWords() except NotImplementedError as e: self.assertEqual(str(e), "can not use sortWords in direct mode") else: self.fail("must raise NotImplementedError") def test_read_filename(self): glos = self.glos = Glossary() glos.read(self.downloadFile("004-bar.txt")) self.assertEqual(glos.filename, join(dataDir, "004-bar")) def test_wordTitleStr_em1(self): glos = self.glos = Glossary() self.assertEqual(glos.wordTitleStr(""), "") def test_wordTitleStr_em2(self): glos = self.glos = Glossary() glos._defiHasWordTitle = True self.assertEqual(glos.wordTitleStr("test1"), "") def test_wordTitleStr_b1(self): glos = self.glos = Glossary() self.assertEqual(glos.wordTitleStr("test1"), "test1
              ") def test_wordTitleStr_b2(self): glos = self.glos = Glossary() self.assertEqual( glos.wordTitleStr("test1", _class="headword"), 'test1
              ', ) def test_wordTitleStr_cjk1(self): glos = self.glos = Glossary() self.assertEqual( glos.wordTitleStr("test1", sample="くりかえし"), "test1
              ", ) def test_wordTitleStr_cjk2(self): glos = self.glos = Glossary() self.assertEqual( glos.wordTitleStr("くりかえし"), "くりかえし
              ", ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/gregorian_test.py000066400000000000000000000152511417733132500204760ustar00rootroot00000000000000#!/usr/bin/env python3 import unittest import sys from os.path import dirname, abspath rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary import gregorian def getMonthLen(y: int, m: int) -> int: if m == 12: return gregorian.to_jd(y + 1, 1, 1) - gregorian.to_jd(y, 12, 1) else: return gregorian.to_jd(y, m + 1, 1) - gregorian.to_jd(y, m, 1) class Testgregorian(unittest.TestCase): def notest_isLeap_negativeYear(self): print() isLeapFunc = gregorian.isLeap for year in range(10, -101, -1): isLeap = isLeapFunc(year) # print(f"{str(year).center(10)} {'L' if isLeap1 else ' '}") print(f"{year}: \"{'L' if isLeap else ' '}\",") # year -> f"{'L' if isLeap33 else ' '}{'L' if isLeap2820 else ' '}" isLeapDict = { -50: " ", -49: " ", -48: "L", -47: " ", -46: " ", -45: " ", -44: "L", -43: " ", -42: " ", -41: " ", -40: "L", -39: " ", -38: " ", -37: " ", -36: "L", -35: " ", -34: " ", -33: " ", -32: "L", -31: " ", -30: " ", -29: " ", -28: "L", -27: " ", -26: " ", -25: " ", -24: "L", -23: " ", -22: " ", -21: " ", -20: "L", -19: " ", -18: " ", -17: " ", -16: "L", -15: " ", -14: " ", -13: " ", -12: "L", -11: " ", -10: " ", -9: " ", -8: "L", -7: " ", -6: " ", -5: " ", -4: "L", -3: " ", -2: " ", -1: " ", 0: "L", 1: " ", 2: " ", 3: " ", 4: "L", 5: " ", 6: " ", 7: " ", 8: "L", 9: " ", 10: " ", 11: " ", 12: "L", 13: " ", 14: " ", 15: " ", 16: "L", 17: " ", 18: " ", 19: " ", 20: "L", 21: " ", 22: " ", 23: " ", 24: "L", 25: " ", 26: " ", 27: " ", 28: "L", 29: " ", 30: " ", 31: " ", 32: "L", 33: " ", 34: " ", 35: " ", 36: "L", 37: " ", 38: " ", 39: " ", 40: "L", 41: " ", 42: " ", 43: " ", 44: "L", 45: " ", 46: " ", 47: " ", 48: "L", 49: " ", 50: " ", 1990: " ", 1991: " ", 1992: "L", 1993: " ", 1994: " ", 1995: " ", 1996: "L", 1997: " ", 1998: " ", 1999: " ", 2000: "L", 2001: " ", 2002: " ", 2003: " ", 2004: "L", 2005: " ", 2006: " ", 2007: " ", 2008: "L", 2009: " ", 2010: " ", 2011: " ", 2012: "L", 2013: " ", 2014: " ", 2015: " ", 2016: "L", 2017: " ", 2018: " ", 2019: " ", 2020: "L", 2021: " ", 2022: " ", 2023: " ", 2024: "L", 2025: " ", 2026: " ", 2027: " ", 2028: "L", 2029: " ", } dateToJdDict = { (-50, 1, 1): 1702798, (-49, 1, 1): 1703163, (-48, 1, 1): 1703528, (-47, 1, 1): 1703894, (-46, 1, 1): 1704259, (-45, 1, 1): 1704624, (-44, 1, 1): 1704989, (-43, 1, 1): 1705355, (-42, 1, 1): 1705720, (-41, 1, 1): 1706085, (-40, 1, 1): 1706450, (-39, 1, 1): 1706816, (-38, 1, 1): 1707181, (-37, 1, 1): 1707546, (-36, 1, 1): 1707911, (-35, 1, 1): 1708277, (-34, 1, 1): 1708642, (-33, 1, 1): 1709007, (-32, 1, 1): 1709372, (-31, 1, 1): 1709738, (-30, 1, 1): 1710103, (-29, 1, 1): 1710468, (-28, 1, 1): 1710833, (-27, 1, 1): 1711199, (-26, 1, 1): 1711564, (-25, 1, 1): 1711929, (-24, 1, 1): 1712294, (-23, 1, 1): 1712660, (-22, 1, 1): 1713025, (-21, 1, 1): 1713390, (-20, 1, 1): 1713755, (-19, 1, 1): 1714121, (-18, 1, 1): 1714486, (-17, 1, 1): 1714851, (-16, 1, 1): 1715216, (-15, 1, 1): 1715582, (-14, 1, 1): 1715947, (-13, 1, 1): 1716312, (-12, 1, 1): 1716677, (-11, 1, 1): 1717043, (-10, 1, 1): 1717408, (-9, 1, 1): 1717773, (-8, 1, 1): 1718138, (-7, 1, 1): 1718504, (-6, 1, 1): 1718869, (-5, 1, 1): 1719234, (-4, 1, 1): 1719599, (-3, 1, 1): 1719965, (-2, 1, 1): 1720330, (-1, 1, 1): 1720695, (0, 1, 1): 1721060, (1, 1, 1): 1721426, (2, 1, 1): 1721791, (3, 1, 1): 1722156, (4, 1, 1): 1722521, (5, 1, 1): 1722887, (6, 1, 1): 1723252, (7, 1, 1): 1723617, (8, 1, 1): 1723982, (9, 1, 1): 1724348, (10, 1, 1): 1724713, (11, 1, 1): 1725078, (12, 1, 1): 1725443, (13, 1, 1): 1725809, (14, 1, 1): 1726174, (15, 1, 1): 1726539, (16, 1, 1): 1726904, (17, 1, 1): 1727270, (18, 1, 1): 1727635, (19, 1, 1): 1728000, (20, 1, 1): 1728365, (21, 1, 1): 1728731, (22, 1, 1): 1729096, (23, 1, 1): 1729461, (24, 1, 1): 1729826, (25, 1, 1): 1730192, (26, 1, 1): 1730557, (27, 1, 1): 1730922, (28, 1, 1): 1731287, (29, 1, 1): 1731653, (30, 1, 1): 1732018, (31, 1, 1): 1732383, (32, 1, 1): 1732748, (33, 1, 1): 1733114, (34, 1, 1): 1733479, (35, 1, 1): 1733844, (36, 1, 1): 1734209, (37, 1, 1): 1734575, (38, 1, 1): 1734940, (39, 1, 1): 1735305, (40, 1, 1): 1735670, (41, 1, 1): 1736036, (42, 1, 1): 1736401, (43, 1, 1): 1736766, (44, 1, 1): 1737131, (45, 1, 1): 1737497, (46, 1, 1): 1737862, (47, 1, 1): 1738227, (48, 1, 1): 1738592, (49, 1, 1): 1738958, (50, 1, 1): 1739323, (2015, 1, 1): 2457024, (2015, 2, 1): 2457055, (2015, 3, 1): 2457083, (2015, 4, 1): 2457114, (2015, 5, 1): 2457144, (2015, 6, 1): 2457175, (2015, 7, 1): 2457205, (2015, 8, 1): 2457236, (2015, 9, 1): 2457267, (2015, 10, 1): 2457297, (2015, 11, 1): 2457328, (2015, 12, 1): 2457358, (2016, 1, 1): 2457389, (2016, 2, 1): 2457420, (2016, 3, 1): 2457449, (2016, 4, 1): 2457480, (2016, 5, 1): 2457510, (2016, 6, 1): 2457541, (2016, 7, 1): 2457571, (2016, 8, 1): 2457602, (2016, 9, 1): 2457633, (2016, 10, 1): 2457663, (2016, 11, 1): 2457694, (2016, 12, 1): 2457724, (2017, 1, 1): 2457755, (2017, 2, 1): 2457786, (2017, 3, 1): 2457814, (2017, 4, 1): 2457845, (2017, 5, 1): 2457875, (2017, 6, 1): 2457906, (2017, 7, 1): 2457936, (2017, 8, 1): 2457967, (2017, 9, 1): 2457998, (2017, 10, 1): 2458028, (2017, 11, 1): 2458059, (2017, 12, 1): 2458089, } def test_isLeap(self): for year, isLeapStr in self.isLeapDict.items(): isLeap = isLeapStr == "L" isLeapActual = gregorian.isLeap(year) self.assertEqual( isLeapActual, isLeap, f"year={year}, isLeap={isLeap}, isLeapActual={isLeapActual}", ) def test_to_jd(self): for date, jd in self.dateToJdDict.items(): jdActual = gregorian.to_jd(*date) self.assertEqual( jdActual, jd, f"date={date}, jd={jd}, jdActual={jdActual}", ) def test_convert(self): startYear = 1950 endYear = 2050 for year in range(startYear, endYear): for month in range(1, 13): monthLen = getMonthLen(year, month) for day in range(1, monthLen + 1): date = (year, month, day) jd = gregorian.to_jd(*date) ndate = gregorian.jd_to(jd) self.assertEqual( ndate, date, f"jd={jd}, date={date}, ndate={ndate}", ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/html_utils_test.py000066400000000000000000000037131417733132500207050ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- from os.path import join, dirname, abspath import sys import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.html_utils import * class UnescapeUnicodeTest(unittest.TestCase): def case(self, text, expected): actual = unescape_unicode(text) self.assertEqual(actual, expected) def test(self): self.case("<", "<") self.case(">", ">") self.case("&", "&") self.case(""", """) self.case("'", "'") self.case(" ", " ") self.case(" ", " ") self.case("<á>", "<á>") self.case("/wəːkiŋtiːm/", "/wəːkiŋtiːm/") # Babylon dictionaries contain a lot of non-standard entity, # references for example, csdot, fllig, nsm, cancer, thlig, # tsdot, upslur... self.case("<&etilde;", "<ẽ") self.case("<⅓", "<⅓") self.case("<⅔", "<⅔") self.case("<ĩ", "<ĩ") self.case("<&ldash;", "<–") self.case("<ů", "<ů") self.case("<ũ", "<ũ") self.case("<&wring;", "<ẘ") self.case("<&xfrac13;", "<⅓") self.case("<ŷ", "<ŷ") self.case("<&ygrave;", "<ỳ") self.case("<&yring;", "<ẙ") self.case("<&ytilde;", "<ỹ") def benchmark_main(): import timeit from random import choice from english_words import english_words_set english_words_list = list(english_words_set) textList = [] for i in range(20): text = "" for j in range(10): text += choice(english_words_list) + " " textList.append(text) print("avg length:", sum(len(text) for text in textList) / len(textList)) def run_benchmark1(): for text in textList: unescape_unicode(text) print("benchmark 1:", timeit.timeit("run_benchmark1()", globals=locals())) if __name__ == "__main__": if "-b" in sys.argv: benchmark_main() else: unittest.main() pyglossary-4.5.0/tests/option_test.py000066400000000000000000000144101417733132500200250ustar00rootroot00000000000000#!/usr/bin/python3 import sys from os.path import join, dirname, abspath import unittest import random rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.option import * class TestOptionValidateBoolNumber(unittest.TestCase): def caseOK(self, cls, raw: str, value: "Optional[bool]"): opt = cls() valueActual, ok = opt.evaluate(raw) self.assertTrue(ok, "evaluate failed") self.assertEqual(valueActual, value) ok2 = opt.validate(valueActual) self.assertEqual(ok2, True, "validate failed") def caseFailed(self, cls, raw: str, value: "Optional[bool]"): opt = cls() valueActual, ok = opt.evaluate(raw) self.assertFalse(ok) self.assertEqual(valueActual, value) def test_bool_ok(self): self.caseOK(BoolOption, "True", True) self.caseOK(BoolOption, "False", False) self.caseOK(BoolOption, "true", True) self.caseOK(BoolOption, "false", False) self.caseOK(BoolOption, "TRUE", True) self.caseOK(BoolOption, "FALSE", False) self.caseOK(BoolOption, "1", True) self.caseOK(BoolOption, "0", False) self.caseOK(BoolOption, "yes", True) self.caseOK(BoolOption, "no", False) self.caseOK(BoolOption, "YES", True) self.caseOK(BoolOption, "NO", False) def test_bool_failed(self): self.caseFailed(BoolOption, "Y", None) self.caseFailed(BoolOption, "N", None) self.caseFailed(BoolOption, "YESS", None) self.caseFailed(BoolOption, "NOO", None) self.caseFailed(BoolOption, "123", None) self.caseFailed(BoolOption, "a", None) def test_int_ok(self): self.caseOK(IntOption, "0", 0) self.caseOK(IntOption, "1", 1) self.caseOK(IntOption, "-1", -1) self.caseOK(IntOption, "1234", 1234) def test_int_failed(self): self.caseFailed(IntOption, "abc", None) self.caseFailed(IntOption, "12f", None) self.caseFailed(IntOption, "fff", None) def test_file_size_ok(self): self.caseOK(FileSizeOption, "0", 0) self.caseOK(FileSizeOption, "1", 1) self.caseOK(FileSizeOption, "1234", 1234) self.caseOK(FileSizeOption, "123k", 123000) self.caseOK(FileSizeOption, "123m", 123000000) self.caseOK(FileSizeOption, "1.7g", 1700000000) self.caseOK(FileSizeOption, "123kib", 123 * 1024) self.caseOK(FileSizeOption, "123KiB", 123 * 1024) self.caseOK(FileSizeOption, "123ki", 123 * 1024) self.caseOK(FileSizeOption, "123Ki", 123 * 1024) self.caseOK(FileSizeOption, "123mib", 123 * 1024 ** 2) self.caseOK(FileSizeOption, "123MiB", 123 * 1024 ** 2) self.caseOK(FileSizeOption, "123mi", 123 * 1024 ** 2) self.caseOK(FileSizeOption, "123Mi", 123 * 1024 ** 2) self.caseOK(FileSizeOption, "1.7gib", int(1.7 * 1024 ** 3)) self.caseOK(FileSizeOption, "1.7GiB", int(1.7 * 1024 ** 3)) self.caseOK(FileSizeOption, "1.7gi", int(1.7 * 1024 ** 3)) self.caseOK(FileSizeOption, "1.7Gi", int(1.7 * 1024 ** 3)) def test_file_size_failed(self): self.caseFailed(FileSizeOption, "-1", None) self.caseFailed(FileSizeOption, "123kg", None) self.caseFailed(FileSizeOption, "123k.1", None) def test_float_ok(self): self.caseOK(FloatOption, "0", 0.0) self.caseOK(FloatOption, "1", 1.0) self.caseOK(FloatOption, "-1", -1.0) self.caseOK(FloatOption, "1234", 1234.0) self.caseOK(FloatOption, "1.5", 1.5) self.caseOK(FloatOption, "-7.9", -7.9) def test_float_failed(self): self.caseFailed(FloatOption, "abc", None) self.caseFailed(FloatOption, "12f", None) self.caseFailed(FloatOption, "fff", None) class TestOptionValidateStr(unittest.TestCase): def newTester(self, customValue: bool, values: "List[str]"): def test(raw: str, valid: bool): opt = StrOption(customValue=customValue, values=values) valueActual, evalOkActual = opt.evaluate(raw) self.assertEqual(evalOkActual, True, "evaluate failed") self.assertEqual(valueActual, raw) validActual = opt.validate(valueActual) self.assertEqual(validActual, valid, "validate failed") return test def test_1(self): test = self.newTester(False, ["a", "b", "c"]) test("a", True) test("b", True) test("c", True) test("d", False) test("123", False) def test_2(self): test = self.newTester(True, ["a", "b", "3"]) test("a", True) test("b", True) test("c", True) test("d", True) test("123", True) class TestOptionValidateDict(unittest.TestCase): def caseOK(self, raw: str, value: "Optional[Dict]"): opt = DictOption() valueActual, ok = opt.evaluate(raw) self.assertTrue(ok, "evaluate failed") self.assertEqual(valueActual, value) ok2 = opt.validate(valueActual) self.assertEqual(ok2, True, "validate failed") def caseEvalFail(self, raw: str): opt = DictOption() valueActual, ok = opt.evaluate(raw) self.assertFalse(ok) self.assertEqual(valueActual, None) def test_dict_ok(self): self.caseOK("", None) self.caseOK("{}", {}) self.caseOK('{"a": 1}', {"a": 1}) self.caseOK('{"a": "b", "123":456}', {"a": "b", "123": 456}) def test_dict_syntaxErr(self): self.caseEvalFail("123abc") self.caseEvalFail('{') self.caseEvalFail("(") self.caseEvalFail('{"a": 1') self.caseEvalFail('{"a": 1]') self.caseEvalFail('][') def test_dict_notDict(self): self.caseEvalFail("123") self.caseEvalFail("[]") self.caseEvalFail("[1, 2, 3]") self.caseEvalFail('["a", 2, 3.5]') self.caseEvalFail('{10, 20, 30}') class TestOptionValidateList(unittest.TestCase): def caseOK(self, raw: str, value: "Optional[Dict]"): opt = ListOption() valueActual, ok = opt.evaluate(raw) self.assertTrue(ok, "evaluate failed") self.assertEqual(valueActual, value) ok2 = opt.validate(valueActual) self.assertEqual(ok2, True, "validate failed") def caseEvalFail(self, raw: str): opt = ListOption() valueActual, ok = opt.evaluate(raw) self.assertFalse(ok, f"evaluale did not fail, valueActual={valueActual!r}") self.assertEqual(valueActual, None) def test_list_ok(self): self.caseOK("", None) self.caseOK("[]", []) self.caseOK('["a", "b"]', ["a", "b"]) self.caseOK("[1, 2, 3]", [1, 2, 3]) self.caseOK('["a", 2, 3.5]', ["a", 2, 3.5]) def test_list_syntaxErr(self): self.caseEvalFail("123abc") self.caseEvalFail('{') self.caseEvalFail("(") self.caseEvalFail('{"a": 1') self.caseEvalFail('{"a": 1]') self.caseEvalFail('][') def test_list_notList(self): self.caseEvalFail("123") self.caseEvalFail('{10, 20, 30}') self.caseEvalFail('{"a": 1}') self.caseEvalFail('{"a": "b", "123":456}') if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/stardict_test.py000066400000000000000000000107231417733132500203350ustar00rootroot00000000000000import unittest import locale import random from functools import cmp_to_key def toBytes(s): return bytes(s, "utf-8") if isinstance(s, str) else bytes(s) def sortKeyBytes(ba: bytes): assert isinstance(ba, bytes) # ba.lower() + ba is wrong return ( ba.lower(), ba, ) def stardictStrCmp(s1, s2): """ use this function to sort index items in StarDict dictionary s1 and s2 must be utf-8 encoded strings """ s1 = toBytes(s1) s2 = toBytes(s2) a = asciiStrCaseCmp(s1, s2) if a == 0: return strCmp(s1, s2) return a # the slow way in Python 3 (where there is no cmp arg in list.sort) sortKeyOld = cmp_to_key(stardictStrCmp) # TOO SLOW def asciiStrCaseCmp(ba1, ba2): """ ba1 and ba2 are instances of bytes imitate g_ascii_strcasecmp function of glib library gstrfuncs.c file """ commonLen = min(len(ba1), len(ba2)) for i in range(commonLen): c1 = asciiLower(ba1[i]) c2 = asciiLower(ba2[i]) if c1 != c2: return c1 - c2 return len(ba1) - len(ba2) def strCmp(ba1, ba2): """ ba1 and ba2 are instances of bytes imitate strcmp of standard C library Attention! You may have a temptation to replace this function with built-in cmp() function. Hold on! Most probably these two function behave identically now, but cmp does not document how it compares strings. There is no guaranty it will not be changed in future. Since we need predictable sorting order in StarDict dictionary, we need to preserve this function despite the fact there are other ways to implement it. """ commonLen = min(len(ba1), len(ba2)) for i in range(commonLen): c1 = ba1[i] c2 = ba2[i] if c1 != c2: return c1 - c2 return len(ba1) - len(ba2) def isAsciiAlpha(c): """ c is int """ return ord("A") <= c <= ord("Z") or ord("a") <= c <= ord("z") def isAsciiLower(c): return ord("a") <= c <= ord("z") def isAsciiUpper(c): """ c is int imitate ISUPPER macro of glib library gstrfuncs.c file """ return ord("A") <= c <= ord("Z") def asciiLower(c): """ c is int returns int (ascii character code) imitate TOLOWER macro of glib library gstrfuncs.c file This function converts upper case Latin letters to corresponding lower case letters, other chars are not changed. c must be non-Unicode string of length 1. You may apply this function to individual bytes of non-Unicode string. The following encodings are allowed: single byte encoding like koi8-r, cp1250, cp1251, cp1252, etc, and utf-8 encoding. Attention! Python Standard Library provides str.lower() method. It is not a correct replacement for this function. For non-unicode string str.lower() is locale dependent, it not only converts Latin letters to lower case, but also locale specific letters will be converted. """ return c - ord("A") + ord("a") if isAsciiUpper(c) else c def getRandomBytes(avgLen, sigma): length = round(random.gauss(avgLen, sigma)) return bytes([ random.choice(range(256)) for _ in range(length) ]) class AsciiLowerUpperTest(unittest.TestCase): def set_locale_iter(self): for localeName in locale.locale_alias.values(): try: locale.setlocale(locale.LC_ALL, localeName) except Exception as e: if "unsupported locale setting" not in str(e): print(e) continue yield localeName def test_isalpha(self): for _ in self.set_locale_iter(): for code in range(256): self.assertEqual( isAsciiAlpha(code), bytes([code]).isalpha(), ) def test_islower(self): for _ in self.set_locale_iter(): for code in range(256): self.assertEqual( isAsciiLower(code), bytes([code]).islower(), ) def test_isupper(self): for _ in self.set_locale_iter(): for code in range(256): self.assertEqual( isAsciiUpper(code), bytes([code]).isupper(), ) def test_lower(self): for _ in self.set_locale_iter(): for code in range(256): self.assertEqual( asciiLower(code), ord(bytes([code]).lower()), ) class SortRandomTest(unittest.TestCase): def set_locale_iter(self): for localeName in locale.locale_alias.values(): try: locale.setlocale(locale.LC_ALL, localeName) except Exception as e: if "unsupported locale setting" not in str(e): raise e continue # print(localeName) yield localeName def test_sort_1(self): bsList = [ getRandomBytes(30, 10) for _ in range(100) ] for _ in self.set_locale_iter(): self.assertEqual( sorted( bsList, key=sortKeyOld, ), sorted( bsList, key=sortKeyBytes, ) ) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/text_utils_test.py000066400000000000000000000175761417733132500207410ustar00rootroot00000000000000#!/usr/bin/python3 import sys from os.path import join, dirname, abspath import unittest import struct rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.text_utils import * class TestTextUtils(unittest.TestCase): def test_fixUtf8(self): f = fixUtf8 # Since entries already keep words and defi as string, fixUtf8 does not # do much. It just removes zero bytes between valid characters # (and not within characters) # If there were encoding errors in input file, Reader class would # most likely fail to read and raise exception. # This feature was useful in Python 2.x, but not much anymore! self.assertEqual(f("\x00س\x00لام"), "سلام") def test_unescapeNTB(self): self.assertEqual("a", unescapeNTB("a", bar=False)) self.assertEqual("a\t", unescapeNTB("a\\t", bar=False)) self.assertEqual("a\n", unescapeNTB("a\\n", bar=False)) self.assertEqual("\ta", unescapeNTB("\\ta", bar=False)) self.assertEqual("\na", unescapeNTB("\\na", bar=False)) self.assertEqual("a\tb\n", unescapeNTB("a\\tb\\n", bar=False)) self.assertEqual("a\\b", unescapeNTB("a\\\\b", bar=False)) self.assertEqual("a\\\tb", unescapeNTB("a\\\\\\tb", bar=False)) self.assertEqual("a|b\tc", unescapeNTB("a|b\\tc", bar=False)) self.assertEqual("a\\|b\tc", unescapeNTB("a\\|b\\tc", bar=False)) self.assertEqual("a\\|b\tc", unescapeNTB("a\\\\|b\\tc", bar=False)) self.assertEqual("|", unescapeNTB("\\|", bar=True)) self.assertEqual("a|b", unescapeNTB("a\\|b", bar=True)) self.assertEqual("a|b\tc", unescapeNTB("a\\|b\\tc", bar=True)) def test_escapeNTB(self): self.assertEqual(escapeNTB("a", bar=False), "a") self.assertEqual(escapeNTB("a\t", bar=False), "a\\t") self.assertEqual(escapeNTB("a\n", bar=False), "a\\n") self.assertEqual(escapeNTB("\ta", bar=False), "\\ta") self.assertEqual(escapeNTB("\na", bar=False), "\\na") self.assertEqual(escapeNTB("a\tb\n", bar=False), "a\\tb\\n") self.assertEqual(escapeNTB("a\\b", bar=False), "a\\\\b") self.assertEqual(escapeNTB("a\\\tb", bar=False), "a\\\\\\tb") self.assertEqual(escapeNTB("a|b\tc", bar=False), "a|b\\tc") self.assertEqual(escapeNTB("a\\|b\tc", bar=False), "a\\\\|b\\tc") self.assertEqual(escapeNTB("|", bar=True), "\\|") self.assertEqual(escapeNTB("a|b", bar=True), "a\\|b") self.assertEqual(escapeNTB("a|b\tc", bar=True), "a\\|b\\tc") def test_splitByBarUnescapeNTB(self): f = splitByBarUnescapeNTB self.assertEqual(f(""), [""]) self.assertEqual(f("|"), ["", ""]) self.assertEqual(f("a"), ["a"]) self.assertEqual(f("a|"), ["a", ""]) self.assertEqual(f("|a"), ["", "a"]) self.assertEqual(f("a|b"), ["a", "b"]) self.assertEqual(f("a\\|b|c"), ["a|b", "c"]) self.assertEqual(f("a\\\\1|b|c"), ["a\\1", "b", "c"]) # self.assertEqual(f("a\\\\|b|c"), ["a\\", "b", "c"]) # FIXME self.assertEqual(f("a\\\\1|b\\n|c\\t"), ["a\\1", "b\n", "c\t"]) def test_unescapeBar(self): f = unescapeBar self.assertEqual("", f("")) self.assertEqual("|", f("\\|")) self.assertEqual("a|b", f("a\\|b")) self.assertEqual("a|b\tc", f("a\\|b\tc")) self.assertEqual("a|b\\t\\nc", f("a\\|b\\t\\nc")) self.assertEqual("\\", f("\\\\")) self.assertEqual("\\|", f("\\\\\\|")) def test_splitByBar(self): f = splitByBar self.assertEqual(f(""), [""]) self.assertEqual(f("|"), ["", ""]) self.assertEqual(f("a"), ["a"]) self.assertEqual(f("a|"), ["a", ""]) self.assertEqual(f("|a"), ["", "a"]) self.assertEqual(f("a|b"), ["a", "b"]) self.assertEqual(f("a\\|b"), ["a|b"]) self.assertEqual(f("a\\|b|c"), ["a|b", "c"]) self.assertEqual(f("a\\\\1|b|c"), ["a\\1", "b", "c"]) # self.assertEqual(f("a\\\\|b|c"), ["a\\", "b", "c"]) # FIXME def test_joinByBar(self): f = joinByBar self.assertEqual("", f([""])) self.assertEqual("|", f(["", ""])) self.assertEqual("a", f(["a"])) self.assertEqual("a|", f(["a", ""])) self.assertEqual("|a", f(["", "a"])) self.assertEqual("a|b", f(["a", "b"])) self.assertEqual("a\\|b", f(["a|b"])) self.assertEqual("a\\|b|c", f(["a|b", "c"])) self.assertEqual("a\\\\1|b|c", f(["a\\1", "b", "c"])) def test_unescapeBarBytes(self): f = unescapeBarBytes self.assertEqual(b"", f(b"")) self.assertEqual(b"|", f(b"\\|")) self.assertEqual(b"a|b", f(b"a\\|b")) self.assertEqual(b"a|b\tc", f(b"a\\|b\tc")) self.assertEqual(b"a|b\\t\\nc", f(b"a\\|b\\t\\nc")) self.assertEqual(b"\\", f(b"\\\\")) self.assertEqual(b"\\|", f(b"\\\\\\|")) def test_formatHMS(self): f = formatHMS self.assertEqual(f(0, 0, 0), "00") self.assertEqual(f(0, 0, 9), "09") self.assertEqual(f(0, 0, 10), "10") self.assertEqual(f(0, 0, 59), "59") self.assertEqual(f(0, 1, 0), "01:00") self.assertEqual(f(0, 1, 5), "01:05") self.assertEqual(f(0, 5, 7), "05:07") self.assertEqual(f(0, 59, 0), "59:00") self.assertEqual(f(0, 59, 59), "59:59") self.assertEqual(f(1, 0, 0), "01:00:00") self.assertEqual(f(123, 5, 7), "123:05:07") self.assertEqual(f(123, 59, 59), "123:59:59") def test_uint32ToBytes(self): f = uint32ToBytes self.assertEqual(f(0), bytes([0, 0, 0, 0])) self.assertEqual(f(0x3e8), bytes([0, 0, 0x03, 0xe8])) self.assertEqual(f(0x186a0), bytes([0, 1, 0x86, 0xa0])) self.assertEqual(f(0x3b9aca00), bytes([0x3b, 0x9a, 0xca, 0x00])) self.assertEqual(f(0xffffffff), bytes([0xff, 0xff, 0xff, 0xff])) with self.assertRaises(struct.error) as ctx: f(0xffffffff + 1) self.assertEqual( str(ctx.exception), "'I' format requires 0 <= number <= 4294967295", ) with self.assertRaises(struct.error) as ctx: f(10000000000) self.assertEqual( str(ctx.exception), "'I' format requires 0 <= number <= 4294967295", ) with self.assertRaises(struct.error) as ctx: f(-1) self.assertEqual(str(ctx.exception), "argument out of range") def test_uint32FromBytes(self): f = uint32FromBytes self.assertEqual(0, f(bytes([0, 0, 0, 0]))) self.assertEqual(0x3e8, f(bytes([0, 0, 0x03, 0xe8]))) self.assertEqual(0x186a0, f(bytes([0, 1, 0x86, 0xa0]))) self.assertEqual(0x3b9aca00, f(bytes([0x3b, 0x9a, 0xca, 0x00]))) self.assertEqual(0xffffffff, f(bytes([0xff, 0xff, 0xff, 0xff]))) with self.assertRaises(struct.error) as ctx: f(bytes([0x01, 0xff, 0xff, 0xff, 0xff])) self.assertEqual(str(ctx.exception), "unpack requires a buffer of 4 bytes") def test_uintFromBytes(self): f = uintFromBytes self.assertEqual(0, f(bytes([0, 0, 0, 0]))) self.assertEqual(0x3e8, f(bytes([0, 0, 0x03, 0xe8]))) self.assertEqual(0x186a0, f(bytes([0, 1, 0x86, 0xa0]))) self.assertEqual(0x3b9aca00, f(bytes([0x3b, 0x9a, 0xca, 0x00]))) self.assertEqual(0xffffffff, f(bytes([0xff, 0xff, 0xff, 0xff]))) self.assertEqual( 0xffabcdef5542, f(bytes([0xff, 0xab, 0xcd, 0xef, 0x55, 0x42])), ) def test_crc32hex(self): f = crc32hex self.assertEqual(f(b""), "00000000") self.assertEqual(f(b"\x00"), "d202ef8d") self.assertEqual(f(b"\x00\x00"), "41d912ff") self.assertEqual( f(bytes.fromhex("73c3bbc38b7459360ac3a9c2b3c2a2")), "bbfb1610", ) def test_urlToPath(self): f = urlToPath self.assertEqual( f("https://github.com/ilius/pyglossary"), "https://github.com/ilius/pyglossary", ) self.assertEqual( f("file:///home/test/abc.txt"), "/home/test/abc.txt", ) self.assertEqual( f("file:///home/test/%D8%AA%D8%B3%D8%AA.txt"), "/home/test/تست.txt", ) def test_replacePostSpaceChar(self): f = replacePostSpaceChar self.assertEqual( f("First sentence .Second sentence.", "."), "First sentence. Second sentence.", ) self.assertEqual( f("First ,second.", ","), "First, second.", ) def test_isASCII(self): f = isASCII self.assertEqual(f(""), True) self.assertEqual(f("abc"), True) self.assertEqual(f("xyz"), True) self.assertEqual(f("ABC"), True) self.assertEqual(f("XYZ"), True) self.assertEqual(f("1234567890"), True) self.assertEqual(f("\n\r\t"), True) self.assertEqual(f("\x80"), False) self.assertEqual(f("abc\x80"), False) self.assertEqual(f("abc\xff"), False) if __name__ == "__main__": unittest.main() pyglossary-4.5.0/tests/xml_utils_test.py000066400000000000000000000012201417733132500205300ustar00rootroot00000000000000import sys from os.path import dirname, abspath import unittest rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) from pyglossary.xml_utils import xml_escape class Test_xml_escape(unittest.TestCase): def test(self): f = xml_escape self.assertEqual(f(''), '') self.assertEqual(f('abc'), 'abc') self.assertEqual(f('"a"'), '"a"') self.assertEqual(f("'a'"), "'a'") self.assertEqual(f('"a"', quotation=False), '"a"') self.assertEqual(f("'a'", quotation=False), "'a'") self.assertEqual(f('R&D'), 'R&D') self.assertEqual(f('<-->'), '<-->') if __name__ == "__main__": unittest.main()