pax_global_header00006660000000000000000000000064132334243650014517gustar00rootroot0000000000000052 comment=5f84fdcfecd48b719779be251e06863a69ab2452 urlwatch-2.8/000077500000000000000000000000001323342436500132215ustar00rootroot00000000000000urlwatch-2.8/.gitignore000066400000000000000000000000271323342436500152100ustar00rootroot00000000000000__pycache__ .idea buildurlwatch-2.8/.travis.yml000066400000000000000000000002461323342436500153340ustar00rootroot00000000000000language: python python: - "3.4" - "3.5" - "3.6" - "nightly" install: - pip install pyyaml minidb requests keyring pycodestyle appdirs script: nosetests -v urlwatch-2.8/COPYING000066400000000000000000000026001323342436500142520ustar00rootroot00000000000000Copyright (c) 2008-2018 Thomas Perl All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. urlwatch-2.8/ChangeLog000066400000000000000000000213121323342436500147720ustar00rootroot000000000000002008-03-04 Thomas Perl * Initial Version 2008-03-17 Thomas Perl * Release version 1.0 2008-03-20 Lukas Vana * Add support for error handling missing URLs * Notify users when NEW sites appear * Option "display_errors" can be set in watch.py 2008-03-22 Thomas Perl * Release version 1.1 2008-05-09 Lukas Upton * Fix problem with Mac OS X 10.5.2 and Ubuntu 8.04 2008-05-10 Thomas Perl * Release version 1.2 2008-05-15 Craig Hoffman * Add support for sending a User-Agent header 2008-05-16 Thomas Perl * Release version 1.3 2008-11-14 Thomas Perl + Add example for using HTML Tidy (needs python-utidylib) + Add example for using the ical2txt module (needs python-vobject) + Add ical2txt.py module for converting ics to plaintext * More comments in hooks.py for better user documentation * Release version 1.4 2008-11-18 Thomas Perl * Support for installing into the system * Use ~/.urlwatch/ for config, cache and hooks * Apply BSD license * Add setup.py (and remove makefile) * Command-line options * Verbose logging mode * Example urls.txt and hooks.py * Update README * Add manpage (urlwatch.1) * Release version 1.5 2008-12-23 Thomas Perl * Use hashlib in Python 2.5 and above for SHA-1 generation * Release version 1.6 2009-01-03 Thomas Perl * Add urlwatch.html2txt module to convert/format HTML to plaintext * Add example of using html2txt in the example hooks file * The html-to-plaintext feature has been suggested by Evert Meulie * Release version 1.7 2009-01-05 Thomas Perl * Fix a problem with relative links in Lynx' "-dump" mode 2009-01-07 Thomas Perl * Fix another problem with file-relative links in html2text w/ Lynx 2009-01-12 Thomas Perl * Describe ical2txt and html2txt with examples in manpage 2009-01-15 Thomas Perl * Add TODO list 2009-01-20 Thomas Perl * Set the socket timeout to one minute to avoid hangs 2009-07-27 Thomas Perl * Catch and handle IOErrors from FTP timeouts 2009-08-01 Thomas Perl * Add error handling for socket timeouts (HTTP mode) 2009-08-10 Thomas Perl * Handle httplib errors (Debian bug 529740) (Thanks to Bastian Kleineidam and Franck Joncourt) * urlwatch 1.8 released 2009-09-29 Thomas Perl * Support for shell pipe (|) in urls.txt * Support for If-Modified-Since header + HTTP 304 * Show previous/current timestamp in diff output * Remove TODO list * urlwatch 1.9 released 2010-05-10 Thomas Perl * Get encoding from headers and convert to UTF-8 (suggested by Ján Ondrej) * urlwatch 1.10 released 2010-07-30 Thomas Perl * Detect non-zero shell command exit codes and raise an error * urlwatch 1.11 released 2011-02-10 Thomas Perl * Allow None as return value for filters (if a filter returns None, interpret it as "don't filter") * Update website URL, contact info and copyright years * urlwatch 1.12 released 2011-08-22 Thomas Perl * Support for POST requests (suggested by Sébastien Fricker) * Use concurrent.futures for parallel execution (needs Python 3.2 or "futures" from PyPI for older Python versions, including 2.x) * Various code changes to enhance compatibility with Python 3 * Add convert-to-python3.sh script to convert the codebase into Python 3 format using the "2to3" utility included with Python * urlwatch 1.13 released 2011-11-15 Thomas Perl * Fix an encoding issue related to the html2txt module (thanks to Thomas Dziedzic for reporting this issue and testing the patch) * urlwatch 1.14 released 2012-08-30 Thomas Perl * Merge changes from Slavko related to UTF-8 and html2txt, this has been tested on Debian-based systems * urlwatch 1.15 released 2012-09-13 Xavier Izard * Added basic support for email delivery, using internal SMTP lib. (see options --mailto, --mailfrom and --smtp) 2013-03-11 Thomas Perl * Minimalistic, automatic setup.py script (based on jabberbot) * Move files around ({examples,urlwatch.1} -> share/...) * Update Python 3 migration script and MANIFEST.in with new paths 2013-11-23 Thomas Perl * Fix a bug with parsing content-encoding headers 2014-01-29 Thomas Perl * Update manpage * urlwatch 1.16 released 2014-08-01 Thomas Perl * Handle invalid encoding sent by server (fixes Debian bug 731931) * Fix lynx handing for relative URLs (fixes Debian bug 732112) * Fix resolving of relative URL filenames (fixes Debian bug 748905) * urlwatch 1.17 released 2015-02-27 Thomas Perl * Fallback to using pwd if os.getlogin() fails (fixes #2) * Handle HTTP compression (Content-encoding: gzip/deflate) * Add option to suppress output on stdout (-q/--quiet) * Allow customizing subject when sending e-mail (-S/--subject) * Added support for TLS and SMTP auth (-p/--pass, -T/--tls, -A/--auth) * Added support for specifying cache directory (-c/--cache) * Add support for HTTP Auth to urlwatch.handler (fixes #10) 2016-01-16 Thomas Perl * Version 2.0 with lots of changes, only a few listed here * Requires Python 3, support for Python 2 dropped * Uses SQLite 3 / minidb for cache storage * Uses PyYAML for the URL list and configuration file * Subclass-based hooking features * Custom job types by subclassing Job * Custom reporters by subclassing ReporterBase * Custom filters by subclassing FilterBase * Old data will be migrated as good as possible to the new formats 2016-02-03 Thomas Perl * Replace urllib usage with requests (by Louis Sautier) * Add cookies support (by Louis Sautier) * Convert README to Markdown (README.md, by Louis Sautier) * Add a new auto-applying filter that uses regexes, fixes #37 (by Louis Sautier) * Use setuptools, install dependencies (Fixes #33) * Fix HTTP basic authentication (Fixes #26) * Add ssl_no_verify option for UrlJob * Update list of dependencies (add requests) * Fix unit tests for files only in source tree (Fixes #34) * Add test/data to source tarball (#34) * Workaround a requests shortcoming related to encoding 2016-06-14 Thomas Perl * Add support for pushover (by Richard Palmer) * html2txt: Use -nonumbers and UTF-8 output for Lynx * Fix SMTP server connection setup (fixes #50) * setup.py: Allow running from non-source directory (Fixes #52) * Fix adding URLs with = in them (Fixes #59) * Add option to use sendmail instead of SMTP (by e-dschungel) * Add InverseGrepFilter which removes lines matching a regex (by e-dschungel) * New html2text method "pyhtml2text" using the Python module "html2text" (by e-dschungel) 2016-07-12 Thomas Perl * Check current directory and use os.path.relpath (Fixes #73) * Add link to watched location in email report (by Guillaume Maudoux) * setup.py: Remove the discovery logic that fails with pip, just hardcode most things * Windows compatibility fixes (os.rename, shelljob checks) * Do not copy example files if they do not exist * Handle SIGPIPE (fixes #77) 2016-12-04 Thomas Perl * New filters: sha1sum, hexdump, element-by-class * New reporters: pushbullet (by R0nd); mailgun (by lechuckcaptain) * Improved filters: BeautifulSoup support for html2txt (by lechuckcaptain) * Improved handlers: HTTP Proxy (by lechuckcaptain); support for file:// URIs * CI Integration: Build configuration for Travis CI (by lechuckcaptain) * Consistency: Feature list is now sorted by name * Issue #108: Fix creation of example files on first startup * Issue #118: Fix match filters for missing keys * Small fixes by: Jakub Wilk, Marc Urben, Adam Dobrawy and Louis Sautier 2017-11-08 Thomas Perl * Issue #127: Fix error reporting * ElementsByAttribute: look for matching tag in handle_endtag (by Gaetan Leurent) * Paths: Add XDG_CONFIG_DIR support (by Jelle van der Waa) * E-Mail: Fix encodings (by Seokjin Han), Allow 'user' parameter for SMTP (by Jay Sitter) * HTTP: Option to avoid 304 responses, Content-Type header (by Vinicius Massuchetto) * html2text: Configuration options (by Vinicius Massuchetto) * Filtering: style (by gvandenbroucke), tag (by cmichi) * New reporter: Telegram support (by gvandenbroucke) 2018-01-28 Thomas Perl * Documentation: Mention appdirs (by e-dschungel) * SMTP: Fix handling of missing user field (by e-dschungel) * Manpage: Fix documentation of XDG environment variables (by Jelle van der Waa) * Unit tests: Fix imports for out-of-source-tree tests (by Maxime Werlen) urlwatch-2.8/MANIFEST.in000066400000000000000000000001341323342436500147550ustar00rootroot00000000000000include ChangeLog COPYING README.md recursive-include share * recursive-include test/data * urlwatch-2.8/README.md000066400000000000000000000125431323342436500145050ustar00rootroot00000000000000[![Build Status](https://travis-ci.org/thp/urlwatch.svg)](https://travis-ci.org/thp/urlwatch) ``` _ _ _ ____ _ _ _ __| |_ ____ _| |_ ___| |__ |___ \ | | | | '__| \ \ /\ / / _` | __/ __| '_ \ __) | | |_| | | | |\ V V / (_| | || (__| | | | / __/ \__,_|_| |_| \_/\_/ \__,_|\__\___|_| |_| |_____| A tool for monitoring webpages for updates ``` urlwatch is intended to help you watch changes in webpages and get notified (via email, in your terminal or with a custom-written reporter class) of any changes. The change notification will include the URL that has changed and a unified diff of what has changed. DEPENDENCIES ------------ urlwatch 2 requires: * Python 3.3 or newer * [PyYAML](http://pyyaml.org/) * [minidb](https://thp.io/2010/minidb/) * [requests](http://python-requests.org/) * [keyring](https://github.com/jaraco/keyring/) * [appdirs](https://github.com/ActiveState/appdirs) * [chump](https://github.com/karanlyons/chump/) (for Pushover support) * [pushbullet.py](https://github.com/randomchars/pushbullet.py) (for Pushbullet support) The dependencies can be installed with (add `--user` to install to `$HOME`): `python3 -m pip install pyyaml minidb requests keyring appdirs` For optional pushover support the chump package is required: `python3 -m pip install chump` For optional pushbullet support the pushbullet.py package is required: `python3 -m pip install pushbullet.py` For unit tests, you also need to install pycodestyle: `python3 -m pip install pycodestyle` MIGRATION FROM URLWATCH 1.x --------------------------- Migration from urlwatch 1.x should be automatic on first start. Here is a quick rundown of changes in 2.0: * URLs are stored in a YAML file now, with direct support for specifying names for jobs, different job kinds, directly applying filters, selecting the HTTP request method, specifying POST data as dictionary and much more * The cache directory has been replaced with a SQLite 3 database file "cache.db" in minidb format, storing all change history (use `--gc-cache` to remove old changes if you don't need them anymore) for further analysis * The hooks mechanism has been replaced with support for creating new job kinds by subclassing, new filters (also by subclassing) as well as new reporters (pieces of code that put the results somewhere, for example the default installation contains the "stdout" reporter that writes to the console and the "email" reporter that can send HTML and text e-mails) * A configuration file - urlwatch.yaml - has been added for specifying user preferences instead of having to supply everything via the command line QUICK START ----------- 1. Start `urlwatch` to migrate your old data or start fresh 2. Use `urlwatch --edit` to customize your job list 3. Use `urlwatch --edit-config` if you want to set up e-mail sending 4. Use `urlwatch --edit-hooks` if you want to write custom subclasses 5. Add `urlwatch` to your crontab (`crontab -e`) TIPS AND TRICKS --------------- Quickly adding new URLs to the job list from the command line: ```urlwatch --add url=http://example.org,name=Example``` You can pick only a given HTML element with the built-in filter, for example to extract ```
.../
``` from a page, you can use the following in your urls.yaml: ```yaml url: http://example.org/ filter: element-by-id:something ``` Also, you can chain filters, so you can run html2text on the result: ```yaml url: http://example.net/ filter: element-by-id:something,html2text ``` The example urls.yaml file also demonstrates the use of built-in filters, here 3 filters are used: html2text, line-grep and whitespace removal to get just a certain info field from a webpage: ```yaml url: http://thp.io/2008/urlwatch/ filter: html2text,grep:Current.*version,strip ``` For most cases, this means that you can specify a filter chain in your urls.yaml page without requiring a custom hook where previously you would have needed to write custom filtering code in Python. If you want to extract only the body tag you can use this filer: ```yaml url: http://thp.io/2008/urlwatch/ filter: element-by-tag:body ``` PUSHOVER -------- You can configure urlwatch to send real time notifications about changes via Pushover(https://pushover.net/). To enable this, ensure you have the chump python package installed (see DEPENDENCIES). Then edit your config (`urlwatch --edit-config`) and enable pushover. You will also need to add to the config your Pushover user key and a unique app key (generated by registering urlwatch as an application on your Pushover account(https://pushover.net/apps/build) PUSHBULLET -------- Pushbullet notification are configured similarly to Pushover (see above). You'll need to add to the config your Pushbullet Access Token, which you can generate at https://www.pushbullet.com/#settings TELEGRAM -------- Telegram notifications are configured using the Telegram Bot API. For this, you'll need a Bot API token and a chat id (see https://core.telegram.org/bots). Sample configuration: ```yaml telegram: bot_token: '999999999:3tOhy2CuZE0pTaCtszRfKpnagOG8IQbP5gf' # your bot api token chat_id: '88888888' # the chat id where the messages should be sent enabled: true ``` CONTACT ------- Website: http://thp.io/2008/urlwatch/ E-Mail: m@thp.io urlwatch-2.8/lib/000077500000000000000000000000001323342436500137675ustar00rootroot00000000000000urlwatch-2.8/lib/urlwatch/000077500000000000000000000000001323342436500156205ustar00rootroot00000000000000urlwatch-2.8/lib/urlwatch/__init__.py000066400000000000000000000011261323342436500177310ustar00rootroot00000000000000"""A tool for monitoring webpages for updates urlwatch is intended to help you watch changes in webpages and get notified (via email, in your terminal or with a custom-written reporter class) of any changes. The change notification will include the URL that has changed and a unified diff of what has changed. """ pkgname = 'urlwatch' __copyright__ = 'Copyright 2008-2018 Thomas Perl' __author__ = 'Thomas Perl ' __license__ = 'BSD' __url__ = 'http://thp.io/2008/urlwatch/' __version__ = '2.8' __user_agent__ = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__) urlwatch-2.8/lib/urlwatch/command.py000066400000000000000000000144071323342436500176160ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import imp import logging import os import shutil import subprocess import sys from .filters import FilterBase from .jobs import JobBase from .reporters import ReporterBase from .util import atomic_rename logger = logging.getLogger(__name__) class UrlwatchCommand: def __init__(self, urlwatcher): self.urlwatcher = urlwatcher self.urlwatch_config = urlwatcher.urlwatch_config def edit_hooks(self): editor = os.environ.get('EDITOR', None) if editor is None: editor = os.environ.get('VISUAL', None) if editor is None: print('Please set $VISUAL or $EDITOR.') return 1 fn_base, fn_ext = os.path.splitext(self.urlwatch_config.hooks) hooks_edit = fn_base + '.edit' + fn_ext try: if os.path.exists(self.urlwatch_config.hooks): shutil.copy(self.urlwatch_config.hooks, hooks_edit) elif self.urlwatch_config.hooks_py_example is not None and os.path.exists( self.urlwatch_config.hooks_py_example): shutil.copy(self.urlwatch_config.hooks_py_example, hooks_edit) subprocess.check_call([editor, hooks_edit]) imp.load_source('hooks', hooks_edit) atomic_rename(hooks_edit, self.urlwatch_config.hooks) print('Saving edit changes in', self.urlwatch_config.hooks) except Exception as e: print('Parsing failed:') print('======') print(e) print('======') print('') print('The file', self.urlwatch_config.hooks, 'was NOT updated.') print('Your changes have been saved in', hooks_edit) return 1 return 0 def show_features(self): print() print('Supported jobs:\n') print(JobBase.job_documentation()) print('Supported filters:\n') print(FilterBase.filter_documentation()) print() print('Supported reporters:\n') print(ReporterBase.reporter_documentation()) print() return 0 def list_urls(self): for idx, job in enumerate(self.urlwatcher.jobs): if self.urlwatch_config.verbose: print('%d: %s' % (idx + 1, repr(job))) else: pretty_name = job.pretty_name() location = job.get_location() if pretty_name != location: print('%d: %s (%s)' % (idx + 1, pretty_name, location)) else: print('%d: %s' % (idx + 1, pretty_name)) return 0 def modify_urls(self): save = True if self.urlwatch_config.delete is not None: try: index = int(self.urlwatch_config.delete) - 1 try: job = self.urlwatcher.jobs.pop(index) print('Removed %r' % (job,)) except IndexError: print('Not found: %r' % (index,)) save = False except ValueError: job = next((job for job in self.urlwatcher.jobs if job.get_location() == self.urlwatch_config.delete), None) try: self.urlwatcher.jobs.remove(job) print('Removed %r' % (job,)) except ValueError: print('Not found: %r' % (self.urlwatch_config.delete,)) save = False if self.urlwatch_config.add is not None: d = {k: v for k, v in (item.split('=', 1) for item in self.urlwatch_config.add.split(','))} job = JobBase.unserialize(d) print('Adding %r' % (job,)) self.urlwatcher.jobs.append(job) if save: self.urlwatcher.urls_storage.save(self.urlwatcher.jobs) return 0 def handle_actions(self): if self.urlwatch_config.features: sys.exit(self.show_features()) if self.urlwatch_config.gc_cache: self.urlwatcher.cache_storage.gc([job.get_guid() for job in self.urlwatcher.jobs]) sys.exit(0) if self.urlwatch_config.edit: sys.exit(self.urlwatcher.urls_storage.edit(self.urlwatch_config.urls_yaml_example)) if self.urlwatch_config.edit_hooks: sys.exit(self.edit_hooks()) if self.urlwatch_config.list: sys.exit(self.list_urls()) if self.urlwatch_config.add is not None or self.urlwatch_config.delete is not None: sys.exit(self.modify_urls()) def check_edit_config(self): if self.urlwatch_config.edit_config: sys.exit(self.urlwatcher.config_storage.edit()) def run(self): self.check_edit_config() self.handle_actions() self.urlwatcher.run_jobs() self.urlwatcher.close() urlwatch-2.8/lib/urlwatch/config.py000066400000000000000000000116351323342436500174450ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import argparse import logging import os import urlwatch from .migration import migrate_cache, migrate_urls logger = logging.getLogger(__name__) class BaseConfig(object): def __init__(self, pkgname, urlwatch_dir, config, urls, cache, hooks, verbose): self.pkgname = pkgname self.urlwatch_dir = urlwatch_dir self.config = config self.urls = urls self.cache = cache self.hooks = hooks self.verbose = verbose class CommandConfig(BaseConfig): def __init__(self, pkgname, urlwatch_dir, bindir, prefix, config, urls, hooks, cache, verbose): super().__init__(pkgname, urlwatch_dir, config, urls, cache, hooks, verbose) self.bindir = bindir self.prefix = prefix self.migrate_cache = migrate_cache self.migrate_urls = migrate_urls if self.bindir == 'bin': # Installed system-wide self.examples_dir = os.path.join(prefix, 'share', self.pkgname, 'examples') else: # Assume we are not yet installed self.examples_dir = os.path.join(prefix, bindir, 'share', self.pkgname, 'examples') self.urls_yaml_example = os.path.join(self.examples_dir, 'urls.yaml.example') self.hooks_py_example = os.path.join(self.examples_dir, 'hooks.py.example') self.parse_args() def parse_args(self): parser = argparse.ArgumentParser(description=urlwatch.__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(urlwatch.__version__)) parser.add_argument('-v', '--verbose', action='store_true', help='show debug output') group = parser.add_argument_group('files and directories') group.add_argument('--urls', metavar='FILE', help='read job list (URLs) from FILE', default=self.urls) group.add_argument('--config', metavar='FILE', help='read configuration from FILE', default=self.config) group.add_argument('--hooks', metavar='FILE', help='use FILE as hooks.py module', default=self.hooks) group.add_argument('--cache', metavar='FILE', help='use FILE as cache database', default=self.cache) group = parser.add_argument_group('job list management') group.add_argument('--list', action='store_true', help='list jobs') group.add_argument('--add', metavar='JOB', help='add job (key1=value1,key2=value2,...)') group.add_argument('--delete', metavar='JOB', help='delete job by location or index') group = parser.add_argument_group('interactive commands ($EDITOR/$VISUAL)') group.add_argument('--edit', action='store_true', help='edit URL/job list') group.add_argument('--edit-config', action='store_true', help='edit configuration file') group.add_argument('--edit-hooks', action='store_true', help='edit hooks script') group = parser.add_argument_group('miscellaneous') group.add_argument('--features', action='store_true', help='list supported jobs/filters/reporters') group.add_argument('--gc-cache', action='store_true', help='remove old cache entries') args = parser.parse_args() for i, arg in enumerate(vars(args)): argval = getattr(args, arg) setattr(self, arg, argval) urlwatch-2.8/lib/urlwatch/filters.py000066400000000000000000000256401323342436500176510ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import re import logging import itertools import os import imp import html.parser import hashlib from enum import Enum from .util import TrackSubClasses logger = logging.getLogger(__name__) class FilterBase(object, metaclass=TrackSubClasses): __subclasses__ = {} __anonymous_subclasses__ = [] def __init__(self, job, state): self.job = job self.state = state def _no_subfilters(self, subfilter): if subfilter is not None: raise ValueError('No subfilters supported for {}'.format(self.__kind__)) @classmethod def filter_documentation(cls): result = [] for sc in TrackSubClasses.sorted_by_kind(cls): result.extend(( ' * %s - %s' % (sc.__kind__, sc.__doc__), )) return '\n'.join(result) @classmethod def auto_process(cls, state, data): filters = itertools.chain((filtercls for _, filtercls in sorted(cls.__subclasses__.items(), key=lambda k_v: k_v[0])), cls.__anonymous_subclasses__) for filtercls in filters: filter_instance = filtercls(state.job, state) if filter_instance.match(): logger.info('Auto-applying filter %r to %s', filter_instance, state.job.get_location()) data = filter_instance.filter(data) return data @classmethod def process(cls, filter_kind, subfilter, state, data): logger.info('Applying filter %r, subfilter %r to %s', filter_kind, subfilter, state.job.get_location()) filtercls = cls.__subclasses__.get(filter_kind, None) if filtercls is None: raise ValueError('Unknown filter kind: %s:%s' % (filter_kind, subfilter)) return filtercls(state.job, state).filter(data, subfilter) def match(self): return False def filter(self, data, subfilter=None): raise NotImplementedError() class AutoMatchFilter(FilterBase): """Automatically matches subclass filters with a given location""" MATCH = None def match(self): if self.MATCH is None: return False d = self.job.to_dict() result = all(d.get(k, None) == v for k, v in self.MATCH.items()) logger.debug('Matching %r with %r result: %r', self, self.job, result) return result class RegexMatchFilter(FilterBase): """Same as AutoMatchFilter but matching is done with regexes""" MATCH = None def match(self): if self.MATCH is None: return False d = self.job.to_dict() # It's a match if we have at least one key/value pair that matches, # and no key/value pairs that do not match matches = [v.match(d[k]) for k, v in self.MATCH.items() if k in d] result = len(matches) > 0 and all(matches) logger.debug('Matching %r with %r result: %r', self, self.job, result) return result class LegacyHooksPyFilter(FilterBase): FILENAME = os.path.expanduser('~/.urlwatch/lib/hooks.py') def __init__(self, job, state): super().__init__(job, state) self.hooks = None if os.path.exists(self.FILENAME): try: self.hooks = imp.load_source('legacy_hooks', self.FILENAME) except Exception as e: logger.error('Could not load legacy hooks file: %s', e) def match(self): return self.hooks is not None def filter(self, data, subfilter=None): try: result = self.hooks.filter(self.job.get_location(), data) if result is None: result = data return result except Exception as e: logger.warn('Could not apply legacy hooks filter: %s', e) return data class Html2TextFilter(FilterBase): """Convert HTML to plaintext""" __kind__ = 'html2text' def filter(self, data, subfilter=None): if subfilter is None: method = 're' options = {} elif isinstance(subfilter, dict): method = subfilter.pop('method') options = subfilter elif isinstance(subfilter, str): method = subfilter options = {} from .html2txt import html2text return html2text(data, method=method, options=options) class Ical2TextFilter(FilterBase): """Convert iCalendar to plaintext""" __kind__ = 'ical2text' def filter(self, data, subfilter=None): self._no_subfilters(subfilter) from .ical2txt import ical2text return ical2text(data) class GrepFilter(FilterBase): """Filter only lines matching a regular expression""" __kind__ = 'grep' def filter(self, data, subfilter=None): if subfilter is None: raise ValueError('The grep filter needs a regular expression') return '\n'.join(line for line in data.splitlines() if re.search(subfilter, line) is not None) class InverseGrepFilter(FilterBase): """Filter which removes lines matching a regular expression""" __kind__ = 'grepi' def filter(self, data, subfilter=None): if subfilter is None: raise ValueError('The inverse grep filter needs a regular expression') return '\n'.join(line for line in data.splitlines() if re.search(subfilter, line) is None) class StripFilter(FilterBase): """Strip leading and trailing whitespace""" __kind__ = 'strip' def filter(self, data, subfilter=None): self._no_subfilters(subfilter) return data.strip() class FilterBy(Enum): ATTRIBUTE = 1 TAG = 2 class ElementsBy(html.parser.HTMLParser): def __init__(self, filter_by, name, value=None): super().__init__() self._filter_by = filter_by if self._filter_by == FilterBy.ATTRIBUTE: self._attributes = {name: value} else: self._name = name self._result = [] self._inside = False self._elts = [] def get_html(self): return ''.join(self._result) def handle_starttag(self, tag, attrs): ad = dict(attrs) if self._filter_by == FilterBy.ATTRIBUTE and all(ad.get(k, None) == v for k, v in self._attributes.items()): self._inside = True elif self._filter_by == FilterBy.TAG and tag == self._name: self._inside = True if self._inside: self._result.append('<%s%s%s>' % (tag, ' ' if attrs else '', ' '.join('%s="%s"' % (k, v) for k, v in attrs))) self._elts.append(tag) def handle_endtag(self, tag): if self._inside: self._result.append('' % (tag,)) if tag in self._elts: t = self._elts.pop() while t != tag and self._elts: t = self._elts.pop() if not self._elts: self._inside = False def handle_data(self, data): if self._inside: self._result.append(data) class GetElementById(FilterBase): """Get an HTML element by its ID""" __kind__ = 'element-by-id' def filter(self, data, subfilter=None): if subfilter is None: raise ValueError('Need an element ID for filtering') element_by_id = ElementsBy(FilterBy.ATTRIBUTE, 'id', subfilter) element_by_id.feed(data) return element_by_id.get_html() class GetElementByClass(FilterBase): """Get all HTML elements by class""" __kind__ = 'element-by-class' def filter(self, data, subfilter=None): if subfilter is None: raise ValueError('Need an element class for filtering') element_by_class = ElementsBy(FilterBy.ATTRIBUTE, 'class', subfilter) element_by_class.feed(data) return element_by_class.get_html() class GetElementByStyle(FilterBase): """Get all HTML elements by style""" __kind__ = 'element-by-style' def filter(self, data, subfilter=None): if subfilter is None: raise ValueError('Need an element style for filtering') element_by_style = ElementsBy(FilterBy.ATTRIBUTE, 'style', subfilter) element_by_style.feed(data) return element_by_style.get_html() class GetElementByTag(FilterBase): """Get an HTML element by its tag""" __kind__ = 'element-by-tag' def filter(self, data, subfilter=None): if subfilter is None: raise ValueError('Need a tag for filtering') element_by_tag = ElementsBy(FilterBy.TAG, subfilter) element_by_tag.feed(data) return element_by_tag.get_html() class Sha1Filter(FilterBase): """Calculate the SHA-1 checksum of the content""" __kind__ = 'sha1sum' def filter(self, data, subfilter=None): self._no_subfilters(subfilter) sha = hashlib.sha1() sha.update(data.encode('utf-8', 'ignore')) return sha.hexdigest() class HexdumpFilter(FilterBase): """Convert binary data to hex dump format""" __kind__ = 'hexdump' def filter(self, data, subfilter=None): self._no_subfilters(subfilter) data = bytearray(data.encode('utf-8', 'ignore')) blocks = [data[i * 16:(i + 1) * 16] for i in range(int((len(data) + (16 - 1)) / 16))] return '\n'.join('%s %s' % (' '.join('%02x' % c for c in block), ''.join((chr(c) if (c > 31 and c < 127) else '.') for c in block)) for block in blocks) urlwatch-2.8/lib/urlwatch/handler.py000066400000000000000000000111221323342436500176040ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import datetime import logging import time import traceback from .filters import FilterBase from .reporters import ReporterBase logger = logging.getLogger(__name__) class JobState(object): def __init__(self, cache_storage, job): self.cache_storage = cache_storage self.job = job self.verb = None self.old_data = None self.new_data = None self.timestamp = None self.exception = None self.traceback = None def load(self): self.old_data, self.timestamp = self.cache_storage.load(self.job, self.job.get_guid()) def save(self): self.cache_storage.save(self.job, self.job.get_guid(), self.new_data, time.time()) def process(self): logger.info('Processing: %s', self.job) try: self.load() data = self.job.retrieve(self) # Apply automatic filters first data = FilterBase.auto_process(self, data) # Apply any specified filters filter_list = self.job.filter if filter_list is not None: if isinstance(filter_list, list): for item in filter_list: key = next(iter(item)) filter_kind, subfilter = key, item[key] data = FilterBase.process(filter_kind, subfilter, self, data) elif isinstance(filter_list, str): for filter_kind in filter_list.split(','): if ':' in filter_kind: filter_kind, subfilter = filter_kind.split(':', 1) else: subfilter = None data = FilterBase.process(filter_kind, subfilter, self, data) self.new_data = data except Exception as e: self.exception = e self.traceback = traceback.format_exc() return self class Report(object): def __init__(self, urlwatch_config): self.config = urlwatch_config.config_storage.config self.job_states = [] self.start = datetime.datetime.now() def _result(self, verb, job_state): if job_state.exception is not None: logger.debug('Got exception while processing %r: %s', job_state.job, job_state.exception) job_state.verb = verb self.job_states.append(job_state) def new(self, job_state): self._result('new', job_state) def changed(self, job_state): self._result('changed', job_state) def unchanged(self, job_state): self._result('unchanged', job_state) def error(self, job_state): self._result('error', job_state) def get_filtered_job_states(self, job_states): for job_state in job_states: if not any(job_state.verb == verb and not self.config['display'][verb] for verb in ('unchanged', 'new', 'error')): yield job_state def finish(self): end = datetime.datetime.now() duration = (end - self.start) ReporterBase.submit_all(self, self.job_states, duration) urlwatch-2.8/lib/urlwatch/html2txt.py000066400000000000000000000115771323342436500177730ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import re import os import subprocess import logging logger = logging.getLogger(__name__) def html2text(data, method='lynx', options=None): """ Convert a string consisting of HTML to plain text for easy difference checking. Method may be one of: 'lynx' (default) - Use "lynx -dump" for conversion options: see "lynx -help" output for options that work with "-dump" 'html2text' - Use "html2text -nobs" for conversion options: https://linux.die.net/man/1/html2text 'bs4' - Use Beautiful Soup library to prettify the HTML options: "parser" only, bs4 supports "lxml", "html5lib", and "html.parser" http://beautiful-soup-4.readthedocs.io/en/latest/#specifying-the-parser-to-use 're' - A simple regex-based HTML tag stripper 'pyhtml2text' - Use Python module "html2text" options: https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options """ if options is None: options = {} if method == 're': stripped_tags = re.sub(r'<[^>]*>', '', data) d = '\n'.join((l.rstrip() for l in stripped_tags.splitlines() if l.strip() != '')) return d if method == 'pyhtml2text': import html2text parser = html2text.HTML2Text() for k, v in options.items(): setattr(parser, k.lower(), v) d = parser.handle(data) return d if method == 'bs4': from bs4 import BeautifulSoup parser = options.pop('parser', 'html.parser') soup = BeautifulSoup(data, parser) d = soup.prettify() return d if method == 'lynx': cmd = ['lynx', '-nonumbers', '-dump', '-stdin', '-assume_charset UTF-8', '-display_charset UTF-8'] elif method == 'html2text': cmd = ['html2text', '-nobs', '-utf8'] else: raise ValueError('Unknown html2text method: %r' % (method,)) stdout_encoding = 'utf-8' for k, v in options.items(): cmd.append('-%s %s' % (k, v) if v is True else '-%s' % k) logger.debug('Command: %r, stdout encoding: %s', cmd, stdout_encoding) env = {} env.update(os.environ) env['LANG'] = 'en_US.utf-8' env['LC_ALL'] = 'en_US.utf-8' html2text = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, env=env) stdout, stderr = html2text.communicate(data.encode('utf-8')) stdout = stdout.decode(stdout_encoding) if method == 'lynx': # Lynx translates relative links in the mode we use it to: # file://localhost/tmp/[RANDOM STRING]/[RELATIVE LINK] # Recent versions of lynx (seen in 2.8.8pre1-1) do not include the # "localhost" in the file:// URLs; see Debian bug 732112 stdout = re.sub(r'file://%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Use the following regular expression to remove the unnecessary # parts, so that [RANDOM STRING] (changing on each call) does not # expose itself as change on the website (it's a Lynx-related thing # Thanks to Evert Meulie for pointing that out stdout = re.sub(r'file://localhost%s/[^/]*/' % (os.environ.get('TMPDIR', '/tmp'),), '', stdout) # Also remove file names like L9816-5928TMP.html stdout = re.sub(r'L\d+-\d+TMP.html', '', stdout) return stdout.strip() urlwatch-2.8/lib/urlwatch/ical2txt.py000066400000000000000000000047211323342436500177300ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. def ical2text(ical_string): import vobject result = [] if isinstance(ical_string, str): parsedCal = vobject.readOne(ical_string) else: try: parsedCal = vobject.readOne(ical_string) except Exception as e: parsedCal = vobject.readOne(ical_string.decode('utf-8', 'ignore')) for event in parsedCal.getChildren(): if event.name == 'VEVENT': if hasattr(event, 'dtstart'): start = event.dtstart.value.strftime('%F %H:%M') else: start = 'unknown start date' if hasattr(event, 'dtend'): end = event.dtend.value.strftime('%F %H:%M') else: end = start if start == end: date_str = start else: date_str = '%s -- %s' % (start, end) result.append('%s: %s' % (date_str, event.summary.value)) return '\n'.join(result) urlwatch-2.8/lib/urlwatch/jobs.py000066400000000000000000000212071323342436500171310ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import email.utils import hashlib import logging import os import re import subprocess import requests import urlwatch from requests.packages.urllib3.exceptions import InsecureRequestWarning from .util import TrackSubClasses requests.packages.urllib3.disable_warnings(InsecureRequestWarning) logger = logging.getLogger(__name__) class ShellError(Exception): """Exception for shell commands with non-zero exit code""" def __init__(self, result): Exception.__init__(self) self.result = result def __str__(self): return '%s: Exit status %d' % (self.__class__.__name__, self.result) class NotModifiedError(Exception): """Exception raised on HTTP 304 responses""" ... class JobBase(object, metaclass=TrackSubClasses): __subclasses__ = {} __required__ = () __optional__ = () def __init__(self, **kwargs): # Set optional keys to None for k in self.__optional__: if k not in kwargs: setattr(self, k, None) # Fail if any required keys are not provided for k in self.__required__: if k not in kwargs: raise ValueError('Required field %s missing: %r' % (k, kwargs)) for k, v in list(kwargs.items()): setattr(self, k, v) @classmethod def job_documentation(cls): result = [] for sc in TrackSubClasses.sorted_by_kind(cls): result.extend(( ' * %s - %s' % (sc.__kind__, sc.__doc__), ' Required keys: %s' % (', '.join(sc.__required__),), ' Optional keys: %s' % (', '.join(sc.__optional__),), '', )) return '\n'.join(result) def get_location(self): raise NotImplementedError() def pretty_name(self): raise NotImplementedError() def serialize(self): d = {'kind': self.__kind__} d.update(self.to_dict()) return d @classmethod def unserialize(cls, data): if 'kind' not in data: # Try to auto-detect the kind of job based on the available keys kinds = [subclass.__kind__ for subclass in list(cls.__subclasses__.values()) if all(required in data for required in subclass.__required__) and not any(key not in subclass.__required__ and key not in subclass.__optional__ for key in data)] if len(kinds) == 1: kind = kinds[0] elif len(kinds) == 0: raise ValueError('Kind is not specified, and no job matches: %r' % (data,)) else: raise ValueError('Multiple kinds of jobs match %r: %r' % (data, kinds)) else: kind = data['kind'] return cls.__subclasses__[kind].from_dict(data) def to_dict(self): return {k: getattr(self, k) for keys in (self.__required__, self.__optional__) for k in keys if getattr(self, k) is not None} @classmethod def from_dict(cls, data): return cls(**{k: v for k, v in list(data.items()) if k in cls.__required__ or k in cls.__optional__}) def __repr__(self): return '<%s %s>' % (self.__kind__, ' '.join('%s=%r' % (k, v) for k, v in list(self.to_dict().items()))) def get_guid(self): location = self.get_location() sha_hash = hashlib.new('sha1') sha_hash.update(location.encode('utf-8')) return sha_hash.hexdigest() def retrieve(self, job_state): raise NotImplementedError() class Job(JobBase): __required__ = () __optional__ = ('name', 'filter') def pretty_name(self): return self.name if self.name else self.get_location() class ShellJob(Job): """Run a shell command and get its standard output""" __kind__ = 'shell' __required__ = ('command',) __optional__ = () def get_location(self): return self.command def retrieve(self, job_state): process = subprocess.Popen(self.command, stdout=subprocess.PIPE, shell=True) stdout_data, stderr_data = process.communicate() result = process.wait() if result != 0: raise ShellError(result) return stdout_data.decode('utf-8') class UrlJob(Job): """Retrieve an URL from a web server""" __kind__ = 'url' __required__ = ('url',) __optional__ = ('cookies', 'data', 'method', 'ssl_no_verify', 'ignore_cached', 'http_proxy', 'https_proxy') CHARSET_RE = re.compile('text/(html|plain); charset=([^;]*)') def get_location(self): return self.url def retrieve(self, job_state): headers = { 'User-agent': urlwatch.__user_agent__, } proxies = { 'http': os.getenv('HTTP_PROXY'), 'https': os.getenv('HTTPS_PROXY'), } if job_state.timestamp is not None: headers['If-Modified-Since'] = email.utils.formatdate(job_state.timestamp) if self.ignore_cached: headers['If-Modified-Since'] = email.utils.formatdate(0) headers['Cache-Control'] = 'max-age=172800' headers['Expires'] = email.utils.formatdate() if self.method is None: self.method = "GET" if self.data is not None: self.method = "POST" headers['Content-type'] = 'application/x-www-form-urlencoded' logger.info('Sending POST request to %s', self.url) if self.http_proxy is not None: proxies['http'] = self.http_proxy if self.https_proxy is not None: proxies['https'] = self.https_proxy file_scheme = 'file://' if self.url.startswith(file_scheme): logger.info('Using local filesystem (%s URI scheme)', file_scheme) return open(self.url[len(file_scheme):], 'rt').read() response = requests.request(url=self.url, data=self.data, headers=headers, method=self.method, verify=(not self.ssl_no_verify), cookies=self.cookies, proxies=proxies) response.raise_for_status() if response.status_code == 304: raise NotModifiedError() # If we can't find the encoding in the headers, requests gets all # old-RFC-y and assumes ISO-8859-1 instead of UTF-8. Use the old # urlwatch behavior and try UTF-8 decoding first. content_type = response.headers.get('Content-type', '') content_type_match = self.CHARSET_RE.match(content_type) if not content_type_match: try: try: try: return response.content.decode('utf-8') except UnicodeDecodeError: return response.content.decode('latin1') except UnicodeDecodeError: return response.content.decode('utf-8', 'ignore') except LookupError: # If this is an invalid encoding, decode as ascii (Debian bug 731931) return response.content.decode('ascii', 'ignore') return response.text urlwatch-2.8/lib/urlwatch/mailer.py000066400000000000000000000103251323342436500174440ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import smtplib import getpass import subprocess import logging try: import keyring except ImportError: keyring = None import email.mime.multipart import email.mime.text import email.utils logger = logging.getLogger(__name__) class Mailer(object): def send(self, msg): raise NotImplementedError def msg_plain(self, from_email, to_email, subject, body): msg = email.mime.text.MIMEText(body, 'plain', 'utf-8') msg['Subject'] = subject msg['From'] = from_email msg['To'] = to_email msg['Date'] = email.utils.formatdate() return msg def msg_html(self, from_email, to_email, subject, body_text, body_html): msg = email.mime.multipart.MIMEMultipart('alternative') msg['Subject'] = subject msg['From'] = from_email msg['To'] = to_email msg['Date'] = email.utils.formatdate() msg.attach(email.mime.text.MIMEText(body_text, 'plain', 'utf-8')) msg.attach(email.mime.text.MIMEText(body_html, 'html', 'utf-8')) return msg class SMTPMailer(Mailer): def __init__(self, smtp_user, smtp_server, smtp_port, tls, auth): self.smtp_server = smtp_server self.smtp_user = smtp_user self.smtp_port = smtp_port self.tls = tls self.auth = auth def send(self, msg): s = smtplib.SMTP(self.smtp_server, self.smtp_port) s.ehlo() if self.tls: s.starttls() if self.auth and keyring is not None: passwd = keyring.get_password(self.smtp_server, self.smtp_user) if passwd is None: raise ValueError('No password available in keyring for {}, {}'.format(self.smtp_server, self.smtp_user)) s.login(self.smtp_user, passwd) s.sendmail(msg['From'], [msg['To']], msg.as_string()) s.quit() class SendmailMailer(Mailer): def __init__(self, sendmail_path): self.sendmail_path = sendmail_path def send(self, msg): p = subprocess.Popen([self.sendmail_path, '-t', '-oi'], stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) result = p.communicate(msg.as_string()) if p.returncode: logger.error('Sendmail failed with {result}'.format(result=result)) def set_password(smtp_server, from_email): ''' Set the keyring password for the mail connection. Interactive.''' if keyring is None: raise ImportError('keyring module missing - service unsupported') password = getpass.getpass(prompt='Enter password for {} using {}: '.format(from_email, smtp_server)) keyring.set_password(smtp_server, from_email, password) urlwatch-2.8/lib/urlwatch/main.py000066400000000000000000000071251323342436500171230ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import imp import logging import os from .handler import Report from .worker import run_jobs logger = logging.getLogger(__name__) class Urlwatch(object): def __init__(self, urlwatch_config, config_storage, cache_storage, urls_storage): self.urlwatch_config = urlwatch_config logger.info('Using %s as URLs file', self.urlwatch_config.urls) logger.info('Using %s for hooks', self.urlwatch_config.hooks) logger.info('Using %s as cache directory', self.urlwatch_config.cache) self.config_storage = config_storage self.cache_storage = cache_storage self.urls_storage = urls_storage self.report = Report(self) self.jobs = None self.check_directories() if hasattr(self.urlwatch_config, 'migrate_urls'): self.urlwatch_config.migrate_urls(self) self.load_hooks() self.load_jobs() if hasattr(self.urlwatch_config, 'migrate_urls'): self.urlwatch_config.migrate_cache(self) def check_directories(self): if not os.path.isdir(self.urlwatch_config.urlwatch_dir): os.makedirs(self.urlwatch_config.urlwatch_dir) if not os.path.exists(self.urlwatch_config.config): self.config_storage.write_default_config(self.urlwatch_config.config) print(""" A default config has been written to {config_yaml}. Use "{pkgname} --edit-config" to customize it. """.format(config_yaml=self.urlwatch_config.config, pkgname=self.urlwatch_config.pkgname)) def load_hooks(self): if os.path.exists(self.urlwatch_config.hooks): imp.load_source('hooks', self.urlwatch_config.hooks) def load_jobs(self): if os.path.isfile(self.urlwatch_config.urls): jobs = self.urls_storage.load_secure() logger.info('Found {0} jobs'.format(len(jobs))) else: logger.warn('No jobs file found') jobs = [] self.jobs = jobs def run_jobs(self): run_jobs(self) def close(self): self.report.finish() self.cache_storage.close() urlwatch-2.8/lib/urlwatch/migration.py000066400000000000000000000064311323342436500201670ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import logging import os.path import sys from .util import atomic_rename from .storage import UrlsYaml, UrlsTxt, CacheDirStorage logger = logging.getLogger(__name__) def migrate_urls(urlwatcher): # Migrate urlwatch 1.x URLs to urlwatch 2.x urlwatch_config = urlwatcher.urlwatch_config pkgname = urlwatch_config.pkgname urls = urlwatch_config.urls urls_txt = os.path.join(urlwatch_config.urlwatch_dir, 'urls.txt') edit = urlwatch_config.edit add = urlwatch_config.add if os.path.isfile(urls_txt) and not os.path.isfile(urls): print(""" Migrating URLs: {urls_txt} -> {urls_yaml} Use "{pkgname} --edit" to customize it. """.format(urls_txt=urls_txt, urls_yaml=urls, pkgname=pkgname)) UrlsYaml(urls).save(UrlsTxt(urls_txt).load_secure()) atomic_rename(urls_txt, urls_txt + '.migrated') if not os.path.isfile(urls) and not edit and not add: print(""" You need to create {urls_yaml} in order to use {pkgname}. Use "{pkgname} --edit" to open the file with your editor. """.format(urls_yaml=urls, pkgname=pkgname)) sys.exit(1) def migrate_cache(urlwatcher): # Migrate urlwatch 1.x cache to urlwatch 2.x urlwatch_config = urlwatcher.urlwatch_config cache = urlwatch_config.cache cache_dir = os.path.join(urlwatch_config.urlwatch_dir, 'cache') if not os.path.isfile(cache) and os.path.isdir(cache_dir): print(""" Migrating cache: {cache_dir} -> {cache_db} """.format(cache_dir=cache_dir, cache_db=cache)) old_cache_storage = CacheDirStorage(cache_dir) urlwatcher.cache_storage.restore(old_cache_storage.backup()) urlwatcher.cache_storage.gc([job.get_guid() for job in urlwatcher.jobs]) atomic_rename(cache_dir, cache_dir + '.migrated') urlwatch-2.8/lib/urlwatch/reporters.py000066400000000000000000000440111323342436500202170ustar00rootroot00000000000000# # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import difflib import email.utils import itertools import logging import sys import time import cgi import requests import urlwatch from .mailer import SMTPMailer from .mailer import SendmailMailer from .util import TrackSubClasses try: import chump except ImportError: chump = None try: from pushbullet import Pushbullet except ImportError: Pushbullet = None logger = logging.getLogger(__name__) class ReporterBase(object, metaclass=TrackSubClasses): __subclasses__ = {} def __init__(self, report, config, job_states, duration): self.report = report self.config = config self.job_states = job_states self.duration = duration def convert(self, othercls): if hasattr(othercls, '__kind__'): config = self.report.config['report'][othercls.__kind__] else: config = {} return othercls(self.report, config, self.job_states, self.duration) @classmethod def reporter_documentation(cls): result = [] for sc in TrackSubClasses.sorted_by_kind(cls): result.extend(( ' * %s - %s' % (sc.__kind__, sc.__doc__), )) return '\n'.join(result) @classmethod def submit_all(cls, report, job_states, duration): any_enabled = False for name, subclass in cls.__subclasses__.items(): cfg = report.config['report'].get(name, {'enabled': False}) if cfg['enabled']: any_enabled = True logger.info('Submitting with %s (%r)', name, subclass) subclass(report, cfg, job_states, duration).submit() if not any_enabled: logger.warn('No reporters enabled.') def submit(self): raise NotImplementedError() def unified_diff(self, job_state): timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1) timestamp_new = email.utils.formatdate(time.time(), localtime=1) return ''.join(difflib.unified_diff([l + '\n' for l in job_state.old_data.splitlines()], [l + '\n' for l in job_state.new_data.splitlines()], '@', '@', timestamp_old, timestamp_new)) class SafeHtml(object): def __init__(self, s): self.s = s def __str__(self): return self.s def format(self, *args, **kwargs): return str(self).format(*(cgi.escape(str(arg)) for arg in args), **{k: cgi.escape(str(v)) for k, v in kwargs.items()}) class HtmlReporter(ReporterBase): def submit(self): yield from (str(part) for part in self._parts()) def _parts(self): cfg = self.report.config['report']['html'] yield SafeHtml(""" urlwatch """) for job_state in self.report.get_filtered_job_states(self.job_states): job = job_state.job if job.__kind__ == 'url': title = '{pretty_name}' elif job.pretty_name() != job.get_location(): title = '{pretty_name}' else: title = '{location}' title = '

{verb}: ' + title + '

' yield SafeHtml(title).format(verb=job_state.verb, location=job.get_location(), pretty_name=job.pretty_name()) content = self._format_content(job_state, cfg['diff']) if content is not None: yield content yield SafeHtml('
') yield SafeHtml("""
{pkgname} {version}, {copyright}
Website: {url}
watched {count} URLs in {duration} seconds
""").format(pkgname=urlwatch.pkgname, version=urlwatch.__version__, copyright=urlwatch.__copyright__, url=urlwatch.__url__, count=len(self.job_states), duration=self.duration.seconds) def _diff_to_html(self, unified_diff): for line in unified_diff.splitlines(): if line.startswith('+'): yield SafeHtml('{line}').format(line=line) elif line.startswith('-'): yield SafeHtml('{line}').format(line=line) else: yield SafeHtml('{line}').format(line=line) def _format_content(self, job_state, difftype): if job_state.verb == 'error': return SafeHtml('
{error}
').format(error=job_state.traceback.strip()) if job_state.verb == 'unchanged': return SafeHtml('
{old_data}
').format(old_data=job_state.old_data) if job_state.old_data in (None, job_state.new_data): return SafeHtml('...') if difftype == 'table': timestamp_old = email.utils.formatdate(job_state.timestamp, localtime=1) timestamp_new = email.utils.formatdate(time.time(), localtime=1) html_diff = difflib.HtmlDiff() return SafeHtml(html_diff.make_table(job_state.old_data.splitlines(1), job_state.new_data.splitlines(1), timestamp_old, timestamp_new, True, 3)) elif difftype == 'unified': return ''.join(( '
',
                '\n'.join(self._diff_to_html(self.unified_diff(job_state))),
                '
', )) else: raise ValueError('Diff style not supported: %r' % (difftype,)) class TextReporter(ReporterBase): def submit(self): cfg = self.report.config['report']['text'] line_length = cfg['line_length'] show_details = cfg['details'] show_footer = cfg['footer'] summary = [] details = [] for job_state in self.report.get_filtered_job_states(self.job_states): summary_part, details_part = self._format_output(job_state, line_length) summary.extend(summary_part) details.extend(details_part) if summary: sep = line_length * '=' yield from itertools.chain( (sep,), ('%02d. %s' % (idx + 1, line) for idx, line in enumerate(summary)), (sep, ''), ) if show_details: yield from details if summary and show_footer: yield from ('-- ', '%s %s, %s' % (urlwatch.pkgname, urlwatch.__version__, urlwatch.__copyright__), 'Website: %s' % (urlwatch.__url__,), 'watched %d URLs in %d seconds' % (len(self.job_states), self.duration.seconds)) def _format_content(self, job_state): if job_state.verb == 'error': return job_state.traceback.strip() if job_state.verb == 'unchanged': return job_state.old_data if job_state.old_data in (None, job_state.new_data): return None return self.unified_diff(job_state) def _format_output(self, job_state, line_length): summary_part = [] details_part = [] pretty_name = job_state.job.pretty_name() location = job_state.job.get_location() if pretty_name != location: location = '%s (%s)' % (pretty_name, location) pretty_summary = ': '.join((job_state.verb.upper(), pretty_name)) summary = ': '.join((job_state.verb.upper(), location)) content = self._format_content(job_state) summary_part.append(pretty_summary) sep = line_length * '-' details_part.extend((sep, summary, sep)) if content is not None: details_part.extend((content, sep)) details_part.extend(('', '')) return summary_part, details_part class StdoutReporter(TextReporter): """Print summary on stdout (the console)""" __kind__ = 'stdout' def _incolor(self, color_id, s): if sys.stdout.isatty() and self.config.get('color', False): return '\033[9%dm%s\033[0m' % (color_id, s) return s def _red(self, s): return self._incolor(1, s) def _green(self, s): return self._incolor(2, s) def _yellow(self, s): return self._incolor(3, s) def _blue(self, s): return self._incolor(4, s) def submit(self): cfg = self.report.config['report']['text'] line_length = cfg['line_length'] separators = (line_length * '=', line_length * '-', '-- ') body = '\n'.join(super().submit()) for line in body.splitlines(): # FIXME: This isn't ideal, but works for now... if line in separators: print(line) elif line.startswith('+'): print(self._green(line)) elif line.startswith('-'): print(self._red(line)) elif any(line.startswith(prefix) for prefix in ('NEW:', 'CHANGED:', 'UNCHANGED:', 'ERROR:')): first, second = line.split(' ', 1) if line.startswith('ERROR:'): print(first, self._red(second)) else: print(first, self._blue(second)) else: print(line) class EMailReporter(TextReporter): """Send summary via e-mail / SMTP""" __kind__ = 'email' def submit(self): filtered_job_states = list(self.report.get_filtered_job_states(self.job_states)) subject_args = { 'count': len(filtered_job_states), 'jobs': ', '.join(job_state.job.pretty_name() for job_state in filtered_job_states), } subject = self.config['subject'].format(**subject_args) body_text = '\n'.join(super().submit()) if not body_text: logger.debug('Not sending e-mail (no changes)') return if self.config['method'] == "smtp": smtp_user = self.config['smtp'].get('user', self.config['from']) mailer = SMTPMailer(smtp_user, self.config['smtp']['host'], self.config['smtp']['port'], self.config['smtp']['starttls'], self.config['smtp']['keyring']) elif self.config['method'] == "sendmail": mailer = SendmailMailer(self.config['sendmail']['path']) else: logger.error('Invalid entry for method {method}'.format(method=self.config['method'])) # TODO set_password(options.email_smtp, options.email_from) if self.config['html']: body_html = '\n'.join(self.convert(HtmlReporter).submit()) msg = mailer.msg_html(self.config['from'], self.config['to'], subject, body_text, body_html) else: msg = mailer.msg_plain(self.config['from'], self.config['to'], subject, body_text) mailer.send(msg) class WebServiceReporter(TextReporter): MAX_LENGTH = 1024 def web_service_get(self): raise NotImplementedError def web_service_submit(self, service, title, body): raise NotImplementedError def submit(self): body_text = '\n'.join(super().submit()) if not body_text: logger.debug('Not sending %s (no changes)', self.__kind__) return if len(body_text) > self.MAX_LENGTH: body_text = body_text[:self.MAX_LENGTH] try: service = self.web_service_get() except Exception as e: logger.error('Failed to load or connect to %s - are the dependencies installed and configured?', self.__kind__, exc_info=True) return self.web_service_submit(service, 'Website Change Detected', body_text) class PushoverReport(WebServiceReporter): """Send summary via pushover.net""" __kind__ = 'pushover' def web_service_get(self): app = chump.Application(self.config['app']) return app.get_user(self.config['user']) def web_service_submit(self, service, title, body): msg = service.create_message(title=title, message=body, html=True, sound='spacealarm') msg.send() class PushbulletReport(WebServiceReporter): """Send summary via pushbullet.com""" __kind__ = 'pushbullet' def web_service_get(self): return Pushbullet(self.config['api_key']) def web_service_submit(self, service, title, body): service.push_note(title, body) class MailGunReporter(TextReporter): """Custom email reporter that use mailgun service""" __kind__ = 'mailgun' def submit(self): domain = self.config['domain'] api_key = self.config['api_key'] from_name = self.config['from_name'] from_mail = self.config['from_mail'] to = self.config['to'] filtered_job_states = list(self.report.get_filtered_job_states(self.job_states)) subject_args = { 'count': len(filtered_job_states), 'jobs': ', '.join(job_state.job.pretty_name() for job_state in filtered_job_states), } subject = self.config['subject'].format(**subject_args) body_text = '\n'.join(super().submit()) body_html = '\n'.join(self.convert(HtmlReporter).submit()) if not body_text: logger.debug('Not calling mailgun API (no changes)') return logger.debug("Sending mailgun request for domain:'{0}'".format(domain)) result = requests.post( "https://api.mailgun.net/v3/{0}/messages".format(domain), auth=("api", api_key), data={"from": "{0} <{1}>".format(from_name, from_mail), "to": to, "subject": subject, "text": body_text, "html": body_html}) try: json_res = result.json() if (result.status_code == 200): logger.info("Mailgun response: id '{0}'. {1}".format(json_res['id'], json_res['message'])) else: logger.error("Mailgun error: {0}".format(json_res['message'])) except ValueError: logger.error( "Failed to parse Mailgun response. HTTP status code: {0}, content: {1}".format(result.status_code, result.content)) return result class TelegramReporter(TextReporter): """Custom Telegram reporter""" MAX_LENGTH = 4096 __kind__ = 'telegram' def submit(self): bot_token = self.config['bot_token'] chat_id = self.config['chat_id'] text = '\n'.join(super().submit()) if not text: logger.debug('Not calling telegram API (no changes)') return result = None for chunk in self.chunkstring(text, self.MAX_LENGTH): result = self.submitToTelegram(bot_token, chat_id, chunk) return result def submitToTelegram(self, bot_token, chat_id, text): logger.debug("Sending telegram request to chat id:'{0}'".format(chat_id)) result = requests.post( "https://api.telegram.org/bot{0}/sendMessage".format(bot_token), data={"chat_id": chat_id, "text": text, "disable_web_page_preview": "true"}) try: json_res = result.json() if (result.status_code == 200): logger.info("Telegram response: ok '{0}'. {1}".format(json_res['ok'], json_res['result'])) else: logger.error("Telegram error: {0}".format(json_res['description'])) except ValueError: logger.error( "Failed to parse telegram response. HTTP status code: {0}, content: {1}".format(result.status_code, result.content)) return result def chunkstring(self, string, length): return (string[0 + i:length + i] for i in range(0, len(string), length)) urlwatch-2.8/lib/urlwatch/storage.py000066400000000000000000000357751323342436500176570ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os import stat import copy import platform from abc import ABCMeta, abstractmethod import shutil import subprocess import shlex import yaml import json import minidb import logging from .util import atomic_rename from .jobs import JobBase, UrlJob, ShellJob logger = logging.getLogger(__name__) DEFAULT_CONFIG = { 'display': { 'new': True, 'error': True, 'unchanged': False, }, 'report': { 'text': { 'line_length': 75, 'details': True, 'footer': True, }, 'html': { 'diff': 'unified', # "unified" or "table" }, 'stdout': { 'enabled': True, 'color': True, }, 'email': { 'enabled': False, 'html': False, 'to': '', 'from': '', 'subject': '{count} changes: {jobs}', 'method': 'smtp', 'smtp': { 'host': 'localhost', 'port': 25, 'starttls': True, 'keyring': True, }, 'sendmail': { 'path': 'sendmail', } }, 'pushover': { 'enabled': False, 'app': '', 'user': '', }, 'pushbullet': { 'enabled': False, 'api_key': '', }, 'mailgun': { 'enabled': False, 'api_key': '', 'domain': '', 'from_mail': '', 'from_name': '', 'to': '', 'subject': '{count} changes: {jobs}' }, }, } def merge(source, destination): # http://stackoverflow.com/a/20666342 for key, value in source.items(): if isinstance(value, dict): # get node or create one node = destination.setdefault(key, {}) merge(value, node) else: destination[key] = value return destination def get_current_user(): try: return os.getlogin() except OSError: # If there is no controlling terminal, because urlwatch is launched by # cron, or by a systemd.service for example, os.getlogin() fails with: # OSError: [Errno 25] Inappropriate ioctl for device import pwd return pwd.getpwuid(os.getuid()).pw_name class BaseStorage(metaclass=ABCMeta): @abstractmethod def load(self, *args): ... @abstractmethod def save(self, *args): ... class BaseFileStorage(BaseStorage, metaclass=ABCMeta): def __init__(self, filename): self.filename = filename class BaseTextualFileStorage(BaseFileStorage, metaclass=ABCMeta): def __init__(self, filename): super().__init__(filename) self.config = {} self.load() @classmethod @abstractmethod def parse(cls, *args): ... def edit(self, example_file=None): editor = os.environ.get('EDITOR', None) if editor is None: editor = os.environ.get('VISUAL', None) if editor is None: print('Please set $VISUAL or $EDITOR.') return 1 fn_base, fn_ext = os.path.splitext(self.filename) file_edit = fn_base + '.edit' + fn_ext if os.path.exists(self.filename): shutil.copy(self.filename, file_edit) elif example_file is not None and os.path.exists(example_file): shutil.copy(example_file, file_edit) while True: try: editor = shlex.split(editor) editor.append(file_edit) subprocess.check_call(editor) # Check if we can still parse it if self.parse is not None: self.parse(file_edit) break # stop if no exception on parser except Exception as e: print('Parsing failed:') print('======') print(e) print('======') print('') print('The file', file_edit, 'was NOT updated.') user_input = input("Do you want to retry the same edit? (y/n)") if user_input.lower()[0] == 'y': continue print('Your changes have been saved in', file_edit) return 1 atomic_rename(file_edit, self.filename) print('Saving edit changes in', self.filename) return 0 @classmethod def write_default_config(cls, filename): config_storage = cls(None) config_storage.filename = filename config_storage.save() class UrlsBaseFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): def __init__(self, filename): self.filename = filename def shelljob_security_checks(self): if platform.system() == 'Windows': return [] shelljob_errors = [] current_uid = os.getuid() dirname = os.path.dirname(self.filename) or '.' dir_st = os.stat(dirname) if (dir_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0: shelljob_errors.append('%s is group/world-writable' % dirname) if dir_st.st_uid != current_uid: shelljob_errors.append('%s not owned by %s' % (dirname, get_current_user())) file_st = os.stat(self.filename) if (file_st.st_mode & (stat.S_IWGRP | stat.S_IWOTH)) != 0: shelljob_errors.append('%s is group/world-writable' % self.filename) if file_st.st_uid != current_uid: shelljob_errors.append('%s not owned by %s' % (self.filename, get_current_user())) return shelljob_errors def load_secure(self): jobs = self.load() # Security checks for shell jobs - only execute if the current UID # is the same as the file/directory owner and only owner can write shelljob_errors = self.shelljob_security_checks() if shelljob_errors and any(isinstance(job, ShellJob) for job in jobs): print(('Removing shell jobs, because %s' % (' and '.join(shelljob_errors),))) jobs = [job for job in jobs if not isinstance(job, ShellJob)] return jobs class BaseTxtFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): @classmethod def parse(cls, *args): filename = args[0] if filename is not None and os.path.exists(filename): with open(filename) as fp: for line in fp: line = line.strip() if not line or line.startswith('#'): continue if line.startswith('|'): yield ShellJob(command=line[1:]) else: args = line.split(None, 2) if len(args) == 1: yield UrlJob(url=args[0]) elif len(args) == 2: yield UrlJob(url=args[0], post=args[1]) else: raise ValueError('Unsupported line format: %r' % (line,)) class BaseYamlFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): @classmethod def parse(cls, *args): filename = args[0] if filename is not None and os.path.exists(filename): with open(filename) as fp: return yaml.load(fp) class BaseJsonFileStorage(BaseTextualFileStorage, metaclass=ABCMeta): @classmethod def parse(cls, *args): filename = args[0] if filename is not None and os.path.exists(filename): with open(filename) as fp: return json.load(fp) class YamlConfigStorage(BaseYamlFileStorage): def load(self, *args): self.config = merge(self.parse(self.filename) or {}, copy.deepcopy(DEFAULT_CONFIG)) def save(self, *args): with open(self.filename, 'w') as fp: yaml.dump(self.config, fp, default_flow_style=False) class JsonConfigStorage(BaseJsonFileStorage): def load(self, *args): self.config = merge(self.parse(self.filename) or {}, copy.deepcopy(DEFAULT_CONFIG)) def save(self, *args): with open(self.filename, 'w') as fp: json.dump(self.config, fp, default_flow_style=False) class UrlsYaml(BaseYamlFileStorage, UrlsBaseFileStorage): @classmethod def parse(cls, *args): filename = args[0] if filename is not None and os.path.exists(filename): with open(filename) as fp: return yaml.load_all(fp) def save(self, *args): jobs = args[0] print('Saving updated list to %r' % self.filename) with open(self.filename, 'w') as fp: yaml.dump_all([job.serialize() for job in jobs], fp, default_flow_style=False) def load(self, *args): with open(self.filename) as fp: return [JobBase.unserialize(job) for job in yaml.load_all(fp) if job is not None] class UrlsJson(BaseJsonFileStorage, UrlsBaseFileStorage): def save(self, *args): jobs = args[0] print('Saving updated list to %r' % self.filename) with open(self.filename, 'w') as fp: yaml.dump_all([job.serialize() for job in jobs], fp, default_flow_style=False) def load(self, *args): with open(self.filename) as fp: json_data = fp.read() return [JobBase.unserialize(job) for job in json.loads(json_data)['urls'] if job is not None] class UrlsTxt(BaseTxtFileStorage, UrlsBaseFileStorage): def load(self): return list(self.parse(self.filename)) def save(self, jobs): print(jobs) raise NotImplementedError() class CacheStorage(BaseFileStorage, metaclass=ABCMeta): @abstractmethod def close(self): ... @abstractmethod def get_guids(self): ... @abstractmethod def load(self, job, guid): ... @abstractmethod def save(self, job, guid, data, timestamp): ... @abstractmethod def delete(self, guid): ... @abstractmethod def clean(self, guid): ... def backup(self): for guid in self.get_guids(): data, timestamp = self.load(None, guid) yield guid, data, timestamp def restore(self, entries): for guid, data, timestamp in entries: self.save(None, guid, data, timestamp) def gc(self, known_guids): for guid in set(self.get_guids()) - set(known_guids): print('Removing: {guid}'.format(guid=guid)) self.delete(guid) for guid in known_guids: count = self.clean(guid) if count > 0: print('Removed {count} old versions of {guid}'.format(count=count, guid=guid)) class CacheDirStorage(CacheStorage): def __init__(self, filename): super().__init__(filename) if not os.path.exists(filename): os.makedirs(filename) def close(self): # No need to close return 0 def _get_filename(self, guid): return os.path.join(self.filename, guid) def get_guids(self): return os.listdir(self.filename) def load(self, job, guid): filename = self._get_filename(guid) if not os.path.exists(filename): return None, None try: with open(filename) as fp: data = fp.read() except UnicodeDecodeError: with open(filename, 'rb') as fp: data = fp.read().decode('utf-8', 'ignore') timestamp = os.stat(filename)[stat.ST_MTIME] return data, timestamp def save(self, job, guid, data, timestamp): # Timestamp is always ignored filename = self._get_filename(guid) with open(filename, 'w+') as fp: fp.write(data) def delete(self, guid): filename = self._get_filename(guid) if os.path.exists(filename): os.unlink(filename) def clean(self, guid): # We only store the latest version, no need to clean return 0 class CacheEntry(minidb.Model): guid = str timestamp = int data = str class CacheMiniDBStorage(CacheStorage): def __init__(self, filename): super().__init__(filename) dirname = os.path.dirname(filename) if dirname and not os.path.isdir(dirname): os.makedirs(dirname) self.db = minidb.Store(self.filename, debug=True) self.db.register(CacheEntry) def close(self): self.db.close() self.db = None def get_guids(self): return (guid for guid, in CacheEntry.query(self.db, minidb.Function('distinct', CacheEntry.c.guid))) def load(self, job, guid): for data, timestamp in CacheEntry.query(self.db, CacheEntry.c.data // CacheEntry.c.timestamp, order_by=CacheEntry.c.timestamp.desc, where=CacheEntry.c.guid == guid, limit=1): return data, timestamp return None, None def save(self, job, guid, data, timestamp): self.db.save(CacheEntry(guid=guid, timestamp=timestamp, data=data)) self.db.commit() def delete(self, guid): CacheEntry.delete_where(self.db, CacheEntry.c.guid == guid) self.db.commit() def clean(self, guid): keep_id = next((CacheEntry.query(self.db, CacheEntry.c.id, where=CacheEntry.c.guid == guid, order_by=CacheEntry.c.timestamp.desc, limit=1)), (None,))[0] if keep_id is not None: result = CacheEntry.delete_where(self.db, (CacheEntry.c.guid == guid) & (CacheEntry.c.id != keep_id)) self.db.commit() return result return 0 urlwatch-2.8/lib/urlwatch/util.py000066400000000000000000000070141323342436500171510ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import logging import os import platform logger = logging.getLogger(__name__) class TrackSubClasses(type): """A metaclass that stores subclass name-to-class mappings in the base class""" @staticmethod def sorted_by_kind(cls): return [item for _, item in sorted((it.__kind__, it) for it in cls.__subclasses__.values())] def __init__(cls, name, bases, namespace): for base in bases: if base == object: continue for attr in ('__required__', '__optional__'): if not hasattr(base, attr): continue inherited = getattr(base, attr, ()) new_value = tuple(namespace.get(attr, ())) + tuple(inherited) namespace[attr] = new_value setattr(cls, attr, new_value) for base in bases: if base == object: continue if hasattr(cls, '__kind__'): subclasses = getattr(base, '__subclasses__', None) if subclasses is not None: logger.info('Registering %r as %s', cls, cls.__kind__) subclasses[cls.__kind__] = cls break else: anonymous_subclasses = getattr(base, '__anonymous_subclasses__', None) if anonymous_subclasses is not None: logger.info('Registering %r', cls) anonymous_subclasses.append(cls) break super().__init__(name, bases, namespace) def atomic_rename(old_filename, new_filename): if platform.system() == 'Windows' and os.path.exists(new_filename): new_old_filename = new_filename + '.bak' if os.path.exists(new_old_filename): os.remove(new_old_filename) os.rename(new_filename, new_old_filename) os.rename(old_filename, new_filename) if os.path.exists(new_old_filename): os.remove(new_old_filename) else: os.rename(old_filename, new_filename) urlwatch-2.8/lib/urlwatch/worker.py000066400000000000000000000064371323342436500175150ustar00rootroot00000000000000# -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import concurrent.futures import logging import requests from .handler import JobState from .jobs import NotModifiedError logger = logging.getLogger(__name__) MAX_WORKERS = 10 def run_parallel(func, items): executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) for future in concurrent.futures.as_completed(executor.submit(func, item) for item in items): exception = future.exception() if exception is not None: raise exception yield future.result() def run_jobs(urlwatcher): cache_storage = urlwatcher.cache_storage jobs = urlwatcher.jobs report = urlwatcher.report logger.debug('Processing %d jobs', len(jobs)) for job_state in run_parallel(lambda job_state: job_state.process(), (JobState(cache_storage, job) for job in jobs)): logger.debug('Job finished: %s', job_state.job) if job_state.exception is not None: if isinstance(job_state.exception, NotModifiedError): logger.info('Job %s has not changed (HTTP 304)', job_state.job) report.unchanged(job_state) elif isinstance(job_state.exception, requests.exceptions.RequestException): # Instead of a full traceback, just show the HTTP error job_state.traceback = str(job_state.exception) report.error(job_state) else: report.error(job_state) elif job_state.old_data is not None: if job_state.old_data.splitlines() != job_state.new_data.splitlines(): report.changed(job_state) job_state.save() else: report.unchanged(job_state) else: report.new(job_state) job_state.save() urlwatch-2.8/setup.cfg000066400000000000000000000000351323342436500150400ustar00rootroot00000000000000[pep8] max-line-length = 120 urlwatch-2.8/setup.py000066400000000000000000000016301323342436500147330ustar00rootroot00000000000000#!/usr/bin/env python3 from setuptools import setup import os import re main_py = open(os.path.join('lib', 'urlwatch', '__init__.py')).read() m = dict(re.findall("\n__([a-z]+)__ = '([^']+)'", main_py)) docs = re.findall('"""(.*?)"""', main_py, re.DOTALL) m['name'] = 'urlwatch' m['author'], m['author_email'] = re.match(r'(.*) <(.*)>', m['author']).groups() m['description'], m['long_description'] = docs[0].strip().split('\n\n', 1) m['download_url'] = '{url}urlwatch-{version}.tar.gz'.format(**m) m['install_requires'] = ['minidb', 'PyYAML', 'requests', 'keyring', 'pycodestyle', 'appdirs'] m['scripts'] = ['urlwatch'] m['package_dir'] = {'': 'lib'} m['packages'] = ['urlwatch'] m['data_files'] = [ ('share/man/man1', ['share/man/man1/urlwatch.1']), ('share/urlwatch/examples', [ 'share/urlwatch/examples/hooks.py.example', 'share/urlwatch/examples/urls.yaml.example', ]), ] setup(**m) urlwatch-2.8/share/000077500000000000000000000000001323342436500143235ustar00rootroot00000000000000urlwatch-2.8/share/man/000077500000000000000000000000001323342436500150765ustar00rootroot00000000000000urlwatch-2.8/share/man/man1/000077500000000000000000000000001323342436500157325ustar00rootroot00000000000000urlwatch-2.8/share/man/man1/urlwatch.1000066400000000000000000000037271323342436500176560ustar00rootroot00000000000000.TH URLWATCH "1" "January 2016" "urlwatch 2.0" "User Commands" .SH NAME urlwatch \- a tool for monitoring webpages for updates .SH USAGE .B urlwatch [\-h] [\-\-version] [\-v] .IP [\-\-urls FILE] [\-\-config FILE] [\-\-hooks FILE] [\-\-cache FILE] .IP [\-\-list] [\-\-add JOB] [\-\-delete JOB] .IP [\-\-edit] [\-\-edit\-config] [\-\-edit\-hooks] .IP [\-\-features] [\-\-gc\-cache] .PP .SH DESCRIPTION .PP urlwatch is intended to help you watch changes in webpages and get notified (via email, in your terminal or with a custom-written reporter class) of any changes. The change notification will include the URL that has changed and a unified diff of what has changed. .SS "optional arguments:" .TP \fB\-h\fR, \fB\-\-help\fR show this help message and exit .TP \fB\-\-version\fR show program's version number and exit .TP \fB\-v\fR, \fB\-\-verbose\fR show debug output .SS "files and directories:" .TP \fB\-\-urls\fR FILE read job list (URLs) from FILE .TP \fB\-\-config\fR FILE read configuration from FILE .TP \fB\-\-hooks\fR FILE use FILE as hooks.py module .TP \fB\-\-cache\fR FILE use FILE as cache database .SS "job list management:" .TP \fB\-\-list\fR list jobs .TP \fB\-\-add\fR JOB add job (key1=value1,key2=value2,...) .TP \fB\-\-delete\fR JOB delete job by location or index .SS "interactive commands ($EDITOR/$VISUAL):" .TP \fB\-\-edit\fR edit URL/job list .TP \fB\-\-edit\-config\fR edit configuration file .TP \fB\-\-edit\-hooks\fR edit hooks script .SS "miscellaneous:" .TP \fB\-\-features\fR list supported jobs/filters/reporters .TP \fB\-\-gc\-cache\fR remove old cache entries .SH "FILES" .TP .B $XDG_CONFIG_HOME/urlwatch/urls.yaml A list of URLs, commands and other jobs to watch .TP .B $XDG_CONFIG_HOME/urlwatch/hooks.py A Python module that can implement new job types, filters and reporters .TP .B $XDG_CONFIG_HOME/urlwatch/cache.db A SQLite 3 database that contains the state history of jobs (for diffing) .SH AUTHOR Thomas Perl .SH WEBSITE http://thp.io/2008/urlwatch/ urlwatch-2.8/share/urlwatch/000077500000000000000000000000001323342436500161545ustar00rootroot00000000000000urlwatch-2.8/share/urlwatch/examples/000077500000000000000000000000001323342436500177725ustar00rootroot00000000000000urlwatch-2.8/share/urlwatch/examples/hooks.py.example000066400000000000000000000076561323342436500231370ustar00rootroot00000000000000# # Example hooks file for urlwatch # # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import re from urlwatch import filters from urlwatch import jobs from urlwatch import reporters #class CustomLoginJob(jobs.UrlJob): # """Custom login for my webpage""" # # __kind__ = 'custom-login' # __required__ = ('username', 'password') # # def retrieve(self, job_state): # return 'Would log in to {} with {} and {}\n'.format(self.url, self.username, self.password) #class CaseFilter(filters.FilterBase): # """Custom filter for changing case, needs to be selected manually""" # # __kind__ = 'case' # # def filter(self, data, subfilter=None): # # The subfilter is specified using a colon, for example the "case" # # filter here can be specified as "case:upper" and "case:lower" # # if subfilter is None: # subfilter = 'upper' # # if subfilter == 'upper': # return data.upper() # elif subfilter == 'lower': # return data.lower() # else: # raise ValueError('Unknown case subfilter: %r' % (subfilter,)) #class IndentFilter(filters.FilterBase): # """Custom filter for indenting, needs to be selected manually""" # # __kind__ = 'indent' # # def filter(self, data, subfilter=None): # # The subfilter here is a number of characters to indent # # if subfilter is None: # indent = 8 # else: # indent = int(subfilter) # # return '\n'.join((' '*indent) + line for line in data.splitlines()) class CustomMatchUrlFilter(filters.AutoMatchFilter): # The AutoMatchFilter will apply automatically to all filters # that have the given properties set MATCH = {'url': 'http://example.org/'} def filter(self, data): return data.replace('foo', 'bar') class CustomRegexMatchUrlFilter(filters.RegexMatchFilter): # Similar to AutoMatchFilter MATCH = {'url': re.compile('http://example.org/.*')} def filter(self, data): return data.replace('foo', 'bar') class CustomTextFileReporter(reporters.TextReporter): """Custom reporter that writes the text-only report to a file""" __kind__ = 'custom_file' def submit(self): with open(self.config['filename'], 'w') as fp: fp.write('\n'.join(super().submit())) class CustomHtmlFileReporter(reporters.HtmlReporter): """Custom reporter that writes the HTML report to a file""" __kind__ = 'custom_html' def submit(self): with open(self.config['filename'], 'w') as fp: fp.write('\n'.join(super().submit())) urlwatch-2.8/share/urlwatch/examples/urls.yaml.example000066400000000000000000000027201323342436500232760ustar00rootroot00000000000000# This is an example urls.yaml file for urlwatch # A basic URL job just needs a URL name: "urlwatch webpage" url: "https://thp.io/2008/urlwatch/" # You can use a pre-supplied filter for this, here we apply two: # the html2text filter that converts the HTML to plaintext and # the grep filter that filters lines based on a regular expression filter: html2text,grep:Current.*version,strip --- # Built-in job kind "shell" needs a command specified name: "Home Listing" command: "ls -al ~" #--- #name: "Login to some webpage (custom job)" #url: "http://example.org/" # This job kind is defined in hooks.py, so you need to enable it #kind: custom-login # Additional parameters for the custom-login job kind can be specified here #username: "myuser" #password: "secret" # Filters can be specified here, separated by comma (these are also from hooks.py) #filter: case:upper,indent:5 --- # If you want to use spaces in URLs, you have to URL-encode them (e.g. %20) url: "http://example.org/With%20Spaces/" --- # POST requests are done by providing a post parameter url: "http://example.com/search.cgi" data: "button=Search&q=something&category=4" --- # You can use a custom HTTP method, this might be useful for cache invalidation url: "http://example.com/foo" method: "PURGE" --- # You can do POST requests by providing data parameter. # POST data can be a URL-encoded string (see last example) or a dict. url: "http://example.com/search.cgi" data: button: Search q: something category: 4 urlwatch-2.8/test/000077500000000000000000000000001323342436500142005ustar00rootroot00000000000000urlwatch-2.8/test/data/000077500000000000000000000000001323342436500151115ustar00rootroot00000000000000urlwatch-2.8/test/data/urls.json000066400000000000000000000017541323342436500170000ustar00rootroot00000000000000{ "urls": [ { "filter": "html2text,grep:Current.*version,strip", "kind": "url", "name": "urlwatch webpage", "url": "https://thp.io/2008/urlwatch/" }, { "kind": "shell", "name": "Home Listing", "command": "ls -al ~" }, { "kind": "url", "name": "Example.org with spaces", "url": "http://example.org/With%20Spaces/" }, { "kind": "url", "name": "Example.org with POST", "url": "http://example.com/search.cgi", "method": "POST", "data": "button=Search&q=something&category=4" }, { "kind": "url", "name": "Example.org with POST with exploded parameters", "url": "http://example.com/search.cgi", "method": "POST", "data": { "button": "Search", "q": "something", "category": 4 } }, { "kind": "url", "name": "Custom HTTP method", "url": "http://example.com/foo", "method": "PURGE" } ] } urlwatch-2.8/test/data/urls.txt000066400000000000000000000023021323342436500166340ustar00rootroot00000000000000 # This is an example urls.txt file for urlwatch # Empty lines and lines starting with "#" are ignored http://www.dubclub-vienna.com/ http://www.openpandora.org/developers.php #http://www.statistik.tuwien.ac.at/lv-guide/u107.369/info.html #http://www.statistik.tuwien.ac.at/lv-guide/u107.369/blatter.html #http://www.dbai.tuwien.ac.at/education/dbs/current/index.html #http://www.dbai.tuwien.ac.at/education/dbs/current/uebung.html http://ti.tuwien.ac.at/rts/teaching/courses/systems_programming http://ti.tuwien.ac.at/rts/teaching/courses/systems_programming/labor http://ti.tuwien.ac.at/rts/teaching/courses/betriebssysteme #http://www.complang.tuwien.ac.at/anton/lvas/effiziente-programme.html #http://www.complang.tuwien.ac.at/anton/lvas/effizienz-aufgabe08/ http://www.kukuk.at/ical/events http://guckes.net/cal/ # You can use the pipe character to "watch" the output of shell commands |ls -al ~ # If you want to use spaces in URLs, you have to URL-encode them (e.g. %20) http://example.org/With%20Spaces/ # You can do POST requests by writing the POST data behind the URL, # separated by a single space character. POST data is URL-encoded. http://example.com/search.cgi button=Search&q=something&category=4 urlwatch-2.8/test/data/urlwatch.json000066400000000000000000000013331323342436500176350ustar00rootroot00000000000000{ "display": { "error": true, "new": true, "unchanged": false }, "report": { "email": { "enabled": false, "from": "", "html": false, "method": "smtp", "sendmail": { "path": "sendmail" }, "smtp": { "host": "localhost", "keyring": true, "port": 25, "starttls": true }, "subject": "{count} changes: {jobs}", "to": "" }, "html": { "diff": "unified" }, "pushover": { "app": "", "enabled": false, "user": "" }, "stdout": { "color": true, "enabled": true }, "text": { "details": true, "footer": true, "line_length": 75 } } }urlwatch-2.8/test/data/urlwatch.yaml000066400000000000000000000007621323342436500176330ustar00rootroot00000000000000display: error: true new: true unchanged: false report: email: enabled: false from: '' html: false method: smtp sendmail: path: sendmail smtp: host: localhost keyring: true port: 25 starttls: true subject: '{count} changes: {jobs}' to: '' html: diff: unified pushover: app: '' enabled: false user: '' stdout: color: true enabled: true text: details: true footer: true line_length: 75 urlwatch-2.8/test/test_filters.py000066400000000000000000000020241323342436500172570ustar00rootroot00000000000000from urlwatch.filters import GetElementById from urlwatch.filters import GetElementByTag from nose.tools import eq_ def test_get_element_by_id(): get_element_by_id = GetElementById(None, None) result = get_element_by_id.filter("""
asdf bar
asdf bar hoho
""", 'bar') print(result) eq_(result, '
asdf bar hoho
') def test_get_element_by_tag(): get_element_by_tag = GetElementByTag(None, None) result = get_element_by_tag.filter(""" foo """, 'body') print(result) eq_(result, 'foo') def test_get_element_by_tag_nested(): get_element_by_tag = GetElementByTag(None, None) result = get_element_by_tag.filter("""
foo
bar
""", 'div') print(result) eq_(result, """
foo
bar
""") urlwatch-2.8/test/test_handler.py000066400000000000000000000106351323342436500172330ustar00rootroot00000000000000import sys from glob import glob import pycodestyle as pycodestyle from urlwatch.jobs import UrlJob, JobBase, ShellJob from urlwatch.storage import UrlsYaml, UrlsTxt from nose.tools import raises, with_setup import tempfile import os import imp from urlwatch import storage from urlwatch.config import BaseConfig from urlwatch.storage import JsonConfigStorage, YamlConfigStorage, UrlsJson, CacheMiniDBStorage from urlwatch.main import Urlwatch def test_required_classattrs_in_subclasses(): for kind, subclass in JobBase.__subclasses__.items(): assert hasattr(subclass, '__kind__') assert hasattr(subclass, '__required__') assert hasattr(subclass, '__optional__') def test_save_load_jobs(): jobs = [ UrlJob(name='news', url='http://news.orf.at/'), ShellJob(name='list homedir', command='ls ~'), ShellJob(name='list proc', command='ls /proc'), ] with tempfile.NamedTemporaryFile() as tmp: UrlsYaml(tmp.name).save(jobs) jobs2 = UrlsYaml(tmp.name).load() os.chmod(tmp.name, 0o777) jobs3 = UrlsYaml(tmp.name).load_secure() assert len(jobs2) == len(jobs) # Assert that the shell jobs have been removed due to secure loading assert len(jobs3) == 1 def test_load_config_yaml(): config_json = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') if os.path.exists(config_json): config = YamlConfigStorage(config_json) assert config is not None assert config.config is not None assert config.config == storage.DEFAULT_CONFIG def test_load_config_json(): config_json = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.json') if os.path.exists(config_json): config = JsonConfigStorage(config_json) assert config is not None assert config.config is not None assert config.config == storage.DEFAULT_CONFIG def test_load_urls_txt(): urls_txt = os.path.join(os.path.dirname(__file__), 'data', 'urls.txt') if os.path.exists(urls_txt): assert len(UrlsTxt(urls_txt).load_secure()) > 0 def test_load_urls_json(): urls_txt = os.path.join(os.path.dirname(__file__), 'data', 'urls.json') if os.path.exists(urls_txt): assert len(UrlsJson(urls_txt).load_secure()) > 0 def test_load_urls_yaml(): urls_yaml = 'share/urlwatch/examples/urls.yaml.example' if os.path.exists(urls_yaml): assert len(UrlsYaml(urls_yaml).load_secure()) > 0 def test_load_hooks_py(): hooks_py = 'share/urlwatch/examples/hooks.py.example' if os.path.exists(hooks_py): imp.load_source('hooks', hooks_py) def test_pep8_conformance(): """Test that we conform to PEP-8.""" style = pycodestyle.StyleGuide(ignore=['E501', 'E402']) py_files = [y for x in os.walk(os.path.abspath('.')) for y in glob(os.path.join(x[0], '*.py'))] py_files.append(os.path.abspath('urlwatch')) result = style.check_files(py_files) assert result.total_errors == 0, "Found #{0} code style errors".format(result.total_errors) class TestConfig(BaseConfig): def __init__(self, config, urls, cache, hooks, verbose): (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0]))) super().__init__('urlwatch', os.path.dirname(__file__), config, urls, cache, hooks, verbose) def teardown_func(): "tear down test fixtures" cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db') if os.path.exists(cache): os.remove(cache) @with_setup(teardown=teardown_func) def test_run_watcher(): urls = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'share', 'urlwatch', 'examples', 'urls.yaml.example') config = os.path.join(os.path.dirname(__file__), 'data', 'urlwatch.yaml') cache = os.path.join(os.path.dirname(__file__), 'data', 'cache.db') hooks = '' config_storage = YamlConfigStorage(config) cache_storage = CacheMiniDBStorage(cache) urls_storage = UrlsYaml(urls) urlwatch_config = TestConfig(config, urls, cache, hooks, True) urlwatcher = Urlwatch(urlwatch_config, config_storage, cache_storage, urls_storage) urlwatcher.run_jobs() def test_unserialize_shell_job_without_kind(): job = JobBase.unserialize({ 'name': 'hoho', 'command': 'ls', }) assert isinstance(job, ShellJob) @raises(ValueError) def test_unserialize_with_unknown_key(): JobBase.unserialize({ 'unknown_key': 123, 'name': 'hoho', }) urlwatch-2.8/urlwatch000077500000000000000000000076001323342436500150030ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # This file is part of urlwatch (https://thp.io/2008/urlwatch/). # Copyright (c) 2008-2018 Thomas Perl # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. The name of the author may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # File and folder paths import logging import os.path import signal import socket import sys from appdirs import AppDirs pkgname = 'urlwatch' urlwatch_dir = os.path.expanduser(os.path.join('~', '.' + pkgname)) if not os.path.exists(urlwatch_dir): urlwatch_dir = AppDirs(pkgname).user_config_dir # Check if we are installed in the system already (prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0]))) if bindir != 'bin': sys.path.insert(0, os.path.join(prefix, bindir, 'lib')) from urlwatch.command import UrlwatchCommand from urlwatch.config import CommandConfig from urlwatch.main import Urlwatch from urlwatch.storage import YamlConfigStorage, CacheMiniDBStorage, UrlsYaml # One minute (=60 seconds) timeout for each request to avoid hanging socket.setdefaulttimeout(60) # Ignore SIGPIPE for stdout (see https://github.com/thp/urlwatch/issues/77) try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) except AttributeError: # Windows does not have signal.SIGPIPE ... logger = logging.getLogger(pkgname) CONFIG_FILE = 'urlwatch.yaml' URLS_FILE = 'urls.yaml' CACHE_FILE = 'cache.db' HOOKS_FILE = 'hooks.py' def setup_logger(verbose): if verbose: root_logger = logging.getLogger('') console = logging.StreamHandler() console.setFormatter(logging.Formatter('%(asctime)s %(module)s %(levelname)s: %(message)s')) root_logger.addHandler(console) root_logger.setLevel(logging.DEBUG) root_logger.info('turning on verbose logging mode') if __name__ == '__main__': config_file = os.path.join(urlwatch_dir, CONFIG_FILE) urls_file = os.path.join(urlwatch_dir, URLS_FILE) cache_file = os.path.join(urlwatch_dir, CACHE_FILE) hooks_file = os.path.join(urlwatch_dir, HOOKS_FILE) command_config = CommandConfig(pkgname, urlwatch_dir, bindir, prefix, config_file, urls_file, hooks_file, cache_file, False) setup_logger(command_config.verbose) # setup storage API config_storage = YamlConfigStorage(command_config.config) cache_storage = CacheMiniDBStorage(command_config.cache) urls_storage = UrlsYaml(command_config.urls) # setup urlwatcher urlwatch = Urlwatch(command_config, config_storage, cache_storage, urls_storage) urlwatch_command = UrlwatchCommand(urlwatch) # run urlwatcher urlwatch_command.run()