debiancontributors-0.6/0000755000175000017500000000000012264333700016037 5ustar enricoenrico00000000000000debiancontributors-0.6/debiancontributors/0000755000175000017500000000000012264333700021737 5ustar enricoenrico00000000000000debiancontributors-0.6/debiancontributors/parser.py0000644000175000017500000001734012256052345023616 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source parser from untrusted data # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import datetime import json class Fail(BaseException): """ Exception raised when a validation or lookup fails """ def __init__(self, code, msg): self.code = code self.msg = msg class ClusterFail(Fail): """ Exception raised to report a number of errors of the same kind """ def __init__(self, code, msg, errors): super(ClusterFail, self).__init__(code, msg) self.errors = errors def get_key(d, key): "Get a key from a dict" try: return d[key] except KeyError: raise Fail(400, "Key '{}' not found".format(key)) def get_key_int(d, key): "Get a key from a dict, as an int" try: return int(get_key(d, key)) except ValueError: raise Fail(400, "Key '{}' does not contain an integer value".format(key)) def get_key_string(d, key, empty=False): "Get a key from a dict, as a string" if empty: res = d.get(key, "") if not res: return "" else: res = get_key(d, key) try: res = str(res) except ValueError: raise Fail(400, "Key '{}' does not contain a string value".format(key)) if not empty and not res: raise Fail(400, "Key '{}' contains an empty string".format(key)) return res def get_key_unicode(d, key, empty=False): "Get a key from a dict, as a unicode, decoded from utf8 if necessary" if empty: res = d.get(key, "") if not res: return "" else: res = get_key(d, key) if not res: raise Fail(400, "Key '{}' contains an empty string".format(key)) if isinstance(res, unicode): return res if not isinstance(res, str): raise Fail(400, "Key '{}' does not contain a string value".format(key)) try: return res.decode("utf8") except (UnicodeEncodeError, UnicodeDecodeError): escaped = res.decode(encoding="utf8", errors="replace") raise Fail(400, "Key '{}' contain {} which is not a valid UTF8 string".format(key, escaped)) def get_key_sequence(d, key): "Get a key from a dict, ensuring it is a list or tuple" res = get_key(d, key) if not isinstance(res, (list, tuple)): raise Fail(400, "Key '{}' does not contain an array".format(key)) return res def get_key_sequence_or_object(d, key): """ Get a key from a dict, ensuring it is a list or tuple, allowing singleton lists of objects to be just the object itself """ res = get_key(d, key) if isinstance(res, (list, tuple)): return res elif isinstance(res, dict): return [res] else: raise Fail(400, "Key '{}' does not contain an array or object".format(key)) def get_key_date_or_none(d, key): "Get a key from a dict, as a date, allowing None" res = get_key_string(d, key, empty=True) if not res: return None try: return datetime.datetime.strptime(res, "%Y-%m-%d").date() except ValueError: raise Fail(400, "Key '{}' does not contain a YYYY-MM-DD date".format(key)) def get_json(f, compression=None): """ Parse JSON from data from a file-like object, with optional decompression """ if compression: if compression == "gzip": import gzip try: with gzip.GzipFile(mode="rb", fileobj=f) as fd: try: return json.load(fd) except (ValueError, UnicodeDecodeError): raise Fail(400, "invalid JSON data") except IOError: raise Fail(400, "invalid gzip compressed data") elif compression == "xz": try: import lzma except ImportError: raise Fail(500, "but python-lzma is not installed to decode xz-compressed data") try: return json.loads(lzma.decompress(f.read())) except IOError: raise Fail(400, "invalid xz compressed data") except (ValueError, UnicodeDecodeError): raise Fail(400, "invalid JSON data") else: raise Fail(500, "{} compression is not supported".format(compression)) else: try: return json.load(f) except (ValueError, UnicodeDecodeError): raise Fail(400, "invalid JSON data") class Parser(object): def parse_identifier(self, d): """ Parse a dict as an Identifier """ from .types import Identifier i_type = get_key_string(d, "type") i_id = get_key_unicode(d, "id") i_desc = get_key_unicode(d, "desc", True) res = Identifier(i_type, i_id, i_desc) res.validate() return res def parse_contribution(self, d): """ Parse a dict as a Contribution """ from .types import Contribution c_type = get_key_string(d, "type") c_begin = get_key_date_or_none(d, "begin") c_until = get_key_date_or_none(d, "end") c_url = get_key_unicode(d, "url", True) or None res = Contribution(c_type, c_begin, c_until, c_url) res.validate() return res def parse_submission(self, seq): """ Parse a sequence as a submission generate a sequence of (ids, contributions) """ if not isinstance(seq, (list, tuple)): raise Fail(400, "Submission is not an Array") errors = [] total_count = 0 for idx, rec in enumerate(seq): total_count += 1 if not isinstance(rec, dict): errors.append("#{}: submission is not an Array") continue # Parse identifiers try: s_ids = [self.parse_identifier(d) for d in get_key_sequence_or_object(rec, "id")] except Fail, f: errors.append("#{}: cannot parse identifier(s): {}".format(idx, f.msg)) continue if not s_ids: errors.append("#{}: identifier list is empty".format(idx)) continue # Parse contributions try: s_contribs = [self.parse_contribution(d) for d in get_key_sequence_or_object(rec, "contributions")] except Fail, f: errors.append("#{} for {}: cannot parse contribution(s): {}".format(idx, s_id[0].id, f.msg)) continue if not s_contribs: errors.append("#{} for {}: contribution list is empty".format(idx, s_id[0].id)) continue yield s_ids, s_contribs if errors: if len(errors) == 1: raise Fail(400, errors[0]) elif len(errors) == total_count: raise ClusterFail(400, "All submissions failed", errors) else: raise ClusterFail(400, "Some submission failed", errors) debiancontributors-0.6/debiancontributors/types.py0000644000175000017500000001714112264246423023466 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source core data structures # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .parser import Fail import re __all__ = ["Identifier", "Contribution"] class Identifier(object): """ Information about a user identifier """ # Validator regexps TYPE_VALIDATORS = { "login": re.compile("^[a-z0-9._-]+$"), # From http://www.regular-expressions.info/email.html "email": re.compile("^[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$", re.I), "fpr": re.compile("^[A-F0-9]{32,40}$"), #"wiki": re.compile("^[A-Za-z]+$"), } def __init__(self, type, id, desc=None): self.type = type self.id = id self.desc = desc def __hash__(self): return hash(self.type) + hash(self.id) def __eq__(self, other): return (self.type == other.type and self.id == other.id) def to_json(self): """ Return a JSON-serializable structure for this identifier """ if self.desc: return { "type": self.type, "id": self.id, "desc": self.desc, } else: return { "type": self.type, "id": self.id, } def validate(self): """ Validate the contents of this Identifier, raising parser.Fail if anything fails. """ # Validate member types if not isinstance(self.type, basestring): raise Fail(400, "Identifier type is '{}' instead of a string".format(type(self.type))) if not isinstance(self.id, basestring): raise Fail(400, "Identifier id is '{}' instead of a string".format(type(self.id))) if self.desc is not None and not isinstance(self.desc, basestring): raise Fail(400, "Identifier desc is '{}' instead of None or a string".format(type(self.desc))) # Validator for this type type_validator = self.TYPE_VALIDATORS.get(self.type, None) if type_validator is None: raise Fail(400, "Invalid identifier type '{}'".format(self.type)) # Parse the ID and validate it if not type_validator.match(self.id): raise Fail(400, "{} '{}' is not a valid identifier".format(self.type, self.id)) @classmethod def create_auto(cls, s, default_desc=None): """ Autodetect identifier type and value from a string. 'desc' is the default description to use if not inferred automatically. """ from email.utils import getaddresses if "<" in s: # Use getaddresses instead of parseaddr because # parseaddr truncates the string at a stray command, instead of # declaring a failed parse: # parseaddr("a, ") gives ('', 'a') results = getaddresses((s,)) if len(results) == 1: # Parsing was ok desc, ident = results[0] else: # Something went wrong, possibly a stray comma. Trying again # wtih a regexp mo = re.match(r"^\s*(?:(.+)\s+)?<([^>]+)>\s*$", s) if mo: desc, ident = mo.group(1, 2) else: desc, ident = default_desc, s else: desc, ident = default_desc, s ident = ident.replace(" ", "") for type, regexp in cls.TYPE_VALIDATORS.iteritems(): if regexp.match(ident): return cls(type, ident, desc) raise ValueError("cannot infer a valid Identifier from '{}'".format(s)) class Contribution(object): """ Information about a contribution. """ def __init__(self, type, begin=None, end=None, url=None): """ type: contribution type (as configured in contrbutors.debian.org for a source) begin: start time of this contribution. None to reuse the last start time. end: end time of this contribution. None to mean 'now'. url: URL used to list all contributions of this type from this person, if available. """ self.type = type self.begin = begin self.end = end self.url = url def __hash__(self): return hash(self.type) + hash(self.begin) + hash(self.end) def __eq__(self, other): return (self.type == other.type and self.begin == other.begin and self.end == other.end) def extend_by_date(self, date): """ Extend the date range to include the given date "Extend" is a bit imprecise: if the current end date is None (meaning 'today'), then it is set to 'date' (which could be before than today) """ if self.begin is None: self.begin = date else: self.begin = min(self.begin, date) if self.end is None: self.end = date else: self.end = max(self.end, date) def to_json(self): """ Return a JSON-serializable structure for this contribution """ res = { "type": self.type } if self.begin: res["begin"] = self.begin.strftime("%Y-%m-%d") if self.end: res["end"] = self.end.strftime("%Y-%m-%d") if self.url: res["url"] = self.url return res @classmethod def merged(cls, first, second): """ Build a Contribution with a merge of two existing ones """ if second.begin is None: begin = first.begin elif first.begin is None: begin = second.begin else: begin = min(first.begin, second.begin) if second.end is None: end = first.end elif first.end is None: end = second.end else: end = min(first.end, second.end) if first.url is None: url = second.url else: url = first.url return cls(first.type, begin, end, url) def validate(self): """ Validate the contents of this Identifier, raising parser.Fail if anything fails. """ # Validate member types if not isinstance(self.type, basestring): raise Fail(400, "Contribution type is '{}' instead of a string".format(type(self.type))) if self.begin is not None and not hasattr(self.begin, "strftime"): raise Fail(400, "Contribution begin is '{}' and does not look like a date or datetime".format(type(self.begin))) if self.end is not None and not hasattr(self.end, "strftime"): raise Fail(400, "Contribution end is '{}' and does not look like a date or datetime".format(type(self.end))) if self.url is not None and not isinstance(self.url, basestring): raise Fail(400, "Contribution URL is '{}' instead of None or a string".format(type(self.url))) debiancontributors-0.6/debiancontributors/datamine.py0000644000175000017500000002005412264333527024103 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .submission import DEFAULT_BASE_URL, Submission from debian import deb822 import os.path import io import re import sys __all__ = ["Fail", "DataMine"] class Fail(BaseException): pass def read_config(fname): with io.open(fname, encoding="utf8") as fp: for par in deb822.Deb822.iter_paragraphs(fp): yield par def read_configstr(s): if not isinstance(s, unicode): raise TypeError("configuration contents must be a unicode string") with io.StringIO(s) as fp: for par in deb822.Deb822.iter_paragraphs(fp): yield par def load_scanners(): """ Load all scanners as a sequence of scanner classes """ from . import scanners from .scanner import Scanner import inspect for name, cls in inspect.getmembers(scanners, inspect.isclass): if not issubclass(cls, Scanner): continue # Make sure that NAME is set if cls.NAME is None: cls.NAME = name.lower() yield cls class DataMine(object): def __init__(self, configfname=None, configstr=None, source_name=None): """ Create a data miner for a data source reading a configuration file. If the first paragraph does not have a "contribution:" field, it is used for general data source configuration, like auth key, source name (if not the same as the file name), and base url (if the default is not ok) The source name is the value of source_name, if given. Else it is the value in general/name. Else it is the basename of configfname, with .conf or .cfg extension stripped, if present. """ # Read all the configuration as a dict { section: { key: value } } if configfname is not None: config = list(read_config(configfname)) elif configstr is not None: config = list(read_configstr(configstr)) else: raise TypeError("one of configfname or configstr should be provided") if not config: raise Fail("the configuration is empty") # Extract the general configuration name = source_name auth_token = None baseurl = None general = config[0] if "contribution" not in general: config = config[1:] if not name: name = general.get("source", None) auth_token = general.get("auth_token", None) baseurl = general.get("baseurl", DEFAULT_BASE_URL) # Default source with the config file name, without config-like # extensions if name is None: name = os.path.basename(configfname) name = re.sub(r".(?:cfg,conf)$", "", name) # Instantiate the submission that we are going to build self.submission = Submission(name, auth_token=auth_token, baseurl=baseurl) # Instantiate scanners self.scanners = [] scanner_factories = { x.NAME: x for x in load_scanners() } for cfg in config: # Contribution type ctype = cfg.get("contribution", None) if ctype is None: raise Fail("'contribution' field not found in data miner configuration") # Get scanner class 'method' configuration method = cfg.get("method", None) if method is None: raise Fail("'method' field not found in data miner configuration") scanner_cls = scanner_factories.get(method, None) if scanner_cls is None: raise Fail("'{}' configuration requests unsupported method: '{}'".format(ctype, method)) # Instantiate scanner self.scanners.append({ "ctype": ctype, "method": method, "scanner": scanner_cls(cfg), }) def scan(self): """ Run all data miners and add their output to the submission """ for s in self.scanners: ctype = s["ctype"] for ident, begin, until, url in s["scanner"].scan(): self.submission.add_contribution_data( ident, ctype, begin, until, url) @classmethod def print_documentation(cls, file=sys.stdout): print(""" =================== dc-tool data mining =================== dc-tool has several methods of data mining that can be controlled via a configuration file. It works like this: 1. Read this documentation and create a configuration file to test. 2. Run ``dc-tool --mine=mysource.conf`` to perform data mining and print results to standard output. 3. When you are satisfied of the results, run ``dc-tool --mine=mysource.conf --post`` to post data to contributors.debian.org. Run that via cron and you have a full working data source. ------------------------- Configuration file syntax ------------------------- The configuration file follows the usual Debian RFC822/Yaml-like syntax. If the first group of options does not have a "contribution:" field, it is used for general configuration of the data source. All other sections define methods of mining the data you want. The data source configuration section ===================================== Example:: # You don't need this option if you call this file nm.debian.org.conf #source: nm.debian.org # Auhentication token used to post data. Use a leading '@' as in '@filename' # to use the contents of another file as auth token. Do not make this file # world-readable! auth_token: @secrets/auth_token.txt The general configuration section has three configurable keywords: ``name`` Data source name, as configured in contributors.debian.org. If omitted, dc-tool will use the configuration file name. If the file name ends in ``.ini``, ``.conf`` or ``.cfg``, the extension will be removed. ``auth_token`` The authentication token used for posting data to the site. Anyone with this authentication token can post data for this data source, so be careful not to give this file world-readable permissions. ``baseurl`` You never need this unless you want to test a local copy of the contributors.debian.org codebase: it defaults to ``{DEFAULT_BASE_URL}`` but you can change it to submit data to your local development version. Data mining sections ==================== Example:: contribution: committer # Data mining method method: gitdirs # Configuration specific to this method dirs: /srv/git.debian.org/git/collab-maint/*.git url: https://alioth.debian.org/users/{{user}}/ Each data mining section has at least two configurable keywords: ``contribution`` Contribution type for this data source, as configured in contributors.debian.org. You can have many sections with the same contribution types, and the results of their data mining will all be merged. ``method`` The mining method. There are several mining method available, each with its own configuration options, documented below. The rest of the options are specific to each data mining method. Below is a full documentation of them. Data mining methods =================== """.format(DEFAULT_BASE_URL=DEFAULT_BASE_URL), file=file) for scanner in sorted(load_scanners(), key=lambda x:x.NAME): scanner.print_documentation(file=file) debiancontributors-0.6/debiancontributors/scanners/0000755000175000017500000000000012264333700023553 5ustar enricoenrico00000000000000debiancontributors-0.6/debiancontributors/scanners/mailbox.py0000644000175000017500000001014012264261102025550 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining on emails # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from ..types import Identifier from .utils.mine import Aggregate from .utils.email import get_mailbox from .. import scanner import email.utils import datetime import time __all__ = ["MailFrom"] class MailFrom(scanner.Scanner): """ Scan email address from From: headers in mailboxes Example:: contribution: developer method: mailfrom folders: /home/debian/lists/debian-devel-announce/* url: http://www.example.com/{email} """ folders = scanner.GlobField(blank=False, help_text=""" mail folders to scan. You can give one or more, and even use shell-style globbing. Mailbox, mailbox.gz and Maildir folders are supported. """) whitelist = scanner.EmailsField(help_text=""" if present, only emails from this list will be considered as contributors. """) blacklist = scanner.EmailsField(help_text=""" if present, emails from this list will not be considered as contributors. """) url = scanner.CharField(help_text=""" template used to build URLs to link to people's contributions. ``{email}`` will be replaced with the email address """) def scan(self): # Build a filter function from whitelist and blacklist whitelist = frozenset(self.whitelist) blacklist = frozenset(self.blacklist) if whitelist and blacklist: filter_func = lambda x: x in whitelist and x not in blacklist elif whitelist: filter_func = lambda x: x in whitelist elif blacklist: filter_func = lambda x: x not in blacklist else: filter_func = lambda x: True contribs = Aggregate() desc_by_email = {} for pathname in self.folders: folder = get_mailbox(pathname) try: for msg in folder: # Extract From address addr = msg.get("From", None) if addr is None: continue name, addr = email.utils.parseaddr(addr) if not filter_func(addr): continue if name: desc_by_email[addr] = name # Extract date date = msg.get("Date", None) if date is None: continue date = email.utils.parsedate(date) if date is None: continue ts = time.mktime(date) contribs.add(addr, ts) finally: folder.close() for addr, (begin, end) in contribs.iteritems(): ident = Identifier("email", addr, desc_by_email.get(addr, None)) begin = datetime.date.fromtimestamp(begin) end = datetime.date.fromtimestamp(end) if self.url: yield ident, begin, end, self.url.format(email=addr) else: yield ident, begin, end, None debiancontributors-0.6/debiancontributors/scanners/git.py0000644000175000017500000000707012264267500024720 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .. import scanner from ..types import * import datetime __all__ = ["GitDirs", "GitLogs"] class GitDirs(scanner.Scanner): """ Scan git directories using file attributes to detect contributions. Generates `login` types of identifiers, using the usernames of the system where it is run. Example:: contribution: committer method: gitdirs dirs: /srv/git.debian.org/git/collab-maint/*.git url: https://alioth.debian.org/users/{user}/ """ dirs = scanner.GlobField(blank=False, help_text=""" ``.git`` directories to scan. You can give one or more, and even use shell-style globbing. """) url = scanner.CharField(help_text=""" template used to build URLs to link to people's contributions. ``{user}`` will be replaced with the username """) def scan(self): from .utils.filesystem import Filesystem scan = Filesystem() for d in self.dirs: scan.scan_git_repo(d) if self.url: tpl = self.url for ident, begin, end in scan.contributions(): yield ident, begin, end, tpl.format(user=ident.id) else: for ident, begin, end in scan.contributions(): yield ident, begin, end, None class GitLogs(scanner.Scanner): """ Scan git logs, taking note of committer and author activity Generates `email` types of identifiers, trusting whatever is in the git log. Example:: contribution: committer method: gitlogs dirs: /srv/git.debian.org/git/collab-maint/*.git """ dirs = scanner.GlobField(blank=False, help_text=""" ``.git`` directories to scan. You can give one or more, and even use shell-style globbing. """) def scan(self): from .utils.proc import stream_command_stdout from .utils.mine import Aggregate contribs = Aggregate() for d in self.dirs: cmd = ["git", "--git-dir", d, "log", "--all", "--pretty=tformat:%ae %at %ce %ct"] for line in stream_command_stdout(cmd): ae, at, ce, ct = line.split() contribs.add(ae, int(at)) contribs.add(ce, int(ct)) for email, (begin, end) in contribs.iteritems(): ident = Identifier("email", email) begin = datetime.date.fromtimestamp(begin) end = datetime.date.fromtimestamp(end) yield ident, begin, end, None debiancontributors-0.6/debiancontributors/scanners/files.py0000644000175000017500000000436512264111143025232 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools # # Copyright (C) 2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .. import scanner __all__ = ["Files"] class Files(scanner.Scanner): """ Recursively scan directories using file attributes to detect contributions. Generates `login` types of identifiers, using the usernames of the system where it is run. Example:: contribution: committer method: files dirs: /srv/cvs.debian.org/cvs/webwml url: https://alioth.debian.org/users/{user}/ """ dirs = scanner.GlobField(blank=False, help_text=""" directories to scan. You can give one or more, and even use shell-style globbing. """) url = scanner.CharField(help_text=""" template used to build URLs to link to people's contributions. ``{user}`` will be replaced with the username """) def scan(self): from .utils.filesystem import Filesystem scan = Filesystem() for d in self.dirs: scan.scan_all_files(d) if self.url: tpl = self.url for ident, begin, end in scan.contributions(): yield ident, begin, end, tpl.format(user=ident.id) else: for ident, begin, end in scan.contributions(): yield ident, begin, end, None debiancontributors-0.6/debiancontributors/scanners/utils/0000755000175000017500000000000012264333700024713 5ustar enricoenrico00000000000000debiancontributors-0.6/debiancontributors/scanners/utils/stats.py0000644000175000017500000000703512256030766026437 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining on emails # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import math def _smooth(days, decay_days): """ Return the smooth factor after 'days' days of decay, considering a maximum decay timespan of 'decay_days' """ # http://en.wikipedia.org/wiki/Bump_function if days >= decay_days: return 0 return math.e * math.exp(-1/(1-(days/decay_days)**2)) class ContributorFrequencyCheck(object): def __init__(self, ident, time_unit_length=86400, contribution_age=7, min_activity_time=60): self.ident = ident self.time_unit_length = time_unit_length self.decay_days = contribution_age self.min_activity_time = min_activity_time self.stamp_set = set() self.stamp_min = None self.stamp_max = None def add_stamp(self, ts): ts = ts // self.time_unit_length self.stamp_set.add(ts) if self.stamp_min is None: self.stamp_min = ts self.stamp_max = ts elif ts < self.stamp_min: self.stamp_min = ts elif ts > self.stamp_max: self.stamp_max = ts @property def contrib_range(self): return self.stamp_min * self.time_unit_length, self.stamp_max * self.time_unit_length def heat_function(self): # Precompute the smooth factors smooth_factors = [ _smooth(i, self.decay_days) for i in xrange(self.decay_days) ] # Compute the heat function over the whole activity timespan # TODO: can be optimizing using a deque of decay_days items, as a # moving window of the alst decay_days days as we progress on the time # axis for ts in xrange(int(self.stamp_min), int(self.stamp_max) + 1): val = 0.0 if ts in self.stamp_set: val += 1 for i in xrange(1, self.decay_days): if (ts - i) in self.stamp_set: val += smooth_factors[i] yield ts, val def is_contributor(self): threshold = 1 first_ts_above_threshold = None for ts, val in self.heat_function(): if val > threshold: if first_ts_above_threshold is None: first_ts_above_threshold = ts else: if ts - first_ts_above_threshold > self.min_activity_time: return True else: first_ts_above_threshold = None return False def test_dump_heat_function(self, fname=None): if fname is None: fname = self.ident.type + "-" + self.ident.id with open(fname, "w") as fd: for ts, val in self.heat_function(): print("{} {}".format(ts, val), file=fd) debiancontributors-0.6/debiancontributors/scanners/utils/doc.py0000644000175000017500000000442212263645236026045 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools # # Copyright (C) 2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import sys def docstring_trim(docstring): """ Deindent a docstring. This code has been taken from http://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation and a docstring has been added because, hyronically, it did not have one. Also interesting is that the PEP has status Active since many years but this code is not in Python's standard library. """ if not docstring: return '' # Convert tabs to spaces (following the normal Python rules) # and split into a list of lines: lines = docstring.expandtabs().splitlines() # Determine minimum indentation (first line doesn't count): indent = sys.maxint for line in lines[1:]: stripped = line.lstrip() if stripped: indent = min(indent, len(line) - len(stripped)) # Remove indentation (first line is special): trimmed = [lines[0].strip()] if indent < sys.maxint: for line in lines[1:]: trimmed.append(line[indent:].rstrip()) # Strip off trailing and leading blank lines: while trimmed and not trimmed[-1]: trimmed.pop() while trimmed and not trimmed[0]: trimmed.pop(0) # Return a single string: return '\n'.join(trimmed) def print_indented(s, indent=4, file=file): indent = " " * indent for line in s.split("\n"): print(indent, line, sep="", file=file) debiancontributors-0.6/debiancontributors/scanners/utils/filesystem.py0000644000175000017500000000623612264111100027444 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining on file systems # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from ...submission import Identifier from .mine import Aggregate import os import os.path import pwd import datetime import logging log = logging.getLogger(__name__) __all__ = ["Filesystem"] class Filesystem(object): """ Collect and aggregate contribution data from file inode information, and build contribution informations out of it """ def __init__(self): self.contribs = Aggregate() def scan_file(self, pathname): """ Add an information from the inode information of a file """ self.scan_stat(os.stat(pathname)) def scan_stat(self, st): """ Add an information from a stat structure """ self.contribs.add(st.st_uid, st.st_mtime) def scan_git_repo(self, gitdir): """ Add information from refs files in the given git repo gitdir: pathname to the bare repository or the .git directory """ scanroot = os.path.join(gitdir, "refs") log.debug("Starting git scanning at %s", scanroot) for root, dirs, files in os.walk(scanroot): for f in files: self.scan_file(os.path.join(root, f)) def scan_svn_repo(self, svnroot): """ Add information from commits in the given svn repo svnroot: pathname to the svn repository root dir """ root = os.path.join(svnroot, "db/revs") for f in os.listdir(root): self.scan_file(os.path.join(root, f)) def scan_all_files(self, root): """ Add information from commits in the given svn repo svnroot: pathname to the svn repository root dir """ for dirpath, dirnames, fnames in os.walk(root): for fname in fnames: self.scan_file(os.path.join(dirpath, fname)) def contributions(self): """ Generate (ident, begin, end) contributions """ for uid, stats in self.contribs.iteritems(): try: pw = pwd.getpwuid(uid) ident = Identifier("login", pw.pw_name) begin = datetime.date.fromtimestamp(stats[0]) end = datetime.date.fromtimestamp(stats[1]) yield ident, begin, end except KeyError: pass debiancontributors-0.6/debiancontributors/scanners/utils/proc.py0000644000175000017500000000616112264266016026241 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining using subprocesses # # Copyright (C) 2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from cStringIO import StringIO import subprocess import os import fcntl import select def stream_output(proc): """ Take a subprocess.Popen object and generate its output, as pairs of (tag, line) couples. Tag can be O for stdout, E for stderr and R for return value. Note that the output is not line-split. R is always the last bit that gets generated. """ fds = [proc.stdout, proc.stderr] tags = ["O", "E"] # Set both pipes as non-blocking for fd in fds: fcntl.fcntl(fd, fcntl.F_SETFL, os.O_NONBLOCK) # Multiplex stdout and stderr with different tags while len(fds) > 0: s = select.select(fds, (), ()) for fd in s[0]: idx = fds.index(fd) buf = fd.read() if buf: yield tags[idx], buf else: fds.pop(idx) tags.pop(idx) res = proc.wait() yield "R", res class StreamStdoutKeepStderr(object): """ Stream lines of standard output from a Popen object, keeping all of its stderr inside a StringIO """ def __init__(self, proc): self.proc = proc self.stderr = StringIO() def __iter__(self): last_line = None for tag, buf in stream_output(self.proc): if tag == "O": for l in buf.splitlines(True): if last_line is not None: l = last_line + l last_line = None if l.endswith("\n"): yield l else: last_line = l elif tag == "E": self.stderr.write(buf) if last_line is not None: yield last_line def stream_command_stdout(cmd, **kw): try: proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kw) proc.stdin.close() lines = StreamStdoutKeepStderr(proc) for line in lines: yield line result = proc.wait() except: proc.terminate() raise if result != 0: raise RuntimeError("{} exited with status {}: {}".format(cmd[0], result, lines.stderr.getvalue().strip())) debiancontributors-0.6/debiancontributors/scanners/utils/mine.py0000644000175000017500000000254012264103123026210 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining utilities # # Copyright (C) 2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import logging log = logging.getLogger(__name__) __all__ = ["Aggregate"] class Aggregate(dict): """ Aggregate pairs of (key, val) in a dict { key: (minval, maxval) } """ def add(self, key, val): """ Add a (key, val) pair to the aggregation """ old = self.get(key, None) if old is None: self[key] = (val, val) else: self[key] = (min(old[0], val), max(old[1], val)) debiancontributors-0.6/debiancontributors/scanners/utils/__init__.py0000644000175000017500000000161712264102112027020 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals debiancontributors-0.6/debiancontributors/scanners/utils/email.py0000644000175000017500000000355212264262105026361 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data mining utilities # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import tempfile import mailbox import gzip import shutil import os.path import logging log = logging.getLogger(__name__) class CompressedMbox(mailbox.mbox): """ Read-only access of a compressed mbox using a temporary file for the uncompressed version """ def __init__(self, pathname): self.tempfile = tempfile.NamedTemporaryFile() with gzip.open(pathname) as fd: shutil.copyfileobj(fd, self.tempfile) self.tempfile.flush() mailbox.mbox.__init__(self, self.tempfile.name) def close(self): # mailbox.mbox is not a new-style object :'( mailbox.mbox.close(self) self.tempfile.close() def get_mailbox(pathname): """ Create the right Mailbox object for a pathname """ if os.path.isdir(pathname): return mailbox.Maildir(pathname) elif pathname.endswith(".gz"): return CompressedMbox(pathname) else: return mailbox.mbox(pathname) debiancontributors-0.6/debiancontributors/scanners/postgres.py0000644000175000017500000001201212264246566026004 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools for dak # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from ..types import * from .. import scanner import datetime import logging log = logging.getLogger(__name__) __all__ = ["Postgres"] class Postgres(scanner.Scanner): """ Perform data mining using a SQL query on a Postgres database. This requires python-psycopg2 to be installed. Example:: contribution: uploader method: postgres db: service=projectb identifier: login query: SELECT s.install_date as date, u.uid as id, u.name as desc FROM source s JOIN fingerprint f ON s.sig_fpr = f.id JOIN uid u ON f.uid = u.id url: http://qa.debian.org/developer.php?login={id}&comaint=yes """ db = scanner.CharField(blank=False, help_text=""" database connection string. See `psycopg2.connect `_ for details.""") identifier = scanner.IdentifierTypeField(default="auto", help_text=""" type of identifier that is found by this SQL query. """) query = scanner.CharField(blank=False, help_text=""" SQL query used to list contributions. SELECT column field names are significant: ``id`` is the contributor name, email, or fingerprint, depending on how ``identifier`` is configured. ``date`` is the contribution date, as a date or datetime. ``desc`` (optional) is a human-readable description for this ``id``, like a person's name. All other SELECT columns are ignored, but can be useful to provide values for the ``url`` template. """) url = scanner.CharField(help_text=""" template used to build URLs to link to people's contributions. Words in curly braces (like ``{id}``) will be expanded with the SELECT column of the same name. """) def scan(self): from .utils.mine import Aggregate import psycopg2 import psycopg2.extensions import psycopg2.extras psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY) db = psycopg2.connect(self.db) db.set_client_encoding('UTF8') cur = db.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute(self.query) contribs = Aggregate() if self.url: url_by_id = {} else: url_by_id = None desc_by_id = {} for row in cur: # Validate id id = row.get("id", None) if not id: log.info("id is empty, skipping result row %r", row) continue # Validate date and turn it into a datetime.date date = row.get("date", None) if not date: log.info("date is empty, skipping result row %r", row) continue if isinstance(date, datetime.datetime): date = date.date() elif not isinstance(date, datetime.date): log.info("date is not a date I can understand, skipping result row %r", row) continue # Generate the URL if we didn't have one already if url_by_id is not None and id not in url_by_id: url_by_id[id] = self.url.format(**row) # Take note of desc if present desc = row.get("desc", None) if desc is not None: desc_by_id[id] = desc contribs.add(id, date) for id, (begin, end) in contribs.iteritems(): if self.identifier == "auto": try: ident = Identifier.create_auto(id, default_desc=desc_by_id.get(id, None)) except ValueError as e: log.info("skipping identifier %s: %s", id, e) else: ident = Identifier(self.identifier, id, desc_by_id.get(id, None)) yield ident, begin, end, url_by_id.get(id, None) debiancontributors-0.6/debiancontributors/scanners/__init__.py0000644000175000017500000000202112264252652025665 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .git import * from .svn import * from .files import * from .postgres import * from .mailbox import * debiancontributors-0.6/debiancontributors/scanners/svn.py0000644000175000017500000000444012264105501024731 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source data mining tools # # Copyright (C) 2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .. import scanner from ..types import * __all__ = ["SvnDirs"] class SvnDirs(scanner.Scanner): """ Scan subversion directories using file attributes to detect contributions. Generates `login` types of identifiers, using the usernames of the system where it is run. Example:: contribution: committer method: svndirs dirs: /srv/svn.debian.org/svn/collab-maint url: https://alioth.debian.org/users/{user}/ """ dirs = scanner.GlobField(blank=False, help_text=""" subversion directories to scan. You can give one or more, and even use shell-style globbing. """) url = scanner.CharField(help_text=""" template used to build URLs to link to people's contributions. ``{user}`` will be replaced with the username """) def scan(self): from .utils.filesystem import Filesystem scan = Filesystem() for d in self.dirs: scan.scan_svn_repo(d) if self.url: tpl = self.url for ident, begin, end in scan.contributions(): yield ident, begin, end, tpl.format(user=ident.id) else: for ident, begin, end in scan.contributions(): yield ident, begin, end, None debiancontributors-0.6/debiancontributors/submission.py0000644000175000017500000002115012264332735024512 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source core data structure # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from collections import defaultdict from .types import * import sys import json __all__ = ["Submission"] DEFAULT_BASE_URL = "https://contributors.debian.org/" class Submission(object): """ A submission to contributors.debian.org """ def __init__(self, name, auth_token=None, baseurl=DEFAULT_BASE_URL): """ name: data source name """ # Data source name, as in the website self.name = name # Authentication token self.auth_token = None # Base URL self.baseurl = baseurl # List of contributions by identifier self.entries = defaultdict(dict) if auth_token is not None: self.set_auth_token(auth_token) def set_auth_token(self, auth_token): """ Set the auth token for this source. If auth_token starts with '@', the rest is treated as a pathname to a file that contains the token. """ if not auth_token: raise ValueError("auth_token is empty") if auth_token[0] == "@": with open(auth_token[1:], "r") as fd: self.auth_token = fd.read().strip() else: self.auth_token = auth_token def add_contribution(self, identifier, contrib): """ Add information about a contribution. identifier: Identifier for the user that made this contribution contrib: Contribution object """ entries = self.entries[identifier] old = entries.get(contrib.type, None) if old is None: entries[contrib.type] = contrib else: entries[contrib.type] = Contribution.merged(old, contrib) def add_contribution_data(self, identifier, type, begin=None, end=None, url=None): """ Add information about a contribution. identifier: Identifier for the user that made this contribution name: contribution name (chosen among the source contribution types) begin: start time of this contribution. None to reuse the last start time. end: end time of this contribution. None to mean 'now'. """ self.add_contribution(identifier, Contribution(type, begin, end, url)) def merge_with(self, submission): """ Merge another submission into this one """ if self.name != submission.name: raise ValueError("Merging submission for two different sources: {}!={}".format( self.name, submission.name)) for ident, contribs in submission.entries.iteritems(): old = self.entries.get(ident, None) if old is None: self.entries[ident] = dict(contribs) else: self.entries[ident] = merge_contrib_dicts(old, contribs) def _gen_records(self): """ Generate DC records for serialization """ for ident, contributions in self.entries.iteritems(): yield { "id": (ident.to_json(),), "contributions": [ c.to_json() for c in contributions.itervalues() ], } def to_json(self, file=None, indent=None): """ Convert to JSON. file: if set to a file-like object, send data there. Else, return the JSON data as a string indent: passed as-is to the indent parameter of the encoder """ if file is not None: return json.dump(list(self._gen_records()), file, indent=indent) else: return json.dumps(list(self._gen_records()), indent=indent) def print_compact(self, file=sys.stdout): """ Make a compact dump of this source to the given file """ for ident, contributions in self.entries.iteritems(): for ctype, c in sorted(contributions.iteritems()): if ident.desc: lead = "{}:{} <{}>".format(ident.type, ident.desc, ident.id) else: lead = "{}:{}".format(ident.type, ident.id) print("{}: {} from {} to {}".format(lead, c.type, c.begin, c.end), file=file) if c.url: print("{}: {} url: {}".format(lead, c.type, c.url), file=file) def post(self): """ POST this submission to the contributors server Returns a couple (success, info). success: a bool, true if everything was imported correctly, false if there has been some problem. info: a dict with detailed status and error information, plus import statistics """ # Yuck! Python's stdlib cannot do file uploads :'( # We need to introduce an external dependency for it import requests import urlparse from cStringIO import StringIO # Build the POST request to contributors.debian.org url = urlparse.urljoin(self.baseurl, '/contributors/post') #print("Posting to '{}'...".format(url)) # Prepare the file to post try: import lzma compress_type = "xz" compress = lzma.compress except ImportError: import gzip compress_type = "gzip" def compress(data): out = StringIO() with gzip.GzipFile(mode="wb", fileobj=out) as fd: fd.write(data) return out.getvalue() file_data = StringIO(compress(self.to_json())) files = { "data": file_data } # POST data data = { "source": self.name, "auth_token": self.auth_token, "data_compression": compress_type, } # POST everything to the server try: res = requests.post(url, data=data, files=files) except requests.ConnectionError, e: return False, { "code": None, "errors": [ "Connection error: " + unicode(e) ] } except requests.HTTPError, e: return False, { "code": None, "errors": [ "Server's HTTP response was malformed: " + unicode(e) ] } # Whether the POST was successful or not, the response body contains # information and statistics in JSON format. response = res.json() if res.status_code == requests.codes.ok: return True, response else: return False, response @classmethod def from_json(cls, name, data): """ Build a Submission from previously generated JSON name: the data source name data: the JSON data, either in a string, in a file, or as a parsed data structure """ if isinstance(data, basestring): data = json.loads(data) elif hasattr(data, "read"): data = json.load(data) res = cls(name) from .parser import Parser parser = Parser() for ids, contribs in parser.parse_submission(data): for i in ids: res.entries[i] = { c.type: c for c in contribs } return res def merge_contrib_dicts(d1, d2): """ Merge two dicts of contributions from the same identifier. Contribution types that happen in both lists will have their timespans merged """ res = {} # Add elements from d1, merging them with d2 if they also exist in d2 for ctype, c1 in d1.iteritems(): c2 = d2.get(ctype, None) if c2 is None: res[ctype] = c1 else: res[ctype] = Contribution.merged(c1, c2) # Add the elements that only exist in d2 for ctype, c2 in d2.iteritems(): res.setdefault(ctype, c2) return res debiancontributors-0.6/debiancontributors/scanner.py0000644000175000017500000002232012264262006023741 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source core data structure # # Copyright (C) 2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import sys class ValidationError(Exception): pass class ConfigField(object): def __init__(self, name=None, blank=True, default=None, help_text="", **kw): """ name: value name in the configuration blank: True if it can have an empty value, False if an empty or missing value should give an error default: value to use if the field was not found in the configuration. help_text: documentation for this field """ self.name = name self.blank = blank self.default = default self.help_text = help_text for k in kw.iterkeys(): raise ValueError("Unknown ConfigField argument: '{}'".format(k)) def to_python(self, val): """ Validate and convert the None or string value from the configuration file to the Python value. If val is None (missing in the configuration) and we have a default value, try to use the default value. If val is a string, strips it. If blank is False, makes sure that the string is not empty. Returns a string (possibly empty, if blank=True) if the value was found in the config file. A ValidationError (or None if blank=True) if it was not found. """ # Handle value not found in the configuration if val is None: if self.blank: return self.default else: raise ValidationError("missing value") val = val.strip() if not self.blank and not val: raise ValidationError("empty value") return val def print_documentation(self, file=sys.stdout): from .scanners.utils.doc import docstring_trim, print_indented print("``{name}`` : {type}, {blank}, default: {default}.".format( name=self.name, type=self.type_name(), blank="optional" if self.blank else "required", default=repr(self.default), ), file=file) if self.help_text: ht = docstring_trim(self.help_text) print_indented(ht, indent=2, file=file) else: print(" currently undocumented.", file=file) print(file=file) @classmethod def type_name(cls): res = cls.__name__ if res.endswith("Field"): return res[:-5] else: return res class CharField(ConfigField): """ A string value. Can be any UTF-8 string. """ pass class IdentifierTypeField(CharField): """ An identifier type. Can be one of: ``auto`` autodetect. "ident" or "Name " are accepted, and ident can be any email, login or OpenPGP fingerprint ``login`` debian.org or Alioth login name. ``email`` email address. ``fpr`` OpenPGP key fingerprint. """ def to_python(self, val): from .types import Identifier res = super(IdentifierTypeField, self).to_python(val) if res == "auto": return res if res not in Identifier.TYPE_VALIDATORS: raise ValidationError("invalid identifier type. Use one of auto, {}".format( ", ".join(sorted(Identifier.TYPE_VALIDATORS.iterkeys())))) return res class GlobField(CharField): """ A string with one or more filenames. Globbing is supported. Arguments can be quoted to deal with whitespace, but glob characters will always be expanded. """ def to_python(self, val): """ Splits with shlex, expands with glob, returns a list of pathnames """ import shlex import glob val = super(GlobField, self).to_python(val) res = [] if val is None: return res for fname in shlex.split(val): res.extend(glob.glob(fname)) if not self.blank and not res: raise ValidationError("no such file or directory") return res class EmailsField(CharField): """ A list of email addresses, like in email To: or Cc: headers. """ def to_python(self, val): """ Parse everything using email.utils.getaddresses """ from email.utils import getaddresses val = super(EmailsField, self).to_python(val) if val is None: return [] res = [ email for name, email in getaddresses((val,)) ] if not self.blank and not res: raise ValidationError("no email addresses found") return res class ScannerFields(type): """ Collects all class members that are instances of ConfigField, merges them to all the instances from the class parents, and set the results as the FIELD class member. """ def __new__(meta, name, parents, attrs): # Harvest config fields config_fields = {} # Collect fields from parents for p in parents: fields = getattr(p, "FIELDS", None) if fields is None: continue config_fields.update(fields.iteritems()) # Add fields from ourselves for name, member in attrs.iteritems(): if not isinstance(member, ConfigField): continue # Set the default for field names if member.name is None: member.name = name config_fields[name] = member # Add a FIELDS dict with all the fields attrs["FIELDS"] = config_fields return super(ScannerFields, meta).__new__(meta, name, parents, attrs) class Scanner(object): """ Base class for all data mining scanners Declarative definition of scanner configuration goes here. Any class members that are instances of ConfigField will be used to parse and validate the configuration. Their validated results will be set as object members. Example: # When instantiated, self.dirs will be a list of pathnames dirs = GlobField(blank=False, help_text="Directories to scan") All ConfigField instances found as class members, will be stored in the class FIELDS dict. For example, you can crudely document all the config options of a scanner like this: for name, field in MyScanner.FIELDS.iteritems(): print("Config key {}, accessible as self.{}: {}".format( field.name, name, field.__doc__)) """ __metaclass__ = ScannerFields # Scanner name, used to refer to the scanner in the mining configuration. # Defaults to the class name. NAME = None def __init__(self, cfg): """ Initialize the scanner with the given configuration dictionary """ # Parse configuration using our field definition for name, field in self.FIELDS.iteritems(): val = cfg.get(field.name, None) try: validated_val = field.to_python(val) except ValidationError as e: raise ValidationError("{} = {}: {}".format(name, val, str(e))) # Set the validated name=value pair as an object member setattr(self, name, validated_val) def scan(self): """ Perform scan and generate 4-tuples of (identifier, begin, end, url) Only identifier cannot be None, everything else can be. """ if False: yield None, None, None, None @classmethod def print_documentation(cls, file=sys.stdout): from .scanners.utils.doc import docstring_trim, print_indented print(cls.NAME, file=file) print("-" * len(cls.NAME), file=file) print(docstring_trim(cls.__doc__), file=file) print(file=file) if not cls.FIELDS: print("This scanning method has no specific configuration options", file=file) else: print("Configuration options", file=file) print("`````````````````````", file=file) print(file=file) types_used = {} for name, field in sorted(cls.FIELDS.iteritems()): field.print_documentation(file=file) types_used.setdefault(field.__class__, name) print("Option types", file=file) print("````````````", file=file) print(file=file) for cls, name in sorted(types_used.iteritems()): print("``{}``".format(cls.type_name()), file=file) ht = docstring_trim(cls.__doc__) print_indented(ht, indent=2, file=file) print(file=file) debiancontributors-0.6/debiancontributors/__init__.py0000644000175000017500000000174412263614235024062 0ustar enricoenrico00000000000000# coding: utf8 # Handle submissions to contributors.debian.org # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from .types import * from .submission import * from .datamine import * debiancontributors-0.6/test/0000755000175000017500000000000012264333700017016 5ustar enricoenrico00000000000000debiancontributors-0.6/test/test_submission.py0000644000175000017500000000563612264333334022637 0ustar enricoenrico00000000000000# coding: utf8 # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import debiancontributors as dc from datetime import date import unittest import json class TestSubmission(unittest.TestCase): def testIdentifier(self): i = dc.Identifier("login", "enrico") self.assertEquals(i.type, "login") self.assertEquals(i.id, "enrico") self.assertIsNone(i.desc) i = dc.Identifier("login", "enrico", "Enrico Zini") self.assertEquals(i.type, "login") self.assertEquals(i.id, "enrico") self.assertEquals(i.desc, "Enrico Zini") def testMinimalData(self): s = dc.Submission("test") s.add_contribution_data(dc.Identifier("login", "enrico"), "upload") js = s.to_json(indent=1) res = json.loads(js) self.assertEquals(res, [{ "id": [ { "type": "login", "id": "enrico" } ], "contributions": [ { "type": "upload" } ], }]) def testFullData(self): s = dc.Submission("test") s.add_contribution_data(dc.Identifier("login", "enrico"), "upload", begin=date(2013, 5, 1), end=date(2013, 11, 30), url="http://www.example.com") js = s.to_json(indent=1) res = json.loads(js) self.assertEquals(res, [{ "id": [ { "type": "login", "id": "enrico" } ], "contributions": [ { "type": "upload", "begin": "2013-05-01", "end": "2013-11-30", "url": "http://www.example.com" } ], }]) def test_auth_token(self): s = dc.Submission("test") self.assertIsNone(s.auth_token) s.set_auth_token("foo") self.assertEquals(s.auth_token, "foo") s.set_auth_token("@" + __file__) self.assertRegexpMatches(s.auth_token, "def test_auth_token\(") s = dc.Submission("test", auth_token="foo") self.assertEquals(s.auth_token, "foo") s = dc.Submission("test", auth_token="@" + __file__) self.assertRegexpMatches(s.auth_token, "def test_auth_token\(") if __name__ == '__main__': unittest.main() debiancontributors-0.6/test/test_mine_git.py0000644000175000017500000000237412264027615022235 0ustar enricoenrico00000000000000# coding: utf8 # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from debiancontributors.datamine import DataMine import unittest class TestMineGit(unittest.TestCase): def test_gitdirs(self): """ Test gitdirs scanner """ mine = DataMine(configstr= """ source: test contribution: committer method: gitdirs dirs: "." url: "http://www.example.com/{name}" """) mine.scan() if __name__ == '__main__': unittest.main() debiancontributors-0.6/test/test_parser.py0000644000175000017500000001016412263633330021726 0ustar enricoenrico00000000000000# coding: utf8 # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import debiancontributors.parser as parser import unittest class TestParser(unittest.TestCase): def testGetKey(self): self.assertEquals(parser.get_key({"foo": "bar"}, "foo"), "bar") self.assertRaises(parser.Fail, parser.get_key, {}, "foo") def testGetKeyInt(self): self.assertEquals(parser.get_key_int({"foo": "7"}, "foo"), 7) self.assertRaises(parser.Fail, parser.get_key_int, {}, "foo") self.assertRaises(parser.Fail, parser.get_key_int, {"foo": ""}, "foo") self.assertRaises(parser.Fail, parser.get_key_int, {"foo": "seven"}, "foo") def testGetKeyString(self): self.assertEquals(parser.get_key_string({"foo": "7"}, "foo"), "7") self.assertEquals(parser.get_key_string({"foo": ""}, "foo", True), "") self.assertEquals(parser.get_key_string({"foo": None}, "foo", True), "") self.assertEquals(parser.get_key_string({}, "foo", True), "") self.assertRaises(parser.Fail, parser.get_key_string, {}, "foo") self.assertRaises(parser.Fail, parser.get_key_string, {"foo": ""}, "foo") def testGetKeyUnicode(self): self.assertEquals(parser.get_key_unicode({"foo": "7"}, "foo"), "7") self.assertEquals(parser.get_key_unicode({"foo": b"\xe2\x99\xa5"}, "foo"), "♥") self.assertEquals(parser.get_key_unicode({"foo": ""}, "foo", True), "") self.assertEquals(parser.get_key_unicode({"foo": None}, "foo", True), "") self.assertEquals(parser.get_key_unicode({}, "foo", True), "") self.assertRaises(parser.Fail, parser.get_key_unicode, {}, "foo") self.assertRaises(parser.Fail, parser.get_key_unicode, {"foo": ""}, "foo") self.assertRaises(parser.Fail, parser.get_key_unicode, {"foo": b'\xe0'}, "foo") def testGetKeySequence(self): self.assertEquals(parser.get_key_sequence({"foo": []}, "foo"), []) self.assertEquals(parser.get_key_sequence({"foo": [1, 2, "three"]}, "foo"), [1, 2, "three"]) self.assertEquals(parser.get_key_sequence({"foo": ()}, "foo"), ()) self.assertRaises(parser.Fail, parser.get_key_sequence, {}, "foo") self.assertRaises(parser.Fail, parser.get_key_sequence, {"foo": "bar"}, "foo") self.assertRaises(parser.Fail, parser.get_key_sequence, {"foo": {}}, "foo") def testGetKeySequenceOrObject(self): self.assertEquals(parser.get_key_sequence_or_object({"foo": []}, "foo"), []) self.assertEquals(parser.get_key_sequence_or_object({"foo": {}}, "foo"), [{}]) self.assertEquals(parser.get_key_sequence_or_object({"foo": [{}]}, "foo"), [{}]) self.assertRaises(parser.Fail, parser.get_key_sequence_or_object, {}, "foo") self.assertRaises(parser.Fail, parser.get_key_sequence_or_object, {"foo": "bar"}, "foo") def testGetKeyDateOrNone(self): from datetime import date self.assertEquals(parser.get_key_date_or_none({"foo": "2013-11-16"}, "foo"), date(2013, 11, 16)) self.assertEquals(parser.get_key_date_or_none({"foo": ""}, "foo"), None) self.assertEquals(parser.get_key_date_or_none({"foo": None}, "foo"), None) self.assertEquals(parser.get_key_date_or_none({}, "foo"), None) self.assertRaises(parser.Fail, parser.get_key_date_or_none, {"foo": "2013"}, "foo") if __name__ == '__main__': unittest.main() debiancontributors-0.6/test/test_types.py0000644000175000017500000001246612264244702021607 0ustar enricoenrico00000000000000# coding: utf8 # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals from debiancontributors.types import * import unittest class TestTypes(unittest.TestCase): def test_identifier(self): """ Test Identifier operations """ iem = Identifier("email", "enrico@enricozini.org") iem.validate() self.assertEquals(iem.type, "email") self.assertEquals(iem.id, "enrico@enricozini.org") self.assertIsNone(iem.desc) ied = Identifier("email", "enrico@enricozini.org", "Enrico Zini") ied.validate() self.assertEquals(ied.type, "email") self.assertEquals(ied.id, "enrico@enricozini.org") self.assertEquals(ied.desc, "Enrico Zini") ilo = Identifier("login", "enrico") ilo.validate() self.assertEquals(ilo.type, "login") self.assertEquals(ilo.id, "enrico") self.assertIsNone(ilo.desc) ild = Identifier("login", "enrico", "Enrico Zini") ild.validate() self.assertEquals(ild.type, "login") self.assertEquals(ild.id, "enrico") self.assertEquals(ild.desc, "Enrico Zini") ifp = Identifier("fpr", "1793D6AB75663E6BF104953A634F4BD1E7AD5568") ifp.validate() self.assertEquals(ifp.type, "fpr") self.assertEquals(ifp.id, "1793D6AB75663E6BF104953A634F4BD1E7AD5568") self.assertIsNone(ifp.desc) ifd = Identifier("fpr", "1793D6AB75663E6BF104953A634F4BD1E7AD5568", "Enrico Zini") ifd.validate() self.assertEquals(ifp.type, "fpr") self.assertEquals(ifd.type, "fpr") self.assertEquals(ifd.id, "1793D6AB75663E6BF104953A634F4BD1E7AD5568") self.assertEquals(ifd.desc, "Enrico Zini") self.assertEquals(iem, ied) self.assertEquals(ilo, ild) self.assertEquals(ifp, ifd) self.assertNotEquals(iem, ilo) self.assertNotEquals(iem, ifp) self.assertNotEquals(iem, ild) self.assertNotEquals(iem, ifd) self.assertNotEquals(ied, ilo) self.assertNotEquals(ied, ifp) self.assertNotEquals(ied, ild) self.assertNotEquals(ied, ifd) def test_bad_identifier(self): from debiancontributors.parser import Fail self.assertRaises(Fail, Identifier("foo", "").validate) self.assertRaises(Fail, Identifier(3, "").validate) self.assertRaises(Fail, Identifier("login", None).validate) self.assertRaises(Fail, Identifier("login", "").validate) self.assertRaises(Fail, Identifier("login", "enrico", 3).validate) self.assertRaises(Fail, Identifier("email", "enrico").validate) self.assertRaises(Fail, Identifier("fpr", "zzz").validate) def test_bad_contribution(self): from debiancontributors.parser import Fail self.assertRaises(Fail, Contribution(None).validate) self.assertRaises(Fail, Contribution("foo", 3).validate) self.assertRaises(Fail, Contribution("foo", None, 3).validate) self.assertRaises(Fail, Contribution("foo", url=3).validate) def test_auto(self): i = Identifier.create_auto("enrico") self.assertEquals(i.type, "login") self.assertEquals(i.id, "enrico") self.assertIsNone(i.desc) i = Identifier.create_auto("Enrico Zini ") self.assertEquals(i.type, "login") self.assertEquals(i.id, "enrico") self.assertEquals(i.desc, "Enrico Zini") i = Identifier.create_auto("enrico@debian.org") self.assertEquals(i.type, "email") self.assertEquals(i.id, "enrico@debian.org") self.assertIsNone(i.desc) i = Identifier.create_auto("Enrico Zini ") self.assertEquals(i.type, "email") self.assertEquals(i.id, "enrico@debian.org") self.assertEquals(i.desc, "Enrico Zini") i = Identifier.create_auto("1793 D6AB 7566 3E6B F104 953A 634F 4BD1 E7AD 5568") self.assertEquals(i.type, "fpr") self.assertEquals(i.id, "1793D6AB75663E6BF104953A634F4BD1E7AD5568") self.assertIsNone(i.desc) i = Identifier.create_auto("Enrico Zini <1793 D6AB 7566 3E6B F104 953A 634F 4BD1 E7AD 5568>") self.assertEquals(i.type, "fpr") self.assertEquals(i.id, "1793D6AB75663E6BF104953A634F4BD1E7AD5568") self.assertEquals(i.desc, "Enrico Zini") i = Identifier.create_auto("Enrico Zini, the Mad ") self.assertEquals(i.type, "login") self.assertEquals(i.id, "enrico") self.assertEquals(i.desc, "Enrico Zini, the Mad") if __name__ == '__main__': unittest.main() debiancontributors-0.6/test/__init__.py0000644000175000017500000000164212257500656021142 0ustar enricoenrico00000000000000# coding: utf8 # Debian Contributors data source for git.debian.org # # Copyright (C) 2013 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals debiancontributors-0.6/README.md0000644000175000017500000000433112264251520017316 0ustar enricoenrico00000000000000debiancontributors python module ================================ See https://wiki.debian.org/DebianContributors for technical information about contributors.debian.org See https://wiki.debian.org/Teams/FrontDesk/DcSiteDevel for this project's page in the Debian wiki. ## Computing and posting data using dc-tool You can describe a data source and how to look for contribution data in a simple configuration file, then run this to perform data mining and submit data to the site, all in one go: dc-tool --mine myconfigfile --post Without --post, it prints the results of data mining on standard output: it is useful to test a data mining configuration. See DATAMINING.rst and the examples/ directory for documentation and examples for the data mining configuration. ## Posting data using dc-tool Assuming you [created a data source in the website][newds] called `myteam.debian.net` with authentication token 'foobar'. Assuming you have generated a file `submission.json`, with your submission. You can post it with `dc-tool` using: dc-tool --source myteam.debian.net --auth-token foobar --post submission.json dc-tool will validate the submission for you before posting it. You can check the contents of the submission by running `dc-tool` without any option: dc-tool submission.json ## Posting data using python code Assuming you [created a data source in the website][newds] called `myteam.debian.net` with authentication token 'foobar'. import debiancontributors as dc from datetime import Date # Create a Submission s = dc.Submission("myteam.debian.net") # Add contribution data to it s.add_contribution( dc.Identifier("email", "enrico@debian.org"), dc.Contribution("shave_yaks", Date(2013, 1, 1), Date(2013, 12, 23))) # Post it to the site success, info = s.post("foobar") if not success: import json print("submission failed:") print(json.dumps(info, indent=1)) ## Posting data the way you like Posting a submission is just a matter of building a JSON data structure and posting it to the site via HTTP POST. See [DebianContributors Implementation notes](https://wiki.debian.org/DebianContributors#Implementation_notes) for details. [newds]: https://wiki.debian.org/DebianContributors#Creating_a_new_data_source debiancontributors-0.6/DATAMINING.rst0000644000175000017500000002231512264333604020212 0ustar enricoenrico00000000000000 =================== dc-tool data mining =================== dc-tool has several methods of data mining that can be controlled via a configuration file. It works like this: 1. Read this documentation and create a configuration file to test. 2. Run ``dc-tool --mine=mysource.conf`` to perform data mining and print results to standard output. 3. When you are satisfied of the results, run ``dc-tool --mine=mysource.conf --post`` to post data to contributors.debian.org. Run that via cron and you have a full working data source. ------------------------- Configuration file syntax ------------------------- The configuration file follows the usual Debian RFC822/Yaml-like syntax. If the first group of options does not have a "contribution:" field, it is used for general configuration of the data source. All other sections define methods of mining the data you want. The data source configuration section ===================================== Example:: # You don't need this option if you call this file nm.debian.org.conf #source: nm.debian.org # Auhentication token used to post data. Use a leading '@' as in '@filename' # to use the contents of another file as auth token. Do not make this file # world-readable! auth_token: @secrets/auth_token.txt The general configuration section has three configurable keywords: ``name`` Data source name, as configured in contributors.debian.org. If omitted, dc-tool will use the configuration file name. If the file name ends in ``.ini``, ``.conf`` or ``.cfg``, the extension will be removed. ``auth_token`` The authentication token used for posting data to the site. Anyone with this authentication token can post data for this data source, so be careful not to give this file world-readable permissions. ``baseurl`` You never need this unless you want to test a local copy of the contributors.debian.org codebase: it defaults to ``https://contributors.debian.org/`` but you can change it to submit data to your local development version. Data mining sections ==================== Example:: contribution: committer # Data mining method method: gitdirs # Configuration specific to this method dirs: /srv/git.debian.org/git/collab-maint/*.git url: https://alioth.debian.org/users/{user}/ Each data mining section has at least two configurable keywords: ``contribution`` Contribution type for this data source, as configured in contributors.debian.org. You can have many sections with the same contribution types, and the results of their data mining will all be merged. ``method`` The mining method. There are several mining method available, each with its own configuration options, documented below. The rest of the options are specific to each data mining method. Below is a full documentation of them. Data mining methods =================== files ----- Recursively scan directories using file attributes to detect contributions. Generates `login` types of identifiers, using the usernames of the system where it is run. Example:: contribution: committer method: files dirs: /srv/cvs.debian.org/cvs/webwml url: https://alioth.debian.org/users/{user}/ Configuration options ````````````````````` ``dirs`` : Glob, required, default: None. directories to scan. You can give one or more, and even use shell-style globbing. ``url`` : Char, optional, default: None. template used to build URLs to link to people's contributions. ``{user}`` will be replaced with the username Option types ```````````` ``Char`` A string value. Can be any UTF-8 string. ``Glob`` A string with one or more filenames. Globbing is supported. Arguments can be quoted to deal with whitespace, but glob characters will always be expanded. gitdirs ------- Scan git directories using file attributes to detect contributions. Generates `login` types of identifiers, using the usernames of the system where it is run. Example:: contribution: committer method: gitdirs dirs: /srv/git.debian.org/git/collab-maint/*.git url: https://alioth.debian.org/users/{user}/ Configuration options ````````````````````` ``dirs`` : Glob, required, default: None. ``.git`` directories to scan. You can give one or more, and even use shell-style globbing. ``url`` : Char, optional, default: None. template used to build URLs to link to people's contributions. ``{user}`` will be replaced with the username Option types ```````````` ``Char`` A string value. Can be any UTF-8 string. ``Glob`` A string with one or more filenames. Globbing is supported. Arguments can be quoted to deal with whitespace, but glob characters will always be expanded. gitlogs ------- Scan git logs, taking note of committer and author activity Generates `email` types of identifiers, trusting whatever is in the git log. Example:: contribution: committer method: gitlogs dirs: /srv/git.debian.org/git/collab-maint/*.git Configuration options ````````````````````` ``dirs`` : Glob, required, default: None. ``.git`` directories to scan. You can give one or more, and even use shell-style globbing. Option types ```````````` ``Glob`` A string with one or more filenames. Globbing is supported. Arguments can be quoted to deal with whitespace, but glob characters will always be expanded. mailfrom -------- Scan email address from From: headers in mailboxes Example:: contribution: developer method: mailfrom folders: /home/debian/lists/debian-devel-announce/* url: http://www.example.com/{email} Configuration options ````````````````````` ``blacklist`` : Emails, optional, default: None. if present, emails from this list will not be considered as contributors. ``folders`` : Glob, required, default: None. mail folders to scan. You can give one or more, and even use shell-style globbing. Mailbox, mailbox.gz and Maildir folders are supported. ``url`` : Char, optional, default: None. template used to build URLs to link to people's contributions. ``{email}`` will be replaced with the email address ``whitelist`` : Emails, optional, default: None. if present, only emails from this list will be considered as contributors. Option types ```````````` ``Char`` A string value. Can be any UTF-8 string. ``Glob`` A string with one or more filenames. Globbing is supported. Arguments can be quoted to deal with whitespace, but glob characters will always be expanded. ``Emails`` A list of email addresses, like in email To: or Cc: headers. postgres -------- Perform data mining using a SQL query on a Postgres database. This requires python-psycopg2 to be installed. Example:: contribution: uploader method: postgres db: service=projectb identifier: login query: SELECT s.install_date as date, u.uid as id, u.name as desc FROM source s JOIN fingerprint f ON s.sig_fpr = f.id JOIN uid u ON f.uid = u.id url: http://qa.debian.org/developer.php?login={id}&comaint=yes Configuration options ````````````````````` ``db`` : Char, required, default: None. database connection string. See `psycopg2.connect `_ for details. ``identifier`` : IdentifierType, optional, default: u'auto'. type of identifier that is found by this SQL query. ``query`` : Char, required, default: None. SQL query used to list contributions. SELECT column field names are significant: ``id`` is the contributor name, email, or fingerprint, depending on how ``identifier`` is configured. ``date`` is the contribution date, as a date or datetime. ``desc`` (optional) is a human-readable description for this ``id``, like a person's name. All other SELECT columns are ignored, but can be useful to provide values for the ``url`` template. ``url`` : Char, optional, default: None. template used to build URLs to link to people's contributions. Words in curly braces (like ``{id}``) will be expanded with the SELECT column of the same name. Option types ```````````` ``Char`` A string value. Can be any UTF-8 string. ``IdentifierType`` An identifier type. Can be one of: ``auto`` autodetect. "ident" or "Name " are accepted, and ident can be any email, login or OpenPGP fingerprint ``login`` debian.org or Alioth login name. ``email`` email address. ``fpr`` OpenPGP key fingerprint. svndirs ------- Scan subversion directories using file attributes to detect contributions. Generates `login` types of identifiers, using the usernames of the system where it is run. Example:: contribution: committer method: svndirs dirs: /srv/svn.debian.org/svn/collab-maint url: https://alioth.debian.org/users/{user}/ Configuration options ````````````````````` ``dirs`` : Glob, required, default: None. subversion directories to scan. You can give one or more, and even use shell-style globbing. ``url`` : Char, optional, default: None. template used to build URLs to link to people's contributions. ``{user}`` will be replaced with the username Option types ```````````` ``Char`` A string value. Can be any UTF-8 string. ``Glob`` A string with one or more filenames. Globbing is supported. Arguments can be quoted to deal with whitespace, but glob characters will always be expanded. debiancontributors-0.6/dc-tool0000755000175000017500000001134512264333400017327 0ustar enricoenrico00000000000000#!/usr/bin/python # coding: utf8 # Script for mining and/or posting data to contributors.debian.org # # Copyright (C) 2013--2014 Enrico Zini # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals import sys import json import argparse import logging def read_json_sources(source_name, pathnames): import debiancontributors as dc # Read JSON data, parsing it to validate it submission = None if pathnames: for fname in pathnames: with open(fname, "r") as fd: try: s = dc.Submission.from_json(source_name, fd) except dc.parser.ClusterFail as e: for msg in e.messages: print("{}: {}".format(fname, msg), file=sys.stderr) return None except dc.parser.Fail as e: print("{}: {}".format(fname, e.msg), file=sys.stderr) return None if submission is None: submission = s else: submission.merge_with(s) else: submission = dc.Submission.from_json(source_name, sys.stdin) return submission if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--debug", help="enable debugging output", action="store_true") parser.add_argument("--verbose", help="enable verbose output", action="store_true") parser.add_argument("--baseurl", metavar="url", help="URL to post data to (default: %(default)s)", action="store", default='https://contributors.debian.org/') parser.add_argument("--source", help="Data source name") parser.add_argument("--auth-token", help="Authentication token. Use @file to use the file content as auth token.") parser.add_argument("--json", help="write the JSON submission to standard output", action="store_true") parser.add_argument("--post", help="POST contribution data to the site", action="store_true") parser.add_argument("--mine", metavar="conffile", action="store", default=None, help="Perform data mining using the given config file") parser.add_argument("--mine-documentation", action="store_true", default=False, help="Print data mining documentation in reStructuredText format") parser.add_argument("sources", metavar="source(s)", help="JSON file(s) to post if --mine is not provided", default=None, nargs="*") args = parser.parse_args() FORMAT = "%(asctime)-15s %(levelname)s %(message)s" if args.debug: logging.basicConfig(level=logging.DEBUG, stream=sys.stderr, format=FORMAT) elif args.verbose: logging.basicConfig(level=logging.INFO, stream=sys.stderr, format=FORMAT) else: logging.basicConfig(level=logging.WARN, stream=sys.stderr, format=FORMAT) if args.mine_documentation: from debiancontributors import DataMine DataMine.print_documentation() sys.exit(0) if args.mine: from debiancontributors import DataMine miner = DataMine(args.mine, source_name=args.source) miner.scan() submission = miner.submission else: submission = read_json_sources(args.source, args.sources) if submission is None: sys.exit(1) # Override auth_token and baseurl from commandline if requested if args.auth_token: submission.set_auth_token(args.auth_token) if args.baseurl: submission.baseurl = args.baseurl if args.post: success, details = submission.post() if success: json.dump(details, sys.stdout, indent=2) print() sys.exit(0) else: json.dump(details, sys.stderr, indent=2) print(file=sys.stderr) sys.exit(1) elif args.json: submission.to_json(sys.stdout, indent=1) else: import io with io.open(sys.stdout.fileno(), "wt", encoding="utf8", closefd=False) as out: submission.print_compact(out) sys.exit(0) debiancontributors-0.6/setup.py0000644000175000017500000000236712264333642017566 0ustar enricoenrico00000000000000#!/usr/bin/env/python """ Copyright (C) 2013--2014 Enrico Zini This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see . """ from distutils.core import setup setup( name = "debiancontributors", requires=['requests (>=2.0.0)'], version = "0.6", description = "Manage submissions to contributors.debian.org", author = ["Enrico Zini"], author_email = ["enrico@debian.org"], url = "http://anonscm.debian.org/gitweb/?p=nm/python-debiancontributors.git;a=summary", license = "http://www.gnu.org/licenses/lgpl-3.0.html", packages = ["debiancontributors", "debiancontributors.scanners", "debiancontributors.scanners.utils"], scripts=['dc-tool'], ) debiancontributors-0.6/MANIFEST.in0000644000175000017500000000015112264263015017573 0ustar enricoenrico00000000000000include MANIFEST.in include README.md include DATAMINING.rst include test/__init__.py include examples/* debiancontributors-0.6/examples/0000755000175000017500000000000012264333700017655 5ustar enricoenrico00000000000000debiancontributors-0.6/examples/www.debian.org0000644000175000017500000000016612264250055022437 0ustar enricoenrico00000000000000contribution: commit method: files dirs: /srv/cvs.debian.org/cvs/webwml url = https://alioth.debian.org/users/{user}/ debiancontributors-0.6/examples/collab-maint0000644000175000017500000000041112264250101022127 0ustar enricoenrico00000000000000contribution: committer method: gitdirs dirs: /srv/git.debian.org/git/collab-maint/*.git url = https://alioth.debian.org/users/{user}/ contribution: committer method: svndirs dirs: /srv/svn.debian.org/svn/collab-maint url = https://alioth.debian.org/users/{user}/ debiancontributors-0.6/examples/ftp.debian.org0000644000175000017500000000113712264243422022403 0ustar enricoenrico00000000000000contribution: upload method: postgres db: service=projectb query: SELECT s.install_date as date, u.uid as id, u.name as desc FROM source s JOIN fingerprint f ON s.sig_fpr = f.id JOIN uid u ON f.uid = u.id url: http://qa.debian.org/developer.php?login={id}&comaint=yes contribution: maint method: postgres db: service=projectb query: SELECT s.install_date as date, c.name as id FROM source s JOIN maintainer c ON s.changedby = c.id url: http://qa.debian.org/developer.php?login={id}&comaint=yes debiancontributors-0.6/examples/example.conf0000644000175000017500000000071012264333560022161 0ustar enricoenrico00000000000000# Example data source definition. # This would submit data to a data source named 'example' on contributors.debian.org source: example auth_token: @example.auth_token # Data mining for contribution type 'committer' contribution: committer # Use the 'gitdirs' scanner method: gitdirs # Configuration of the gitdirs scanner. See dc-tool --mine-document for details. dirs: .git url: http://example.com/{user} contribution: committer method: gitlogs dirs: . debiancontributors-0.6/PKG-INFO0000644000175000017500000000060012264333700017130 0ustar enricoenrico00000000000000Metadata-Version: 1.1 Name: debiancontributors Version: 0.6 Summary: Manage submissions to contributors.debian.org Home-page: http://anonscm.debian.org/gitweb/?p=nm/python-debiancontributors.git;a=summary Author: ['Enrico Zini'] Author-email: ['enrico@debian.org'] License: http://www.gnu.org/licenses/lgpl-3.0.html Description: UNKNOWN Platform: UNKNOWN Requires: requests (>=2.0.0)