csvkit-0.9.1/0000755000076600000240000000000012506400503014012 5ustar onyxfishstaff00000000000000csvkit-0.9.1/csvkit/0000755000076600000240000000000012506400503015315 5ustar onyxfishstaff00000000000000csvkit-0.9.1/csvkit/__init__.py0000644000076600000240000000204012477331225017435 0ustar onyxfishstaff00000000000000#!/usr/bin/env python """ This module contains csvkit's superpowered replacement for the builtin :mod:`csv` module. For Python 2 users, the greatest improvement over the standard library full unicode support. Python 3's :mod:`csv` module supports unicode internally, so this module is provided primarily for compatability purposes. * Python 2: :mod:`csvkit.py2`. * Python 3: :mod:`csvkit.py3`. """ import six if six.PY2: from csvkit import py2 CSVKitReader = py2.CSVKitReader CSVKitWriter = py2.CSVKitWriter CSVKitDictReader = py2.CSVKitDictReader CSVKitDictWriter = py2.CSVKitDictWriter reader = py2.reader writer = py2.writer DictReader = py2.CSVKitDictReader DictWriter = py2.CSVKitDictWriter else: from csvkit import py3 CSVKitReader = py3.CSVKitReader CSVKitWriter = py3.CSVKitWriter CSVKitDictReader = py3.CSVKitDictReader CSVKitDictWriter = py3.CSVKitDictWriter reader = py3.reader writer = py3.writer DictReader = py3.CSVKitDictReader DictWriter = py3.CSVKitDictWriter csvkit-0.9.1/csvkit/cleanup.py0000644000076600000240000000660512477331225017340 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from csvkit.exceptions import CSVTestException, LengthMismatchError def join_rows(rows, joiner=' '): """ Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg. """ rows = list(rows) fixed_row = rows[0][:] for row in rows[1:]: if len(row) == 0: row = [''] fixed_row[-1] += "%s%s" % (joiner, row[0]) fixed_row.extend(row[1:]) return fixed_row def fix_length_errors(errs, target_line_length, joiner=' '): """ If possible, transform the rows backed up in the list of errors into rows of the correct length. If the list of errors does not yet produce a row of target_line_length, return an empty array. """ if not errs: return [] fixed_rows = [] backlog = [] for err in errs: if type(err) is not LengthMismatchError: return [] # give up if any are not length errors backlog.append(err) fixed_row = join_rows([err.row for err in backlog]) if len(fixed_row) == target_line_length: fixed_rows.append(fixed_row) backlog = [] # reset return fixed_rows def extract_joinable_row_errors(errs): joinable = [] for err in reversed(errs): if type(err) is not LengthMismatchError: break if joinable and err.line_number != joinable[-1].line_number - 1: break joinable.append(err) joinable.reverse() return joinable class RowChecker(object): """ Iterate over rows of a CSV producing cleaned rows and storing error rows. """ def __init__(self, reader): self.reader = reader self.column_names = next(reader) self.errors = [] self.rows_joined = 0 self.joins = 0 def checked_rows(self): """ A generator which yields rows which are ready to write to output. """ line_number = self.reader.line_num for row in self.reader: try: if len(row) != len(self.column_names): raise LengthMismatchError(line_number, row, len(self.column_names)) yield row except LengthMismatchError as e: self.errors.append(e) joinable_row_errors = extract_joinable_row_errors(self.errors) while joinable_row_errors: fixed_row = join_rows([err.row for err in joinable_row_errors], joiner=' ') if len(fixed_row) < len(self.column_names): break if len(fixed_row) == len(self.column_names): self.rows_joined += len(joinable_row_errors) self.joins += 1 yield fixed_row for fixed in joinable_row_errors: self.errors.remove(fixed) break joinable_row_errors = joinable_row_errors[1:] # keep trying in case we're too long because of a straggler except CSVTestException as e: self.errors.append(e) line_number = self.reader.line_num csvkit-0.9.1/csvkit/cli.py0000644000076600000240000003545312477331225016463 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import argparse import bz2 import codecs import gzip import os.path import sys import six from csvkit import CSVKitReader from csvkit.exceptions import ColumnIdentifierError, RequiredHeaderError def lazy_opener(fn): def wrapped(self, *args, **kwargs): self._lazy_open() fn(*args, **kwargs) return wrapped class LazyFile(six.Iterator): """ A proxy for a File object that delays opening it until a read method is called. Currently this implements only the minimum methods to be useful, but it could easily be expanded. """ def __init__(self, init, *args, **kwargs): self.init = init self.f = None self._is_lazy_opened = False self._lazy_args = args self._lazy_kwargs = kwargs def __getattr__(self, name): if not self._is_lazy_opened: self.f = self.init(*self._lazy_args, **self._lazy_kwargs) self._is_lazy_opened = True return getattr(self.f, name) def __iter__(self): return self def close(self): self.f.close() self.f = None self._is_lazy_opened = False def __next__(self): if not self._is_lazy_opened: self.f = self.init(*self._lazy_args, **self._lazy_kwargs) self._is_lazy_opened = True return next(self.f) class CSVKitUtility(object): description = '' epilog = '' override_flags = '' def __init__(self, args=None, output_file=None): """ Perform argument processing and other setup for a CSVKitUtility. """ self._init_common_parser() self.add_arguments() self.args = self.argparser.parse_args(args) if 'f' not in self.override_flags: self.input_file = self._open_input_file(self.args.input_path) self.reader_kwargs = self._extract_csv_reader_kwargs() self.writer_kwargs = self._extract_csv_writer_kwargs() self._install_exception_handler() if output_file is None: self.output_file = sys.stdout else: self.output_file = output_file # Ensure SIGPIPE doesn't throw an exception # Prevents [Errno 32] Broken pipe errors, e.g. when piping to 'head' # To test from the shell: # python -c "for i in range(5000): print 'a,b,c'" | csvlook | head # Without this fix you will see at the end: # [Errno 32] Broken pipe # With this fix, there should be no error # For details on Python and SIGPIPE, see http://bugs.python.org/issue1652 try: import signal signal.signal(signal.SIGPIPE, signal.SIG_DFL) except (ImportError, AttributeError): #Do nothing on platforms that don't have signals or don't have SIGPIPE pass def add_arguments(self): """ Called upon initialization once the parser for common arguments has been constructed. Should be overriden by individual utilities. """ raise NotImplementedError('add_arguments must be provided by each subclass of CSVKitUtility.') def main(self): """ Main loop of the utility. Should be overriden by individual utilities and explicitly called by the executing script. """ raise NotImplementedError(' must be provided by each subclass of CSVKitUtility.') def _init_common_parser(self): """ Prepare a base argparse argument parser so that flags are consistent across different shell command tools. If you want to constrain which common args are present, you can pass a string for 'omitflags'. Any argument whose single-letter form is contained in 'omitflags' will be left out of the configured parser. Use 'f' for file. """ self.argparser = argparse.ArgumentParser(description=self.description, epilog=self.epilog) # Input if 'f' not in self.override_flags: self.argparser.add_argument(metavar="FILE", nargs='?', dest='input_path', help='The CSV file to operate on. If omitted, will accept input on STDIN.') if 'd' not in self.override_flags: self.argparser.add_argument('-d', '--delimiter', dest='delimiter', help='Delimiting character of the input CSV file.') if 't' not in self.override_flags: self.argparser.add_argument('-t', '--tabs', dest='tabs', action='store_true', help='Specifies that the input CSV file is delimited with tabs. Overrides "-d".') if 'q' not in self.override_flags: self.argparser.add_argument('-q', '--quotechar', dest='quotechar', help='Character used to quote strings in the input CSV file.') if 'u' not in self.override_flags: self.argparser.add_argument('-u', '--quoting', dest='quoting', type=int, choices=[0,1,2,3], help='Quoting style used in the input CSV file. 0 = Quote Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 = Quote None.') if 'b' not in self.override_flags: self.argparser.add_argument('-b', '--doublequote', dest='doublequote', action='store_true', help='Whether or not double quotes are doubled in the input CSV file.') if 'p' not in self.override_flags: self.argparser.add_argument('-p', '--escapechar', dest='escapechar', help='Character used to escape the delimiter if --quoting 3 ("Quote None") is specified and to escape the QUOTECHAR if --doublequote is not specified.') if 'z' not in self.override_flags: self.argparser.add_argument('-z', '--maxfieldsize', dest='maxfieldsize', type=int, help='Maximum length of a single field in the input CSV file.') if 'e' not in self.override_flags: self.argparser.add_argument('-e', '--encoding', dest='encoding', default='utf-8', help='Specify the encoding the input CSV file.') if 'S' not in self.override_flags: self.argparser.add_argument('-S', '--skipinitialspace', dest='skipinitialspace', default=False, action='store_true', help='Ignore whitespace immediately following the delimiter.') if 'H' not in self.override_flags: self.argparser.add_argument('-H', '--no-header-row', dest='no_header_row', action='store_true', help='Specifies that the input CSV file has no header row. Will create default headers.') if 'v' not in self.override_flags: self.argparser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help='Print detailed tracebacks when errors occur.') # Output if 'l' not in self.override_flags: self.argparser.add_argument('-l', '--linenumbers', dest='line_numbers', action='store_true', help='Insert a column of line numbers at the front of the output. Useful when piping to grep or as a simple primary key.') # Input/Output if 'zero' not in self.override_flags: self.argparser.add_argument('--zero', dest='zero_based', action='store_true', help='When interpreting or displaying column numbers, use zero-based numbering instead of the default 1-based numbering.') def _open_input_file(self, path): """ Open the input file specified on the command line. """ if six.PY2: mode = 'rb' kwargs = {} else: mode = 'rt' kwargs = { 'encoding': self.args.encoding } if not path or path == '-': f = sys.stdin else: (_, extension) = os.path.splitext(path) if extension == u'.gz': f = LazyFile(gzip.open, path, mode, **kwargs) elif extension == '.bz2': if six.PY2: f = LazyFile(bz2.BZ2File, path, mode, **kwargs) else: f = LazyFile(bz2.open, path, mode, **kwargs) else: f = LazyFile(open, path, mode, **kwargs) return f def _extract_csv_reader_kwargs(self): """ Extracts those from the command-line arguments those would should be passed through to the input CSV reader(s). """ kwargs = {} if self.args.tabs: kwargs['delimiter'] = '\t' elif self.args.delimiter: kwargs['delimiter'] = self.args.delimiter if self.args.quotechar: kwargs['quotechar'] = self.args.quotechar if self.args.quoting: kwargs['quoting'] = self.args.quoting if self.args.doublequote: kwargs['doublequote'] = self.args.doublequote if self.args.escapechar: kwargs['escapechar'] = self.args.escapechar if self.args.maxfieldsize: kwargs['maxfieldsize'] = self.args.maxfieldsize if self.args.skipinitialspace: kwargs['skipinitialspace'] = self.args.skipinitialspace if six.PY2 and self.args.encoding: kwargs['encoding'] = self.args.encoding return kwargs def _extract_csv_writer_kwargs(self): """ Extracts those from the command-line arguments those would should be passed through to the output CSV writer. """ kwargs = {} if 'l' not in self.override_flags and self.args.line_numbers: kwargs['line_numbers'] = True return kwargs def _install_exception_handler(self): """ Installs a replacement for sys.excepthook, which handles pretty-printing uncaught exceptions. """ if six.PY2: sys.stderr = codecs.getwriter('utf-8')(sys.stderr) def handler(t, value, traceback): if self.args.verbose: sys.__excepthook__(t, value, traceback) else: # Special case handling for Unicode errors, which behave very strangely # when cast with unicode() if t == UnicodeDecodeError: sys.stderr.write('Your file is not "%s" encoded. Please specify the correct encoding with the -e flag. Use the -v flag to see the complete error.\n' % self.args.encoding) else: sys.stderr.write('%s\n' % six.text_type(value)) sys.excepthook = handler def print_column_names(self): """ Pretty-prints the names and indices of all columns to a file-like object (usually sys.stdout). """ if self.args.no_header_row: raise RequiredHeaderError('You cannot use --no-header-row with the -n or --names options.') f = self.input_file output = self.output_file try: zero_based=self.args.zero_based except: zero_based=False rows = CSVKitReader(f, **self.reader_kwargs) column_names = next(rows) for i, c in enumerate(column_names): if not zero_based: i += 1 output.write('%3i: %s\n' % (i, c)) def match_column_identifier(column_names, c, zero_based=False): """ Determine what column a single column id (name or index) matches in a series of column names. Note that integer values are *always* treated as positional identifiers. If you happen to have column names which are also integers, you must specify them using a positional index. """ if isinstance(c, six.string_types) and not c.isdigit() and c in column_names: return column_names.index(c) else: try: c = int(c) if not zero_based: c -= 1 # Fail out if neither a column name nor an integer except: raise ColumnIdentifierError('Column identifier "%s" is neither an integer, nor a existing column\'s name.' % c) # Fail out if index is 0-based if c < 0: raise ColumnIdentifierError('Column 0 is not valid; columns are 1-based.') # Fail out if index is out of range if c >= len(column_names): raise ColumnIdentifierError('Index %i is beyond the last named column, "%s" at index %i.' % (c, column_names[-1], len(column_names) - 1)) return c def parse_column_identifiers(ids, column_names, zero_based=False, excluded_columns=None): """ Parse a comma-separated list of column indices AND/OR names into a list of integer indices. Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of non-integers (e.g. column names) are not supported. Note: Column indices are 1-based. """ columns = [] # If not specified, start with all columns if not ids: columns = range(len(column_names)) if columns and not excluded_columns: return columns if not columns: for c in ids.split(','): c = c.strip() try: columns.append(match_column_identifier(column_names, c, zero_based)) except ColumnIdentifierError: if ':' in c: a,b = c.split(':',1) elif '-' in c: a,b = c.split('-',1) else: raise try: if a: a = int(a) else: a = 1 if b: b = int(b) + 1 else: b = len(column_names) + 1 except ValueError: raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.") for x in range(a,b): columns.append(match_column_identifier(column_names, x, zero_based)) excludes = [] if excluded_columns: for c in excluded_columns.split(','): c = c.strip() try: excludes.append(match_column_identifier(column_names, c, zero_based)) except ColumnIdentifierError: if ':' in c: a,b = c.split(':',1) elif '-' in c: a,b = c.split('-',1) else: raise try: if a: a = int(a) else: a = 1 if b: b = int(b) + 1 else: b = len(column_names) except ValueError: raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.") for x in range(a,b): excludes.append(match_column_identifier(column_names, x, zero_based)) return [c for c in columns if c not in excludes] csvkit-0.9.1/csvkit/convert/0000755000076600000240000000000012506400503016775 5ustar onyxfishstaff00000000000000csvkit-0.9.1/csvkit/convert/__init__.py0000644000076600000240000000433012477331225021121 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import six from csvkit.convert.csvitself import csv2csv from csvkit.convert.fixed import fixed2csv from csvkit.convert.geojs import geojson2csv from csvkit.convert.js import json2csv from csvkit.convert.ndjs import ndjson2csv from csvkit.convert.xls import xls2csv from csvkit.convert.xlsx import xlsx2csv SUPPORTED_FORMATS = ['fixed', 'xls', 'xlsx', 'csv', 'json', 'geojson', 'ndjson'] # DBF is supported for Python 2 only if six.PY2: from csvkit.convert.dbase import dbf2csv SUPPORTED_FORMATS.append('dbf') def convert(f, format, schema=None, key=None, **kwargs): """ Convert a file of a specified format to CSV. """ if not f: raise ValueError('f must not be None') if not format: raise ValueError('format must not be None') if format == 'fixed': if not schema: raise ValueError('schema must not be null when format is "fixed"') return fixed2csv(f, schema, **kwargs) elif format == 'xls': return xls2csv(f, **kwargs) elif format == 'xlsx': return xlsx2csv(f, **kwargs) elif format == 'json': return json2csv(f, key, **kwargs) elif format == 'ndjson': return ndjson2csv(f, **kwargs) elif format == 'geojson': return geojson2csv(f, **kwargs) elif format == 'csv': return csv2csv(f, **kwargs) elif format == 'dbf': if six.PY3: raise ValueError('format "dbf" is not supported forthis version of Python.') return dbf2csv(f, **kwargs) else: raise ValueError('format "%s" is not supported' % format) def guess_format(filename): """ Try to guess a file's format based on its extension (or lack thereof). """ last_period = filename.rfind('.') if last_period == -1: # No extension: assume fixed-width return 'fixed' extension = filename[last_period + 1:] if extension == 'xls': return extension elif extension == 'xlsx': return extension elif extension in ['json', 'js']: return 'json' elif extension == 'csv': return extension elif extension == 'fixed': return extension elif extension == 'dbf': return extension return None csvkit-0.9.1/csvkit/convert/csvitself.py0000644000076600000240000000053612477331225021370 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import six from csvkit import table def csv2csv(f, **kwargs): """ "Convert" a CSV into a new CSV by normalizing types and correcting for other anomalies. """ tab = table.Table.from_csv(f, **kwargs) o = six.StringIO() output = tab.to_csv(o) output = o.getvalue() o.close() return output csvkit-0.9.1/csvkit/convert/dbase.py0000644000076600000240000000171212477331225020441 0ustar onyxfishstaff00000000000000#!/usr/bin/env python """ Note: dbf is only supported/imported for Python 2. """ import dbf import six from csvkit import table def dbf2csv(f, **kwargs): """ Convert a dBASE .dbf file to csv. """ with dbf.Table(f.name) as db: headers = db.field_names column_ids = range(len(headers)) data_columns = [[] for c in headers] for row in db: for i, d in enumerate(row): try: data_columns[i].append(six.text_type(row[column_ids[i]]).strip()) except IndexError: # Non-rectangular data is truncated break columns = [] for i, c in enumerate(data_columns): columns.append(table.Column(column_ids[i], headers[i], c)) tab = table.Table(columns=columns) o = six.StringIO() output = tab.to_csv(o) output = o.getvalue() o.close() return output csvkit-0.9.1/csvkit/convert/fixed.py0000644000076600000240000001241212477331225020461 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from collections import namedtuple from codecs import iterdecode import six from csvkit import CSVKitReader, CSVKitWriter def fixed2csv(f, schema, output=None, **kwargs): """ Convert a fixed-width file to csv using a CSV-formatted schema description. A schema CSV must start with a header row with (at least) columns labeled "column","start", and "length". (Other columns will be ignored.) For each subsequent row, therefore, those columns will be used to identify a column name, the starting index of the column (an integer), and the length of the column (also an integer). Values in the 'start' column are assumed to be zero-based, unless the first value for 'start' is 1, in which case all values are assumed to be one-based. If output is specified, rows will be written to that object, otherwise the complete data will be returned. """ streaming = True if output else False if not streaming: output = six.StringIO() try: encoding = kwargs['encoding'] except KeyError: encoding = None writer = CSVKitWriter(output) reader = FixedWidthReader(f, schema, encoding=encoding) writer.writerows(reader) if not streaming: data = output.getvalue() return data # Return empty string when streaming return '' class FixedWidthReader(six.Iterator): """ Given a fixed-width file and a schema file, produce an analog to a csv reader that yields a row of strings for each line in the fixed-width file, preceded with a row of headers as provided in the schema. (This might be problematic if fixed-width-files ever have header rows also, but I haven't seen that.) The schema_file should be in CSV format with a header row which has columns 'column', 'start', and 'length'. (Other columns will be ignored.) Values in the 'start' column are assumed to be "zero-based" unless the first value is "1" in which case all values are assumed to be "one-based." """ def __init__(self, f, schema, encoding=None): if encoding is not None: f = iterdecode(f, encoding) self.file = f self.parser = FixedWidthRowParser(schema) self.header = True def __iter__(self): return self def __next__(self): if self.header: self.header = False return self.parser.headers return self.parser.parse(next(self.file)) FixedWidthField = namedtuple('FixedWidthField', ['name', 'start', 'length']) class FixedWidthRowParser(object): """ Instantiated with a schema, able to return a sequence of trimmed strings representing fields given a fixed-length line. Flexible about where the columns are, as long as they are headed with the literal names 'column', 'start', and 'length'. """ def __init__(self, schema): self.fields = [] # A list of FixedWidthFields schema_reader = CSVKitReader(schema) schema_decoder = SchemaDecoder(next(schema_reader)) for i,row in enumerate(schema_reader): try: self.fields.append(schema_decoder(row)) except Exception as e: raise ValueError("Error reading schema at line %i: %s" % (i + 2,e)) def parse(self, line): values = [] for field in self.fields: values.append(line[field.start:field.start + field.length].strip()) return values def parse_dict(self, line): """Convenience method returns a dict. Equivalent to dict(zip(self.headers,self.parse(line))).""" return dict(zip(self.headers,self.parse(line))) @property def headers(self): return [field.name for field in self.fields] class SchemaDecoder(object): """ Extracts column, start, and length columns from schema rows. Once instantiated, each time the instance is called with a row, a (column,start,length) tuple will be returned based on values in that row and the constructor kwargs. """ REQUIRED_COLUMNS = [('column', None), ('start', int), ('length', int)] start = None length = None column = None one_based = None def __init__(self, header, **kwargs): """ Constructs a schema row decoder. """ for p, val_type in self.REQUIRED_COLUMNS: try: if val_type: setattr(self, p, val_type(header.index(p))) else: setattr(self, p, header.index(p)) except ValueError: raise ValueError('A column named "%s" must exist in the schema file.' % (p)) def __call__(self, row): """ Return a tuple (column, start, length) based on this instance's parameters. If the first time this is called, the row's 'start' value is 1, then all 'start' values including the first will be one less than in the actual input data, to adjust for one-based specifications. Values for 'start' and 'length' will be cast to integers. """ if self.one_based is None: self.one_based = (int(row[self.start]) == 1) if self.one_based: adjusted_start = int(row[self.start]) - 1 else: adjusted_start = int(row[self.start]) return FixedWidthField(row[self.column], adjusted_start, int(row[self.length])) csvkit-0.9.1/csvkit/convert/geojs.py0000644000076600000240000000351512477331225020475 0ustar onyxfishstaff00000000000000#!/usr/bin/env python try: from collections import OrderedDict import json except ImportError: from ordereddict import OrderedDict import simplejson as json import six from csvkit import CSVKitWriter def geojson2csv(f, key=None, **kwargs): """ Convert a GeoJSON document into CSV format. """ js = json.load(f, object_pairs_hook=OrderedDict) if not isinstance(js, dict): raise TypeError('JSON document is not valid GeoJSON: Root element is not an object.') if 'type' not in js: raise TypeError('JSON document is not valid GeoJSON: No top-level "type" key.') if js['type'] != 'FeatureCollection': raise TypeError('Only GeoJSON with root FeatureCollection type is supported. Not %s' % js['type']) if 'features' not in js: raise TypeError('JSON document is not a valid FeatureCollection: No top-level "features" key.') features = js['features'] features_parsed = [] # tuples in the format (id, properties, geometry) property_fields = [] for feature in features: geoid = feature.get('id', None) properties = feature.get('properties') or {} for prop in properties.keys(): if prop not in property_fields: property_fields.append(prop) geometry = json.dumps(feature['geometry']) features_parsed.append((geoid, properties, geometry)) header = ['id'] header.extend(property_fields) header.append('geojson') o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(header) for geoid, properties, geometry in features_parsed: row = [geoid] for field in property_fields: row.append(properties.get(field, None)) row.append(geometry) writer.writerow(row) output = o.getvalue() o.close() return output csvkit-0.9.1/csvkit/convert/js.py0000644000076600000240000000334312477331225020001 0ustar onyxfishstaff00000000000000#!/usr/bin/env python try: from collections import OrderedDict import json except ImportError: from ordereddict import OrderedDict import simplejson as json import itertools import six from csvkit import CSVKitWriter def parse_object(obj, path=''): """ Recursively parse JSON objects and a dictionary of paths/keys and values. Inspired by JSONPipe (https://github.com/dvxhouse/jsonpipe). """ if isinstance(obj, dict): iterator = obj.items() elif isinstance(obj, (list, tuple)): iterator = enumerate(obj) else: return { path.strip('/'): obj } d = {} for key, value in iterator: key = six.text_type(key) d.update(parse_object(value, path + key + '/')) return d def json2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ js = json.load(f, object_pairs_hook=OrderedDict) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output csvkit-0.9.1/csvkit/convert/ndjs.py0000644000076600000240000000335112477331225020322 0ustar onyxfishstaff00000000000000#!/usr/bin/env python try: from collections import OrderedDict import json except ImportError: from ordereddict import OrderedDict import simplejson as json import itertools import six from csvkit import CSVKitWriter def parse_object(obj, path=''): """ Recursively parse JSON objects and a dictionary of paths/keys and values. Inspired by JSONPipe (https://github.com/dvxhouse/jsonpipe). """ if isinstance(obj, dict): iterator = obj.items() elif isinstance(obj, (list, tuple)): iterator = enumerate(obj) else: return { path.strip('/'): obj } d = {} for key, value in iterator: key = six.text_type(key) d.update(parse_object(value, path + key + '/')) return d def ndjson2csv(f, key=None, **kwargs): """ Convert a JSON document into CSV format. Supports both JSON and "Newline-delimited JSON". The top-level element of the input must be a list or a dictionary. If it is a dictionary, a key must be provided which is an item of the dictionary which contains a list. """ first_line = f.readline() first_row = json.loads(first_line, object_pairs_hook=OrderedDict) js = itertools.chain((first_row, ), (json.loads(l, object_pairs_hook=OrderedDict) for l in f)) fields = [] flat = [] for obj in js: flat.append(parse_object(obj)) for key in obj.keys(): if key not in fields: fields.append(key) o = six.StringIO() writer = CSVKitWriter(o) writer.writerow(fields) for i in flat: row = [] for field in fields: row.append(i.get(field, None)) writer.writerow(row) output = o.getvalue() o.close() return output csvkit-0.9.1/csvkit/convert/xls.py0000644000076600000240000001106512477331225020173 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import datetime import six import xlrd from csvkit import table from csvkit.exceptions import XLSDataError def normalize_empty(values, **kwargs): """ Normalize a column which contains only empty cells. """ return None, [None] * len(values) def normalize_text(values, **kwargs): """ Normalize a column of text cells. """ return six.text_type, [six.text_type(v) if v else None for v in values] def normalize_numbers(values, **kwargs): """ Normalize a column of numeric cells. """ # Test if all values are whole numbers, if so coerce floats it ints integral = True for v in values: if v and v % 1 != 0: integral = False break if integral: return int, [int(v) if v != '' else None for v in values] else: # Convert blanks to None return float, [v if v else None for v in values] def normalize_dates(values, datemode=0, **kwargs): """ Normalize a column of date cells. """ normal_values = [] normal_types_set = set() for v in values: # Convert blanks to None if v == '': normal_values.append(None) continue v_tuple = xlrd.xldate_as_tuple(v, datemode) if v_tuple == (0, 0, 0, 0, 0, 0): # Midnight normal_values.append(datetime.time(*v_tuple[3:])) normal_types_set.add(datetime.time) elif v_tuple[3:] == (0, 0, 0): # Date only normal_values.append(datetime.date(*v_tuple[:3])) normal_types_set.add(datetime.date) elif v_tuple[:3] == (0, 0, 0): # Time only normal_values.append(datetime.time(*v_tuple[3:])) normal_types_set.add(datetime.time) else: # Date and time normal_values.append(datetime.datetime(*v_tuple)) normal_types_set.add(datetime.datetime) if len(normal_types_set) == 1: # No special handling if column contains only one type pass elif normal_types_set == set([datetime.datetime, datetime.date]): # If a mix of dates and datetimes, up-convert dates to datetimes for i, v in enumerate(normal_values): if v.__class__ == datetime.date: normal_values[i] = datetime.datetime.combine(v, datetime.time()) normal_types_set.remove(datetime.date) elif normal_types_set == set([datetime.datetime, datetime.time]): # Datetimes and times don't mix raise XLSDataError('Column contains a mix of times and datetimes (this is not supported).') elif normal_types_set == set([datetime.date, datetime.time]): # Dates and times don't mix raise XLSDataError('Column contains a mix of dates and times (this is not supported).') # Natural serialization of dates and times by csv.writer is insufficent so they get converted back to strings at this point return normal_types_set.pop(), normal_values def normalize_booleans(values, **kwargs): """ Normalize a column of boolean cells. """ return bool, [bool(v) if v != '' else None for v in values] NORMALIZERS = { xlrd.biffh.XL_CELL_EMPTY: normalize_empty, xlrd.biffh.XL_CELL_TEXT: normalize_text, xlrd.biffh.XL_CELL_NUMBER: normalize_numbers, xlrd.biffh.XL_CELL_DATE: normalize_dates, xlrd.biffh.XL_CELL_BOOLEAN: normalize_booleans } def determine_column_type(types): """ Determine the correct type for a column from a list of cell types. """ types_set = set(types) types_set.discard(xlrd.biffh.XL_CELL_EMPTY) # Normalize mixed types to text if len(types_set) > 1: return xlrd.biffh.XL_CELL_TEXT try: return types_set.pop() except KeyError: return xlrd.biffh.XL_CELL_EMPTY def xls2csv(f, **kwargs): """ Convert an Excel .xls file to csv. """ book = xlrd.open_workbook(file_contents=f.read()) if 'sheet' in kwargs: sheet = book.sheet_by_name(kwargs['sheet']) else: sheet = book.sheet_by_index(0) tab = table.Table() for i in range(sheet.ncols): # Trim headers column_name = sheet.col_values(i)[0] values = sheet.col_values(i)[1:] types = sheet.col_types(i)[1:] column_type = determine_column_type(types) t, normal_values = NORMALIZERS[column_type](values, datemode=book.datemode) column = table.Column(i, column_name, normal_values, normal_type=t) tab.append(column) o = six.StringIO() output = tab.to_csv(o) output = o.getvalue() o.close() return output csvkit-0.9.1/csvkit/convert/xlsx.py0000644000076600000240000000463212477331225020365 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import datetime from openpyxl.reader.excel import load_workbook import six from csvkit import CSVKitWriter from csvkit.typeinference import NULL_TIME def normalize_datetime(dt): if dt.microsecond == 0: return dt ms = dt.microsecond if ms < 1000: return dt.replace(microsecond=0) elif ms > 999000: return dt.replace(microsecond=0) + datetime.timedelta(seconds=1) return dt def has_date_elements(cell): """ Try to use formatting to determine if a cell contains only time info. See: http://office.microsoft.com/en-us/excel-help/number-format-codes-HP005198679.aspx """ if 'd' in cell.number_format or \ 'y' in cell.number_format: return True return False def xlsx2csv(f, output=None, **kwargs): """ Convert an Excel .xlsx file to csv. Note: Unlike other convertor's, this one allows output columns to contain mixed data types. Blank headers are also possible. """ streaming = True if output else False if not streaming: output = six.StringIO() writer = CSVKitWriter(output) book = load_workbook(f, use_iterators=True, data_only=True) if 'sheet' in kwargs: sheet = book.get_sheet_by_name(kwargs['sheet']) else: sheet = book.get_active_sheet() for i, row in enumerate(sheet.iter_rows()): if i == 0: writer.writerow([c.value for c in row]) continue out_row = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) elif value.__class__ is float: if value % 1 == 0: value = int(value) if value.__class__ in (datetime.datetime, datetime.date, datetime.time): value = value.isoformat() out_row.append(value) writer.writerow(out_row) if not streaming: data = output.getvalue() return data # Return empty string when streaming return '' csvkit-0.9.1/csvkit/exceptions.py0000644000076600000240000000552612477331225020073 0ustar onyxfishstaff00000000000000#!/usr/bin/env python class CustomException(Exception): """ A base exception that handles pretty-printing errors for command-line utilities. """ def __init__(self, msg): self.msg = msg def __unicode__(self): return self.msg def __str__(self): return self.msg class FieldSizeLimitError(CustomException): """ Exception raised when a field in the CSV file exceeds the default max or one provided by the user. """ def __init__(self, limit): self.msg = 'CSV contains fields longer than maximum length of %i characters. Try raising the maximum with the --maxfieldsize flag.' % limit class ColumnIdentifierError(CustomException): """ Exception raised when the user supplies an invalid column identifier. """ pass class XLSDataError(CustomException): """ Exception raised when there is a problem converting XLS data. """ pass class CSVTestException(CustomException): """ Superclass for all row-test-failed exceptions. All must have a line number, the problematic row, and a text explanation. """ def __init__(self, line_number, row, msg): super(CSVTestException, self).__init__(msg) self.line_number = line_number self.row = row class LengthMismatchError(CSVTestException): """ Encapsulate information about a row which as the wrong length. """ def __init__(self, line_number, row, expected_length): msg = 'Expected %i columns, found %i columns' % (expected_length, len(row)) super(LengthMismatchError, self).__init__(line_number, row, msg) @property def length(self): return len(self.row) class CSVJSONException(CustomException): """ Exception raised when there is a problem converting data to CSV. """ pass class NonUniqueKeyColumnException(CSVJSONException): pass class InvalidValueForTypeException(CustomException): """ Exception raised when a value can not be normalized to a specified type. """ def __init__(self, index, value, normal_type): self.index = index self.value = value self.normal_type = normal_type msg = 'Unable to convert "%s" to type %s (at index %i)' % (value, normal_type, index) super(InvalidValueForTypeException, self).__init__(msg) class InvalidValueForTypeListException(CustomException): """ Exception raised when one or more InvalidValueForTypeException has been raised while accumulating errors. """ def __init__(self, errors): self.errors = errors msg = 'Encountered errors converting values in %i columns' % len(errors) super(InvalidValueForTypeListException, self).__init__(msg) class RequiredHeaderError(CustomException): """ Exception raised when an operation requires a CSV file to have a header row. """ pass csvkit-0.9.1/csvkit/grep.py0000644000076600000240000001050212477331225016635 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import six from csvkit.exceptions import ColumnIdentifierError class FilteringCSVReader(six.Iterator): """ Given any row iterator, only return rows which pass the filter. If 'header' is False, then all rows must pass the filter; by default, the first row will be passed through untested. The value of patterns may be either a sequence or a dictionary. Items in the sequence and values in the dictionary may be strings, regular expressions, or functions. For each row in the wrapped iterator, these values will be used as tests, and the row will only be yielded by the filter if all values pass their corresponding tests. This behavior can be toggled so that all rows which pass any of the tests will be yielded by specifying "any_match=True" in the constructor. Empty values (the blank string or None) not be tested; the value in that position will not affect whether or not the filtering reader yields a prospective row. To test for explicitly blank, use a regular expression such as "^$" or "^\s*$" If patterns is a dictionary, the keys can be integers identifying indices in the input rows, or, if 'header' is True (as it is by default), they can be strings matching column names in the first row of the reader. If patterns is a sequence, then it is assumed that they will be applied to the equivalently positioned values in the test rows. By specifying 'inverse=True', only rows which do not match the patterns will be passed by the filter. The header, if there is one, will always be returned regardless of the value for 'inverse'. """ returned_header = False column_names = None def __init__(self, reader, patterns, header=True, any_match=False, inverse=False): super(FilteringCSVReader, self).__init__() self.reader = reader self.header = header if self.header: self.column_names = next(reader) self.any_match = any_match self.inverse = inverse self.patterns = standardize_patterns(self.column_names,patterns) def __iter__(self): return self def __next__(self): if self.column_names and not self.returned_header: self.returned_header = True return self.column_names while True: row = next(self.reader) if self.test_row(row): return row raise StopIteration() def test_row(self, row): for idx, test in self.patterns.items(): if self.any_match and test(row[idx]): return not self.inverse # True if not self.any_match and not test(row[idx]): return self.inverse # False return not self.inverse # True def standardize_patterns(column_names, patterns): """ Given patterns in any of the permitted input forms, return a dict whose keys are column indices and whose values are functions which return a boolean value whether the value passes. If patterns is a dictionary and any of its keys are values in column_names, the returned dictionary will have those keys replaced with the integer position of that value in column_names """ try: # Dictionary of patterns patterns = dict((k, pattern_as_function(v)) for k, v in patterns.items() if v) if not column_names: return patterns p2 = {} for k in patterns: if k in column_names: idx = column_names.index(k) if idx in patterns: raise ColumnIdentifierError("Column %s has index %i which already has a pattern." % (k,idx)) p2[idx] = patterns[k] else: p2[k] = patterns[k] return p2 except AttributeError: # Sequence of patterns return dict((i, pattern_as_function(x)) for i, x in enumerate(patterns)) def pattern_as_function(obj): # obj is function if hasattr(obj, '__call__'): return obj # obj is regex object if hasattr(obj, 'match'): return regex_callable(obj) # obj is string return lambda x: obj in x class regex_callable(object): def __init__(self, pattern): self.pattern = pattern def __call__(self, arg): return self.pattern.match(arg) csvkit-0.9.1/csvkit/headers.py0000644000076600000240000000030312477331225017311 0ustar onyxfishstaff00000000000000#!/usr/bin/env python def make_default_headers(n): """ Make a set of simple, default headers for files that are missing them. """ return ['column%i' % (i + 1) for i in range(n)] csvkit-0.9.1/csvkit/join.py0000644000076600000240000001256212477331225016647 0ustar onyxfishstaff00000000000000#!/usr/bin/env python def _get_ordered_keys(rows, column_index): """ Get ordered keys from rows, given the key column index. """ return [r[column_index] for r in rows] def _get_mapped_keys(rows, column_index): mapped_keys = {} for r in rows: key = r[column_index] if key in mapped_keys: mapped_keys[key].append(r) else: mapped_keys[key] = [r] return mapped_keys def sequential_join(left_table, right_table): """ Join two tables by aligning them horizontally without performing any filtering. """ # Grab headers left_headers = left_table[0] right_headers = right_table[0] left_rows = left_table[1:] right_rows = iter(right_table[1:]) output = [left_headers + right_headers] for left_row in left_rows: try: right_row = next(right_rows) except StopIteration: output.append(left_row + ([u''] * len(right_headers))) output.append(left_row + right_row) for right_row in right_rows: output.append(([u''] * len(left_headers)) + right_row) return output def inner_join(left_table, left_column_id, right_table, right_column_id): """ Execute an inner join on two tables and return the combined table. """ # Grab headers left_headers = left_table[0] len_left_headers = len(left_headers) right_headers = right_table[0] left_rows = left_table[1:] right_rows = right_table[1:] # Map right rows to keys right_mapped_keys = _get_mapped_keys(right_rows, right_column_id) output = [left_headers + right_headers] for left_row in left_rows: len_left_row = len(left_row) if len_left_row < len_left_headers: left_row.extend([None] * (len_left_headers - len_left_row)) left_key = left_row[left_column_id] if left_key in right_mapped_keys: for right_row in right_mapped_keys[left_key]: output.append(left_row + right_row) return output def full_outer_join(left_table, left_column_id, right_table, right_column_id): """ Execute full outer join on two tables and return the combined table. """ # Grab headers left_headers = left_table[0] len_left_headers = len(left_headers) right_headers = right_table[0] left_rows = left_table[1:] right_rows = right_table[1:] # Get ordered keys left_ordered_keys = _get_ordered_keys(left_rows, left_column_id) # Get mapped keys right_mapped_keys = _get_mapped_keys(right_rows, right_column_id) output = [left_headers + right_headers] for left_row in left_rows: len_left_row = len(left_row) left_key = left_row[left_column_id] if len_left_row < len_left_headers: left_row.extend([None] * (len_left_headers - len_left_row)) if left_key in right_mapped_keys: for right_row in right_mapped_keys[left_key]: output.append(left_row + right_row) else: output.append(left_row + ([u''] * len(right_headers))) for right_row in right_rows: right_key = right_row[right_column_id] if right_key not in left_ordered_keys: output.append(([u''] * len(left_headers)) + right_row) return output def left_outer_join(left_table, left_column_id, right_table, right_column_id): """ Execute left outer join on two tables and return the combined table. """ # Grab headers left_headers = left_table[0] len_left_headers = len(left_headers) right_headers = right_table[0] left_rows = left_table[1:] right_rows = right_table[1:] # Get mapped keys right_mapped_keys = _get_mapped_keys(right_rows, right_column_id) output = [left_headers + right_headers] for left_row in left_rows: len_left_row = len(left_row) left_key = left_row[left_column_id] if len_left_row < len_left_headers: left_row.extend([None] * (len_left_headers - len_left_row)) if left_key in right_mapped_keys: for right_row in right_mapped_keys[left_key]: output.append(left_row + right_row) else: output.append(left_row + ([u''] * len(right_headers))) return output def right_outer_join(left_table, left_column_id, right_table, right_column_id): """ Execute right outer join on two tables and return the combined table. """ # Grab headers left_headers = left_table[0] len_left_headers = len(left_headers) right_headers = right_table[0] left_rows = left_table[1:] right_rows = right_table[1:] # Get ordered keys left_ordered_keys = _get_ordered_keys(left_rows, left_column_id) # Get mapped keys right_mapped_keys = _get_mapped_keys(right_rows, right_column_id) output = [left_headers + right_headers] for left_row in left_rows: len_left_row = len(left_row) left_key = left_row[left_column_id] if len_left_row < len_left_headers: left_row.extend([None] * (len_left_headers - len_left_row)) if left_key in right_mapped_keys: for right_row in right_mapped_keys[left_key]: output.append(left_row + right_row) for right_row in right_rows: right_key = right_row[right_column_id] if right_key not in left_ordered_keys: output.append(([u''] * len(left_headers)) + right_row) return output csvkit-0.9.1/csvkit/py2.py0000644000076600000240000000552612477331225016424 0ustar onyxfishstaff00000000000000#!/usr/bin/env python """ Python2-specific classes. """ import six from csvkit import unicsv class CSVKitReader(unicsv.UnicodeCSVReader): """ A unicode-aware CSV reader. """ pass class CSVKitWriter(unicsv.UnicodeCSVWriter): """ A unicode-aware CSV writer. """ def __init__(self, f, encoding='utf-8', line_numbers=False, **kwargs): self.row_count = 0 self.line_numbers = line_numbers if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' unicsv.UnicodeCSVWriter.__init__(self, f, encoding, **kwargs) def _append_line_number(self, row): if self.row_count == 0: row.insert(0, 'line_number') else: row.insert(0, self.row_count) self.row_count += 1 def writerow(self, row): if self.line_numbers: row = list(row) self._append_line_number(row) # Convert embedded Mac line endings to unix style line endings so they get quoted row = [i.replace('\r', '\n') if isinstance(i, six.string_types) else i for i in row] unicsv.UnicodeCSVWriter.writerow(self, row) def writerows(self, rows): for row in rows: self.writerow(row) class CSVKitDictReader(unicsv.UnicodeCSVDictReader): """ A unicode-aware CSV DictReader. """ pass class CSVKitDictWriter(unicsv.UnicodeCSVDictWriter): """ A unicode-aware CSV DictWriter. """ def __init__(self, f, fieldnames, encoding='utf-8', line_numbers=False, **kwargs): self.row_count = 0 self.line_numbers = line_numbers if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' unicsv.UnicodeCSVDictWriter.__init__(self, f, fieldnames, encoding=encoding, **kwargs) def _append_line_number(self, row): if self.row_count == 0: row['line_number'] = 0 else: row['line_number'] = self.row_count self.row_count += 1 def writerow(self, row): if self.line_numbers: row = list(row) self._append_line_number(row) # Convert embedded Mac line endings to unix style line endings so they get quoted row = dict([(k, v.replace('\r', '\n')) if isinstance(v, basestring) else (k, v) for k, v in row.items()]) unicsv.UnicodeCSVDictWriter.writerow(self, row) def writerows(self, rows): for row in rows: self.writerow(row) def reader(*args, **kwargs): """ A drop-in replacement for Python's :func:`csv.reader` that leverages :class:`csvkit.py2.CSVKitReader`. """ return CSVKitReader(*args, **kwargs) def writer(*args, **kwargs): """ A drop-in replacement for Python's :func:`csv.writer` that leverages :class:`csvkit.py2.CSVKitWriter`. """ return CSVKitWriter(*args, **kwargs) csvkit-0.9.1/csvkit/py3.py0000644000076600000240000000613112477331225016416 0ustar onyxfishstaff00000000000000#!/usr/bin/env python """ Python3-specific classes. """ import csv import six class CSVKitReader(six.Iterator): """ A wrapper around Python 3's builtin :func:`csv.reader`. """ def __init__(self, f, **kwargs): self.reader = csv.reader(f, **kwargs) def __iter__(self): return self def __next__(self): return next(self.reader) @property def dialect(self): return self.reader.dialect @property def line_num(self): return self.reader.line_num class CSVKitWriter(object): """ A wrapper around Python 3's builtin :func:`csv.writer`. """ def __init__(self, f, line_numbers=False, **kwargs): self.row_count = 0 self.line_numbers = line_numbers if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' self.writer = csv.writer(f, **kwargs) def _append_line_number(self, row): if self.row_count == 0: row.insert(0, 'line_number') else: row.insert(0, self.row_count) self.row_count += 1 def writerow(self, row): if self.line_numbers: row = list(row) self._append_line_number(row) # Convert embedded Mac line endings to unix style line endings so they get quoted row = [i.replace('\r', '\n') if isinstance(i, six.string_types) else i for i in row] self.writer.writerow(row) def writerows(self, rows): for row in rows: self.writer.writerow(row) class CSVKitDictReader(csv.DictReader): """ A wrapper around Python 3's builtin :class:`csv.DictReader`. """ pass class CSVKitDictWriter(csv.DictWriter): """ A wrapper around Python 3's builtin :class:`csv.DictWriter`. """ def __init__(self, f, fieldnames, line_numbers=False, **kwargs): self.row_count = 0 self.line_numbers = line_numbers if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' csv.DictWriter.__init__(self, f, fieldnames, **kwargs) def _append_line_number(self, row): if self.row_count == 0: row['line_number'] = 0 else: row['line_number'] = self.row_count self.row_count += 1 def writerow(self, row): if self.line_numbers: row = list(row) self._append_line_number(row) # Convert embedded Mac line endings to unix style line endings so they get quoted row = dict([(k, v.replace('\r', '\n')) if isinstance(v, six.string_types) else (k, v) for k, v in row.items()]) csv.DictWriter.writerow(self, row) def writerows(self, rows): for row in rows: self.writerow(row) def reader(*args, **kwargs): """ A drop-in replacement for Python's :func:`csv.reader` that leverages :class:`csvkit.py3.CSVKitReader`. """ return CSVKitReader(*args, **kwargs) def writer(*args, **kwargs): """ A drop-in replacement for Python's :func:`csv.writer` that leverages :class:`csvkit.py3.CSVKitWriter`. """ return CSVKitWriter(*args, **kwargs) csvkit-0.9.1/csvkit/sniffer.py0000644000076600000240000000062112477331225017335 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import csv POSSIBLE_DELIMITERS = [',', '\t', ';', ' ', ':', '|'] def sniff_dialect(sample): """ A functional version of ``csv.Sniffer().sniff``, that extends the list of possible delimiters to include some seen in the wild. """ try: dialect = csv.Sniffer().sniff(sample, POSSIBLE_DELIMITERS) except: dialect = None return dialect csvkit-0.9.1/csvkit/sql.py0000644000076600000240000000571012477331225016504 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import datetime import six from sqlalchemy import Column, MetaData, Table, create_engine from sqlalchemy import BigInteger, Boolean, Date, DateTime, Float, Integer, String, Time from sqlalchemy.schema import CreateTable NoneType = type(None) DIALECTS = { 'access': 'access.base', 'firebird': 'firebird.kinterbasdb', 'informix': 'informix.informixdb', 'maxdb': 'maxdb.sapdb', 'mssql': 'mssql.pyodbc', 'mysql': 'mysql.mysqlconnector', 'oracle': 'oracle.cx_oracle', 'postgresql': 'postgresql.psycopg2', 'sqlite': 'sqlite.pysqlite', 'sybase': 'sybase.pyodbc' } NULL_COLUMN_MAX_LENGTH = 32 SQL_INTEGER_MAX = 2147483647 SQL_INTEGER_MIN = -2147483647 def make_column(column, no_constraints=False): """ Creates a sqlalchemy column from a csvkit Column. """ sql_column_kwargs = {} sql_type_kwargs = {} column_types = { bool: Boolean, #int: Integer, see special case below float: Float, datetime.datetime: DateTime, datetime.date: Date, datetime.time: Time, NoneType: String, six.text_type: String } if column.type in column_types: sql_column_type = column_types[column.type] elif column.type is int: column_max = max([v for v in column if v is not None]) column_min = min([v for v in column if v is not None]) if column_max > SQL_INTEGER_MAX or column_min < SQL_INTEGER_MIN: sql_column_type = BigInteger else: sql_column_type = Integer else: raise ValueError('Unexpected normalized column type: %s' % column.type) if no_constraints is False: if column.type is NoneType: sql_type_kwargs['length'] = NULL_COLUMN_MAX_LENGTH elif column.type is six.text_type: sql_type_kwargs['length'] = column.max_length() sql_column_kwargs['nullable'] = column.has_nulls() return Column(column.name, sql_column_type(**sql_type_kwargs), **sql_column_kwargs) def get_connection(connection_string): engine = create_engine(connection_string) metadata = MetaData(engine) return engine, metadata def make_table(csv_table, name='table_name', no_constraints=False, db_schema=None, metadata=None): """ Creates a sqlalchemy table from a csvkit Table. """ if not metadata: metadata = MetaData() sql_table = Table(csv_table.name, metadata, schema=db_schema) for column in csv_table: sql_table.append_column(make_column(column, no_constraints)) return sql_table def make_create_table_statement(sql_table, dialect=None): """ Generates a CREATE TABLE statement for a sqlalchemy table. """ if dialect: module = __import__('sqlalchemy.dialects.%s' % DIALECTS[dialect], fromlist=['dialect']) sql_dialect = module.dialect() else: sql_dialect = None return six.text_type(CreateTable(sql_table).compile(dialect=sql_dialect)).strip() + ';' csvkit-0.9.1/csvkit/table.py0000644000076600000240000002123612477331225016775 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import datetime import itertools import six from csvkit import CSVKitReader, CSVKitWriter from csvkit import sniffer from csvkit import typeinference from csvkit.cli import parse_column_identifiers from csvkit.headers import make_default_headers class InvalidType(object): """ Dummy object type for Column initialization, since None is being used as a valid value. """ pass class Column(list): """ A normalized data column and inferred annotations (nullable, etc.). """ def __init__(self, order, name, l, normal_type=InvalidType, blanks_as_nulls=True, infer_types=True): """ Construct a column from a sequence of values. If normal_type is not InvalidType, inference will be skipped and values assumed to have already been normalized. If infer_types is False, type inference will be skipped and the type assumed to be unicode. """ if normal_type != InvalidType: t = normal_type data = l elif not infer_types: t = six.text_type data = l else: t, data = typeinference.normalize_column_type(l, blanks_as_nulls=blanks_as_nulls) list.__init__(self, data) self.order = order self.name = name or '_unnamed' # empty column names don't make sense self.type = t def __str__(self): return str(self.__unicode__()) def __unicode__(self): """ Stringify a description of this column. """ return '%3i: %s (%s)' % (self.order, self.name, self.type) def __getitem__(self, key): """ Return null for keys beyond the range of the column. This allows for columns to be of uneven length and still be merged into rows cleanly. """ l = len(self) if isinstance(key, slice): indices = six.moves.range(*key.indices(l)) return [(list.__getitem__(self, i) if i < l else None) for i in indices] if key >= l: return None return list.__getitem__(self, key) def has_nulls(self): """ Check if this column contains nulls. """ return True if None in self else False def max_length(self): """ Compute maximum length of data in this column. Returns 0 if the column does not of type ``unicode``. """ l = 0 if self.type == six.text_type: l = max([len(d) if d else 0 for d in self]) if self.has_nulls(): l = max(l, 4) # "None" return l class Table(list): """ A normalized data table and inferred annotations (nullable, etc.). """ def __init__(self, columns=[], name='new_table'): """ Generic constructor. You should normally use a from_* method to create a Table. """ list.__init__(self, columns) self.name = name def __str__(self): return str(self.__unicode__()) def __unicode__(self): """ Stringify a description of all columns in this table. """ return '\n'.join([six.text_type(c) for c in self]) def _reindex_columns(self): """ Update order properties of all columns in table. """ for i, c in enumerate(self): c.order = i def _deduplicate_column_name(self, column): while column.name in self.headers(): try: i = column.name.rindex('_') counter = int(column.name[i + 1:]) column.name = '%s_%i' % (column.name[:i], counter + 1) except: column.name += '_2' return column.name def append(self, column): """Implements list append.""" self._deduplicate_column_name(column) list.append(self, column) column.index = len(self) - 1 def insert(self, i, column): """Implements list insert.""" self._deduplicate_column_name(column) list.insert(self, i, column) self._reindex_columns() def extend(self, columns): """Implements list extend.""" for c in columns: self._deduplicate_column_name(c) list.extend(self, columns) self._reindex_columns() def remove(self, column): """Implements list remove.""" list.remove(self, column) self._reindex_columns() def sort(self): """Forbids list sort.""" raise NotImplementedError() def reverse(self): """Forbids list reverse.""" raise NotImplementedError() def headers(self): return [c.name for c in self] def count_rows(self): lengths = [len(c) for c in self] if lengths: return max(lengths) return 0 def row(self, i): """ Fetch a row of data from this table. """ if i < 0: raise IndexError('Negative row numbers are not valid.') if i >= self.count_rows(): raise IndexError('Row number exceeds the number of rows in the table.') row_data = [c[i] for c in self] return row_data @classmethod def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ Creates a new Table from a file-like object containing CSV data. Note: the column_ids argument will cause only those columns with a matching identifier to be parsed, type inferred, etc. However, their order/index property will reflect the original data (e.g. column 8 will still be "order" 7, even if it's the third column in the resulting Table. """ # This bit of nonsense is to deal with "files" from stdin, # which are not seekable and thus must be buffered contents = f.read() # snifflimit == 0 means do not sniff if snifflimit is None: kwargs['dialect'] = sniffer.sniff_dialect(contents) elif snifflimit > 0: kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = CSVKitReader(f, **kwargs) if no_header_row: # Peek at a row to infer column names from row = next(rows) headers = make_default_headers(len(row)) column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] data_columns = [[] for c in headers] # Put row back on top rows = itertools.chain([row], rows) else: headers = next(rows) if column_ids: column_ids = parse_column_identifiers(column_ids, headers, zero_based) headers = [headers[c] for c in column_ids] else: column_ids = range(len(headers)) data_columns = [[] for c in headers] width = len(data_columns) for i, row in enumerate(rows): for j, d in enumerate(row): try: data_columns[j].append(row[column_ids[j]].strip()) except IndexError: # Non-rectangular data is truncated break j += 1 # Populate remaining columns with None while j < width: data_columns[j].append(None) j += 1 columns = [] for i, c in enumerate(data_columns): columns.append(Column(column_ids[i], headers[i], c, blanks_as_nulls=blanks_as_nulls, infer_types=infer_types)) return Table(columns, name=name) def to_rows(self, serialize_dates=False): """ Generates rows from columns and performs. Optionally serialize date objects to isoformat strings. """ if serialize_dates: out_columns = [] for c in self: # Stringify datetimes, dates, and times if c.type in [datetime.datetime, datetime.date, datetime.time]: out_columns.append([six.text_type(v.isoformat()) if v != None else None for v in c]) else: out_columns.append(c) # Convert columns to rows return list(zip(*out_columns)) else: return list(zip(*self)) def to_csv(self, output, **kwargs): """ Serializes the table to CSV and writes it to any file-like object. """ rows = self.to_rows(serialize_dates=True) # Insert header row rows.insert(0, self.headers()) writer = CSVKitWriter(output, **kwargs) writer.writerows(rows) csvkit-0.9.1/csvkit/typeinference.py0000644000076600000240000002145512477331225020551 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import datetime from dateutil.parser import parse import six from csvkit.exceptions import InvalidValueForTypeException, InvalidValueForTypeListException NoneType = type(None) NULL_VALUES = ('na', 'n/a', 'none', 'null', '.') TRUE_VALUES = ('yes', 'y', 'true', 't') FALSE_VALUES = ('no', 'n', 'false', 'f') DEFAULT_DATETIME = datetime.datetime(9999, 12, 31, 0, 0, 0) NULL_DATE = datetime.date(9999, 12, 31) NULL_TIME = datetime.time(0, 0, 0) def normalize_column_type(l, normal_type=None, blanks_as_nulls=True): """ Attempts to normalize a list (column) of string values to booleans, integers, floats, dates, times, datetimes, or strings. NAs and missing values are converted to empty strings. Empty strings are converted to nulls in the case of non-string types. For string types (unicode), empty strings are converted to nulls unless blanks_as_nulls is false. Optional accepts a "normal_type" argument which specifies a type that the values must conform to (rather than inferring). Will raise InvalidValueForTypeException if a value is not coercable. Returns a tuple of (type, normal_values). """ # Optimizations lower = six.text_type.lower replace = six.text_type.replace # Convert "NA", "N/A", etc. to null types. for i, x in enumerate(l): if x is not None and lower(x) in NULL_VALUES: l[i] = '' # Are they null? if not normal_type or normal_type == NoneType: try: for i, x in enumerate(l): if x != '' and x is not None: raise ValueError('Not null') return NoneType, [None] * len(l) except ValueError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) # Are they boolean? if not normal_type or normal_type == bool: try: normal_values = [] append = normal_values.append for i, x in enumerate(l): if x == '' or x is None: append(None) elif x.lower() in TRUE_VALUES: append(True) elif x.lower() in FALSE_VALUES: append(False) else: raise ValueError('Not boolean') return bool, normal_values except ValueError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) # Are they integers? if not normal_type or normal_type == int: try: normal_values = [] append = normal_values.append for i, x in enumerate(l): if x == '' or x is None: append(None) continue int_x = int(replace(x, ',', '')) if x[0] == '0' and int(x) != 0: raise TypeError('Integer is padded with 0s, so treat it as a string instead.') append(int_x) return int, normal_values except TypeError: if normal_type == int: raise InvalidValueForTypeException(i, x, int) if blanks_as_nulls: return six.text_type, [x if x != '' else None for x in l] else: return six.text_type, l except ValueError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) # Are they floats? if not normal_type or normal_type == float: try: normal_values = [] append = normal_values.append for i, x in enumerate(l): if x == '' or x is None: append(None) continue float_x = float(replace(x, ',', '')) append(float_x) return float, normal_values except ValueError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) # Are they datetimes? if not normal_type or normal_type in [datetime.time, datetime.date, datetime.datetime]: try: normal_values = [] append = normal_values.append normal_types_set = set() add = normal_types_set.add for i, x in enumerate(l): if x == '' or x is None: append(None) add(NoneType) continue d = parse(x, default=DEFAULT_DATETIME) # Is it only a time? if d.date() == NULL_DATE: if normal_type and normal_type != datetime.time: raise InvalidValueForTypeException(i, x, normal_type) d = d.time() add(datetime.time) # Is it only a date? elif d.time() == NULL_TIME: if normal_type and normal_type not in [datetime.date, datetime.datetime]: raise InvalidValueForTypeException(i, x, normal_type) d = d.date() add(datetime.date) # It must be a date and time else: if normal_type and normal_type != datetime.datetime: raise InvalidValueForTypeException(i, x, normal_type) add(datetime.datetime) append(d) # This case can only happen if normal_type was specified and the column contained all nulls if normal_type and normal_types_set == set([NoneType]): return normal_type, normal_values normal_types_set.discard(NoneType) # If a mix of dates and datetimes, up-convert dates to datetimes if normal_types_set == set([datetime.datetime, datetime.date]) or (normal_types_set == set([datetime.date]) and normal_type is datetime.datetime): for i, v in enumerate(normal_values): if v.__class__ == datetime.date: normal_values[i] = datetime.datetime.combine(v, NULL_TIME) if datetime.datetime in normal_types_set: normal_types_set.discard(datetime.date) # Datetimes and times don't mix -- fallback to using strings elif normal_types_set == set([datetime.datetime, datetime.time]) or (normal_types_set == set([datetime.time]) and normal_type is datetime.datetime): raise ValueError('Cant\'t coherently mix datetimes and times in a single column.') # Dates and times don't mix -- fallback to using strings elif normal_types_set == set([datetime.date, datetime.time]) or (normal_types_set == set([datetime.time]) and normal_type is datetime.date) or (normal_types_set == set([datetime.date]) and normal_type is datetime.time): raise ValueError('Can\'t coherently mix dates and times in a single column.') return normal_types_set.pop(), normal_values except ValueError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) except OverflowError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) except TypeError: if normal_type: raise InvalidValueForTypeException(i, x, normal_type) # Don't know what they are, so they must just be strings if blanks_as_nulls: return six.text_type, [x if x != '' else None for x in l] else: return six.text_type, l def normalize_table(rows, normal_types=None, accumulate_errors=False, blanks_as_nulls=True): """ Given a sequence of sequences, normalize the lot. Optionally accepts a normal_types parameter which is a list of types that the columns must normalize to. """ data_columns = [] column_count = 0 row_count = 0 for row in rows: while column_count < len(row): data_columns.append([None] * row_count) column_count += 1 for i, value in enumerate(row): data_columns[i].append(value) row_count += 1 new_normal_types = [] new_normal_columns= [] errors = {} for i, column in enumerate(data_columns): try: if normal_types: t, c = normalize_column_type(column, normal_types[i], blanks_as_nulls=blanks_as_nulls) else: t, c = normalize_column_type(column, blanks_as_nulls=blanks_as_nulls) new_normal_types.append(t) new_normal_columns.append(c) except InvalidValueForTypeException as e: if not accumulate_errors: raise errors[i] = e if errors: raise InvalidValueForTypeListException(errors) return new_normal_types, new_normal_columns csvkit-0.9.1/csvkit/unicsv.py0000644000076600000240000001126612477331225017217 0ustar onyxfishstaff00000000000000#!/usr/bin/env python """ This module contains unicode aware replacements for :func:`csv.reader` and :func:`csv.writer`. The implementations are largely copied from `examples in the csv module documentation `_. These classes are available for Python 2 only. The Python 3 version of `csv` supports unicode internally. .. note:: You probably don't want to use these classes directly. Try the :mod:`csvkit` module. """ import codecs import csv import sys import six from csvkit.exceptions import FieldSizeLimitError EIGHT_BIT_ENCODINGS = ['utf-8', 'u8', 'utf', 'utf8', 'latin-1', 'iso-8859-1', 'iso8859-1', '8859', 'cp819', 'latin', 'latin1', 'l1'] class UTF8Recoder(six.Iterator): """ Iterator that reads an encoded stream and reencodes the input to UTF-8. """ def __init__(self, f, encoding): self.reader = codecs.getreader(encoding)(f) def __iter__(self): return self def __next__(self): return next(self.reader).encode('utf-8') class UnicodeCSVReader(object): """ A CSV reader which will read rows from a file in a given encoding. """ def __init__(self, f, encoding='utf-8', maxfieldsize=None, **kwargs): f = UTF8Recoder(f, encoding) self.reader = csv.reader(f, **kwargs) if maxfieldsize: csv.field_size_limit(maxfieldsize) def next(self): try: row = next(self.reader) except csv.Error as e: # Terrible way to test for this exception, but there is no subclass if 'field larger than field limit' in str(e): raise FieldSizeLimitError(csv.field_size_limit()) else: raise e return [six.text_type(s, 'utf-8') for s in row] def __iter__(self): return self @property def line_num(self): return self.reader.line_num class UnicodeCSVWriter(object): """ A CSV writer which will write rows to a file in the specified encoding. NB: Optimized so that eight-bit encodings skip re-encoding. See: https://github.com/onyxfish/csvkit/issues/175 """ def __init__(self, f, encoding='utf-8', **kwargs): self.encoding = encoding self._eight_bit = (self.encoding.lower().replace('_', '-') in EIGHT_BIT_ENCODINGS) if self._eight_bit: self.writer = csv.writer(f, **kwargs) else: # Redirect output to a queue for reencoding self.queue = six.StringIO() self.writer = csv.writer(self.queue, **kwargs) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() def writerow(self, row): if self._eight_bit: self.writer.writerow([six.text_type(s if s != None else '').encode(self.encoding) for s in row]) else: self.writer.writerow([six.text_type(s if s != None else '').encode('utf-8') for s in row]) # Fetch UTF-8 output from the queue... data = self.queue.getvalue() data = data.decode('utf-8') # ...and reencode it into the target encoding data = self.encoder.encode(data) # write to the file self.stream.write(data) # empty the queue self.queue.truncate(0) def writerows(self, rows): for row in rows: self.writerow(row) class UnicodeCSVDictReader(csv.DictReader): """ Defer almost all implementation to :class:`csv.DictReader`, but wraps our unicode reader instead of :func:`csv.reader`. """ def __init__(self, f, fieldnames=None, restkey=None, restval=None, *args, **kwargs): reader = UnicodeCSVReader(f, *args, **kwargs) if 'encoding' in kwargs: kwargs.pop('encoding') csv.DictReader.__init__(self, f, fieldnames, restkey, restval, *args, **kwargs) self.reader = reader class UnicodeCSVDictWriter(csv.DictWriter): """ Defer almost all implementation to :class:`csv.DictWriter`, but wraps our unicode writer instead of :func:`csv.writer`. """ def __init__(self, f, fieldnames, restval="", extrasaction="raise", *args, **kwds): self.fieldnames = fieldnames self.restval = restval if extrasaction.lower() not in ("raise", "ignore"): raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'" % extrasaction) self.extrasaction = extrasaction self.writer = UnicodeCSVWriter(f, *args, **kwds) if sys.version_info < (2, 7): def writeheader(self): """ Python 2.6 is missing the writeheader function. """ self.writerow(dict(zip(self.fieldnames, self.fieldnames))) csvkit-0.9.1/csvkit/utilities/0000755000076600000240000000000012506400503017330 5ustar onyxfishstaff00000000000000csvkit-0.9.1/csvkit/utilities/__init__.py0000644000076600000240000000000012477331225021442 0ustar onyxfishstaff00000000000000csvkit-0.9.1/csvkit/utilities/csvclean.py0000644000076600000240000000547012477331225021521 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from os.path import splitext from csvkit import CSVKitReader, CSVKitWriter from csvkit.cli import CSVKitUtility from csvkit.cleanup import RowChecker class CSVClean(CSVKitUtility): description = 'Fix common errors in a CSV file.' override_flags = ['H'] def add_arguments(self): self.argparser.add_argument('-n', '--dry-run', dest='dryrun', action='store_true', help='Do not create output files. Information about what would have been done will be printed to STDERR.') def main(self): reader = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.dryrun: checker = RowChecker(reader) for row in checker.checked_rows(): pass if checker.errors: for e in checker.errors: self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) else: base, ext = splitext(self.input_file.name) with open('%s_out.csv' % base,'w') as f: clean_writer = CSVKitWriter(f, **self.writer_kwargs) checker = RowChecker(reader) clean_writer.writerow(checker.column_names) for row in checker.checked_rows(): clean_writer.writerow(row) if checker.errors: error_filename = '%s_err.csv' % base with open(error_filename, 'w') as f: error_writer = CSVKitWriter(f, **self.writer_kwargs) error_header = ['line_number', 'msg'] error_header.extend(checker.column_names) error_writer.writerow(error_header) error_count = len(checker.errors) for e in checker.errors: error_writer.writerow(self._format_error_row(e)) self.output_file.write('%i error%s logged to %s\n' % (error_count,'' if error_count == 1 else 's', error_filename)) else: self.output_file.write('No errors.\n') if checker.joins: self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n' % (checker.rows_joined, checker.joins)) def _format_error_row(self, error): row = [error.line_number, error.msg] row.extend(error.row) return row def launch_new_instance(): utility = CSVClean() utility.main() if __name__ == '__main__': launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvcut.py0000644000076600000240000000452612477331225021233 0ustar onyxfishstaff00000000000000#!/usr/bin/env python """ csvcut is originally the work of eminent hackers Joe Germuska and Aaron Bycoffe. This code is forked from: https://gist.github.com/561347/9846ebf8d0a69b06681da9255ffe3d3f59ec2c97 Used and modified with permission. """ import itertools from csvkit import CSVKitReader, CSVKitWriter from csvkit.cli import CSVKitUtility, parse_column_identifiers from csvkit.headers import make_default_headers class CSVCut(CSVKitUtility): description = 'Filter and truncate CSV files. Like unix "cut" command, but for tabular data.' def add_arguments(self): self.argparser.add_argument('-n', '--names', dest='names_only', action='store_true', help='Display column names and indices from the input CSV and exit.') self.argparser.add_argument('-c', '--columns', dest='columns', help='A comma separated list of column indices or names to be extracted. Defaults to all columns.') self.argparser.add_argument('-C', '--not-columns', dest='not_columns', help='A comma separated list of column indices or names to be excluded. Defaults to no columns.') self.argparser.add_argument('-x', '--delete-empty-rows', dest='delete_empty', action='store_true', help='After cutting, delete rows which are completely empty.') def main(self): if self.args.names_only: self.print_column_names() return rows = CSVKitReader(self.input_file, **self.reader_kwargs) if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based, self.args.not_columns) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow([column_names[c] for c in column_ids]) for row in rows: out_row = [row[c] if c < len(row) else None for c in column_ids] if self.args.delete_empty: if ''.join(out_row) == '': continue output.writerow(out_row) def launch_new_instance(): utility = CSVCut() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvformat.py0000644000076600000240000000524612477331225021730 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from csvkit import CSVKitReader, CSVKitWriter from csvkit.cli import CSVKitUtility class CSVFormat(CSVKitUtility): description = 'Convert a CSV file to a custom output format.' override_flags = ['l', 'zero', 'H'] def add_arguments(self): self.argparser.add_argument('-D', '--out-delimiter', dest='out_delimiter', help='Delimiting character of the output CSV file.') self.argparser.add_argument('-T', '--out-tabs', dest='out_tabs', action='store_true', help='Specifies that the output CSV file is delimited with tabs. Overrides "-D".') self.argparser.add_argument('-Q', '--out-quotechar', dest='out_quotechar', help='Character used to quote strings in the output CSV file.') self.argparser.add_argument('-U', '--out-quoting', dest='out_quoting', type=int, choices=[0,1,2,3], help='Quoting style used in the output CSV file. 0 = Quote Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 = Quote None.') self.argparser.add_argument('-B', '--out-doublequote', dest='out_doublequote', action='store_true', help='Whether or not double quotes are doubled in the output CSV file.') self.argparser.add_argument('-P', '--out-escapechar', dest='out_escapechar', help='Character used to escape the delimiter in the output CSV file if --quoting 3 ("Quote None") is specified and to escape the QUOTECHAR if --doublequote is not specified.') self.argparser.add_argument('-M', '--out-lineterminator', dest='out_lineterminator', help='Character used to terminate lines in the output CSV file.') def _extract_csv_writer_kwargs(self): kwargs = {} if self.args.out_tabs: kwargs['delimiter'] = '\t' elif self.args.out_delimiter: kwargs['delimiter'] = self.args.out_delimiter if self.args.out_quotechar: kwargs['quotechar'] = self.args.out_quotechar if self.args.out_quoting: kwargs['quoting'] = self.args.out_quoting if self.args.out_doublequote: kwargs['doublequote'] = self.args.out_doublequote if self.args.out_escapechar: kwargs['escapechar'] = self.args.out_escapechar if self.args.out_lineterminator: kwargs['lineterminator'] = self.args.out_lineterminator return kwargs def main(self): reader = CSVKitReader(self.input_file, **self.reader_kwargs) writer = CSVKitWriter(self.output_file, **self.writer_kwargs) writer.writerows(reader) def launch_new_instance(): utility = CSVFormat() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvgrep.py0000644000076600000240000000562312477331225021374 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import re import sys from argparse import FileType from csvkit import CSVKitReader, CSVKitWriter from csvkit.cli import CSVKitUtility, parse_column_identifiers from csvkit.grep import FilteringCSVReader class CSVGrep(CSVKitUtility): description = 'Search CSV files. Like the unix "grep" command, but for tabular data.' override_flags = ['H'] def add_arguments(self): self.argparser.add_argument('-n', '--names', dest='names_only', action='store_true', help='Display column names and indices from the input CSV and exit.') self.argparser.add_argument('-c', '--columns', dest='columns', help='A comma separated list of column indices or names to be searched.') self.argparser.add_argument('-m','--match', dest="pattern", action='store', help='The string to search for.') self.argparser.add_argument('-r', '--regex', dest='regex', action='store', help='If specified, must be followed by a regular expression which will be tested against the specified columns.') self.argparser.add_argument('-f', '--file', dest='matchfile', type=FileType('r'), action='store', help='If specified, must be the path to a file. For each tested row, if any line in the file (stripped of line separators) is an exact match for the cell value, the row will pass.') self.argparser.add_argument('-i', '--invert-match', dest='inverse', action='store_true', help='If specified, select non-matching instead of matching rows.') def main(self): if self.args.names_only: self.print_column_names() return if not self.args.columns: self.argparser.error('You must specify at least one column to search using the -c option.') if self.args.regex is None and self.args.pattern is None and self.args.matchfile is None: self.argparser.error('One of -r, -m or -f must be specified, unless using the -n option.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) column_ids = parse_column_identifiers(self.args.columns, column_names, self.args.zero_based) if self.args.regex: pattern = re.compile(self.args.regex) elif self.args.matchfile: lines = set(line.rstrip() for line in self.args.matchfile) pattern = lambda x: x in lines else: pattern = self.args.pattern patterns = dict((c, pattern) for c in column_ids) output = CSVKitWriter(self.output_file, **self.writer_kwargs) output.writerow(column_names) filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse=self.args.inverse) for row in filter_reader: output.writerow(row) def launch_new_instance(): utility = CSVGrep() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvjoin.py0000644000076600000240000001164612477331225021400 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from csvkit import CSVKitReader, CSVKitWriter from csvkit import join from csvkit.cli import CSVKitUtility, match_column_identifier class CSVJoin(CSVKitUtility): description = 'Execute a SQL-like join to merge CSV files on a specified column or columns.' epilog = 'Note that the join operation requires reading all files into memory. Don\'t try this on very large files.' override_flags = ['f', 'H'] def add_arguments(self): self.argparser.add_argument(metavar="FILE", nargs='*', dest='input_paths', default=['-'], help='The CSV files to operate on. If only one is specified, it will be copied to STDOUT.') self.argparser.add_argument('-c', '--columns', dest='columns', help='The column name(s) on which to join. Should be either one name (or index) or a comma-separated list with one name (or index) for each file, in the same order that the files were specified. May also be left unspecified, in which case the two files will be joined sequentially without performing any matching.') self.argparser.add_argument('--outer', dest='outer_join', action='store_true', help='Perform a full outer join, rather than the default inner join.') self.argparser.add_argument('--left', dest='left_join', action='store_true', help='Perform a left outer join, rather than the default inner join. If more than two files are provided this will be executed as a sequence of left outer joins, starting at the left.') self.argparser.add_argument('--right', dest='right_join', action='store_true', help='Perform a right outer join, rather than the default inner join. If more than two files are provided this will be executed as a sequence of right outer joins, starting at the right.') def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to join.') if self.args.columns: join_column_names = self._parse_join_column_names(self.args.columns) if len(join_column_names) == 1: join_column_names = join_column_names * len(self.input_files) if len(join_column_names) != len(self.input_files): self.argparser.error('The number of join column names must match the number of files, or be a single column name that exists in all files.') if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns: self.argparser.error('You must provide join column names when performing an outer join.') if self.args.left_join and self.args.right_join: self.argparser.error('It is not valid to specify both a left and a right join.') tables = [] for f in self.input_files: tables.append(list(CSVKitReader(f, **self.reader_kwargs))) f.close() join_column_ids = [] if self.args.columns: for i, t in enumerate(tables): join_column_ids.append(match_column_identifier(t[0], join_column_names[i])) jointab = [] if self.args.left_join: # Left outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) elif self.args.right_join: # Right outer join jointab = tables[-1] remaining_tables = tables[:-1] remaining_tables.reverse() for i, t in enumerate(remaining_tables): jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1]) elif self.args.outer_join: # Full outer join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: if self.args.columns: # Inner join jointab = tables[0] for i, t in enumerate(tables[1:]): jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1]) else: jointab = tables[0] # Sequential join for t in tables[1:]: jointab = join.sequential_join(jointab, t) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in jointab: output.writerow(row) def _parse_join_column_names(self, join_string): """ Parse a list of join columns. """ return list(map(str.strip, join_string.split(','))) def launch_new_instance(): utility = CSVJoin() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvjson.py0000644000076600000240000001560112477331225021405 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import codecs try: from collections import OrderedDict import json except ImportError: from ordereddict import OrderedDict import simplejson as json import six from csvkit import CSVKitReader from csvkit.cli import CSVKitUtility, match_column_identifier from csvkit.exceptions import NonUniqueKeyColumnException class CSVJSON(CSVKitUtility): description = 'Convert a CSV file into JSON (or GeoJSON).' override_flags = ['H'] def add_arguments(self): self.argparser.add_argument('-i', '--indent', dest='indent', type=int, default=None, help='Indent the output JSON this many spaces. Disabled by default.') self.argparser.add_argument('-k', '--key', dest='key', type=str, default=None, help='Output JSON as an array of objects keyed by a given column, KEY, rather than as a list. All values in the column must be unique. If --lat and --lon are also specified, this column will be used as GeoJSON Feature ID.') self.argparser.add_argument('--lat', dest='lat', type=str, default=None, help='A column index or name containing a latitude. Output will be GeoJSON instead of JSON. Only valid if --lon is also specified.') self.argparser.add_argument('--lon', dest='lon', type=str, default=None, help='A column index or name containing a longitude. Output will be GeoJSON instead of JSON. Only valid if --lat is also specified.') self.argparser.add_argument('--crs', dest='crs', type=str, default=None, help='A coordinate reference system string to be included with GeoJSON output. Only valid if --lat and --lon are also specified.') self.argparser.add_argument('--stream', dest='streamOutput', action='store_true', help='Output JSON as a stream of newline-separated objects, rather than an as an array.') def main(self): if six.PY2: stream = codecs.getwriter('utf-8')(self.output_file) else: stream = self.output_file json_kwargs = { 'ensure_ascii': False, 'indent': self.args.indent, } if six.PY2: json_kwargs['encoding'] = 'utf-8' def dump_json (data,newline=False): json.dump(data, stream, **json_kwargs) if newline: stream.write("\n") """ Convert CSV to JSON. """ if self.args.lat and not self.args.lon: self.argparser.error('--lon is required whenever --lat is specified.') if self.args.lon and not self.args.lat: self.argparser.error('--lat is required whenever --lon is specified.') if self.args.crs and not self.args.lat: self.argparser.error('--crs is only allowed when --lat and --lon are also specified.') if self.args.streamOutput and (self.args.lat or self.args.lon or self.args.key): self.argparser.error('--stream is only allowed if --lat, --lon and --key are not specified.') rows = CSVKitReader(self.input_file, **self.reader_kwargs) column_names = next(rows) # GeoJSON if self.args.lat and self.args.lon: features = [] min_lon = None min_lat = None max_lon = None max_lat = None lat_column = match_column_identifier(column_names, self.args.lat, self.args.zero_based) lon_column = match_column_identifier(column_names, self.args.lon, self.args.zero_based) if self.args.key: id_column = match_column_identifier(column_names, self.args.key, self.args.zero_based) else: id_column = None for row in rows: feature = OrderedDict() feature['type'] = 'Feature' properties = OrderedDict() geoid = None lat = None lon = None for i, c in enumerate(row): if i == lat_column: try: lat = float(c) except ValueError: lat = None if min_lat is None or lat < min_lat: min_lat = lat if max_lat is None or lat > max_lat: max_lat = lat elif i == lon_column: try: lon = float(c) except ValueError: lon = None if min_lon is None or lon < min_lon: min_lon = lon if max_lon is None or lon > max_lon: max_lon = lon elif id_column is not None and i == id_column: geoid = c else: properties[column_names[i]] = c if id_column is not None: feature['id'] = geoid feature['geometry'] = OrderedDict([ ('type', 'Point'), ('coordinates', [lon, lat]) ]) feature['properties'] = properties features.append(feature) output = OrderedDict([ ('type', 'FeatureCollection'), ('bbox', [min_lon, min_lat, max_lon, max_lat]), ('features', features) ]) if self.args.crs: output['crs'] = OrderedDict([ ('type', 'name'), ('properties', { 'name': self.args.crs }) ]) dump_json(output) # Keyed JSON elif self.args.key: output = OrderedDict() for row in rows: data = OrderedDict() for i, column in enumerate(column_names): data[column] = row[i] k = data[self.args.key] if k in output: raise NonUniqueKeyColumnException('Value %s is not unique in the key column.' % six.text_type(k)) output[k] = data dump_json(output) # Boring JSON else: output = [] for row in rows: data = OrderedDict() for i, column in enumerate(column_names): try: data[column] = row[i] except IndexError: data[column] = None if(self.args.streamOutput): dump_json(data,newline=True) else: output.append(data) if not self.args.streamOutput: dump_json(output) def launch_new_instance(): utility = CSVJSON() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvlook.py0000644000076600000240000000426112477331225021400 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import itertools import six from csvkit import CSVKitReader from csvkit.cli import CSVKitUtility from csvkit.headers import make_default_headers class CSVLook(CSVKitUtility): description = 'Render a CSV file in the console as a fixed-width table.' def add_arguments(self): pass def main(self): rows = CSVKitReader(self.input_file, **self.reader_kwargs) # Make a default header row if none exists if self.args.no_header_row: row = next(rows) column_names = make_default_headers(len(row)) # Put the row back on top rows = itertools.chain([row], rows) else: column_names = next(rows) column_names = list(column_names) # prepend 'line_number' column with line numbers if --linenumbers option if self.args.line_numbers: column_names.insert(0, 'line_number') rows = [list(itertools.chain([str(i + 1)], row)) for i, row in enumerate(rows)] # Convert to normal list of rows rows = list(rows) # Insert the column names at the top rows.insert(0, column_names) widths = [] for row in rows: for i, v in enumerate(row): try: if len(v) > widths[i]: widths[i] = len(v) except IndexError: widths.append(len(v)) # Dashes span each width with '+' character at intersection of # horizontal and vertical dividers. divider = '|--' + '-+-'.join('-'* w for w in widths) + '--|' self.output_file.write('%s\n' % divider) for i, row in enumerate(rows): output = [] for j, d in enumerate(row): if d is None: d = '' output.append(' %s ' % six.text_type(d).ljust(widths[j])) self.output_file.write('| %s |\n' % ('|'.join(output))) if (i == 0 or i == len(rows) - 1): self.output_file.write('%s\n' % divider) def launch_new_instance(): utility = CSVLook() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvpy.py0000644000076600000240000000256512477331225021071 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from csvkit import CSVKitReader, CSVKitDictReader from csvkit.cli import CSVKitUtility class CSVPy(CSVKitUtility): description = 'Load a CSV file into a CSVKitReader object and then drops into a Python shell.' override_flags = ['l', 'zero', 'H'] def add_arguments(self): self.argparser.add_argument('--dict', dest='as_dict', action='store_true', help='Use CSVKitDictReader instead of CSVKitReader.') def main(self): # Attempt reading filename, will cause lazy loader to access file and raise error if it does not exist filename = self.input_file.name if self.args.as_dict: reader_class = CSVKitDictReader else: reader_class = CSVKitReader reader = reader_class(self.input_file, **self.reader_kwargs) welcome_message = 'Welcome! "%s" has been loaded in a %s object named "reader".' % (filename, reader_class.__name__) try: from IPython.frontend.terminal.embed import InteractiveShellEmbed ipy = InteractiveShellEmbed(banner1=welcome_message) ipy() except ImportError: import code code.interact(welcome_message, local={ 'reader': reader }) def launch_new_instance(): utility = CSVPy() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvsort.py0000644000076600000240000000453512477331225021427 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import os from csvkit import CSVKitWriter from csvkit import table from csvkit.cli import CSVKitUtility, parse_column_identifiers class CSVSort(CSVKitUtility): description = 'Sort CSV files. Like unix "sort" command, but for tabular data.' def add_arguments(self): self.argparser.add_argument('-y', '--snifflimit', dest='snifflimit', type=int, help='Limit CSV dialect sniffing to the specified number of bytes. Specify "0" to disable sniffing entirely.') self.argparser.add_argument('-n', '--names', dest='names_only', action='store_true', help='Display column names and indices from the input CSV and exit.') self.argparser.add_argument('-c', '--columns', dest='columns', help='A comma separated list of column indices or names to sort by. Defaults to all columns.') self.argparser.add_argument('-r', '--reverse', dest='reverse', action='store_true', help='Sort in descending order.') self.argparser.add_argument('--no-inference', dest='no_inference', action='store_true', help='Disable type inference when parsing the input.') def main(self): if self.args.names_only: self.print_column_names() return if self.input_file.name != '': # Use filename as table name table_name = os.path.splitext(os.path.split(self.input_file.name)[1])[0] else: table_name = 'csvsql_table' tab = table.Table.from_csv( self.input_file, name=table_name, snifflimit=self.args.snifflimit, no_header_row=self.args.no_header_row, infer_types=(not self.args.no_inference), **self.reader_kwargs ) column_ids = parse_column_identifiers(self.args.columns, tab.headers(), self.args.zero_based) rows = tab.to_rows(serialize_dates=True) sorter = lambda r: [(r[c] is not None, r[c]) for c in column_ids] rows.sort(key=sorter, reverse=self.args.reverse) rows.insert(0, tab.headers()) output = CSVKitWriter(self.output_file, **self.writer_kwargs) for row in rows: output.writerow(row) def launch_new_instance(): utility = CSVSort() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvsql.py0000644000076600000240000001624512477331225021240 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import os import sys from csvkit import sql from csvkit import table from csvkit import CSVKitWriter from csvkit.cli import CSVKitUtility class CSVSQL(CSVKitUtility): description = 'Generate SQL statements for one or more CSV files, create execute those statements directly on a database, and execute one or more SQL queries.' override_flags = ['l', 'f'] def add_arguments(self): self.argparser.add_argument(metavar="FILE", nargs='*', dest='input_paths', default=['-'], help='The CSV file(s) to operate on. If omitted, will accept input on STDIN.') self.argparser.add_argument('-y', '--snifflimit', dest='snifflimit', type=int, help='Limit CSV dialect sniffing to the specified number of bytes. Specify "0" to disable sniffing entirely.') self.argparser.add_argument('-i', '--dialect', dest='dialect', choices=sql.DIALECTS, help='Dialect of SQL to generate. Only valid when --db is not specified.') self.argparser.add_argument('--db', dest='connection_string', help='If present, a sqlalchemy connection string to use to directly execute generated SQL on a database.') self.argparser.add_argument('--query', default=None, help='Execute one or more SQL queries delimited by ";" and output the result of the last query as CSV.') self.argparser.add_argument('--insert', dest='insert', action='store_true', help='In addition to creating the table, also insert the data into the table. Only valid when --db is specified.') self.argparser.add_argument('--tables', dest='table_names', help='Specify one or more names for the tables to be created. If omitted, the filename (minus extension) or "stdin" will be used.') self.argparser.add_argument('--no-constraints', dest='no_constraints', action='store_true', help='Generate a schema without length limits or null checks. Useful when sampling big tables.') self.argparser.add_argument('--no-create', dest='no_create', action='store_true', help='Skip creating a table. Only valid when --insert is specified.') self.argparser.add_argument('--blanks', dest='blanks', action='store_true', help='Do not coerce empty strings to NULL values.') self.argparser.add_argument('--no-inference', dest='no_inference', action='store_true', help='Disable type inference when parsing the input.') self.argparser.add_argument('--db-schema', dest='db_schema', help='Optional name of database schema to create table(s) in.') def main(self): connection_string = self.args.connection_string do_insert = self.args.insert query = self.args.query self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if self.args.table_names: table_names = self.args.table_names.split(',') else: table_names = [] # If one or more filenames are specified, we need to add stdin ourselves (if available) if sys.stdin not in self.input_files: try: if not sys.stdin.isatty(): self.input_files.insert(0, sys.stdin) except: pass # Create an SQLite database in memory if no connection string is specified if query and not connection_string: connection_string = "sqlite:///:memory:" do_insert = True if self.args.dialect and connection_string: self.argparser.error('The --dialect option is only valid when --db is not specified.') if do_insert and not connection_string: self.argparser.error('The --insert option is only valid when --db is also specified.') if self.args.no_create and not do_insert: self.argparser.error('The --no-create option is only valid --insert is also specified.') # Establish database validity before reading CSV files if connection_string: try: engine, metadata = sql.get_connection(connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() trans = conn.begin() for f in self.input_files: try: # Try to use name specified via --table table_name = table_names.pop(0) except IndexError: if f == sys.stdin: table_name = "stdin" else: # Use filename as table name table_name = os.path.splitext(os.path.split(f.name)[1])[0] csv_table = table.Table.from_csv( f, name=table_name, snifflimit=self.args.snifflimit, blanks_as_nulls=(not self.args.blanks), infer_types=(not self.args.no_inference), no_header_row=self.args.no_header_row, **self.reader_kwargs ) f.close() if connection_string: sql_table = sql.make_table( csv_table, table_name, self.args.no_constraints, self.args.db_schema, metadata ) # Create table if not self.args.no_create: sql_table.create() # Insert data if do_insert and csv_table.count_rows() > 0: insert = sql_table.insert() headers = csv_table.headers() conn.execute(insert, [dict(zip(headers, row)) for row in csv_table.to_rows()]) # Output SQL statements else: sql_table = sql.make_table(csv_table, table_name, self.args.no_constraints) self.output_file.write('%s\n' % sql.make_create_table_statement(sql_table, dialect=self.args.dialect)) if connection_string: if query: # Execute specified SQL queries queries = query.split(';') rows = None for q in queries: if q: rows = conn.execute(q) # Output result of last query as CSV try: output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) except AttributeError: pass trans.commit() conn.close() def launch_new_instance(): utility = CSVSQL() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvstack.py0000644000076600000240000000626612477331225021550 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import os from csvkit import CSVKitReader, CSVKitWriter from csvkit.cli import CSVKitUtility from csvkit.headers import make_default_headers class CSVStack(CSVKitUtility): description = 'Stack up the rows from multiple CSV files, optionally adding a grouping value.' override_flags = ['f'] def add_arguments(self): self.argparser.add_argument(metavar="FILE", nargs='+', dest='input_paths', default=['-'], help='The CSV file(s) to operate on. If omitted, will accept input on STDIN.') self.argparser.add_argument('-g', '--groups', dest='groups', help='A comma-seperated list of values to add as "grouping factors", one for each CSV being stacked. These will be added to the stacked CSV as a new column. You may specify a name for the grouping column using the -n flag.') self.argparser.add_argument('-n', '--group-name', dest='group_name', help='A name for the grouping column, e.g. "year". Only used when also specifying -g.') self.argparser.add_argument('--filenames', dest='group_by_filenames', action='store_true', help='Use the filename of each input file as its grouping value. When specified, -g will be ignored.') def main(self): self.input_files = [] for path in self.args.input_paths: self.input_files.append(self._open_input_file(path)) if len(self.input_files) < 2: self.argparser.error('You must specify at least two files to stack.') if self.args.group_by_filenames: groups = [os.path.split(f.name)[1] for f in self.input_files] elif self.args.groups: groups = self.args.groups.split(',') if len(groups) != len(self.input_files): self.argparser.error('The number of grouping values must be equal to the number of CSV files being stacked.') else: groups = None group_name = self.args.group_name if self.args.group_name else 'group' output = CSVKitWriter(self.output_file, **self.writer_kwargs) for i, f in enumerate(self.input_files): rows = CSVKitReader(f, **self.reader_kwargs) # If we have header rows, use them if not self.args.no_header_row: headers = next(rows, []) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) # If we don't generate simple column names based on first row else: row = next(rows, []) headers = make_default_headers(len(row)) if i == 0: if groups: headers.insert(0, group_name) output.writerow(headers) if groups: row.insert(0, groups[i]) output.writerow(row) for row in rows: if groups: row.insert(0, groups[i]) output.writerow(row) f.close() def launch_new_instance(): utility = CSVStack() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/csvstat.py0000644000076600000240000002074112477331225021410 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import datetime from heapq import nlargest from operator import itemgetter import math import six from csvkit import CSVKitReader, table from csvkit.cli import CSVKitUtility NoneType = type(None) MAX_UNIQUE = 5 MAX_FREQ = 5 OPERATIONS =('min', 'max', 'sum', 'mean', 'median', 'stdev', 'nulls', 'unique', 'freq', 'len') class CSVStat(CSVKitUtility): description = 'Print descriptive statistics for each column in a CSV file.' override_flags = ['l'] def add_arguments(self): self.argparser.add_argument('-y', '--snifflimit', dest='snifflimit', type=int, help='Limit CSV dialect sniffing to the specified number of bytes. Specify "0" to disable sniffing entirely.') self.argparser.add_argument('-c', '--columns', dest='columns', help='A comma separated list of column indices or names to be examined. Defaults to all columns.') self.argparser.add_argument('--max', dest='max_only', action='store_true', help='Only output max.') self.argparser.add_argument('--min', dest='min_only', action='store_true', help='Only output min.') self.argparser.add_argument('--sum', dest='sum_only', action='store_true', help='Only output sum.') self.argparser.add_argument('--mean', dest='mean_only', action='store_true', help='Only output mean.') self.argparser.add_argument('--median', dest='median_only', action='store_true', help='Only output median.') self.argparser.add_argument('--stdev', dest='stdev_only', action='store_true', help='Only output standard deviation.') self.argparser.add_argument('--nulls', dest='nulls_only', action='store_true', help='Only output whether column contains nulls.') self.argparser.add_argument('--unique', dest='unique_only', action='store_true', help='Only output unique values.') self.argparser.add_argument('--freq', dest='freq_only', action='store_true', help='Only output frequent values.') self.argparser.add_argument('--len', dest='len_only', action='store_true', help='Only output max value length.') self.argparser.add_argument('--count', dest='count_only', action='store_true', help='Only output row count') def main(self): operations = [op for op in OPERATIONS if getattr(self.args, op + '_only')] if len(operations) > 1: self.argparser.error('Only one statistic argument may be specified (mean, median, etc).') if operations and self.args.count_only: self.argparser.error('You may not specify --count and a statistical argument at the same time.') if self.args.count_only: count = len(list(CSVKitReader(self.input_file))) if not self.args.no_header_row: count -= 1 self.output_file.write('Row count: %i\n' % count) return tab = table.Table.from_csv( self.input_file, snifflimit=self.args.snifflimit, column_ids=self.args.columns, zero_based=self.args.zero_based, no_header_row=self.args.no_header_row, **self.reader_kwargs ) for c in tab: values = sorted(filter(lambda i: i is not None, c)) stats = {} # Output a single stat if len(operations) == 1: op = operations[0] stat = getattr(self, 'get_%s' % op)(c, values, {}) # Formatting if op == 'unique': stat = len(stat) elif op == 'freq': stat = ', '.join([('"%s": %s' % (six.text_type(k), count)) for k, count in stat]) stat = '{ %s }' % stat if len(tab) == 1: self.output_file.write(six.text_type(stat)) else: self.output_file.write('%3i. %s: %s\n' % (c.order + 1, c.name, stat)) # Output all stats else: for op in OPERATIONS: stats[op] = getattr(self, 'get_%s' % op)(c, values, stats) self.output_file.write(('%3i. %s\n' % (c.order + 1, c.name))) if c.type == None: self.output_file.write('\tEmpty column\n') continue self.output_file.write('\t%s\n' % c.type) self.output_file.write('\tNulls: %s\n' % stats['nulls']) if len(stats['unique']) <= MAX_UNIQUE and c.type is not bool: uniques = [six.text_type(u) for u in list(stats['unique'])] data = u'\tValues: %s\n' % ', '.join(uniques) self.output_file.write(data) else: if c.type not in [six.text_type, bool]: self.output_file.write('\tMin: %s\n' % stats['min']) self.output_file.write('\tMax: %s\n' % stats['max']) if c.type in [int, float]: self.output_file.write('\tSum: %s\n' % stats['sum']) self.output_file.write('\tMean: %s\n' % stats['mean']) self.output_file.write('\tMedian: %s\n' % stats['median']) self.output_file.write('\tStandard Deviation: %s\n' % stats['stdev']) self.output_file.write('\tUnique values: %i\n' % len(stats['unique'])) if len(stats['unique']) != len(values): self.output_file.write('\t%i most frequent values:\n' % MAX_FREQ) for value, count in stats['freq']: self.output_file.write(('\t\t%s:\t%s\n' % (six.text_type(value), count))) if c.type == six.text_type: self.output_file.write('\tMax length: %i\n' % stats['len']) if not operations: self.output_file.write('\n') self.output_file.write('Row count: %s\n' % tab.count_rows()) def get_min(self, c, values, stats): if c.type == NoneType: return None v = min(values) if v in [datetime.datetime, datetime.date, datetime.time]: return v.isoformat() return v def get_max(self, c, values, stats): if c.type == NoneType: return None v = max(values) if v in [datetime.datetime, datetime.date, datetime.time]: return v.isoformat() return v def get_sum(self, c, values, stats): if c.type not in [int, float]: return None return sum(values) def get_mean(self, c, values, stats): if c.type not in [int, float]: return None if 'sum' not in stats: stats['sum'] = self.get_sum(c, values, stats) return float(stats['sum']) / len(values) def get_median(self, c, values, stats): if c.type not in [int, float]: return None return median(values) def get_stdev(self, c, values, stats): if c.type not in [int, float]: return None if 'mean' not in stats: stats['mean'] = self.get_mean(c, values, stats) return math.sqrt(sum(math.pow(v - stats['mean'], 2) for v in values) / len(values)) def get_nulls(self, c, values, stats): return c.has_nulls() def get_unique(self, c, values, stats): return set(values) def get_freq(self, c, values, stats): return freq(values) def get_len(self, c, values, stats): if c.type != six.text_type: return None return c.max_length() def median(l): """ Compute the median of a list. """ length = len(l) if length % 2 == 1: return l[(length + 1) // 2 - 1] else: a = l[(length // 2) - 1] b = l[length // 2] return (float(a + b)) / 2 def freq(l, n=MAX_FREQ): """ Count the number of times each value occurs in a column. """ count = {} for x in l: s = six.text_type(x) if s in count: count[s] += 1 else: count[s] = 1 # This will iterate through dictionary, return N highest # values as (key, value) tuples. top = nlargest(n, six.iteritems(count), itemgetter(1)) return top def launch_new_instance(): utility = CSVStat() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/in2csv.py0000644000076600000240000000672112477331225021127 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from csvkit import convert from csvkit.cli import CSVKitUtility class In2CSV(CSVKitUtility): description = 'Convert common, but less awesome, tabular data formats to CSV.' epilog='Some command line flags only pertain to specific input formats.' override_flags = ['f'] def add_arguments(self): self.argparser.add_argument(metavar="FILE", nargs='?', dest='input_path', help='The CSV file to operate on. If omitted, will accept input on STDIN.') self.argparser.add_argument('-f', '--format', dest='filetype', help='The format of the input file. If not specified will be inferred from the file type. Supported formats: %s.' % ', '.join(sorted(convert.SUPPORTED_FORMATS))) self.argparser.add_argument('-s', '--schema', dest='schema', help='Specifies a CSV-formatted schema file for converting fixed-width files. See documentation for details.') self.argparser.add_argument('-k', '--key', dest='key', help='Specifies a top-level key to use look within for a list of objects to be converted when processing JSON.') self.argparser.add_argument('-y', '--snifflimit', dest='snifflimit', type=int, help='Limit CSV dialect sniffing to the specified number of bytes. Specify "0" to disable sniffing entirely.') self.argparser.add_argument('--sheet', dest='sheet', help='The name of the XLSX sheet to operate on.') self.argparser.add_argument('--no-inference', dest='no_inference', action='store_true', help='Disable type inference when parsing the input.') def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if filetype == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if filetype == 'fixed': kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) self.output_file.write(data) def launch_new_instance(): utility = In2CSV() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit/utilities/sql2csv.py0000644000076600000240000000476112477331225021322 0ustar onyxfishstaff00000000000000#!/usr/bin/env python from argparse import FileType import sys from csvkit import CSVKitWriter from csvkit import sql from csvkit.cli import CSVKitUtility class SQL2CSV(CSVKitUtility): description = 'Execute an SQL query on a database and output the result to a CSV file.' override_flags = 'f,b,d,e,H,p,q,S,t,u,z,zero'.split(',') def add_arguments(self): self.argparser.add_argument('--db', dest='connection_string', default='sqlite://', help='An sqlalchemy connection string to connect to a database.',) self.argparser.add_argument('file', metavar="FILE", nargs='?', type=FileType('rt'), default=sys.stdin, help='The file to use as SQL query. If both FILE and QUERY are omitted, query will be read from STDIN.') self.argparser.add_argument('--query', default=None, help="The SQL query to execute. If specified, it overrides FILE and STDIN.") self.argparser.add_argument('-H', '--no-header-row', dest='no_header_row', action='store_true', help='Do not output column names.') self.argparser.set_defaults( delimiter=None, doublequote=None, escapechar=None, encoding='utf-8', maxfieldsize=None, quotechar=None, quoting=None, skipinitialspace=None, tabs=None, ) def main(self): try: engine, metadata = sql.get_connection(self.args.connection_string) except ImportError: raise ImportError('You don\'t appear to have the necessary database backend installed for connection string you\'re trying to use.. Available backends include:\n\nPostgresql:\tpip install psycopg2\nMySQL:\t\tpip install MySQL-python\n\nFor details on connection strings and other backends, please see the SQLAlchemy documentation on dialects at: \n\nhttp://www.sqlalchemy.org/docs/dialects/\n\n') conn = engine.connect() if self.args.query: query = self.args.query.strip() else: query = "" for line in self.args.file: query += line rows = conn.execute(query) output = CSVKitWriter(self.output_file, **self.writer_kwargs) if not self.args.no_header_row: output.writerow(rows._metadata.keys) for row in rows: output.writerow(row) conn.close() def launch_new_instance(): utility = SQL2CSV() utility.main() if __name__ == "__main__": launch_new_instance() csvkit-0.9.1/csvkit.egg-info/0000755000076600000240000000000012506400503017007 5ustar onyxfishstaff00000000000000csvkit-0.9.1/csvkit.egg-info/dependency_links.txt0000644000076600000240000000000112506400503023055 0ustar onyxfishstaff00000000000000 csvkit-0.9.1/csvkit.egg-info/entry_points.txt0000644000076600000240000000142312506400503022305 0ustar onyxfishstaff00000000000000[console_scripts] csvclean = csvkit.utilities.csvclean:launch_new_instance csvcut = csvkit.utilities.csvcut:launch_new_instance csvformat = csvkit.utilities.csvformat:launch_new_instance csvgrep = csvkit.utilities.csvgrep:launch_new_instance csvjoin = csvkit.utilities.csvjoin:launch_new_instance csvjson = csvkit.utilities.csvjson:launch_new_instance csvlook = csvkit.utilities.csvlook:launch_new_instance csvpy = csvkit.utilities.csvpy:launch_new_instance csvsort = csvkit.utilities.csvsort:launch_new_instance csvsql = csvkit.utilities.csvsql:launch_new_instance csvstack = csvkit.utilities.csvstack:launch_new_instance csvstat = csvkit.utilities.csvstat:launch_new_instance in2csv = csvkit.utilities.in2csv:launch_new_instance sql2csv = csvkit.utilities.sql2csv:launch_new_instance csvkit-0.9.1/csvkit.egg-info/PKG-INFO0000644000076600000240000000336512506400503020113 0ustar onyxfishstaff00000000000000Metadata-Version: 1.1 Name: csvkit Version: 0.9.1 Summary: A library of utilities for working with CSV, the king of tabular file formats. Home-page: http://csvkit.rtfd.org/ Author: Christopher Groskopf Author-email: staringmonkey@gmail.com License: MIT Description: csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats. It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe. Important links: * Repository: https://github.com/onyxfish/csvkit * Issues: https://github.com/onyxfish/csvkit/issues * Documentation: http://csvkit.rtfd.org/ * Schemas: https://github.com/onyxfish/ffs * Buildbot: https://travis-ci.org/onyxfish/csvkit Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: End Users/Desktop Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: MIT License Classifier: Natural Language :: English Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Utilities csvkit-0.9.1/csvkit.egg-info/requires.txt0000644000076600000240000000013712506400503021410 0ustar onyxfishstaff00000000000000xlrd>=0.7.1 sqlalchemy>=0.6.6 openpyxl==2.2.0-b1 six>=1.6.1 python-dateutil==2.2 dbf==0.94.003 csvkit-0.9.1/csvkit.egg-info/SOURCES.txt0000644000076600000240000000205612506400503020676 0ustar onyxfishstaff00000000000000README setup.py csvkit/__init__.py csvkit/cleanup.py csvkit/cli.py csvkit/exceptions.py csvkit/grep.py csvkit/headers.py csvkit/join.py csvkit/py2.py csvkit/py3.py csvkit/sniffer.py csvkit/sql.py csvkit/table.py csvkit/typeinference.py csvkit/unicsv.py csvkit.egg-info/PKG-INFO csvkit.egg-info/SOURCES.txt csvkit.egg-info/dependency_links.txt csvkit.egg-info/entry_points.txt csvkit.egg-info/requires.txt csvkit.egg-info/top_level.txt csvkit/convert/__init__.py csvkit/convert/csvitself.py csvkit/convert/dbase.py csvkit/convert/fixed.py csvkit/convert/geojs.py csvkit/convert/js.py csvkit/convert/ndjs.py csvkit/convert/xls.py csvkit/convert/xlsx.py csvkit/utilities/__init__.py csvkit/utilities/csvclean.py csvkit/utilities/csvcut.py csvkit/utilities/csvformat.py csvkit/utilities/csvgrep.py csvkit/utilities/csvjoin.py csvkit/utilities/csvjson.py csvkit/utilities/csvlook.py csvkit/utilities/csvpy.py csvkit/utilities/csvsort.py csvkit/utilities/csvsql.py csvkit/utilities/csvstack.py csvkit/utilities/csvstat.py csvkit/utilities/in2csv.py csvkit/utilities/sql2csv.pycsvkit-0.9.1/csvkit.egg-info/top_level.txt0000644000076600000240000000000712506400503021536 0ustar onyxfishstaff00000000000000csvkit csvkit-0.9.1/PKG-INFO0000644000076600000240000000336512506400503015116 0ustar onyxfishstaff00000000000000Metadata-Version: 1.1 Name: csvkit Version: 0.9.1 Summary: A library of utilities for working with CSV, the king of tabular file formats. Home-page: http://csvkit.rtfd.org/ Author: Christopher Groskopf Author-email: staringmonkey@gmail.com License: MIT Description: csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats. It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe. Important links: * Repository: https://github.com/onyxfish/csvkit * Issues: https://github.com/onyxfish/csvkit/issues * Documentation: http://csvkit.rtfd.org/ * Schemas: https://github.com/onyxfish/ffs * Buildbot: https://travis-ci.org/onyxfish/csvkit Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: Intended Audience :: End Users/Desktop Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: MIT License Classifier: Natural Language :: English Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Utilities csvkit-0.9.1/README0000644000076600000240000000073612477331225014713 0ustar onyxfishstaff00000000000000csvkit is a suite of utilities for converting to and working with CSV, the king of tabular file formats. It is inspired by pdftk, gdal and the original csvcut utility by Joe Germuska and Aaron Bycoffe. Important links: * Repository: https://github.com/onyxfish/csvkit * Issues: https://github.com/onyxfish/csvkit/issues * Documentation: http://csvkit.rtfd.org/ * Schemas: https://github.com/onyxfish/ffs * Buildbot: https://travis-ci.org/onyxfish/csvkit csvkit-0.9.1/setup.cfg0000644000076600000240000000007312506400503015633 0ustar onyxfishstaff00000000000000[egg_info] tag_build = tag_date = 0 tag_svn_revision = 0 csvkit-0.9.1/setup.py0000644000076600000240000000551112506371533015540 0ustar onyxfishstaff00000000000000#!/usr/bin/env python import sys from setuptools import setup install_requires = [ 'xlrd>=0.7.1', 'sqlalchemy>=0.6.6', 'openpyxl==2.2.0-b1', 'six>=1.6.1', 'python-dateutil==2.2' ] if sys.version_info < (2, 7): install_requires.append('argparse>=1.2.1') install_requires.append('ordereddict>=1.1') install_requires.append('simplejson>=3.6.3') if sys.version_info[0] == 2: install_requires.append('dbf==0.94.003') setup( name='csvkit', version='0.9.1', description='A library of utilities for working with CSV, the king of tabular file formats.', long_description=open('README').read(), author='Christopher Groskopf', author_email='staringmonkey@gmail.com', url='http://csvkit.rtfd.org/', license='MIT', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'Intended Audience :: Developers', 'Intended Audience :: End Users/Desktop', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Utilities' ], packages=[ 'csvkit', 'csvkit.convert', 'csvkit.utilities' ], entry_points ={ 'console_scripts': [ 'csvclean = csvkit.utilities.csvclean:launch_new_instance', 'csvcut = csvkit.utilities.csvcut:launch_new_instance', 'csvformat = csvkit.utilities.csvformat:launch_new_instance', 'csvgrep = csvkit.utilities.csvgrep:launch_new_instance', 'csvjoin = csvkit.utilities.csvjoin:launch_new_instance', 'csvjson = csvkit.utilities.csvjson:launch_new_instance', 'csvlook = csvkit.utilities.csvlook:launch_new_instance', 'csvpy = csvkit.utilities.csvpy:launch_new_instance', 'csvsort = csvkit.utilities.csvsort:launch_new_instance', 'csvsql = csvkit.utilities.csvsql:launch_new_instance', 'csvstack = csvkit.utilities.csvstack:launch_new_instance', 'csvstat = csvkit.utilities.csvstat:launch_new_instance', 'in2csv = csvkit.utilities.in2csv:launch_new_instance', 'sql2csv = csvkit.utilities.sql2csv:launch_new_instance' ] }, install_requires = install_requires )