pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/0000755000000000000000000000000013035554276017164 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tox.ini0000644000000000000000000000062713035554276020504 0ustar rootroot# Tox (http://tox.testrun.org/) is a tool for running tests # in multiple virtualenvs. This configuration file will run the # test suite on all supported python versions. To use it, "pip install tox" # and then run "tox" from this directory. # Adding numpy for not great reasons. [tox] envlist = py27 [testenv] commands = nosetests -s --verbose --logging-config log_nose.cfg deps = requests nose pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pacbio-manifest.json0000644000000000000000000000045413035554276023123 0ustar rootroot[ { "id": "pbcommand", "name": "pbcommand", "_comment": "Need to add the first 6 chars of the GH SHA", "version": "0.4.5-FIXME", "description": "Python Common library for ToolContract Interface, Report, DataStore and SMRT Link Service Client", "dependencies": [ ] } ] pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/0000755000000000000000000000000013035554276021124 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/pb_io/0000755000000000000000000000000013035554276022214 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/pb_io/conditions.py0000644000000000000000000000236613035554276024746 0ustar rootrootimport json import os from pbcommand.models import ReseqConditions, ReseqCondition def _resolve_conditions(cs, path): """ :type cs: ReseqConditions :rtype: ReseqConditions """ def _resolve_if(p): if os.path.isabs(p): return p else: return os.path.join(path, p) rconditions = [] for c in cs.conditions: s = _resolve_if(c.subreadset) a = _resolve_if(c.alignmentset) r = _resolve_if(c.referenceset) rc = ReseqCondition(c.cond_id, s, a, r) rconditions.append(rc) return cs._replace(conditions=rconditions) def load_reseq_conditions_from(json_file_or_dict): """ Load resequencing conditions from JSON file or str :param json_file_or_dict: :rtype: ReseqConditions """ # refactor that common usage from TC io if isinstance(json_file_or_dict, dict): d = json_file_or_dict else: with open(json_file_or_dict, 'r') as f: d = json.loads(f.read()) cs = ReseqConditions.from_dict(d) # Resolve if isinstance(json_file_or_dict, basestring): dir_name = os.path.dirname(os.path.abspath(json_file_or_dict)) return _resolve_conditions(cs, dir_name) else: return cs pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/pb_io/__init__.py0000644000000000000000000000132613035554276024327 0ustar rootrootfrom .report import load_report_from_json, load_report_spec_from_json from .tool_contract_io import (load_tool_contract_from, load_resolved_tool_contract_from, load_pipeline_presets_from, write_resolved_tool_contract, write_tool_contract, write_resolved_tool_contract_avro, write_tool_contract_avro) from .common import (load_pipeline_chunks_from_json, write_pipeline_chunks, load_pipeline_datastore_view_rules_from_json, pacbio_option_from_dict) from .conditions import load_reseq_conditions_from pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/pb_io/common.py0000644000000000000000000001136313035554276024062 0ustar rootrootimport logging import json import sys import warnings from pbcommand.models import (PipelineChunk, PipelineDataStoreViewRules, TaskOptionTypes, PacBioFloatChoiceOption, PacBioStringChoiceOption, PacBioIntChoiceOption, PacBioStringOption, PacBioFloatOption, PacBioBooleanOption, PacBioIntOption) from pbcommand.schemas import validate_datastore_view_rules log = logging.getLogger(__name__) def write_pipeline_chunks(chunks, output_json_file, comment): _d = dict(nchunks=len(chunks), _version="0.1.0", chunks=[c.to_dict() for c in chunks]) if comment is not None: _d['_comment'] = comment with open(output_json_file, 'w') as f: f.write(json.dumps(_d, indent=4, separators=(',', ': '))) log.debug("Write {n} chunks to {o}".format(n=len(chunks), o=output_json_file)) def load_pipeline_chunks_from_json(path): """Returns a list of Pipeline Chunks :rtype: list[PipelineChunk] """ try: with open(path, 'r') as f: d = json.loads(f.read()) chunks = [] for cs in d['chunks']: chunk_id = cs['chunk_id'] chunk_datum = cs['chunk'] c = PipelineChunk(chunk_id, **chunk_datum) chunks.append(c) return chunks except Exception: msg = "Unable to load pipeline chunks from {f}".format(f=path) sys.stderr.write(msg + "\n") raise def load_pipeline_datastore_view_rules_from_json(path): """Load pipeline presets from dict""" with open(path, 'r') as f: d = json.loads(f.read()) validate_datastore_view_rules(d) return PipelineDataStoreViewRules.from_dict(d) def _pacbio_choice_option_from_dict(d): """ Factory/dispatch method for returning a PacBio Choice Option Type :rtype: PacBioOption """ choices = d['choices'] default_value = d['default'] # this will immediately raise option_type_id = TaskOptionTypes.from_choice_str(d['optionTypeId']) opt_id = d['id'] name = d['name'] desc = d['description'] klass_map = {TaskOptionTypes.CHOICE_STR: PacBioStringChoiceOption, TaskOptionTypes.CHOICE_FLOAT: PacBioFloatChoiceOption, TaskOptionTypes.CHOICE_INT: PacBioIntChoiceOption} k = klass_map[option_type_id] # Sanitize Unicode hack if k is PacBioStringChoiceOption: default_value = default_value.encode('ascii', 'ignore') choices = [i.encode('ascii', 'ignore') for i in choices] opt = k(opt_id, name, default_value, desc, choices) return opt def __simple_option_by_type(option_id, name, default, description, option_type_id): option_type = TaskOptionTypes.from_simple_str(option_type_id) klass_map = {TaskOptionTypes.INT: PacBioIntOption, TaskOptionTypes.FLOAT: PacBioFloatOption, TaskOptionTypes.STR: PacBioStringOption, TaskOptionTypes.BOOL: PacBioBooleanOption} k = klass_map[option_type] # This requires a hack for the unicode to ascii for string option type. if k is PacBioStringOption: # sanitize unicode default = default.encode('ascii', 'ignore') opt = k(option_id, name, default, description) return opt def _pacbio_legacy_option_from_dict(d): """ Load the legacy (jsonschema-ish format) Note, choice types are not supported here. :rtype: PacBioOption """ warnings.warn("This is obsolete and will disappear soon", DeprecationWarning) opt_id = d['pb_option']['option_id'] name = d['pb_option']['name'] default = d['pb_option']['default'] desc = d['pb_option']['description'] option_type_id = d['pb_option']['type'].encode('ascii') # Hack to support "number" if option_type_id == "number": option_type_id = "float" return __simple_option_by_type(opt_id, name, default, desc, option_type_id) def _pacbio_option_from_dict(d): if "pb_option" in d: return _pacbio_legacy_option_from_dict(d) else: return __simple_option_by_type(d['id'], d['name'], d['default'], d['description'], d['optionTypeId']) def pacbio_option_from_dict(d): """Fundamental API for loading any PacBioOption type from a dict """ # This should probably be pushed into pbcommand/pb_io/* for consistency # Extensions are supported by adding a dispatch method by looking for required # key(s) in the dict. if "choices" in d and d.get('choices') is not None: # the None check is for the TCs that are non-choice based models, but # were written with "choices" key return _pacbio_choice_option_from_dict(d) else: return _pacbio_option_from_dict(d) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/pb_io/report.py0000644000000000000000000000727713035554276024116 0ustar rootroot"""Loading a report from JSON This manual marshalling/de-marshalling is not awesome. """ import json import logging import uuid as U from pbcommand.models.report import (Report, Plot, PlotGroup, Attribute, Table, Column, ReportSpec) from pbcommand.schemas import validate_report, validate_report_spec log = logging.getLogger(__name__) __all__ = ["load_report_from_json"] def _to_id(s): if '.' in s: return s.split('.')[-1] else: return s def _to_plot(d): id_ = _to_id(d['id']) caption = d.get('caption', None) image = d['image'] thumbnail = d.get('thumbnail', None) title = d.get('title', None) p = Plot(id_, image, caption=caption, thumbnail=thumbnail, title=title) return p def _to_plot_group(d): id_ = _to_id(d['id']) legend = d.get('legend', None) thumbnail = d.get('thumbnail', None) # is this optional? title = d.get('title', None) if 'plots' in d: plots = [_to_plot(pd) for pd in d['plots']] else: plots = [] return PlotGroup(id_, title=title, legend=legend, plots=plots, thumbnail=thumbnail) def _to_attribute(d): id_ = _to_id(d['id']) name = d.get('name', None) # this can't be none value = d['value'] return Attribute(id_, value, name=name) def _to_column(d): id_ = _to_id(d['id']) header = d.get('header', None) values = d.get('values', []) return Column(id_, header=header, values=values) def _to_table(d): id_ = _to_id(d['id']) title = d.get('title', None) columns = [] for column_d in d.get('columns', []): c = _to_column(column_d) columns.append(c) # assert that all the columns have the same number of values nvalues = {len(c.values) for c in columns} assert len(nvalues) == 1 return Table(id_, title=title, columns=columns) def dict_to_report(dct): # FIXME. Add support for different version schemas in a cleaner, more # concrete manner. report_id = dct['id'] # Make this optional for now report_uuid = dct.get('uuid', str(U.uuid4())) # Make sure the UUID is well formed _ = U.UUID(report_uuid) # Legacy Reports > 0.3.9 will not have the title key title = dct.get('title', "Report {i}".format(i=report_id)) plot_groups = [] if 'plotGroups' in dct: pg = dct['plotGroups'] if pg: plot_groups = [_to_plot_group(d) for d in pg] attributes = [] for r_attr in dct.get('attributes', []): attr = _to_attribute(r_attr) attributes.append(attr) tables = [] for table_d in dct.get('tables', []): t = _to_table(table_d) tables.append(t) report = Report(report_id, title=title, plotgroups=plot_groups, tables=tables, attributes=attributes, dataset_uuids=dct.get('dataset_uuids', ()), uuid=report_uuid) return report def load_report_from_json(json_file): """Convert a report json file to Report instance.""" with open(json_file, 'r') as f: d = json.loads(f.read()) r = dict_to_report(d) return r def _to_report(nfiles, attribute_id, report_id): # this should have version of the bax/bas files, chemistry attributes = [Attribute(attribute_id, nfiles)] return Report(report_id, attributes=attributes) def fofn_to_report(nfofns): return _to_report(nfofns, "nfofns", "fofn_report") def load_report_spec_from_json(json_file, validate=True): with open(json_file, 'r') as f: d = json.loads(f.read()) if validate: validate_report_spec(d) return ReportSpec.from_dict(d) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/pb_io/tool_contract_io.py0000644000000000000000000003274013035554276026135 0ustar rootroot"""IO Layer for creating models from files""" import json import logging from avro.datafile import DataFileWriter from avro.io import DatumWriter import pbcommand from pbcommand.schemas import RTC_SCHEMA, TC_SCHEMA, validate_presets from pbcommand.models import (TaskTypes, GatherToolContractTask, ScatterToolContractTask, MalformedToolContractError, MalformedResolvedToolContractError, validate_tool_contract) from pbcommand.pb_io.common import pacbio_option_from_dict from pbcommand.models.tool_contract import (ToolDriver, ToolContractTask, ToolContract, ResolvedToolContractTask, ResolvedToolContract, InputFileType, OutputFileType, ResolvedScatteredToolContractTask, ResolvedGatherToolContractTask, ToolContractResolvedResource, PipelinePreset) log = logging.getLogger(__name__) __all__ = ['load_resolved_tool_contract_from', 'load_tool_contract_from', 'load_pipeline_presets_from', 'write_tool_contract', 'write_resolved_tool_contract'] class Constants(object): TOOL_ID = "tool_contract_id" TOOL = "tool_contract" TOOL_TYPE = "task_type" IS_DIST = 'is_distributed' # Serialization Format SERIALIZATION = 'serialization' # Scatter TC, mirrors the nproc key in the JSON NCHUNKS = "nchunks" RTOOL = "resolved_tool_contract" # Used in Scattering/Chunking tasks to # produce chunks with specific $chunk_keys CHUNK_KEYS = "chunk_keys" MAX_NCHUNKS = 'max_nchunks' # Used in Gather Tasks GATHER_CHUNK_KEY = 'chunk_key' def load_or_raise(ex_type): def loader_wrap(func): def _wrapper(path): msg = "Failed to load {p}".format(p=path) try: return func(path) except Exception as e: msg = msg + " {e} {m}".format(m=e.message, e=e) log.error(msg, exc_info=True) raise ex_type(msg) return _wrapper return loader_wrap def __driver_from_d(d): driver_exe = d['driver']['exe'] driver_env = d['driver'].get('env', {}) serialization = d['driver'].get(Constants.SERIALIZATION, 'json') return ToolDriver(driver_exe, env=driver_env, serialization=serialization) def __core_resolved_tool_contract_task_from_d(d): def _to_a(x): return x.encode('ascii', 'ignore') def _get(attr_name): return d[Constants.RTOOL][attr_name] def _get_or(attr_name, default_value): return d[Constants.RTOOL].get(attr_name, default_value) def _get_ascii(x_): return _to_a(_get(x_)) tool_contract_id = _get_ascii(Constants.TOOL_ID) tool_type = _get_ascii(Constants.TOOL_TYPE) is_distributed = _get(Constants.IS_DIST) # list of strings input_files = [_to_a(x) for x in _get("input_files")] # list of strings output_files = [_to_a(x) for x in _get("output_files")] tool_options = _get("options") # int nproc = _get("nproc") # allow for backward compatibility log_level = _get_or("log_level", "INFO") resource_types = [ToolContractResolvedResource.from_d(dx) for dx in _get("resources")] return tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, log_level def __to_rtc_from_d(d): def _wrapper(task): driver = __driver_from_d(d) rtc = ResolvedToolContract(task, driver) return rtc return _wrapper def _standard_resolved_tool_contract_from_d(d): """Load a 'Standard' CLI task type""" tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, log_level = __core_resolved_tool_contract_task_from_d(d) task = ResolvedToolContractTask(tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, log_level) return __to_rtc_from_d(d)(task) def _scatter_resolved_tool_contract_from_d(d): """Load a Gathered Tool Contract """ tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, log_level = __core_resolved_tool_contract_task_from_d(d) max_nchunks = d[Constants.RTOOL][Constants.MAX_NCHUNKS] chunk_keys = d[Constants.RTOOL][Constants.CHUNK_KEYS] task = ResolvedScatteredToolContractTask(tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, max_nchunks, chunk_keys, log_level=log_level) return __to_rtc_from_d(d)(task) def _gather_resolved_tool_contract_from_d(d): tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, log_level = __core_resolved_tool_contract_task_from_d(d) chunk_key = d[Constants.RTOOL][Constants.GATHER_CHUNK_KEY] task = ResolvedGatherToolContractTask(tool_contract_id, is_distributed, input_files, output_files, tool_options, nproc, resource_types, chunk_key, log_level=log_level) return __to_rtc_from_d(d)(task) def resolved_tool_contract_from_d(d): """Convert a dict to Resolved Tool Contract""" def _to_a(x): return x.encode('ascii', 'ignore') def _get(attr_name): return d[Constants.RTOOL][attr_name] def _get_ascii(x_): return _to_a(_get(x_)) tool_type = _get_ascii(Constants.TOOL_TYPE) dispatch_funcs = {TaskTypes.STANDARD: _standard_resolved_tool_contract_from_d, TaskTypes.GATHERED: _gather_resolved_tool_contract_from_d, TaskTypes.SCATTERED: _scatter_resolved_tool_contract_from_d} if tool_type in dispatch_funcs: return dispatch_funcs[tool_type](d) else: raise ValueError("Unsupported task type '{x}' Supported task types {t}".format(x=tool_type, t=dispatch_funcs.keys())) def json_path_or_d(value): if isinstance(value, dict): return value elif isinstance(value, basestring): with open(value, 'r') as f: d = json.loads(f.read()) return d else: raise ValueError("Unsupported value. Expected dict, or string") def _json_path_or_d(func): def _wrapper(value): return func(json_path_or_d(value)) return _wrapper @load_or_raise(MalformedResolvedToolContractError) @_json_path_or_d def load_resolved_tool_contract_from(path_or_d): return resolved_tool_contract_from_d(path_or_d) @_json_path_or_d def __core_tool_contract_task_from(d): if Constants.TOOL not in d: raise MalformedResolvedToolContractError("Unable to find root key {k}. Keys {a}".format(k=Constants.TOOL, a=d.keys())) def _to_a(x_): return x_.encode('ascii', 'ignore') def _get(x_): # Get a Subkey within if x_ not in d[Constants.TOOL]: raise MalformedToolContractError("Unable to find subkey '{x}' within key '{i}'".format(x=x_, i=Constants.TOOL)) return d[Constants.TOOL][x_] def _get_or(x_, default): return d[Constants.TOOL].get(x_, default) def _get_ascii(x_): return _to_a(_get(x_)) def _get_ascii_or(x_, default): return _to_a(_get_or(x_, default)) def _to_in_ft(fd): fx = lambda s: _to_a(fd[s]) return InputFileType(fx("file_type_id"), fx("id"), fx("title"), fx("description")) def _to_out_ft(fd): fx = lambda s: _to_a(fd[s]) return OutputFileType(fx("file_type_id"), fx("id"), fx("title"), fx("description"), fx("default_name")) task_id = _to_a(d[Constants.TOOL_ID]) display_name = _get_ascii("name") version = _to_a(d["version"]) default_desc = "PacBio Tool {n}".format(n=display_name) description = _get_ascii_or("description", default_desc) is_distributed = _get(Constants.IS_DIST) input_types = [_to_in_ft(x) for x in _get("input_types")] output_types = [_to_out_ft(x) for x in _get("output_types")] tool_options = [pacbio_option_from_dict(opt_d) for opt_d in _get("schema_options")] nproc = _get("nproc") resource_types = _get("resource_types") return task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types def __to_tc_from_d(d): def _wrapper(task): driver = __driver_from_d(d) schema_version = d.get("schema_version", "UNKNOWN") tc = ToolContract(task, driver, schema_version) return tc return _wrapper @_json_path_or_d def _standard_tool_contract_from(path_or_d): task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types = __core_tool_contract_task_from(path_or_d) task = ToolContractTask(task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types) return __to_tc_from_d(path_or_d)(task) @_json_path_or_d def _scattered_tool_contract_from(path_or_d): task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types = __core_tool_contract_task_from(path_or_d) chunk_keys = path_or_d[Constants.TOOL][Constants.CHUNK_KEYS] # int, or SymbolTypes.MAX_NCHUNKS nchunks = path_or_d[Constants.TOOL][Constants.NCHUNKS] task = ScatterToolContractTask(task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types, chunk_keys, nchunks) return __to_tc_from_d(path_or_d)(task) @_json_path_or_d def _gather_tool_contract_from(path_or_d): task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types = __core_tool_contract_task_from(path_or_d) task = GatherToolContractTask(task_id, display_name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resource_types) return __to_tc_from_d(path_or_d)(task) @_json_path_or_d def tool_contract_from_d(d): """Load tool contract from dict""" if Constants.TOOL not in d: raise KeyError("Tool Contract must have {k}".format(k=Constants.TOOL)) if Constants.TOOL_ID not in d[Constants.TOOL]: raise KeyError("Tool Contract must have {k}.{v}".format(k=Constants.TOOL, v=Constants.TOOL_ID)) task_type = d[Constants.TOOL][Constants.TOOL_TYPE] dispatch_funcs = {TaskTypes.SCATTERED: _scattered_tool_contract_from, TaskTypes.GATHERED: _gather_tool_contract_from, TaskTypes.STANDARD: _standard_tool_contract_from} if task_type in dispatch_funcs: tc = dispatch_funcs[task_type](d) return validate_tool_contract(tc) else: raise ValueError("Unsupported task type {x}".format(x=task_type)) @load_or_raise(MalformedToolContractError) @_json_path_or_d def load_tool_contract_from(path_or_d): return tool_contract_from_d(path_or_d) # XXX this could probably be more robust @_json_path_or_d def load_pipeline_presets_from(d): """Load pipeline presets from dict""" validate_presets(d) presets = PipelinePreset( options=d['options'], task_options=d['taskOptions'], pipeline_id=d['pipelineId'], preset_id=d['presetId'], name=d['name'], description=d['description']) return presets def _write_json(s, output_file): with open(output_file, 'w') as f: f.write(json.dumps(s, indent=4, sort_keys=True, separators=(',', ': '))) return s def write_tool_contract(tool_contract, output_json_file): """ Write a Tool Contract :type tool_contract: ToolContract :param output_json_file: :return: """ return _write_json(tool_contract.to_dict(), output_json_file) def write_resolved_tool_contract(rtc, output_json_file): """ :param rtc: :type rtc: ResolvedToolContract :param output_json_file: :return: """ d = rtc.to_dict() return _write_json(d, output_json_file) def _write_records_to_avro(schema, _d_or_ds, output_file): # FIXME. There's only one record being written here, # why does this not support a single item if isinstance(_d_or_ds, dict): _d_or_ds = [_d_or_ds] with open(output_file, 'w') as outs: with DataFileWriter(outs, DatumWriter(), schema) as writer: for record in _d_or_ds: writer.append(record) log.debug("Write avro file to {p}".format(p=output_file)) return _d_or_ds def write_tool_contract_avro(tc, avro_output): return _write_records_to_avro(TC_SCHEMA, tc.to_dict(), avro_output) def write_resolved_tool_contract_avro(rtc, avro_output): return _write_records_to_avro(RTC_SCHEMA, rtc.to_dict(), avro_output) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/0000755000000000000000000000000013035554276022547 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/pipeline_presets.avsc0000644000000000000000000000243013035554276026776 0ustar rootroot{ "namespace": "com.pacbio.common.models.pipeline_presets", "type": "record", "name": "PipelinePreset", "doc": "Pipeline Preset with custom task options", "fields": [ { "name": "pipelineId", "type": "string", "doc": "Fully qualified pipeline ID, must only have [A-Z][0-9]_." }, { "name": "presetId", "type": "string", "doc": "Fully qualified ID of the pipeline preset, must only have [A-Z][0-9]_." }, { "name": "name", "type": "string", "doc": "Plain-English name of the task option as it will appear in UI" }, { "name": "description", "type": "string", "doc": "More detailed description of the task option as it will appear in UI" }, { "doc": "Workflow level options. See the pbsmrtpipe docs for details", "name": "options", "type": { "type": "map", "values": ["long", "boolean", "string", "int", "double"] } }, { "doc": "Task level options. Please see the pipeline of interest to get a list of available task options using `pbsmrtpipe show-template-details `", "name": "taskOptions", "type": { "type": "map", "values": ["long", "boolean", "string", "int", "double"] } } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/datastore_view_rules.avsc0000644000000000000000000000356113035554276027664 0ustar rootroot{ "namespace": "com.pacbio.common.models.datastore", "type": "record", "name": "PipelineDataStoreViewRules", "doc": "Custom view of the DataStoreFile(s) emitted from a specific pipeline (by id)", "fields": [ { "name": "pipelineId", "type": "string", "doc": "Fully qualified pipeline id to apply rules to. e.g., pbsmrtpipe.pipelines.dev_01" }, { "name": "smrtlinkVersion", "type": "string", "doc": "Version of SMRTLink to which these rules apply. e.g., '3.2'" }, { "name": "rules", "type": { "type": "array", "items": { "type": "record", "name": "DataStoreViewRule", "doc": "Custom View of specific DataStoreFile by source id in the datastore.json", "fields": [ { "name": "sourceId", "type": "string", "doc": "Source ID as it appears in the pbsmrtpipe datastore, Should have the form {task-id}-{in|out}-{positional-index}" }, { "name": "fileTypeId", "type": "string", "doc": "File type identifier, e.g. PacBio.FileTypes.JsonReport" }, { "name": "isHidden", "type": "boolean", "doc": "Specifies that a file should not appear in the UI" }, { "name": "name", "type": ["string", "null"], "doc": "Override the display name (optional). If this is null, the default name of the datastore file will be used in UI" }, { "name": "description", "type": ["string", "null"], "doc": "Override the display description (optional). If this is null, the default description of the datastore file will be used in UI" } ] } } } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/report_spec.avsc0000644000000000000000000001625613035554276025764 0ustar rootroot{ "namespace": "com.pacbio.common.models.reports", "type": "record", "name": "ReportSpec", "doc": "Specification and view rules for a single PacBio report.", "fields": [ { "name": "id", "type": "string", "doc": "Pbreports style id, must only have [A-z][0-9]_" }, { "name": "version", "type": "string", "doc": "Version of the report corresponding to this spec" }, { "name": "title", "type": "string", "doc": "Report display name" }, { "name": "description", "type": ["string", "null"], "default": null, "doc": "Plain-English description of this report, suitable for documentation" }, { "name": "isHidden", "type": ["boolean", "null"], "default": null, "doc": "Flag to hide the entire report" }, { "name": "attributes", "type": { "type": "array", "items": { "type": "record", "name": "AttributeSpec", "fields": [ { "name": "id", "doc": "Report attribute ID", "type": "string" }, { "name": "name", "type": "string", "doc": "Report attribute display name" }, { "name": "description", "type": ["string", "null"], "default": null, "doc": "Plain-English description of the attribute's meaning" }, { "name": "type", "type": "string", "doc": "Expected type of the attribute value" }, { "name": "format", "type": ["string", "null"], "default": null, "doc": "Format string to apply to the value in UI" }, { "name": "isHidden", "type": ["boolean", "null"], "default": null, "doc": "Flag to hide this attribute" } ] } } }, { "name": "tables", "type": { "type": "array", "items": { "type": "record", "name": "TableSpec", "fields": [ { "doc": "Report table Id", "name": "id", "type": "string" }, { "name": "title", "type": "string", "doc": "Display name of the Table" }, { "name": "description", "type": ["string", "null"], "default": null, "doc": "Plain-English description of the table" }, { "name": "isHidden", "type": ["boolean", "null"], "default": null, "doc": "Flag to hide this table" }, { "name": "columns", "doc": "List of Columns", "type": { "type": "array", "items": { "type": "record", "name": "TableColumnSpec", "fields": [ { "doc": "Unique id of column (must be report id format style)", "name": "id", "type": "string" }, { "name": "header", "type": "string", "doc": "Display name of Column" }, { "name": "description", "type": ["string", "null"], "default": null, "doc": "Plain-English description of column" }, { "name": "type", "type": "string", "doc": "Expected type of column values" }, { "name": "format", "type": ["string", "null"], "default": null, "doc": "Format string to apply to values in the UI" }, { "name": "isHidden", "type": ["boolean", "null"], "default": null, "doc": "Flag to hide this column" } ] } } } ] } } }, { "name": "plotGroups", "type": { "type": "array", "items": { "type": "record", "name": "PlotGroupSpec", "fields": [ { "name": "id", "type": "string", "doc": "Plot group ID" }, { "name": "title", "type": "string", "doc": "Plot group title" }, { "name": "legend", "type": ["string", "null"], "doc": "Not clear what the use case of this is", "default": null }, { "name": "description", "type": ["string", "null"], "default": null, "doc": "Plain-English description" }, { "doc": "List of Plots", "name": "plots", "type": { "type": "array", "items": { "doc": "PacBio Report Plot", "type": "record", "name": "PlotSpec", "fields": [ { "name": "id", "type": "string", "doc": "Plot Id" }, { "name": "title", "type": ["string", "null"], "doc": "Display Name of Plot" }, { "name": "caption", "doc": "Caption of the Plot", "type": ["string", "null"], "default": null }, { "name": "description", "type": ["string", "null"], "doc": "Plain-English description", "default": null }, { "name": "xlabel", "type": ["string", "null"], "default": null, "doc": "X-axis label (optional)" }, { "name": "ylabel", "type": ["string", "null"], "default": null, "doc": "Y-axis label (optional)" }, { "name": "isHidden", "type": ["boolean", "null"], "default": null, "doc": "Flag to hide this plot" } ] } } } ] } } } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/datastore.avsc0000644000000000000000000000563413035554276025423 0ustar rootroot{ "namespace": "com.pacbio.common.models", "type": "record", "name": "PacBioDataStore", "doc": "Container for datastore files emitted from a 'Job'type (e.g., analysis, import-dataset, merge-dataset)", "fields": [ { "name": "createdAt", "type": "string", "doc": "ISO8601 Datetime to specific when the datastore file was created. Example 2016-08-18T07:40:43" }, { "name": "updatedAt", "type": "string", "doc": "ISO8601 Datetime to specific when the datastore file was last updated at. Example 2016-08-18T07:40:43" }, { "name": "version", "type": "string", "doc": "Datastore schema version " }, { "doc": "List of DataStore files in the datastore", "name": "files", "type": { "type": "array", "items": { "type": "record", "name": "DataStoreFile", "doc": "DataStore file that contains metadata of a single output file", "fields": [ { "name": "sourceId", "type": "string", "doc": "Source ID unique identifer, must have the form {task-id}-{in|out}-{positional-index} Example `pbsmrtpipe.tasks.dev_hello_worlder-out-0`" }, { "name": "fileTypeId", "type": "string", "doc": "File type identifier. Example `PacBio.FileTypes.JsonReport`" }, { "name": "createdAt", "type": "string", "doc": "ISO8601 Datetime to specific when the file was created Example 2016-08-18T07:40:43" }, { "name": "modifiedAt", "type": "string", "doc": "ISO8601 Datetime to specific when the file was last modified at Example 2016-08-18T07:40:43" }, { "name": "path", "type": "string", "doc": "Absolute path to the file. Example /path/to/my-file.gff" }, { "name": "fileSize", "type": "string", "doc": "File size in kB" }, { "default": false, "name": "isChunked", "type": "boolean", "doc": "Is the file an intermediate file used in a chunked pipeline" }, { "name": "uniqueId", "type": "string", "doc": "Globally unique UUID of the datastore file. Example feddd711-9b37-4cc4-ac5a-4dd4134ad0ca" }, { "name": "name", "type": "string", "doc": "The default name of the datastore file will be used in the UI" }, { "name": "description", "type": "string", "doc": "the default description of the datastore file will be used in the UI" } ] } } } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/tool_contract.avsc0000644000000000000000000001704713035554276026310 0ustar rootroot{ "namespace": "com.pacbio.common.models.contracts", "type": "record", "name": "ToolContract", "fields": [ { "doc": "Version of the ToolContract", "name": "version", "type": "string" }, { "doc": "Schema Version of the ToolContract", "name": "schema_version", "type": ["string", "null"] }, { "_comment": "(this is duplicated in the Task?)", "doc": "Fully qualified id of the tool contract (in the legacy model this is also the task type id)", "name": "tool_contract_id", "type": "string" }, { "name": "tool_contract", "type": { "type": "record", "name": "ToolContractTask", "doc": "Task for defining metadata of the task interface such as Input and Output file types, task options and other metadata", "fields": [ { "name": "input_types", "type": { "type": "array", "items": { "type": "record", "name": "ToolInputFile", "fields": [ { "doc": "Id of input file", "name": "id", "type": "string" }, { "doc": "PacBio File Type identifier, PacBio.DataSet.SubreadSet", "name": "file_type_id", "type": "string" }, { "doc": "Display Name of input file type", "name": "title", "type": "string" }, { "doc": "Description of input file type", "name": "description", "type": "string" } ] } } }, { "name": "output_types", "doc": "Output file types of Task", "type": { "type": "array", "items": { "type": "record", "name": "ToolOutputFile", "doc": "", "fields": [ { "_comment": "FIXME(mpkocher) This needs to be clearly defined", "doc": "Unique id for referencing the output file", "name": "id", "type": "string" }, { "doc": "PacBio FileType identifier, e.g., PacBio.DataSets.SubreadSet", "name": "file_type_id", "type": "string" }, { "doc": "Display Name of the output file name", "name": "title", "type": "string" }, { "doc": "Default base name of the file name. This must be provided without the extension. The extension is determined by the `file_type_id`", "name": "default_name", "type": "string" }, { "doc": "Description of Output file", "name": "description", "type": "string" } ] } } }, { "name": "schema_options", "type": { "type": "array", "items": { "type": "record", "name": "PacBioOptions", "fields": [ { "doc": "Default value for the task option. Every task *must* have default value", "name": "default", "type": [ "int", "string", "boolean", "float" ] }, { "doc": "Globally unique id of the form {namespace}.task_options.{key}. Example (pbtacos.task_options.max_records)", "name": "id", "type": "string" }, { "doc": "Display name of task option", "name": "name", "type": "string" }, { "doc": "Description of Task Option", "name": "description", "type": "string" }, { "doc": "PacBio task option type", "name": "optionTypeId", "type": { "doc": "This needs to be considerably improved and clarified. The option type must be consistent with the value defined. The naming is using camelcase because the same data model is used in the pipeline template.", "type": "enum", "name": "PacBioOptionType", "aliases": ["com.pacbio.common.models.contracts.PacBioOptionType"], "symbols": [ "integer", "boolean", "string", "float", "choice_float", "choice_string", "choice_integer" ] } } ] } } }, { "doc": "Description of Tool/Task", "name": "description", "type": "string" }, { "doc": "Display Name of Tool/Task", "name": "name", "type": "string" }, { "_comment": "FIXME(mpkocher) This can be given as a Symbol `$max_nproc`", "doc": "Number of processors to use. This can be given as a Symbol `$max_nproc` See pbsmrtpipe docs for more details", "name": "nproc", "type": "int" }, { "doc": "Globally unique Tool Contract identifier", "name": "tool_contract_id", "type": "string" }, { "doc": "Task class type, Standard, Scatter, Gather", "name": "task_type", "type": "string", "default": "pbsmrtpipe.task_types.standard" }, { "doc": "Determine if the task will be submitted to the cluster resources", "name": "is_distributed", "type": "boolean" }, { "doc": "This needs to be converted to an ENUM. Allowed values $tmpfile $tmpdir", "name": "resource_types", "type": { "type": "array", "items": { "type": "string" } } } ] } }, { "name": "driver", "type": { "type": "record", "name": "ToolDriver", "fields": [ { "doc": "path to exe. The first arg will the the resolved tool contract JSON", "name": "exe", "type": "string" }, { "doc": "Serialization type. Either 'json' or 'avro' binary format ", "name": "serialization", "type": "string" } ] } } ] }././@LongLink0000644000000000000000000000014700000000000011605 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/pipeline_template_view_rules.avscpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/pipeline_template_view_rules.av0000644000000000000000000000261013035554276031042 0ustar rootroot{ "namespace": "com.pacbio.common.models.pipeline", "type": "record", "name": "PipelineTemplateView", "doc": "Custom views of a Resolved Pipeline Template and task options", "fields": [ { "name": "id", "type": "string", "doc": "Fully qualified pipeline ID, must only have [A-Z][0-9]_." }, { "name": "name", "type": "string", "doc": "Plain-English name of the pipeline as it will appear in UI" }, { "name": "description", "type": "string", "doc": "More detailed description of the pipeline as it will appear in UI" }, { "name": "taskOptions", "type": { "type": "array", "items": { "type": "record", "name": "TaskOptionViewRule", "fields": [ { "name": "id", "type": "string", "doc": "Source ID as it appears in the pbsmrtpipe datastore, Should have the from {task-id}-{in|out}-{positional-index}" }, { "name": "hidden", "type": "boolean", "doc": "Specifies that a field should not appear in the UI" }, { "name": "advanced", "type": "boolean", "doc": "Specifies that a field should only appear in the advanced settings window" } ] } } } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/resolved_tool_contract.avsc0000644000000000000000000000606213035554276030206 0ustar rootroot{ "namespace": "com.pacbio.common.models.contracts", "type": "record", "name": "ResolvedToolContract", "doc": "Resolved `ToolContract` used to run tasks in pipelines", "fields": [ { "name": "resolved_tool_contract", "doc": "Container for Resolved Tool Contract metadata", "type": { "doc": "Resolved Tool Contract Task metadata, such as nproc to use, resolved paths to input and output files and resolved Task Option values", "type": "record", "name": "ResolvedToolContractTask", "fields": [ { "doc": "Resolved paths to input files (Paths must be absolute)", "name": "input_files", "type": { "type": "array", "items": { "type": "string" } } }, { "doc": "Resolved paths to output files (Paths must be absolute)", "name": "output_files", "type": { "type": "array", "items": { "type": "string" } } }, { "doc": "Resolved Task Options", "name": "options", "type": { "type": "map", "values": ["long", "boolean", "string", "int", "float"] } }, { "doc": "Number of Processors to use", "name": "nproc", "type": "int" }, { "doc": "Determine if the task should be submitted to the distributed computing env (e.g, SGE). Only applies if the system is configured to support distributed computing ", "name": "is_distributed", "type": "boolean" }, { "doc": "Task type", "name": "task_type", "type": "string" }, { "doc": "Globally unique id to reference a Tool Contract. This is sometimes referred to as the `task` id (for historical purposes)", "name": "tool_contract_id", "type": "string" }, { "_comment": "FIXME(mkocher) This needs to be well defined", "doc": "Log level to emit to. Supports the standard INFO, DEBUG, ERROR values", "name": "log_level", "type": "string" }, { "name": "resources", "type": { "type": "array", "items": { "type": "string" } } } ] } }, { "doc": "Driver executable to be called to execute the task", "name": "driver", "type": { "type": "record", "name": "Driver", "fields": [ { "doc": "Executable to be called. The first positional argument will be the path to the `ResolvedToolContract` Example `python -m mytool.module run-tool-contract ` or `my-exe `. The exe must be in $PATH before running the task.", "name": "exe", "type": "string" } ] } } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/__init__.py0000644000000000000000000000426413035554276024666 0ustar rootrootimport os import functools import avro.schema from avro.io import validate SCHEMA_REGISTRY = {} __all__ = ['validate_pbreport', 'validate_tc', 'validate_rtc', 'validate_datastore_view_rules', 'SCHEMA_REGISTRY'] def _load_schema(idx, name): d = os.path.dirname(__file__) schema_path = os.path.join(d, name) with open(schema_path, 'r') as f: schema = avro.schema.parse(f.read()) SCHEMA_REGISTRY[idx] = schema return schema RTC_SCHEMA = _load_schema("resolved_tool_contract", "resolved_tool_contract.avsc") PBREPORT_SCHEMA = _load_schema("pbreport", "pbreport.avsc") TC_SCHEMA = _load_schema("tool_contract", "tool_contract.avsc") PRESET_SCHEMA = _load_schema("pipeline_presets", "pipeline_presets.avsc") DS_VIEW_SCHEMA = _load_schema("datastore_view_rules", "datastore_view_rules.avsc") REPORT_SPEC_SCHEMA = _load_schema("report_spec", "report_spec.avsc") def _validate(schema, msg, d): """Validate a python dict against a avro schema""" # FIXME(mkocher)(2016-7-16) Add a better error message than "Invalid" if not validate(schema, d): raise IOError("Invalid {m} ".format(m=msg)) return True def _is_valid(schema, d): return validate(schema, d) validate_rtc = functools.partial(_validate, RTC_SCHEMA, "Resolved Tool Contract Model") validate_pbreport = functools.partial(_validate, PBREPORT_SCHEMA, "Report Model") validate_report = validate_pbreport validate_tc = functools.partial(_validate, TC_SCHEMA, "Tool Contract Model") validate_presets = functools.partial(_validate, PRESET_SCHEMA, "Pipeline Presets Model") validate_datastore_view_rules = functools.partial(_validate, DS_VIEW_SCHEMA, "Pipeline DataStore View Rules") validate_report_spec = functools.partial(_validate, REPORT_SPEC_SCHEMA, "Report Specification Model") is_valid_rtc = functools.partial(_is_valid, RTC_SCHEMA) is_valid_report = functools.partial(_is_valid, PBREPORT_SCHEMA) is_valid_tc = functools.partial(_is_valid, TC_SCHEMA) is_valid_presets = functools.partial(_is_valid, PRESET_SCHEMA) is_valid_datastore_view_rules = functools.partial(_is_valid, DS_VIEW_SCHEMA) is_valid_report_spec = functools.partial(_is_valid, REPORT_SPEC_SCHEMA) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/schemas/pbreport.avsc0000644000000000000000000001300413035554276025260 0ustar rootroot{ "namespace": "com.pacbio.common.models.reports", "type": "record", "name": "Report", "doc": "PacBio Report data model. Except where specified all `id` values must be of the form [A-z][0-9]_ (e.g., `mapping_stats`)", "fields": [ { "name": "id", "type": "string", "doc": "Pbreports style id, must only have [A-z][0-9]_" }, { "name": "version", "type": "string", "doc": "Version of the Report Schema Spec" }, { "name": "uuid", "type": "string", "doc": "Report UUID" }, { "name": "title", "type": ["string", "null"], "default": null, "doc": "Report Display name" }, { "name": "attributes", "type": { "type": "array", "items": { "type": "record", "name": "ReportAttribute", "fields": [ { "doc": "Report Attribute id", "name": "id", "type": "string" }, { "doc": "Report Attribute display name", "name": "name", "type": "string" }, { "doc": "Report Attribute value", "name": "value", "type": [ "string", "int", "float" ] } ] } } }, { "name": "plotGroups", "type": { "type": "array", "items": { "type": "record", "name": "PlotGroup", "fields": [ { "doc": "Plot group Id", "name": "id", "type": "string" }, { "doc": "The display name of plot group", "name": "title", "type": "string" }, { "name": "legend", "type": [ "string", "null" ], "doc": "Not clear what the usecase is of this" }, { "doc": "Thumbnail image path for the entire PlotGroup", "name": "thumbnail", "type": [ "string", "null" ] }, { "doc": "List of Plots", "name": "plots", "type": { "type": "array", "items": { "doc": "PacBio Report Plot", "type": "record", "name": "ReportPlot", "fields": [ { "name": "id", "type": "string", "doc": "Plot Id" }, { "name": "image", "type": "string", "doc": "Png Path to Image (must be relative to the path of report.json file)" }, { "name": "title", "type": ["string", "null"], "doc": "Display Name of Plot" }, { "name": "caption", "doc": "Caption of the Plot", "type": [ "string", "null" ] }, { "name": "thumbnail", "doc": "Relative path to thumbnail of the Plot (must be relative to the path of report.json file)", "type": [ "string", "null" ] } ] } } } ] } } }, { "name": "tables", "type": { "type": "array", "items": { "type": "record", "name": "ReportTable", "fields": [ { "doc": "Report Table Id", "name": "id", "type": "string" }, { "name": "title", "type": "string", "doc": "Display name of the Table" }, { "name": "columns", "doc": "List of Columns", "type": { "type": "array", "items": { "type": "record", "name": "ReportTableColumn", "fields": [ { "doc": "Unique id of column (must be report id format style)", "name": "id", "type": "string" }, { "doc": "Display name of Column", "name": "header", "type": "string" }, { "name": "value", "_comment": "This is a quite unclear interface", "doc": "Column values. Attention to mixed-types attempting to represent 'NA'", "type": { "type": "array", "items": [ "int", "float", "string", "null" ] } } ] } } } ] } } } ] }pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/0000755000000000000000000000000013035554276021673 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/core.py0000644000000000000000000002275013035554276023203 0ustar rootroot""" New Commandline interface that supports ResolvedToolContracts and emitting ToolContracts There's three use cases. - running from an argparse instance - running from a Resolved Tool Contract (RTC) - emitting a ToolContract (TC) Going to do this in a new steps. - de-serializing of RTC (I believe this should be done via avro, not a new random JSON file. With avro, the java, c++, classes can be generated. Python can load the RTC via a structure dict that has a well defined schema) - get loading and running of RTC from commandline to call main func in a report. - generate/emit TC from a a common commandline parser interface that builds the TC and the standard argparse instance """ import argparse import json import logging import time import traceback import shutil import os import sys import pbcommand from pbcommand.models import PbParser, ResourceTypes from pbcommand.common_options import (RESOLVED_TOOL_CONTRACT_OPTION, EMIT_TOOL_CONTRACT_OPTION, add_resolved_tool_contract_option, add_base_options) from pbcommand.utils import get_parsed_args_log_level from pbcommand.pb_io.tool_contract_io import load_resolved_tool_contract_from def _add_version(p, version): p.version = version p.add_argument('--version', action="version", help="show program's version number and exit") return p def get_default_argparser(version, description): """ Everyone should use this to create an instance on a argparser python parser. *This should be replaced updated to have the required base options* :param version: Version of your tool :param description: Description of your tool :return: :rtype: ArgumentParser """ p = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Explicitly adding here to have only --version (not -v) return _add_version(p, version) def get_default_argparser_with_base_opts(version, description, default_level="INFO"): """Return a parser with the default log related options If you don't want the default log behavior to go to stdout, then set the default log level to be "ERROR". This will essentially suppress all output to stdout. Default behavior will only emit to stderr. This is essentially a '--quiet' default mode. my-tool --my-opt=1234 file_in.txt To override the default behavior and add a chatty-er stdout my-tool --my-opt=1234 --log-level=INFO file_in.txt Or write the console output to write the log file to an explict file and leave the stdout untouched. my-tool --my-opt=1234 --log-level=DEBUG --log-file=file.log file_in.txt """ return add_base_options(get_default_argparser(version, description), default_level=default_level) def _pacbio_main_runner(alog, setup_log_func, exe_main_func, *args, **kwargs): """ Runs a general func and logs results. The return type is expected to be an (int) return code. :param alog: a log instance :param func: a cli exe func, must return an int exit code. func(args) => Int, where args is parsed from p.parse_args() :param args: parsed args from parser :param setup_log_func: F(alog, level=value, file_name=value, formatter=value) or None :return: Exit code of callable func :rtype: int """ started_at = time.time() pargs = args[0] # default logging level level = logging.INFO if 'level' in kwargs: level = kwargs.pop('level') else: level = get_parsed_args_log_level(pargs) # None will default to stdout log_file = getattr(pargs, 'log_file', None) # Currently, only support to stdout. More customization would require # more required commandline options in base parser (e.g., --log-file, --log-formatter) log_options = dict(level=level, file_name=log_file) # The Setup log func must adhere to the pbcommand.utils.setup_log func # signature # FIXME. This should use the more concrete F(file_name_or_name, level, formatter) # signature of setup_logger if setup_log_func is not None and alog is not None: setup_log_func(alog, **log_options) alog.info("Using pbcommand v{v}".format(v=pbcommand.get_version())) alog.info("completed setting up logger with {f}".format(f=setup_log_func)) alog.info("log opts {d}".format(d=log_options)) try: # the code in func should catch any exceptions. The try/catch # here is a fail safe to make sure the program doesn't fail # and the makes sure the exit code is logged. return_code = exe_main_func(*args, **kwargs) run_time = time.time() - started_at except Exception as e: run_time = time.time() - started_at if alog is not None: alog.error(e, exc_info=True) else: traceback.print_exc(sys.stderr) # We should have a standard map of exit codes to Int if isinstance(e, IOError): return_code = 1 else: return_code = 2 _d = dict(r=return_code, s=run_time) if alog is not None: alog.info("exiting with return code {r} in {s:.2f} sec.".format(**_d)) return return_code def pacbio_args_runner(argv, parser, args_runner_func, alog, setup_log_func): # For tools that haven't yet implemented the ToolContract API args = parser.parse_args(argv) return _pacbio_main_runner(alog, setup_log_func, args_runner_func, args) class TemporaryResourcesManager(object): """Context manager for creating and destroying temporary resources""" def __init__(self, rtc): self.resolved_tool_contract = rtc def __enter__(self): for resource in self.resolved_tool_contract.task.resources: if resource.type_id == ResourceTypes.TMP_DIR: os.makedirs(resource.path) def __exit__(self, type, value, traceback): for resource in self.resolved_tool_contract.task.resources: if resource.type_id == ResourceTypes.TMP_DIR: if os.path.exists(resource.path): shutil.rmtree(resource.path) def pacbio_args_or_contract_runner(argv, parser, args_runner_func, contract_tool_runner_func, alog, setup_log_func): """ For tools that understand resolved_tool_contracts, but can't emit tool contracts (they may have been written by hand) :param parser: argparse Parser :type parser: ArgumentParser :param args_runner_func: func(args) => int signature :param contract_tool_runner_func: func(tool_contract_instance) should be the signature :param alog: a python log instance :param setup_log_func: func(log_instance) => void signature :return: int return code :rtype: int """ def _log_not_none(msg): if alog is not None: alog.info(msg) # circumvent the argparse parsing by inspecting the raw argv, then create # a temporary parser with limited arguments to process the special case of # --resolved-cool-contract (while still respecting verbosity flags). if any(a.startswith(RESOLVED_TOOL_CONTRACT_OPTION) for a in argv): p_tmp = get_default_argparser(version=parser.version, description=parser.description) add_resolved_tool_contract_option(add_base_options(p_tmp, default_level="NOTSET")) args_tmp = p_tmp.parse_args(argv) resolved_tool_contract = load_resolved_tool_contract_from( args_tmp.resolved_tool_contract) _log_not_none("Successfully loaded resolved tool contract from {a}".format(a=argv)) # XXX if one of the logging flags was specified, that takes precedence, # otherwise use the log level in the resolved tool contract. note that # this takes advantage of the fact that argparse allows us to use # NOTSET as the default level even though it's not one of the choices. log_level = get_parsed_args_log_level(args_tmp, default_level=logging.NOTSET) if log_level == logging.NOTSET: log_level = resolved_tool_contract.task.log_level with TemporaryResourcesManager(resolved_tool_contract) as tmp_mgr: r = _pacbio_main_runner(alog, setup_log_func, contract_tool_runner_func, resolved_tool_contract, level=log_level) _log_not_none("Completed running resolved contract. {c}".format(c=resolved_tool_contract)) return r else: # tool was called with the standard commandline invocation return pacbio_args_runner(argv, parser, args_runner_func, alog, setup_log_func) def pbparser_runner(argv, parser, args_runner_func, contract_runner_func, alog, setup_log_func): """Run a Contract or emit a contract to stdout.""" if not isinstance(parser, PbParser): raise TypeError("Only supports PbParser.") arg_parser = parser.arg_parser.parser # extract the contract tool_contract = parser.to_contract() if EMIT_TOOL_CONTRACT_OPTION in argv: # print tool_contract x = json.dumps(tool_contract.to_dict(), indent=4, separators=(',', ': ')) print x else: return pacbio_args_or_contract_runner(argv, arg_parser, args_runner_func, contract_runner_func, alog, setup_log_func) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/0000755000000000000000000000000013035554276023511 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_app.py0000644000000000000000000000711613035554276025506 0ustar rootroot"""Simple CLI dev app for testing Emitting Tool Contracts and Running from Resolved Tool Contracts""" import logging import sys from pbcommand.utils import setup_log from pbcommand.cli import pbparser_runner from pbcommand.models import FileTypes, get_pbparser, ResourceTypes # This has the same functionality as the dev_simple_app from .dev_simple_app import run_main log = logging.getLogger(__name__) __version__ = '0.2.1' # Used for the tool contract id. Must have the form {namespace}.tasks.{name} # to prevent namespace collisions. For python tools, the namespace should be # the python package name. TOOL_ID = "pbcommand.tasks.dev_app" def add_args_and_options(p): """ Add input, output files and options to parser. :type p: PbParser :return: PbParser """ # FileType, label, name, description p.add_input_file_type(FileTypes.FASTA, "fasta_in", "Fasta File", "PacBio Spec'ed fasta file") # File Type, label, name, description, default file name p.add_output_file_type(FileTypes.FASTA, "fasta_out", "Filtered Fasta file", "Filtered Fasta file", "filter") # Option id, label, default value, name, description # for the argparse, the read-length will be translated to --read-length and (accessible via args.read_length) p.add_int("pbcommand.task_options.dev_read_length", "read-length", 25, "Length filter", "Min Sequence Length filter") return p def get_contract_parser(): """ Central point of programmatically defining a Parser. :rtype: PbParser :return: PbParser """ # Commandline exe to call "{exe}" /path/to/resolved-tool-contract.json driver_exe = "python -m pbcommand.cli.example.dev_app --resolved-tool-contract " desc = "Dev app for Testing that supports emitting tool contracts" subcomponents = [("my_subcomponent", "1.2.3")] resource_types = (ResourceTypes.TMP_FILE, ResourceTypes.TMP_FILE, ResourceTypes.TMP_DIR) p = get_pbparser(TOOL_ID, __version__, "Example Dev App", desc, driver_exe, is_distributed=False, resource_types=resource_types, subcomponents=subcomponents) add_args_and_options(p) return p def args_runner(args): """Entry point from argparse""" log.debug("raw args {a}".format(a=args)) return run_main(args.fasta_in, args.fasta_out, args.read_length) def resolved_tool_contract_runner(resolved_tool_contract): """Run from the resolved contract :param resolved_tool_contract: :type resolved_tool_contract: ResolvedToolContract """ in_file = resolved_tool_contract.task.input_files[0] out_file = resolved_tool_contract.task.output_files[0] min_read_length = resolved_tool_contract.task.options["pbcommand.task_options.dev_read_length"] r = run_main(in_file, out_file, min_read_length) return r def main(argv=sys.argv): log.info("Starting {f} version {v} pbcommand example dev app".format(f=__file__, v=__version__)) # PbParser instance, this has both the argparse instance and the tool contract # instance. mp = get_contract_parser() # To Access the argparse instance # mp.arg_parser.parser # The Tool Contract parser # mp.tool_contract_parser.parser return pbparser_runner(argv[1:], mp, args_runner, resolved_tool_contract_runner, log, setup_log) if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_simple_app.py0000644000000000000000000000605413035554276027057 0ustar rootroot"""Simple CLI dev app for testing This app is a 'simple' app in that is can not emit tool-contracts, but it can run tool contracts via -- """ import logging import sys import warnings from pbcommand.utils import setup_log from pbcommand.validators import validate_file from pbcommand.models import ResolvedToolContract from pbcommand.common_options import add_resolved_tool_contract_option from pbcommand.cli import pacbio_args_or_contract_runner, get_default_argparser log = logging.getLogger(__name__) __version__ = '0.1.1' TOOL_ID = "pbcommand.tasks.dev_app_simple" try: from pbcore.io import FastaWriter, FastaReader except ImportError: warnings.warn("Example apps require pbcore. Install from https://github.com/PacificBiosciences/pbcore") def get_parser(): p = get_default_argparser(__version__, __doc__) p.add_argument("fasta_in", type=validate_file, help="Input Fasta") p.add_argument("fasta_out", type=str, help="Output Fasta") p.add_argument('--read-length', type=int, default=25, help="Min Sequence length to filter") add_resolved_tool_contract_option(p) # this parser cannot emit a tool contract, but can run from a resolved # contract via --resolved-tool-contract /path/to/resolved-tool-contract.json return p def run_main(input_file, output_file, min_sequence_length): """ Main function entry point to your application (this should be imported from your library code) :rtype int: """ _d = dict(i=input_file, a=min_sequence_length, o=output_file) msg = "Running dev_app task. with input:{i} output:{o} and min-length={a}".format(**_d) log.info(msg) with FastaWriter(output_file) as w: with FastaReader(input_file) as r: for record in r: if len(record.sequence) > min_sequence_length: w.writeRecord(record) log.debug("completed running main.") return 0 def args_runner(args): """Entry point from argparse""" log.debug("raw args {a}".format(a=args)) return run_main(args.fasta_in, args.fasta_out, args.read_length) def resolved_tool_contract_runner(resolved_tool_contract): """Run from the resolved contract :param resolved_tool_contract: :type resolved_tool_contract: ResolvedToolContract """ in_file = resolved_tool_contract.task.input_files[0] out_file = resolved_tool_contract.task.output_files[0] alpha = 9 r = run_main(in_file, out_file, alpha) log.info("Completed running resolved contract. {c}".format(c=resolved_tool_contract)) return r def main(argv=sys.argv): # New interface that supports running resolved tool contracts log.info("Starting {f} version {v} pbcommand example dev app".format(f=__file__, v=__version__)) p = get_parser() return pacbio_args_or_contract_runner(argv[1:], p, args_runner, resolved_tool_contract_runner, log, setup_log) if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_txt_app.py0000644000000000000000000000474713035554276026414 0ustar rootroot"""Demonstration Example app Primaryly used for end-to-end testing. emit tool contract -> Resolve -> resolved tool contract -> run. """ import logging import sys from pbcommand.utils import setup_log from pbcommand.cli import pbparser_runner from pbcommand.models import FileTypes, get_pbparser, ResourceTypes TOOL_ID = "pbcommand.tasks.dev_txt_app" VERSION = "0.1.0" log = logging.getLogger(__name__) def get_parser(): driver_exe = "python -m pbcommand.cli.examples.dev_app --resolved-tool-contract " desc = "Dev app for Testing that supports emitting tool contracts" # Can specify libs or other dependencies that subcomponents = [("pbcommand", VERSION), ("my_component", "0.1.0"), ("my_component_id", "1.2.3")] # ResourceTypes.* resource_types = (ResourceTypes.TMP_FILE, ResourceTypes.TMP_FILE, ResourceTypes.TMP_DIR) # Create an instance of a Pacbio Parser p = get_pbparser(TOOL_ID, VERSION, "Txt App", desc, driver_exe, is_distributed=False, resource_types=resource_types, subcomponents=subcomponents) # Add Input Files types p.add_input_file_type(FileTypes.TXT, "txt_in", "Txt file", "Generic Text File") # Add output files types p.add_output_file_type(FileTypes.TXT, "txt_out", "Txt outfile", "Generic Output Txt file", "output") p.add_int("pbcommand.task_options.dev_max_nlines", "max_nlines", 10, "Max Lines", "Max Number of lines to Copy") return p def run_main(input_txt, output_txt, max_nlines): n = 0 with open(input_txt, 'r') as r: with open(output_txt, 'w') as w: w.write("# Output Txt file") for line in r: if n >= max_nlines: break w.write(line + "\n") n += 1 log.info("Completed writing {n} lines".format(n=n)) return 0 def args_runner(args): return run_main(args.txt_in, args.txt_out, args.max_nlines) def rtc_runner(rtc): return run_main(rtc.task.input_files[0], rtc.task.output_files[0], rtc.task.options["pbcommand.task_options.dev_max_nlines"]) def main(argv=sys.argv): return pbparser_runner(argv[1:], get_parser(), args_runner, rtc_runner, log, setup_log) if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_mixed_app.py0000644000000000000000000001156013035554276026672 0ustar rootroot"""Example to show how to expose a subset of functionality to tool contract, while exposing all the options via argparse commandline interface In this example, the tool contract has ins = (csv,) outs = (report, ) options = alpha In the "full" argpase layer that has an optional hdf5 option file and beta. (Option[Int] is the scala-style notation) ins = (csv, ) outs (report, Option[h5]) options = alpha, beta """ import sys import logging from pbcommand.models import FileTypes, get_pbparser from pbcommand.cli import pbparser_runner from pbcommand.utils import setup_log log = logging.getLogger(__name__) TOOL_ID = "pbcommand.tasks.dev_mixed_app" __version__ = "0.2.0" def _get_contract_parser(): """ Central point of programmatically defining a Parser. :rtype: PbParser :return: PbParser """ # Number of processors to use nproc = 2 # Commandline exe to call "{exe}" /path/to/resolved-tool-contract.json driver_exe = "python -m pbcommand.cli.examples.dev_mixed_app --resolved-tool-contract " desc = "Dev app for Testing that supports emitting tool contracts" p = get_pbparser(TOOL_ID, __version__, "DevApp", desc, driver_exe, is_distributed=False, nproc=nproc) return p def add_rtc_options(p): """ Add all ins/outs and options that will be in both the tool contract and the argparse layer :param p: :type p: pbcommand.models.PbParser :return: """ p.add_input_file_type(FileTypes.CSV, "csv", "Input CSV", "Input csv description") p.add_output_file_type(FileTypes.REPORT, "rpt", "Output Report", "Output PacBio Report JSON", "example.report") p.add_int("pbcommand.task_options.alpha", "alpha", 25, "Alpha", "Alpha description") p.add_float("pbcommand.task_options.beta", "beta", 1.234, "Beta", "Beta description") p.add_boolean("pbcommand.task_options.gamma", "gamma", True, "Gamma", "Gamma description") p.add_choice_str("pbcommand.task_options.ploidy", "ploidy", ["haploid", "diploid"], "Ploidy", "Genome ploidy", "haploid") p.add_choice_int("pbcommand.task_options.delta", "delta", [1,2,3], "Delta", "An integer choice", default=1) p.add_choice_float("pbcommand.task_options.epsilon", "epsilon", [0.01, 0.1, 1.0], "Epsilon", "A float choice", default=0.1) p.add_str("pbcommand.task_options.comment", "comment", "asdf", "Comments", "A string parameter") return p def add_argparse_only(p): """ Standard argparse layer :param p: :type p: argparse.ArgumentParser :return: """ p.add_argument("--output-h5", type=str, help="Optional output H5 file.") p.add_argument("--zeta", type=int, default=1234, help="Example option") return p def get_contract_parser(): p = _get_contract_parser() # minimal ins/outs + options exposed at the contract level add_rtc_options(p) # add all options to the raw argparse instance add_argparse_only(p.arg_parser.parser) return p def _fake_main(csv, report_json, alpha=1, beta=1.234, gamma=True, delta=1, epsilon=1234, output_h5=None, ploidy=None, zeta=None): _d = dict(c=csv, r=report_json, a=alpha, b=beta, g=gamma, d=delta, e=epsilon, h=output_h5, p=ploidy) log.info("Running main with {c} {r} alpha={a} beta={b} gamma={g} delta={d} epsilon={e} h5={h} p={p}".format(**_d)) with open(report_json, "w") as f: f.write("{}") return 0 def args_runner(args): """Standard python args access point""" csv = args.csv report_json = args.rpt output_h5 = args.output_h5 return _fake_main(csv, report_json, alpha=args.alpha, beta=args.beta, gamma=args.gamma, epsilon=args.epsilon, output_h5=output_h5, ploidy=args.ploidy, zeta=args.zeta) def resolved_tool_contract_runner(rtc): """ :param rtc: :type rtc: pbcommand.models.tool_contract.ResolvedToolContract :return: """ csv = rtc.task.input_files[0] rpt = rtc.task.output_files[0] alpha = rtc.task.options["pbcommand.task_options.alpha"] beta = rtc.task.options["pbcommand.task_options.beta"] gamma = rtc.task.options["pbcommand.task_options.gamma"] ploidy = rtc.task.options["pbcommand.task_options.ploidy"] delta = rtc.task.options["pbcommand.task_options.delta"] epsilon = rtc.task.options["pbcommand.task_options.epsilon"] comments = rtc.task.options["pbcommand.task_options.comment"] return _fake_main(csv, rpt, alpha=alpha, beta=beta, gamma=gamma, ploidy=ploidy) def main(argv=sys.argv): # New interface that supports running resolved tool contracts log.info("Starting {f} version {v} pbcommand example dev app".format(f=__file__, v=__version__)) p = get_contract_parser() return pbparser_runner(argv[1:], p, args_runner, resolved_tool_contract_runner, log, setup_log) if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/__init__.py0000644000000000000000000000000013035554276025610 0ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/template_simple.py0000644000000000000000000000251613035554276027253 0ustar rootroot"""Simple Example Template for creating a CLI tool""" import os import sys import logging from pbcommand.validators import validate_file from pbcommand.utils import setup_log from pbcommand.cli import get_default_argparser_with_base_opts, pacbio_args_runner log = logging.getLogger(__name__) __version__ = "0.1.0" # __author__ = "Add-your-name" def get_parser(): """Define Parser. Use the helper methods in validators to validate input""" p = get_default_argparser_with_base_opts(__version__, __doc__) p.add_argument('path_to_file', type=validate_file, help="Path to File") return p def run_main(path, value=8): """ Main function that should be called. Typically this is imported from your library code. This should NOT reference args.* """ log.info("Running path {p} with value {v}".format(p=path, v=value)) log.info("Found path? {t} {p}".format(p=path, t=os.path.exists(path))) return 0 def args_runner(args): log.info("Raw args {a}".format(a=args)) return run_main(args.path_to_file, value=100) def main(argv): return pacbio_args_runner(argv[1:], get_parser(), args_runner, log, setup_log_func=setup_log) if __name__ == '__main__': sys.exit(main(sys.argv)) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_quick_hello_world.py0000644000000000000000000000625013035554276030432 0ustar rootrootimport sys import pprint import logging from pbcommand.models import FileTypes, OutputFileType from pbcommand.cli import registry_builder, registry_runner, QuickOpt log = logging.getLogger(__name__) # Version of the Commandline Tool __version__ = "0.1.2" registry = registry_builder("pbcommand", "python -m pbcommand.cli.examples.dev_quick_hello_world ") def _example_main(input_files, output_files, **kwargs): log.info("Running example main with {i} {o} kw:{k}".format(i=input_files, o=output_files, k=kwargs)) # write mock output files, otherwise the End-to-End test will fail xs = output_files if isinstance(output_files, (list, tuple)) else [output_files] for x in xs: with open(x, 'w') as writer: if len(kwargs) > 0: pprint.pprint(dict(kwargs), writer) else: writer.write("Mock data\n") return 0 @registry("dev_qhello_world", "0.2.1", FileTypes.FASTA, FileTypes.FASTA, nproc=1, options=dict(alpha=1234)) def run_rtc(rtc): log.debug("Dev Quick Hello World Example. Fasta -> Fasta with option alpha=1234") return _example_main(rtc.task.input_files[0], rtc.task.output_files[0], nproc=rtc.task.nproc) @registry("dev_fastq2fasta", "0.1.0", FileTypes.FASTQ, FileTypes.FASTA, options=dict(beta=QuickOpt(1234.0, "Beta Name", "Beta Description"), gamma=True), name="Fastq to Fasta", description="Dev Task Fastq to Fasta Example") def run_rtc(rtc): return _example_main(rtc.task.input_files[0], rtc.task.output_files[0]) @registry("dev_txt_hello", "0.1.0", FileTypes.TXT, (FileTypes.TXT, FileTypes.TXT), nproc=3, is_distributed=False) def run_rtc(rtc): return _example_main(rtc.task.input_files, rtc.task.output_files) @registry("dev_test_options", "0.1.0", FileTypes.TXT, FileTypes.TXT, nproc=1, options=dict(alpha=1234, beta=5.4321, gamma=True, ploidy=("haploid", "diploid"), delta=(1,2,3), epsilon=(0.01,0.1,1.0))) def run_rtc(rtc): log.debug("Dev Quick Hello World Example with various option types") return _example_main(rtc.task.input_files[0], rtc.task.output_files[0], options=rtc.task.options) def _to_output(i, file_type): default_name = "_".join([file_type.file_type_id, file_type.base_name + "_" + str(i)]) label = "label_" + file_type.file_type_id desc = "File {f}".format(f=file_type) return OutputFileType(file_type.file_type_id, label, repr(file_type), desc, default_name) def _to_outputs(file_types): return [_to_output(i, ft) for i, ft in enumerate(file_types)] @registry("dev_txt_custom_outs", "0.1.0", FileTypes.TXT, _to_outputs((FileTypes.TXT, FileTypes.TXT)), name="Custom Txt Task") def run_rtc(rtc): """Test for using OutputFileTypes as outputs Output types can be specified as FileType, or OutputFileType instances """ return _example_main(rtc.task.input_files, rtc.task.output_files) if __name__ == '__main__': default_log_level = logging.DEBUG sys.exit(registry_runner(registry, sys.argv[1:], default_log_level=default_log_level, version=__version__)) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_scatter_fasta_app.py0000644000000000000000000001256713035554276030417 0ustar rootroot"""Example of Generating a Chunk.json file that 'scatters' a pair of fasta files""" import os import logging import sys import warnings import math import datetime from pbcommand.cli import pbparser_runner from pbcommand.models import get_scatter_pbparser, FileTypes, PipelineChunk from pbcommand.pb_io import write_pipeline_chunks from pbcommand.utils import setup_log log = logging.getLogger(__name__) TOOL_ID = "pbcommand.tasks.dev_scatter_fasta" __version__ = '0.1.0' try: from pbcore.io import FastaWriter, FastaReader except ImportError: warnings.warn("Example apps require pbcore. Install from https://github.com/PacificBiosciences/pbcore") class Constants(object): NCHUNKS_OPT = "pbcommand.task_options.dev_scatter_fa_nchunks" FA_CHUNK_KEY = "$chunk.fasta_id" def __get_nrecords_from_reader(reader): n = 0 for _ in reader: n += 1 return n def write_fasta_records(fastax_writer_klass, records, file_name): n = 0 with fastax_writer_klass(file_name) as w: for record in records: w.writeRecord(record) n += 1 log.debug("Completed writing {n} fasta records".format(n=n)) def __to_chunked_fastx_files(fastx_reader_klass, fastax_writer_klass, chunk_key, fastx_path, max_total_nchunks, dir_name, base_name, ext): """Convert a Fasta/Fasta file to a chunked list of files""" # grab the number of records so we can chunk it with fastx_reader_klass(fastx_path) as f: nrecords = __get_nrecords_from_reader(f) max_total_nchunks = min(nrecords, max_total_nchunks) n = int(math.ceil(float(nrecords)) / max_total_nchunks) nchunks = 0 with fastx_reader_klass(fastx_path) as r: it = iter(r) for i in xrange(max_total_nchunks): records = [] chunk_id = "_".join([base_name, str(nchunks)]) chunk_name = ".".join([chunk_id, ext]) nchunks += 1 fasta_chunk_path = os.path.join(dir_name, chunk_name) if i != max_total_nchunks: for _ in xrange(n): records.append(it.next()) else: for x in it: records.append(x) write_fasta_records(fastax_writer_klass, records, fasta_chunk_path) total_bases = sum(len(r.sequence) for r in records) d = dict(total_bases=total_bases, nrecords=len(records)) d[chunk_key] = os.path.abspath(fasta_chunk_path) c = PipelineChunk(chunk_id, **d) yield c def to_chunked_fasta_files(fasta_path, max_total_nchunks, dir_name, chunk_key, base_name, ext): return __to_chunked_fastx_files(FastaReader, FastaWriter, chunk_key, fasta_path, max_total_nchunks, dir_name, base_name, ext) def write_chunks_to_json(chunks, chunk_file): log.debug("Wrote {n} chunks to {f}.".format(n=len(chunks), f=chunk_file)) write_pipeline_chunks(chunks, chunk_file, "Chunks written at {d}".format(d=datetime.datetime.now())) return 0 def _write_fasta_chunks_to_file(to_chunk_fastx_file_func, chunk_file, fastx_path, max_total_chunks, dir_name, chunk_key, chunk_base_name, chunk_ext): chunks = list(to_chunk_fastx_file_func(fastx_path, max_total_chunks, dir_name, chunk_key, chunk_base_name, chunk_ext)) write_chunks_to_json(chunks, chunk_file) return 0 def write_fasta_chunks_to_file(chunk_file, fasta_path, max_total_chunks, dir_name, chunk_key, chunk_base_name, chunk_ext): return _write_fasta_chunks_to_file(to_chunked_fasta_files, chunk_file, fasta_path, max_total_chunks, dir_name, chunk_key, chunk_base_name, chunk_ext) def run_main(fasta_file, chunk_output_json, chunk_key, max_nchunks, nchunks=None, chunk_base_name="fasta"): """Create a Chunk.json file with nchunks <= max_nchunks Not clear on the nchunks vs max_nchunks. """ output_dir = os.path.dirname(chunk_output_json) return write_fasta_chunks_to_file(chunk_output_json, fasta_file, max_nchunks, output_dir, chunk_key, chunk_base_name, "fasta") def get_parser(): driver = "python -m pbcommand.cli.examples.dev_scatter_fasta_app --resolved-tool-contract " desc = "Scatter a single fasta file to create chunk.json file" # chunk keys that **will** be written to the file chunk_keys = (Constants.FA_CHUNK_KEY, ) p = get_scatter_pbparser(TOOL_ID, __version__, "Fasta Scatter", desc, driver, chunk_keys, is_distributed=False) p.add_input_file_type(FileTypes.FASTA, "fasta_in", "Fasta In", "Fasta file to scatter") p.add_output_file_type(FileTypes.CHUNK, "cjson", "Chunk JSON", "Scattered/Chunked Fasta Chunk.json", "fasta.chunks") p.add_int("pbcommand.task_options.dev_scatter_fa_nchunks", "nchunks", 10, "Number of chunks", "Suggested number of chunks. May be overridden by $max_nchunks") return p def args_runner(args): return run_main(args.fasta_in, args.cjson, Constants.FA_CHUNK_KEY, args.nchunks) def rtc_runner(rtc): return run_main(rtc.task.input_files[0], rtc.task.output_files[0], Constants.FA_CHUNK_KEY, rtc.task.options[Constants.NCHUNKS_OPT]) def main(argv=sys.argv): return pbparser_runner(argv[1:], get_parser(), args_runner, rtc_runner, log, setup_log) if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/examples/dev_gather_fasta_app.py0000644000000000000000000000657213035554276030223 0ustar rootroot"""Example of Gather TC to gather several $chunk.fasta_id in chunk.json file. """ import logging import sys import warnings import functools from pbcommand.cli import pbparser_runner from pbcommand.models import get_gather_pbparser, FileTypes from pbcommand.pb_io import load_pipeline_chunks_from_json from pbcommand.utils import setup_log from .dev_scatter_fasta_app import Constants log = logging.getLogger(__name__) TOOL_ID = "pbcommand.tasks.dev_gather_fasta" __version__ = '0.1.0' try: from pbcore.io import FastaWriter, FastaReader except ImportError: warnings.warn("Example apps require pbcore. Install from https://github.com/PacificBiosciences/pbcore") def __gather_fastx(fastx_reader, fastx_writer, fastx_files, output_file): # this will work for any Pbcore Reader, Writer classes n = 0 with fastx_writer(output_file) as writer: for fastx_file in fastx_files: with fastx_reader(fastx_file) as reader: for record in reader: n += 1 writer.writeRecord(record) log.info("Completed gathering {n} files (with {x} records) to {f}".format(n=len(fastx_files), f=output_file, x=n)) return 0 gather_fasta = functools.partial(__gather_fastx, FastaReader, FastaWriter) def _get_datum_from_chunks_by_chunk_key(chunks, chunk_key): datum = [] for chunk in chunks: if chunk_key in chunk.chunk_keys: value = chunk.chunk_d[chunk_key] datum.append(value) else: raise KeyError("Unable to find chunk key '{i}' in {p}".format(i=chunk_key, p=chunk)) return datum def __args_gather_runner(func, chunk_json, output_file, chunk_key): chunks = load_pipeline_chunks_from_json(chunk_json) # Allow looseness if not chunk_key.startswith('$chunk.'): chunk_key = '$chunk.' + chunk_key log.warn("Prepending chunk key with '$chunk.' to '{c}'".format(c=chunk_key)) else: chunk_key = chunk_key fastx_files = _get_datum_from_chunks_by_chunk_key(chunks, chunk_key) _ = func(fastx_files, output_file) return 0 def run_main(chunked_json, output_fasta, chunk_key): """Create a Chunk.json file with nchunks <= max_nchunks Not clear on the nchunks vs max_nchunks. """ return __args_gather_runner(gather_fasta, chunked_json, output_fasta, chunk_key) def get_parser(): driver = "python -m pbcommand.cli.examples.dev_gather_fasta_app --resolved-tool-contract " desc = "Gather a fasta resources in a Chunk.json file" p = get_gather_pbparser(TOOL_ID, __version__, "Fasta Chunk Gather", desc, driver, is_distributed=False) p.add_input_file_type(FileTypes.CHUNK, "chunk_json", "Chunk JSON", "Chunked Fasta JSON Out") p.add_output_file_type(FileTypes.FASTA, "output", "Chunk JSON", "Output Fasta", "gathered") return p def args_runner(args): return run_main(args.chunk_json, args.output, Constants.FA_CHUNK_KEY) def rtc_runner(rtc): return run_main(rtc.task.input_files[0], rtc.task.output_files[0], Constants.FA_CHUNK_KEY) def main(argv=sys.argv): return pbparser_runner(argv[1:], get_parser(), args_runner, rtc_runner, log, setup_log) if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/__init__.py0000644000000000000000000000042013035554276024000 0ustar rootrootfrom .core import (pacbio_args_runner, pacbio_args_or_contract_runner, pbparser_runner, get_default_argparser, get_default_argparser_with_base_opts) from .quick import (registry_runner, registry_builder, QuickOpt) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/quick.py0000644000000000000000000003157213035554276023371 0ustar rootrootimport argparse import json import logging import os import sys from collections import namedtuple import time import pbcommand from .core import get_default_argparser_with_base_opts from pbcommand.common_options import add_base_options, add_common_options from pbcommand.models import (ToolContractTask, ToolContract, InputFileType, OutputFileType, FileType, PacBioIntChoiceOption, PacBioStringOption, PacBioFloatOption, PacBioBooleanOption, PacBioIntOption, PacBioStringChoiceOption, PacBioFloatChoiceOption) from pbcommand.models.tool_contract import ToolDriver from pbcommand.pb_io import (load_resolved_tool_contract_from, write_tool_contract) from pbcommand.utils import setup_log, setup_logger, get_parsed_args_log_level log = logging.getLogger(__name__) __all__ = ['registry_builder', 'registry_runner', 'Registry'] class Constants(object): RTC_DRIVER = 'run-rtc' QuickOpt = namedtuple("QuickOpt", "value name description") def _example_main(*args, **kwargs): log.info("Running example main with {a} kw:{k}".format(a=args, k=kwargs)) return 0 def _file_type_to_input_file_type(file_type, index): fid = "_".join([file_type.file_type_id, str(index)]) return InputFileType(file_type.file_type_id, "Label " + fid, repr(file_type), "description for {f}".format(f=fid)) def _file_type_to_output_file_type(file_type, index): fid = "_".join([file_type.file_type_id, str(index)]) return OutputFileType(file_type.file_type_id, "Label " + fid, repr(file_type), "description for {f}".format(f=file_type), file_type.default_name) def __convert_to_choice_option(option_id, default_value_or_choices, name, description, choices=None): """Enable some looseness in the inputs if the default_value is provided by a list or tuple, assume the default value is the first value. Else, assume the choices and default value was provided """ # FIXME, this method is somewhat duplicated with the from dict serialization IO layer def _is_list(x): return isinstance(x, (tuple, list)) if _is_list(default_value_or_choices): value = default_value_or_choices[0] r_choices = default_value_or_choices else: value = default_value_or_choices r_choices = choices if isinstance(value, basestring): opt = PacBioStringChoiceOption(option_id, name, value, description, r_choices) elif isinstance(value, int): opt = PacBioIntChoiceOption(option_id, name, value, description, r_choices) elif isinstance(value, float): opt = PacBioFloatChoiceOption(option_id, name, value, description, r_choices) else: raise TypeError("Invalid choice type {t} of default:{d} and choices: {c}") return opt def _convert_to_option(namespace, key, value, name=None, description=None, choices=None): opt_id = ".".join([namespace, 'task_options', key]) r_name = "Option {n}".format(n=key) if name is None else name r_desc = "Option {n} description".format(n=key) if description is None else description if isinstance(value, (tuple, list)) or isinstance(choices, (tuple, list)): opt = __convert_to_choice_option(opt_id, value, r_name, r_desc, choices=choices) elif isinstance(value, basestring): opt = PacBioStringOption(opt_id, r_name, value, r_desc) elif isinstance(value, bool): opt = PacBioBooleanOption(opt_id, r_name, value, r_desc) elif isinstance(value, int): opt = PacBioIntOption(opt_id, r_name, value, r_desc) elif isinstance(value, float): opt = PacBioFloatOption(opt_id, r_name, value, r_desc) else: raise TypeError("Unsupported option {k} type. {t} ".format(k=key, t=type(value))) return opt def _convert_quick_option(namespace, key, quick_opt): """:type quick_opt: QuickOpt""" return _convert_to_option(namespace, key, quick_opt.value, name=quick_opt.name, description=quick_opt.description) def _to_list(x): if isinstance(x, (list, tuple)): return x else: return [x] def _transform_output_ftype(x, i): if isinstance(x, FileType): return _file_type_to_output_file_type(x, i) elif isinstance(x, OutputFileType): return x else: raise TypeError("Unsupported type {t} value {x}".format(x=x, t=type(x))) def _convert_to_raw_option(namespace, key, value_or_quick_opt): if isinstance(value_or_quick_opt, QuickOpt): return _convert_quick_option(namespace, key, value_or_quick_opt) else: # 'raw' opt was provide with a primitive type return _convert_to_option(namespace, key, value_or_quick_opt) class Registry(object): def __init__(self, tool_namespace, driver_base): self.namespace = tool_namespace self.driver_base = driver_base # id -> func(rtc) self.rtc_runners = {} def __repr__(self): _d = dict(k=self.__class__.__name__, n=self.namespace, d=self.driver_base, t=len(self.rtc_runners)) return "<{k} {n} {d} tool-contracts:{t} >".format(**_d) def __call__(self, tool_id, version, input_types, output_types, options=None, nproc=1, is_distributed=True, name=None, description=None): def _w(func): """ Task Options are provided as 'naked' non-namespaced values and are automatically type detected and converted to a PacBioOption """ # support list or a single value itypes = _to_list(input_types) otypes = _to_list(output_types) global_id = ".".join([self.namespace, 'tasks', tool_id]) def _or_default(value_, default_value): return default_value if value_ is None else value_ display_name = _or_default(name, "Tool {n}".format(n=tool_id)) desc = _or_default(description, "Quick tool {n} {g}".format(n=tool_id, g=global_id)) input_file_types = [_file_type_to_input_file_type(ft, i) for i, ft in enumerate(itypes)] output_file_types = [_transform_output_ftype(ft, i) for i, ft in enumerate(otypes)] if options is None: tool_options = [] else: tool_options = [_convert_to_raw_option(self.namespace, key, value) for key, value in options.iteritems()] resource_types = [] task = ToolContractTask(global_id, display_name, desc, version, is_distributed, input_file_types, output_file_types, tool_options, nproc, resource_types) # trailing space if for 'my-tool --resolved-tool-contract ' /path/to/rtc.json driver_exe = " ".join([self.driver_base, Constants.RTC_DRIVER, " "]) driver = ToolDriver(driver_exe, ) tc = ToolContract(task, driver) self.rtc_runners[tc] = func return _w def to_summary(self): xs = [] x = xs.append x("Registry namespace:{n} driverbase:{d}".format(n=self.namespace, d=self.driver_base)) for tc, func in self.rtc_runners.iteritems(): x(str(tc)) return "\n".join(xs) def registry_builder(tool_namespace, driver_base): r = Registry(tool_namespace, driver_base) return r def _subparser_builder(subparser, name, description, options_func, exe_func): p = subparser.add_parser(name, help=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) options_func(p) # I strongly dislike this. p.set_defaults(func=exe_func) return p def _add_run_rtc_options(default_log_level=logging.INFO): def _wrapper(p): add_common_options(p, default_level=default_log_level) p.add_argument('rtc_path', type=str, help="Path to resolved tool contract") return p return _wrapper def _add_emit_all_tcs_options(p): p.add_argument('-o', '--output_dir', type=str, default=os.getcwd(), help='Emit all Tool Contracts to output directory') return p def _add_emit_tc_options(p): p.add_argument('tc_id', type=str, help="Tool Contract Id") return p def __args_summary_runner(registry): def _w(args): log.info("Registry {r}".format(r=registry)) log.info("\n" + registry.to_summary()) print registry.to_summary() return 0 return _w def __args_rtc_runner(registry, default_log_level): def _w(args): started_at = time.time() def run_time(): return time.time() - started_at def exit_msg(rcode_): return "Completed running {r} exitcode {e} in {t:.2f} sec.".format(r=rtc, e=rcode_, t=run_time()) level = get_parsed_args_log_level(args) setup_logger(None, level=level) log.info("Loading pbcommand {v}".format(v=pbcommand.get_version())) log.info("Registry {r}".format(r=registry)) log.info("Setting log-level to {d}".format(d=level)) log.debug("args {a}".format(a=args)) log.info("loading RTC from {i}".format(i=args.rtc_path)) rtc = load_resolved_tool_contract_from(args.rtc_path) id_funcs = {t.task.task_id: func for t, func in registry.rtc_runners.iteritems()} func = id_funcs.get(rtc.task.task_id, None) if func is None: rcode = 1 log.error("Unknown tool contract id '{x}' Registered TC ids {i}".format(x=rtc.task.task_id, i=id_funcs.keys())) log.error(exit_msg(rcode)) return rcode else: log.info("Running id:{i} Resolved Tool Contract {r}".format(r=rtc, i=rtc.task.task_id)) log.info("Runner func {f}".format(f=func)) exit_code = func(rtc) if exit_code == 0: log.info(exit_msg(exit_code)) else: log.error(exit_msg(exit_code)) return exit_code return _w def __args_emit_tc_runner(registry): def _w(args): log.info("Registry {r}".format(r=registry)) tc_id = args.tc_id log.info("Emitting TC from {i}".format(i=tc_id)) id_tc = {t.task.task_id: t for t in registry.rtc_runners.keys()} log.info(id_tc) tc = id_tc.get(tc_id, None) if tc is None: sys.stderr.write("ERROR. Unable to find tool-contract id {i}".format(i=tc_id)) return -1 else: print json.dumps(tc.to_dict(), sort_keys=True, indent=4, separators=(',', ': ')) return 0 return _w def __args_emit_all_tcs_runner(registry): def _w(args): log.info("Registry {r}".format(r=registry)) log.info(registry.to_summary()) log.info("Emitting TCs to {i}".format(i=args.output_dir)) tcs = registry.rtc_runners.keys() for tc in tcs: output_file = os.path.join(args.output_dir, tc.task.task_id + "_tool_contract.json") write_tool_contract(tc, output_file) return 0 return _w def _to_registry_parser(version, description, default_log_level): def _f(registry): p = get_default_argparser_with_base_opts(version, description) sp = p.add_subparsers(help='Commands') args_summary_runner = __args_summary_runner(registry) args_rtc_runner = __args_rtc_runner(registry, default_log_level) args_tc_emit = __args_emit_tc_runner(registry) args_tcs_emit = __args_emit_all_tcs_runner(registry) _subparser_builder(sp, Constants.RTC_DRIVER, "Run Resolved Tool contract", _add_run_rtc_options(default_log_level), args_rtc_runner) _subparser_builder(sp, 'emit-tool-contracts', "Emit all Tool contracts to output-dir", _add_emit_all_tcs_options, args_tcs_emit) _subparser_builder(sp, 'emit-tool-contract', "Emit a single tool contract by id", _add_emit_tc_options, args_tc_emit) _subparser_builder(sp, 'summary', "Summary of Tool Contracts", lambda x: x, args_summary_runner) return p return _f def registry_runner(registry, argv, default_log_level=logging.INFO, version="0.1.0"): """Runs a registry :arg version: Is the version of the Commandline tool, not the TCs or tasks. 1. Manually build an argparser that has For running: my_tool run-rtc /path/to/resolved-tool-contract.json For emitting: my-tool emit-tool-contracts /path/to/output-dir my-tool emit-tool-contract global_id :type registry: Registry """ f = _to_registry_parser(version, "Multi-quick-tool-runner for {r}".format(r=registry.namespace), default_log_level) p = f(registry) args = p.parse_args(argv) # The logger needs to be setup only in specific subparsers. Some commands # are using the stdout as a non logging model return_code = args.func(args) return return_code pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/cli/utils.py0000644000000000000000000001012313035554276023402 0ustar rootroot """ Additional utilities for running command-line apps - most of these do not apply to tool-contract-driven programs. (Ported from pbsmrtpipe) """ import traceback import argparse import platform import logging import time import os from pbcommand.validators import validate_file, validate_fofn from pbcommand.utils import setup_log log = logging.getLogger(__name__) def subparser_builder(subparser, subparser_id, description, options_func, exe_func): """ Util to add subparser options :param subparser: :param subparser_id: :param description: :param options_func: Function that will add args and options to Parser instance F(subparser) -> None :param exe_func: Function to run F(args) -> Int :return: """ p = subparser.add_parser(subparser_id, help=description) options_func(p) p.set_defaults(func=exe_func) return p def add_debug_option(p): p.add_argument('--debug', action='store_true', help="Send logging info to stdout.") return p def _validate_output_dir_or_get_default(value): if value is None: return os.getcwd() else: if os.path.exists(value): return os.path.abspath(value) else: os.mkdir(value) return os.path.abspath(value) def add_output_dir_option(p): p.add_argument('-o', '--output-dir', type=_validate_output_dir_or_get_default, default=os.getcwd(), help="Output directory.") return p def _add_input_file(args_label, type_, help_): def _wrapper(p): p.add_argument(args_label, type=type_, help=help_) return p return _wrapper add_fasta_output = _add_input_file("fasta_out", str, "Path to output Fasta File") add_fasta_input = _add_input_file("fasta_in", validate_file, "Path to Input FASTA File") add_fastq_output = _add_input_file("fastq_out", str, "Path to output Fastq File") add_fastq_input = _add_input_file("fastq_in", validate_file, "Path to Input FASTQ File") add_fofn_input = _add_input_file("fofn_in", validate_fofn, "Path to Input FOFN (File of file names) File") add_fofn_output = _add_input_file("fofn_out", str, "Path to output FOFN.") add_report_output = _add_input_file("json_report", str, "Path to PacBio JSON Report") add_subread_input = _add_input_file("subread_ds", validate_file, "Path to PacBio Subread DataSet XML") add_ds_reference_input = _add_input_file("reference_ds", validate_file, "Path to PacBio Subread DataSet XML") def args_executer(args): """ :rtype int """ try: return_code = args.func(args) except Exception as e: log.error(e, exc_info=True) import sys traceback.print_exc(sys.stderr) if isinstance(e, IOError): return_code = 1 else: return_code = 2 return return_code def main_runner(argv, parser, exe_runner_func, setup_log_func, alog): """ Fundamental interface to commandline applications """ started_at = time.time() args = parser.parse_args(argv) # log.debug(args) # setup log _have_log_setup = False if hasattr(args, 'quiet') and args.quiet: setup_log_func(alog, level=logging.ERROR) elif hasattr(args, 'verbosity') and args.verbosity > 0: if args.verbosity >= 2: setup_log_func(alog, level=logging.DEBUG) else: setup_log_func(alog, level=logging.INFO) elif hasattr(args, 'debug') and args.debug: setup_log_func(alog, level=logging.DEBUG) else: alog.addHandler(logging.NullHandler()) log.debug(args) alog.info("Starting tool version {v}".format(v=parser.version)) rcode = exe_runner_func(args) run_time = time.time() - started_at _d = dict(r=rcode, s=run_time) alog.info("exiting with return code {r} in {s:.2f} sec.".format(**_d)) return rcode def main_runner_default(argv, parser, alog): # FIXME. This still has the old set_defaults(func=func) and # has the assumption that --debug has been assigned as an option # This is used for all the subparsers setup_log_func = setup_log return main_runner(argv, parser, args_executer, setup_log_func, alog) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/resolver.py0000644000000000000000000002157213035554276023346 0ustar rootroot"""Driver for creating a Resolved Tool Contract from a Tool Contract""" from collections import defaultdict import logging import os import uuid from pbcommand.models.common import (SymbolTypes, REGISTERED_FILE_TYPES, ResourceTypes) from pbcommand.models.tool_contract import (ResolvedToolContract, ToolContract, ResolvedToolContractTask, ResolvedScatteredToolContractTask, ResolvedGatherToolContractTask, ToolContractResolvedResource) log = logging.getLogger(__name__) class ToolContractError(BaseException): pass def __resolve_int_or_symbol(symbol_type, symbol_or_int, max_value): if isinstance(symbol_or_int, int): return min(symbol_or_int, max_value) elif symbol_or_int == symbol_type: return max_value else: raise TypeError("unsupported type for {s} '{t}".format(t=symbol_or_int, s=symbol_type)) def _resolve_nproc(nproc_int_or_symbol, max_nproc): return __resolve_int_or_symbol(SymbolTypes.MAX_NPROC, nproc_int_or_symbol, max_nproc) def _resolve_max_nchunks(nchunks_or_symbol, max_nchunks): return __resolve_int_or_symbol(SymbolTypes.MAX_NCHUNKS, nchunks_or_symbol, max_nchunks) def _resolve_options(tool_contract, tool_options): """ Resolve Task Options from :type tool_contract: ToolContract :type tool_options: dict :rtype: dict """ resolved_options = {} # Get and Validate resolved value. # TODO. None support should be removed. for option in tool_contract.task.options: # This hides whatever underlying JSON grossness remains value = tool_options.get(option.option_id, option.default) # Wrap in a try to communicate error messages with reasonable context try: # This expects the PacBioOption subclass to implement the # necessary validating function validated_option = option.validate_option(value) resolved_options[option.option_id] = validated_option except (KeyError, ValueError, IndexError, TypeError) as e: raise ToolContractError("Incompatible option types for {o}. " "Supplied {i}. Expected pacbio opt type '{t}' {e}".format( o=option.option_id, i=type(value), t=option.OPTION_TYPE_ID, e=str(e))) return resolved_options def _resolve_output_file(registry_d, file_type, output_file_type, root_output_dir): """ Resolved the Output File Type :type file_type: pbcommand.models.FileType :type output_file_type: pbcommand.models.OutputFileType :return: Resolved output file name """ def _get_fname(base, ext): idx = base, ext count = registry_d[idx] xs = "" if count == 0 else "-" + str(count) registry_d[idx] += 1 name = "".join([base, xs, ".", ext]) return os.path.join(root_output_dir, name) # FIXME. THIS NEED TO BE FUNDAMENTALLY FIXED and updated to use the spec # in the avro schema. return _get_fname(output_file_type.default_name, file_type.ext) def _resolve_resource_types(resources, output_dir, root_tmp_dir): resolved_resources = [] def _add(rt_id, p): r = ToolContractResolvedResource(rt_id, p) resolved_resources.append(r) return r def _to_p(x): return os.path.join(root_tmp_dir, x) def _to_r(prefix, suffix=None): u = uuid.uuid4() name = "{x}-{u}".format(u=u, x=prefix) if suffix is not None: name += suffix return _to_p(name) # The names are not optimal, this would require more config for resource in resources: if resource == ResourceTypes.TMP_DIR: path = _to_r("pb-tmp") _add(resource, path) elif resource == ResourceTypes.TMP_FILE: _add(resource, _to_r("pb-tmp", "-file")) elif resource == ResourceTypes.LOG_FILE: u = uuid.uuid4() name = "{x}-{u}-log".format(u=u, x="pb-tmp") path = os.path.join(output_dir, name) _add(resource, path) else: raise ValueError("Unsupported Resource Type {x}".format(x=resource)) return resolved_resources def _resolve_output_files(output_file_types, root_output_dir): # store the files as {(base, ext): count} _outs_registry = defaultdict(lambda: 0) return [_resolve_output_file(_outs_registry, REGISTERED_FILE_TYPES[f.file_type_id], f, root_output_dir) for f in output_file_types] def _resolve_core(tool_contract, input_files, root_output_dir, max_nproc, tool_options, tmp_dir=None): """ tool_options are dict{id:value} of values to override defaults """ if len(input_files) != len(tool_contract.task.input_file_types): _d = dict(i=input_files, t=tool_contract.task.input_file_types) raise ToolContractError("Incompatible input types. Supplied {i}. Expected file types {t}".format(**_d)) output_files = _resolve_output_files(tool_contract.task.output_file_types, root_output_dir) resolved_options = _resolve_options(tool_contract, tool_options) nproc = _resolve_nproc(tool_contract.task.nproc, max_nproc) resolved_resources = _resolve_resource_types(tool_contract.task.resources, root_output_dir, tmp_dir) return output_files, resolved_options, nproc, resolved_resources def resolve_tool_contract(tool_contract, input_files, root_output_dir, root_tmp_dir, max_nproc, tool_options, is_distributable, log_level="INFO"): """ Convert a ToolContract into a Resolved Tool Contract. :param tool_contract: Tool Contract interface :param input_files: List of input files (must be consistent with the tool contract input file list (types are not enforced) :param max_nproc: Max number of processors :param tool_options: dict of overridden options :type input_files: list[String] :type max_nproc: int :type tool_contract: ToolContract :type tool_options: dict :rtype: ResolvedToolContract :return: A Resolved tool contract """ output_files, resolved_options, nproc, resources = _resolve_core(tool_contract, input_files, root_output_dir, max_nproc, tool_options, root_tmp_dir) is_distributed = False if is_distributable and tool_contract.task.is_distributed: is_distributed = True task = ResolvedToolContractTask(tool_contract.task.task_id, is_distributed, input_files, output_files, resolved_options, nproc, resources, log_level=log_level) return ResolvedToolContract(task, tool_contract.driver) def resolve_scatter_tool_contract(tool_contract, input_files, root_output_dir, root_tmp_dir, max_nproc, tool_options, max_nchunks, chunk_keys, is_distributable, log_level="INFO"): output_files, resolved_options, nproc, resources = _resolve_core(tool_contract, input_files, root_output_dir, max_nproc, tool_options, tmp_dir=root_tmp_dir) resolved_max_chunks = _resolve_max_nchunks(tool_contract.task.max_nchunks, max_nchunks) task = ResolvedScatteredToolContractTask(tool_contract.task.task_id, tool_contract.task.is_distributed and is_distributable, input_files, output_files, resolved_options, nproc, resources, resolved_max_chunks, chunk_keys, log_level=log_level) return ResolvedToolContract(task, tool_contract.driver) def resolve_gather_tool_contract(tool_contract, input_files, root_output_dir, root_tmp_dir, max_nproc, tool_options, chunk_key, is_distributable, log_level="INFO"): output_files, resolved_options, nproc, resources = _resolve_core(tool_contract, input_files, root_output_dir, max_nproc, tool_options, tmp_dir=root_tmp_dir) task = ResolvedGatherToolContractTask(tool_contract.task.task_id, tool_contract.task.is_distributed and is_distributable, input_files, output_files, resolved_options, nproc, resources, chunk_key, log_level=log_level) return ResolvedToolContract(task, tool_contract.driver) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/validators.py0000644000000000000000000001201713035554276023647 0ustar rootrootimport os import logging import functools from pbcommand.utils import nfs_exists_check from pbcommand.pb_io import load_report_from_json log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) # squash annoying no Handler msg def trigger_nfs_refresh(ff): # keeping this for backward compatibility return nfs_exists_check(ff) def _validate_resource(func, resource): """Validate the existence of a file/dir""" _ = nfs_exists_check(resource) if func(resource): return os.path.abspath(resource) else: raise IOError("Unable to find '{f}'".format(f=resource)) validate_file = functools.partial(_validate_resource, os.path.isfile) validate_dir = functools.partial(_validate_resource, os.path.isdir) validate_output_dir = functools.partial(_validate_resource, os.path.isdir) def validate_or(f1, f2, error_msg): """ Apply Valid functions f1, then f2 (if failure occurs) :param error_msg: Default message to print """ def wrapper(path): try: return f1(path) except Exception: try: return f2(path) except Exception as e: log.error("{m} {p} \n. {e}".format(m=error_msg, p=path, e=repr(e))) raise return wrapper def validate_report(report_file_name): """ Raise ValueError if report contains path seps """ if not os.path.basename(report_file_name) == report_file_name: raise ValueError("Path separators are not allowed: {r}".format(r=report_file_name)) return report_file_name def validate_fofn(fofn): """Validate existence of FOFN and files within the FOFN. :param fofn: (str) Path to File of file names. :raises: IOError if any file is not found. :return: (str) abspath of the input fofn """ _ = nfs_exists_check(fofn) if os.path.isfile(fofn): file_names = fofn_to_files(os.path.abspath(fofn)) log.debug("Found {n} files in FOFN {f}.".format(n=len(file_names), f=fofn)) return os.path.abspath(fofn) else: raise IOError("Unable to find {f}".format(f=fofn)) def fofn_to_files(fofn): """Util func to convert a bas/bax fofn file to a list of bas/bax files.""" _ = nfs_exists_check(fofn) if os.path.exists(fofn): with open(fofn, 'r') as f: bas_files = {line.strip() for line in f.readlines()} for bas_file in bas_files: if not os.path.isfile(bas_file): # try one more time to find the file by # performing an NFS refresh found = nfs_exists_check(bas_file) if not found: raise IOError("Unable to find bas/bax file '{f}'".format(f=bas_file)) return list(bas_files) else: raise IOError("Unable to find FOFN {f}".format(f=fofn)) def validate_report(file_name): e = [] base_path = os.path.dirname(file_name) r = load_report_from_json(file_name) if r.title is None: e.append("Report {i} is missing a title".format(i=r.id)) for t in r.tables: if t.title is None: e.append("Table {r.t} is missing a title".format(r=r.id, t=t.id)) for col in t.columns: if col.header is None: e.append("Column {r.t.c} is missing a header".format( r=r.id, t=t.id, c=col.id)) lengths = set([len(col.values) for col in t.columns]) if len(lengths) != 1: e.append("Inconsistent column sizes in table {r.t}: {s}".format( r=r.id, t=t.id, s=",".join( [str(x) for x in sorted(list(lengths))]))) for pg in r.plotGroups: if pg.title is None: e.append("Plot group {r.g} is missing a title".format( r=r.id, g=pg.id)) for plot in pg.plots: #if plot.caption is None: # raise ValueError("Plot {r.g.p} is missing a caption".format( # r=r.id, g=pg.id, p=plot.id)) if plot.image is None: e.append("Plot {r.g.p} does not have an image".format( r=r.id, g=pg.id, p=plot.id)) img_path = os.path.join(base_path, plot.image) if not os.path.exists(img_path): e.append("The plot image {f} does not exist".format(f=img_path)) if plot.thumbnail is None: pass #raise ValueError("Plot {r.g.p} does not have an thumbnail image".format( # r=r.id, g=pg.id, p=plot.id)) else: thumbnail = os.path.join(base_path, plot.thumbnail) if not os.path.exists(thumbnail): e.append("The thumbnail image {f} does not exist".format(f=img_path)) if pg.thumbnail is not None: thumbnail = os.path.join(base_path, pg.thumbnail) if not os.path.exists(thumbnail): e.append("The thumbnail image {f} does not exist".format(f=img_path)) if len(e) > 0: raise ValueError("\n".join(e)) return r pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/0000755000000000000000000000000013035554276022407 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/conditions.py0000644000000000000000000000203213035554276025127 0ustar rootroot"""Working doc for Condition data models https://gist.github.com/mpkocher/347f9ae9092c24888e1c702a916276c2 """ from collections import namedtuple class ReseqCondition(namedtuple("ReseqCondition", "cond_id subreadset alignmentset referenceset")): def to_dict(self): return {"condId": self.cond_id, "subreadset": self.subreadset, "alignmentset": self.alignmentset, "referenceset": self.referenceset} @staticmethod def from_dict(d): def _f(k): # sloppy return d[k].encode('ascii', 'ignore') return ReseqCondition(_f('condId'), _f('subreadset'), _f('alignmentset'), _f('referenceset')) class ReseqConditions(namedtuple("ReseqConditions", "conditions")): # leave out the pipeline id. Not sure if this is necessary def to_dict(self): return {"conditions": [c.to_dict() for c in self.conditions]} @staticmethod def from_dict(d): return ReseqConditions([ReseqCondition.from_dict(x) for x in d['conditions']]) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/tool_contract.py0000644000000000000000000003325313035554276025641 0ustar rootroot"""Common models for Tool Contract and Resolved Tool Contract Author: Michael Kocher """ import abc from collections import OrderedDict import types import datetime import pbcommand from .common import TaskTypes, ResourceTypes, REGISTERED_FILE_TYPES __version__ = pbcommand.get_version() class MalformedToolContractError(ValueError): pass class MalformedResolvedToolContractError(ValueError): pass def _validate_type(value, type_or_types): return isinstance(value, type_or_types) def _validate_or_raise(value, type_or_types): if not _validate_type(value, type_or_types): _d = dict(x=value, t=type(value), s=type_or_types) raise TypeError("Unsupported type for {x} {t}. Expected types {s}".format(**_d)) return value def _validate_list_of_or_raise(a_list, t): """Validates a List of items of a specific type""" if not isinstance(a_list, (list, tuple)): raise TypeError("Expected list, got {t}".format(t=type(a_list))) for item in a_list: if not isinstance(item, t): raise TypeError("Expected type {t}, Got {x}".format(t=t, x=type(item))) return a_list def _is_empty_list(alist): return len(alist) == 0 def __validate_ioputs(msg, alist): if _is_empty_list(alist): raise MalformedToolContractError(msg) return True def validate_tool_contract(tc): """:type tc: ToolContract Expand this out. """ __validate_ioputs("Inputs must have at least 1 input.", tc.task.input_file_types) __validate_ioputs("Outputs must have at least 1 output", tc.task.output_file_types) for oft in tc.task.output_file_types: file_type = REGISTERED_FILE_TYPES[oft.file_type_id] if oft.default_name.endswith(file_type.ext): raise ValueError( "File {i} default name already has extension: {n}".format( i=oft.label, n=oft.default_name)) return tc class _IOFileType(object): __metaclass__ = abc.ABCMeta def __init__(self, file_type_id, label, display_name, description): self.file_type_id = file_type_id self.label = label self.display_name = display_name # short description self.description = description def __repr__(self): _d = dict(i=self.label, n=self.display_name, f=self.file_type_id, k=self.__class__.__name__) return "<{k} {f} {i} >".format(**_d) @abc.abstractmethod def to_dict(self): raise NotImplementedError class InputFileType(_IOFileType): def to_dict(self): return dict(file_type_id=self.file_type_id, id=self.label, title=self.display_name, description=self.description) class OutputFileType(_IOFileType): def __init__(self, file_type_id, label, display_name, description, default_name): super(OutputFileType, self).__init__(file_type_id, label, display_name, description) # Default name of the output file. Should be specified as (base, ext) # but "base.ext" is also supported. This should go away self.default_name = default_name def to_dict(self): return dict(file_type_id=self.file_type_id, id=self.label, title=self.display_name, description=self.description, default_name=self.default_name) class ToolContractResolvedResource(object): def __init__(self, resource_type_id, path): assert resource_type_id in ResourceTypes.ALL() self.type_id = resource_type_id self.path = path def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.type_id, p=self.path) return "<{k} {i} path:{p} >".format(**_d) @staticmethod def from_d(d): return ToolContractResolvedResource(d['resource_type'], d['path']) def to_dict(self): return dict(resource_type=self.type_id, path=self.path) class ToolDriver(object): def __init__(self, driver_exe, env=None, serialization='json'): """ :param driver_exe: Path to the driver :param env: path to env to be sourced before it's run? :return: """ self.driver_exe = driver_exe self.env = {} if env is None else env # 'avro' or 'json' self.serialization = serialization def __repr__(self): _d = dict(k=self.__class__.__name__, e=self.driver_exe) return "<{k} driver:{e} >".format(**_d) def to_dict(self): return dict(exe=self.driver_exe, env=self.env, serialization=self.serialization) class ToolContractTask(object): TASK_TYPE_ID = TaskTypes.STANDARD def __init__(self, task_id, name, description, version, is_distributed, input_types, output_types, options, nproc, resources): """ Core metadata for a commandline task :param task_id: Global id to reference your tool in a pipeline :type task_id: str :param name: Display name of your :param description: Short description of your tool :param version: semantic style version string :param is_distributed: If the task will be run locally or not :param is_distributed: bool :param input_types: list[FileType] :param output_types: :param options: list of PacBioOption instances :param nproc: :param resources: :type tool_options: list[PacBioOption] """ self.task_id = task_id self.name = name self.description = description self.version = version self.is_distributed = is_distributed self.input_file_types = input_types self.output_file_types = output_types # This needs to be list self.options = _validate_or_raise(options, types.ListType) self.nproc = nproc # List of ResourceTypes self.resources = resources def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.task_id, t=self.is_distributed, n=self.name) return "<{k} id:{i} {n} >".format(**_d) def to_dict(self): opts = [x.to_dict() for x in self.options] # for debugging, but creates too much chatter for production # now = " " + str(datetime.datetime.now()) now = "" comment = "Created by pbcommand {v}".format(v=__version__, n=now) + str(now) _t = dict(tool_contract_id=self.task_id, input_types=[i.to_dict() for i in self.input_file_types], output_types=[i.to_dict() for i in self.output_file_types], task_type=self.TASK_TYPE_ID, is_distributed=self.is_distributed, name=self.name, description=self.description, schema_options=opts, nproc=self.nproc, resource_types=self.resources, _comment=comment) return _t class ScatterToolContractTask(ToolContractTask): TASK_TYPE_ID = TaskTypes.SCATTERED def __init__(self, task_id, name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resources, chunk_keys, max_nchunks): """Scatter tasks have a special output signature of [FileTypes.CHUNK] The chunk keys are the expected to be written to the chunk.json file """ super(ScatterToolContractTask, self).__init__(task_id, name, description, version, is_distributed, input_types, output_types, tool_options, nproc, resources) self.chunk_keys = chunk_keys # int or $max_chunks symbol self.max_nchunks = max_nchunks def to_dict(self): s = super(ScatterToolContractTask, self).to_dict() s['chunk_keys'] = self.chunk_keys s['nchunks'] = self.max_nchunks return s class GatherToolContractTask(ToolContractTask): """Gather tasks have special input type [FileTypes.CHUNK]""" TASK_TYPE_ID = TaskTypes.GATHERED # not completely sure how to handle chunk-keys. class ToolContract(object): # Calling to_dict will always generate a compliant version with this # spec WRITER_SCHEMA_VERSION = "2.0.0" def __init__(self, task, driver, schema_version=WRITER_SCHEMA_VERSION): """ :type task: ToolContractTask | ScatterToolContractTask | GatherToolContractTask :type driver: ToolDriver :param task: :param driver: :return: """ self.task = task self.driver = driver self.schema_version = schema_version def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.task.task_id, t=self.task.is_distributed) return "<{k} id:{i} >".format(**_d) def to_dict(self): validate_tool_contract(self) _t = self.task.to_dict() _d = dict(version=self.task.version, tool_contract_id=self.task.task_id, driver=self.driver.to_dict(), tool_contract=_t, schema_version=self.WRITER_SCHEMA_VERSION) return _d def _get_resource_by_type(rt, resources): xs = [] for r in resources: if r.type_id == rt: xs.append(r) return xs class ResolvedToolContractTask(object): # The interface is the same, but the types are "resolved" and have a # different # structure TASK_TYPE_ID = TaskTypes.STANDARD def __init__(self, task_id, is_distributed, input_files, output_files, options, nproc, resources, log_level="INFO"): self.task_id = task_id self.is_distributed = is_distributed self.input_files = input_files self.output_files = output_files self.options = options self.nproc = nproc self.resources = resources self.log_level = log_level @property def tmpdir_resources(self): return _get_resource_by_type(ResourceTypes.TMP_DIR, self.resources) @property def tmpfile_resources(self): return _get_resource_by_type(ResourceTypes.TMP_FILE, self.resources) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.task_id, t=self.is_distributed) return "<{k} id:{i} >".format(**_d) def to_dict(self): comment = "Created by pbcommand v{v}".format(v=pbcommand.get_version()) tc = dict(input_files=self.input_files, output_files=self.output_files, task_type=self.TASK_TYPE_ID, is_distributed=self.is_distributed, tool_contract_id=self.task_id, nproc=self.nproc, resources=[r.to_dict() for r in self.resources], options=self.options, _comment=comment, log_level=self.log_level) return tc class ResolvedScatteredToolContractTask(ResolvedToolContractTask): TASK_TYPE_ID = TaskTypes.SCATTERED def __init__(self, task_id, is_distributed, input_files, output_files, options, nproc, resources, max_nchunks, chunk_keys, log_level="INFO"): super(ResolvedScatteredToolContractTask, self).__init__(task_id, is_distributed, input_files, output_files, options, nproc, resources, log_level) self.max_nchunks = max_nchunks # these can be used to verified the output chunk.json # after the task has been run self.chunk_keys = chunk_keys def to_dict(self): d = super(ResolvedScatteredToolContractTask, self).to_dict() d['max_nchunks'] = self.max_nchunks d['chunk_keys'] = self.chunk_keys return d class ResolvedGatherToolContractTask(ResolvedToolContractTask): TASK_TYPE_ID = TaskTypes.GATHERED def __init__(self, task_id, is_distributed, input_files, output_files, options, nproc, resources, chunk_key, log_level="INFO"): """ The chunk key is used in the pluck specific chunk values from PipelineChunks. This makes gather tasks (i.e., GffGather) generalized. """ super(ResolvedGatherToolContractTask, self).__init__(task_id, is_distributed, input_files, output_files, options, nproc, resources, log_level) self.chunk_key = chunk_key def to_dict(self): d = super(ResolvedGatherToolContractTask, self).to_dict() d['chunk_key'] = self.chunk_key return d class ResolvedToolContract(object): def __init__(self, task, driver): """ :type task: ResolvedToolContractTask | ResolvedScatteredToolContractTask | ResolvedGatherToolContractTask :type driver: ToolDriver :param task: :param driver: :return: """ self.task = task self.driver = driver def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.task.task_id, t=self.task.is_distributed) return "<{k} id:{i} >".format(**_d) def to_dict(self): return dict(resolved_tool_contract=self.task.to_dict(), driver=self.driver.to_dict()) class PipelinePreset(object): def __init__(self, options, task_options, pipeline_id, preset_id, name, description): self.options = options self.task_options = task_options self.pipeline_id = pipeline_id self.preset_id = preset_id self.name = name self.description = description def __repr__(self): _d = dict(k=self.__class__.__name__) #self.to_dict() return "<{k} >".format(**_d) def to_dict(self): return OrderedDict([ ("pipelineId", self.pipeline_id), ("presetId", self.preset_id), ("name", self.name), ("description", self.description), ("options", dict(self.options)), ("taskOptions", dict(self.task_options))]) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/__init__.py0000644000000000000000000000143413035554276024522 0ustar rootrootfrom .common import (FileType, FileTypes, TaskOptionTypes, DataSetFileType, DataSetMetaData, TaskTypes, ResourceTypes, SymbolTypes, PipelineChunk, DataStoreFile, DataStore, DataStoreViewRule, PipelineDataStoreViewRules, BasePacBioOption, PacBioIntOption, PacBioBooleanOption, PacBioStringOption, PacBioFloatOption, PacBioIntChoiceOption, PacBioFloatChoiceOption, PacBioStringChoiceOption) from .tool_contract import * from .parser import (get_pbparser, get_gather_pbparser, get_scatter_pbparser, PbParser) from .conditions import (ReseqCondition, ReseqConditions) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/common.py0000644000000000000000000010405713035554276024260 0ustar rootroot"""Core models used in the ToolContract and Resolved ToolContract Large parts of this are pulled from pbsmrtpipe. Author: Michael Kocher """ import json import logging import os import re import types import warnings import functools import datetime from collections import namedtuple log = logging.getLogger(__name__) REGISTERED_FILE_TYPES = {} # Light weight Dataset metatadata. Use pbcore for full dataset functionality DataSetMetaData = namedtuple("DataSetMetaData", 'uuid metatype') class PacBioNamespaces(object): # File Types # PBSMRTPIPE_FILE_PREFIX = 'pbsmrtpipe.files' # NEW File Type Identifier style Prefix NEW_PBSMRTPIPE_FILE_PREFIX = "PacBio.FileTypes" # New DataSet Identifier Prefix DATASET_FILE_PREFIX = "PacBio.DataSet" PB_INDEX = "PacBio.Index" # Task Ids PBSMRTPIPE_TASK_PREFIX = 'pbsmrtpipe.tasks' PB_TASK_TYPES = 'pbsmrtpipe.task_types' # Task Options PBSMRTPIPE_TASK_OPTS_PREFIX = 'pbsmrtpipe.task_options' # Workflow Level Options PBSMRTPIPE_OPTS_PREFIX = 'pbsmrtpipe.options' # Constants PBSMRTPIPE_CONSTANTS_PREFIX = 'pbsmrtpipe.constants' # Pipelines PBSMRTPIPE_PIPELINES = "pbsmrtpipe.pipelines" # Option Types PBSMRTPIPE_OPTS_TYPE = "pbsmrtpipe.option_types" def __to_type(prefix, name): return ".".join([prefix, name]) to_constant_ns = functools.partial(__to_type, PacBioNamespaces.PBSMRTPIPE_CONSTANTS_PREFIX) to_file_ns = functools.partial(__to_type, PacBioNamespaces.NEW_PBSMRTPIPE_FILE_PREFIX) to_ds_ns = functools.partial(__to_type, PacBioNamespaces.DATASET_FILE_PREFIX) to_task_option_ns = functools.partial(__to_type, PacBioNamespaces.PBSMRTPIPE_TASK_OPTS_PREFIX) to_task_ns = functools.partial(__to_type, PacBioNamespaces.PBSMRTPIPE_TASK_PREFIX) to_task_types_ns = functools.partial(__to_type, PacBioNamespaces.PB_TASK_TYPES) to_workflow_option_ns = functools.partial(__to_type, PacBioNamespaces.PBSMRTPIPE_OPTS_PREFIX) to_pipeline_ns = functools.partial(__to_type, PacBioNamespaces.PBSMRTPIPE_PIPELINES) to_index_ns = functools.partial(__to_type, PacBioNamespaces.PB_INDEX) to_opt_type_ns = functools.partial(__to_type, PacBioNamespaces.PBSMRTPIPE_OPTS_TYPE) class TaskTypes(object): # This is really TC types STANDARD = to_task_types_ns("standard") SCATTERED = to_task_types_ns("scattered") GATHERED = to_task_types_ns("gathered") class TaskOptionTypes(object): """Core Task Option type id type""" # FIXME(mpkocher)(2016-7-16) This should be more well defined, e.g., int32 and use the same id format of # For example, pacbio.option_types.int32 # Because of the Avro schema restrictions and to keep the keys short # in name, we'll use a dot let format. The legacy format used # pbsmrtpipe.option_types.* as the root namespace INT = "integer" BOOL = "boolean" STR = "string" FLOAT = "float" # Choice type Options CHOICE_STR = "choice_string" CHOICE_INT = "choice_integer" CHOICE_FLOAT = "choice_float" @classmethod def ALL(cls): """Return a set of all Task Option Types""" return {cls.INT, cls.BOOL, cls.STR, cls.FLOAT, cls.CHOICE_STR, cls.CHOICE_INT, cls.CHOICE_FLOAT} @classmethod def _raise_value_error(cls, value, allowed, option_type_name): raise ValueError("Incompatible task {o} option type id '{s}'. " "Allowed values {v}".format(o=option_type_name, s=value, v=",".join(allowed))) @classmethod def ALL_SIMPLE(cls): """Returns a set of 'simple' task option types (e.g., boolean, string, int, float)""" return {cls.STR, cls.BOOL, cls.INT, cls.FLOAT} @classmethod def from_simple_str(cls, sx): """Validates a string is a validate task option type id or raise ValueError :raises ValueError """ if sx in cls.ALL_SIMPLE(): return sx else: cls._raise_value_error(sx, cls.ALL_SIMPLE(), "simple") @classmethod def ALL_CHOICES(cls): """Returns a set of choice task option types""" return {cls.CHOICE_INT, cls.CHOICE_FLOAT, cls.CHOICE_STR} @classmethod def is_choice(cls, sx): return sx in cls.ALL_CHOICES() @classmethod def from_choice_str(cls, sx): """Validates and returns a task choice option type or raises ValueError""" if sx in cls.ALL_CHOICES(): return sx else: cls._raise_value_error(sx, cls.ALL_CHOICES(), "choice") @classmethod def from_str(cls, sx): """Validates and returns a valid type option type id or raises ValueError, :note: For legacy reasons, "number" will be mapped to "float" """ # FIXME, Legacy fix, "number" appears to mean "float"? if sx == "number": sx = TaskOptionTypes.FLOAT if sx in TaskOptionTypes.ALL(): return sx else: cls._raise_value_error(sx, cls.ALL(), "") class SymbolTypes(object): """ *Symbols* that are understood during resolving, such as max number of processors, Max Chunks. Used when defining a Tool Contract """ MAX_NPROC = '$max_nproc' MAX_NCHUNKS = '$max_nchunks' TASK_TYPE = '$task_type' RESOLVED_OPTS = '$ropts' SCHEMA_OPTS = '$opts_schema' OPTS = '$opts' NCHUNKS = '$nchunks' NPROC = '$nproc' class ResourceTypes(object): """ Resources such as tmp dirs and files, log files. Used when defining a Tool Contract """ TMP_DIR = '$tmpdir' TMP_FILE = '$tmpfile' LOG_FILE = '$logfile' # tasks can write output to this directory OUTPUT_DIR = '$outputdir' # Not sure this is a good idea # TASK_DIR = '$taskdir' @classmethod def ALL(cls): return cls.TMP_DIR, cls.TMP_FILE, cls.LOG_FILE, cls.OUTPUT_DIR @classmethod def is_tmp_resource(cls, name): return name in (cls.TMP_FILE, cls.TMP_DIR) @classmethod def is_valid(cls, attr_name): return attr_name in cls.ALL() class _RegisteredFileType(type): def __init__(cls, name, bases, dct): super(_RegisteredFileType, cls).__init__(name, bases, dct) def __call__(cls, *args, **kwargs): if len(args) != 4: log.error(args) raise ValueError("Incorrect initialization for {c}".format(c=cls.__name__)) file_type_id, base_name, file_ext, mime_type = args file_type = REGISTERED_FILE_TYPES.get(file_type_id, None) if file_type is None: file_type = super(_RegisteredFileType, cls).__call__(*args) #log.debug("Registering file type '{i}'".format(i=file_type_id)) REGISTERED_FILE_TYPES[file_type_id] = file_type else: # print warning if base name, ext, mime type aren't the same attrs_names = [('base_name', base_name), ('ext', file_ext), ('mime_type', mime_type)] for attrs_name, value in attrs_names: v = getattr(file_type, attrs_name) if v != value: _msg = "Attempting to register a file with a different '{x}' -> {v} (expected {y})".format(x=attrs_name, v=v, y=value) log.warn(_msg) warnings.warn(_msg) return file_type class FileType(object): __metaclass__ = _RegisteredFileType def __init__(self, file_type_id, base_name, ext, mime_type): """ Core File Type data model :param file_type_id: unique file string :param base_name: default base name of the file (without extension) :param ext: file extension :param mime_type: file mimetype :return: """ self.file_type_id = file_type_id self.base_name = base_name self.ext = ext self.mime_type = mime_type if file_type_id not in REGISTERED_FILE_TYPES: REGISTERED_FILE_TYPES[file_type_id] = self @property def default_name(self): """ Default name of file alias for base_name""" return self.base_name # ".".join([self.base_name, self.ext]) def __eq__(self, other): if isinstance(other, self.__class__): if self.file_type_id == other.file_type_id: if self.base_name == other.base_name: if self.ext == other.ext: return True return False def __ne__(self, other): return not self.__eq__(other) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.file_type_id, n=self.default_name) return "<{k} id={i} name={n} >".format(**_d) class DataSetFileType(FileType): """File types that are a DataSet Type""" pass class MimeTypes(object): """Supported Mime types""" JSON = 'application/json' TXT = 'text/plain' CSV = 'text/csv' XML = 'application/xml' BINARY = 'application/octet-stream' PICKLE = 'application/python-pickle' GZIP = 'application/x-gzip' ZIP = 'application/zip' class FileTypes(object): """Registry of all PacBio Files types This needs to be cleaned up and solidified. The old pre-SA3 file types need to be deleted. """ # generic Txt file TXT = FileType(to_file_ns('txt'), 'file', 'txt', MimeTypes.TXT) # Generic Log file LOG = FileType(to_file_ns('log'), 'file', 'log', MimeTypes.TXT) # Config file CFG = FileType(to_file_ns('cfg'), 'config', 'cfg', MimeTypes.TXT) # THIS NEEDS TO BE CONSISTENT with scala code. When the datastore # is written to disk the file type id's might be translated to # the DataSet style file type ids. REPORT = FileType(to_file_ns('JsonReport'), "report", "json", MimeTypes.JSON) # this will go away soon in favor of using a more type based model to # distinguish between scatter and gather file types CHUNK = FileType(to_file_ns("CHUNK"), "chunk", "json", MimeTypes.JSON) GCHUNK = FileType(to_file_ns("GCHUNK"), 'gather_chunk', "json", MimeTypes.JSON) SCHUNK = FileType(to_file_ns("SCHUNK"), "scatter_chunk", "json", MimeTypes.JSON) FASTA = FileType(to_file_ns('Fasta'), "file", "fasta", MimeTypes.TXT) FASTQ = FileType(to_file_ns('Fastq'), "file", "fastq", MimeTypes.TXT) # Not sure this should be a special File Type? INPUT_XML = FileType(to_file_ns('input_xml'), "input", "xml", MimeTypes.XML) FOFN = FileType(to_file_ns("generic_fofn"), "generic", "fofn", MimeTypes.TXT) MOVIE_FOFN = FileType(to_file_ns('movie_fofn'), "movie", "fofn", MimeTypes.TXT) RGN_FOFN = FileType(to_file_ns('rgn_fofn'), "region", "fofn", MimeTypes.TXT) RS_MOVIE_XML = FileType(to_file_ns("rs_movie_metadata"), "file", "rs_movie.metadata.xml", MimeTypes.XML) REF_ENTRY_XML = FileType(to_file_ns('reference_info_xml'), "reference.info.xml", "xml", MimeTypes.XML) ALIGNMENT_CMP_H5 = FileType(to_file_ns('alignment_cmp_h5'), "alignments", "cmp.h5", MimeTypes.BINARY) # I am not sure this should be a first class file BLASR_M4 = FileType(to_file_ns('blasr_file'), 'blasr', 'm4', MimeTypes.TXT) BAM = FileType(to_file_ns('bam'), "alignments", "bam", MimeTypes.BINARY) BAMBAI = FileType(to_file_ns('bam_bai'), "alignments", "bam.bai", MimeTypes.BINARY) BED = FileType(to_file_ns('bed'), "file", "bed", MimeTypes.TXT) SAM = FileType(to_file_ns('sam'), "alignments", "sam", MimeTypes.BINARY) VCF = FileType(to_file_ns('vcf'), "file", "vcf", MimeTypes.TXT) GFF = FileType(to_file_ns('gff'), "file", "gff", MimeTypes.TXT) BIGWIG = FileType(to_file_ns('bigwig'), "annotations", "bw", MimeTypes.BINARY) CSV = FileType(to_file_ns('csv'), "file", "csv", MimeTypes.CSV) XML = FileType(to_file_ns('xml'), "file", "xml", 'application/xml') # Generic Json File JSON = FileType(to_file_ns("json"), "file", "json", MimeTypes.JSON) # Generic H5 File H5 = FileType(to_file_ns("h5"), "file", "h5", MimeTypes.BINARY) # Generic Python pickle XXX EVIL PICKLE = FileType(to_file_ns("pickle"), "file", "pickle", MimeTypes.PICKLE) # GZIPped archive GZIP = FileType(to_file_ns("gzip"), "file", "gz", MimeTypes.GZIP) ZIP = FileType(to_file_ns("zip"), "file", "zip", MimeTypes.ZIP) # ******************* NEW SA3 File Types ******************** # DataSet Types. The default file names should have well-defined agreed # upon format. See what Dave did for the bam files. # https://github.com/PacificBiosciences/PacBioFileFormats DS_SUBREADS_H5 = DataSetFileType(to_ds_ns("HdfSubreadSet"), "file", "hdfsubreadset.xml", MimeTypes.XML) DS_SUBREADS = DataSetFileType(to_ds_ns("SubreadSet"), "file", "subreadset.xml", MimeTypes.XML) DS_CCS = DataSetFileType(to_ds_ns("ConsensusReadSet"), "file", "consensusreadset.xml", MimeTypes.XML) DS_REF = DataSetFileType(to_ds_ns("ReferenceSet"), "file", "referenceset.xml", MimeTypes.XML) DS_ALIGN = DataSetFileType(to_ds_ns("AlignmentSet"), "file", "alignmentset.xml", MimeTypes.XML) DS_CONTIG = DataSetFileType(to_ds_ns("ContigSet"), "file", "contigset.xml", MimeTypes.XML) DS_BARCODE = DataSetFileType(to_ds_ns("BarcodeSet"), "file", "barcodeset.xml", MimeTypes.XML) DS_ALIGN_CCS = DataSetFileType(to_ds_ns("ConsensusAlignmentSet"), "file", "consensusalignmentset.xml", MimeTypes.XML) DS_GMAP_REF = DataSetFileType(to_ds_ns("GmapReferenceSet"), "file", "gmapreferenceset.xml", MimeTypes.XML) # PacBio Defined Formats # **** Index Files # ReferenceSet specific I_SAM = FileType(to_index_ns("SamIndex"), "file", "sam.index", MimeTypes.BINARY) I_SAW = FileType(to_index_ns("SaWriterIndex"), "file", "sa", MimeTypes.BINARY) # SMRT VIew specific files I_INDEXER = FileType(to_index_ns("Indexer"), "file", "fasta.index", MimeTypes.TXT) I_FCI = FileType(to_index_ns("FastaContigIndex"), "file", "fasta.contig.index", MimeTypes.TXT) # PacBio BAM pbi I_PBI = FileType(to_index_ns("PacBioIndex"), "file", "pbi", MimeTypes.BINARY) # This is duplicated from the old pre-DS era models. see BAMBAI I_BAI = FileType(to_index_ns("BamIndex"), "file", "bam.bai", MimeTypes.BINARY) # Fasta type files FASTA_BC = FileType("PacBio.BarcodeFile.BarcodeFastaFile", "file", "barcode.fasta", MimeTypes.TXT) # No ':' or '"' in the id FASTA_REF = FileType("PacBio.ReferenceFile.ReferenceFastaFile", "file", "pbreference.fasta", MimeTypes.TXT) CONTIG_FA = FileType("PacBio.ContigFile.ContigFastaFile", "file", "contig.fasta", MimeTypes.TXT) # Adapter Fasta File From PPA FASTA_ADAPTER = FileType("PacBio.SubreadFile.AdapterFastaFile", "file", "adapters.fasta", MimeTypes.TXT) FASTA_CONTROL = FileType("PacBio.SubreadFile.ControlFastaFile", "file", "control.fasta", MimeTypes.TXT) # BAM dialects BAM_ALN = FileType("PacBio.AlignmentFile.AlignmentBamFile", "file", "alignment.bam", MimeTypes.BINARY) BAM_SUB = FileType("PacBio.SubreadFile.SubreadBamFile", "file", "subread.bam", MimeTypes.BINARY) BAM_CCS = FileType("PacBio.ConsensusReadFile.ConsensusReadBamFile", "file", "ccs.bam", MimeTypes.BINARY) BAM_CCS_ALN = FileType("PacBio.AlignmentFile.ConsensusAlignmentBamFile", "file", "ccs_align.bam", MimeTypes.BINARY) # MK TODO. Add remaining SubreadSet files types, Scraps, HqRegion, etc.. BAZ = FileType("PacBio.ReadFile.BazFile", "file", "baz", MimeTypes.BINARY) TRC = FileType("PacBio.ReadFile.TraceFile", "file", "trc", MimeTypes.BINARY) PLS = FileType("PacBio.ReadFile.PulseFile", "file", "pls", MimeTypes.BINARY) # RS era BAX = FileType("PacBio.SubreadFile.BaxFile", "file", "bax.h5", MimeTypes.BINARY) # sts.xml STS_XML = FileType("PacBio.SubreadFile.ChipStatsFile", "file", "sts.xml", MimeTypes.XML) STS_H5 = FileType("PacBio.SubreadFile.ChipStatsH5File", "file", "sts.h5", MimeTypes.BINARY) # Resequencing Conditions File Format COND_RESEQ = FileType(to_file_ns("COND_RESEQ"), "file", "conditions-reseq.json", MimeTypes.JSON) @staticmethod def is_valid_id(file_type_id): return file_type_id in REGISTERED_FILE_TYPES @staticmethod def ALL_DATASET_TYPES(): return {i: f for i, f in REGISTERED_FILE_TYPES.iteritems() if isinstance(f, DataSetFileType)} @staticmethod def ALL(): return REGISTERED_FILE_TYPES def _get_timestamp_or_now(path, func): if os.path.exists(path): return func(path) else: return datetime.datetime.now() class DataStoreFile(object): def __init__(self, uuid, source_id, type_id, path, is_chunked=False, name="", description=""): """ :param uuid: UUID of the datstore file :param source_id: source id of the DataStore file :param type_id: File Type id of :param path: Absolute path to the datastore file :param is_chunked: is the datastore file a "chunked" file from a scatter/chunking task :param name: Display name of datastore file :param description: Description of the datastore file """ # adding this for consistency. In the scala code, the unique id must be # a uuid format self.uuid = uuid # this must globally unique. This is used to provide context to where # the file originated from (i.e., the tool author self.file_id = source_id # Consistent with a value in FileTypes self.file_type_id = type_id self.path = path # FIXME(mkocher)(2016-2-23): This is probably not the best model self.file_size = os.path.getsize(path) if os.path.exists(path) else 0 self.created_at = _get_timestamp_or_now(path, lambda px: datetime.datetime.fromtimestamp(os.path.getctime(px))) self.modified_at = _get_timestamp_or_now(path, lambda px: datetime.datetime.fromtimestamp(os.path.getmtime(px))) # Was the file produced by Chunked task self.is_chunked = is_chunked self.name = name self.description = description def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.file_id, t=self.file_type_id, p=os.path.basename(self.path)) return "<{k} {i} type:{t} filename:{p} >".format(**_d) def to_dict(self): return dict(sourceId=self.file_id, uniqueId=str(self.uuid), fileTypeId=self.file_type_id, path=self.path, fileSize=self.file_size, createdAt=_datetime_to_string(self.created_at), modifiedAt=_datetime_to_string(self.modified_at), isChunked=self.is_chunked, name=self.name, description=self.description) @staticmethod def from_dict(d): # FIXME. This isn't quite right. to_a = lambda x: x.encode('ascii', 'ignore') to_k = lambda x: to_a(d[x]) is_chunked = d.get('isChunked', False) return DataStoreFile(to_k('uniqueId'), to_k('sourceId'), to_k('fileTypeId'), to_k('path'), is_chunked=is_chunked, name=to_a(d.get("name", "")), description=to_a(d.get("description", ""))) def _datetime_to_string(dt): return dt.strftime('%Y-%m-%dT%H:%M:%S') class DataStore(object): version = "0.2.2" def __init__(self, ds_files, created_at=None): """ :param ds_files: list of datastore file instances :param created_at: Date the datastore was created. if None, will use the current datetime :type ds_files: list[DataStoreFile] """ self.files = {f.uuid: f for f in ds_files} self.created_at = datetime.datetime.now() if created_at is None else created_at self.updated_at = datetime.datetime.now() def __repr__(self): _d = dict(n=len(self.files), k=self.__class__.__name__) return "<{k} nfiles={n} >".format(**_d) def add(self, ds_file): if isinstance(ds_file, DataStoreFile): self.files[ds_file.uuid] = ds_file self.updated_at = datetime.datetime.now() else: raise TypeError("DataStoreFile expected. Got type {t} for {d}".format(t=type(ds_file), d=ds_file)) def to_dict(self): fs = [f.to_dict() for i, f in self.files.iteritems()] _d = dict(version=self.version, createdAt=_datetime_to_string(self.created_at), updatedAt=_datetime_to_string(self.updated_at), files=fs) return _d def write_json(self, file_name): write_dict_to_json(self.to_dict(), file_name, "w") def write_update_json(self, file_name): """Overwrite Datastore with current state""" write_dict_to_json(self.to_dict(), file_name, "w+") @staticmethod def load_from_d(d): """Load DataStore from a dict""" ds_files = [DataStoreFile.from_dict(x) for x in d['files']] return DataStore(ds_files) @staticmethod def load_from_json(path): """Load DataStore from a JSON file""" with open(path, 'r') as reader: d = json.loads(reader.read()) return DataStore.load_from_d(d) def _is_chunk_key(k): return k.startswith(PipelineChunk.CHUNK_KEY_PREFIX) class MalformedChunkKeyError(ValueError): """Chunk Key does NOT adhere to the spec""" pass class PipelineChunk(object): CHUNK_KEY_PREFIX = "$chunk." RX_CHUNK_KEY = re.compile(r'^\$chunk\.([A-z0-9_]*)') def __init__(self, chunk_id, **kwargs): """ kwargs is a key-value store. keys that begin "$chunk." are considered to be semantically understood by workflow and can be "routed" to chunked task inputs. Values that don't begin with "$chunk." are considered metadata. :param chunk_id: Chunk id :type chunk_id: str """ if self.RX_CHUNK_KEY.match(chunk_id) is not None: raise MalformedChunkKeyError("'{c}' expected {p}".format(c=chunk_id, p=self.RX_CHUNK_KEY.pattern)) self.chunk_id = chunk_id # loose key-value pair self._datum = kwargs def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.chunk_id, c=",".join(self.chunk_keys)) return "<{k} id='{i}' chunk keys={c} >".format(**_d) def set_chunk_key(self, chunk_key, value): """Overwrite or add a chunk_key => value to the Chunk datum the chunk-key can be provided with or without the '$chunk:' prefix """ if not chunk_key.startswith(PipelineChunk.CHUNK_KEY_PREFIX): chunk_key = PipelineChunk.CHUNK_KEY_PREFIX + chunk_key self._datum[chunk_key] = value def set_metadata_key(self, metadata_key, value): """Set chunk metadata key => value metadata key must NOT begin with $chunk. format """ if metadata_key.startswith(PipelineChunk.CHUNK_KEY_PREFIX): raise ValueError("Cannot set chunk-key values. {i}".format(i=metadata_key)) self._datum[metadata_key] = value @property def chunk_d(self): return {k: v for k, v in self._datum.iteritems() if _is_chunk_key(k)} @property def chunk_keys(self): return self.chunk_d.keys() @property def chunk_metadata(self): return {k: v for k, v in self._datum.iteritems() if not _is_chunk_key(k)} def to_dict(self): return {'chunk_id': self.chunk_id, 'chunk': self._datum} class DataStoreViewRule(object): """ Rule specifying if and how the UI should display a datastore file. """ def __init__(self, source_id, file_type_id, is_hidden, name="", description=""): """ :param source_id: Unique source id of the datastore file :param file_type_id: File Type id of the datastore file :param is_hidden: Mark the file has hidden :param name: Display name of the file :param description: Description of the file """ # for generating rules compositionally in Python, it's easier to just # pass the FileType object directly if isinstance(file_type_id, FileType): file_type_id = file_type_id.file_type_id assert FileTypes.is_valid_id(file_type_id), file_type_id self.source_id = source_id self.file_type_id = file_type_id self.is_hidden = is_hidden self.name = name self.description = description def to_dict(self): return {"sourceId": self.source_id, "fileTypeId": self.file_type_id, "isHidden": self.is_hidden, "name": self.name, "description": self.description} @staticmethod def from_dict(d): return DataStoreViewRule(d['sourceId'], d['fileTypeId'], d['isHidden'], d.get('name', ''), d.get('description', '')) class PipelineDataStoreViewRules(object): """ A collection of DataStoreViewRule objects associated with a pipeline. """ def __init__(self, pipeline_id, smrtlink_version, rules=()): self.pipeline_id = pipeline_id self.smrtlink_version = smrtlink_version self.rules = list(rules) def to_dict(self): return {"pipelineId": self.pipeline_id, "smrtlinkVersion": self.smrtlink_version, "rules": [r.to_dict() for r in self.rules]} @staticmethod def from_dict(d): return PipelineDataStoreViewRules( pipeline_id=d['pipelineId'], smrtlink_version=d['smrtlinkVersion'], rules=[DataStoreViewRule.from_dict(r) for r in d['rules']]) @staticmethod def load_from_json(path): with open(path, 'r') as reader: d = json.loads(reader.read()) return PipelineDataStoreViewRules.from_dict(d) def write_json(self, file_name): write_dict_to_json(self.to_dict(), file_name) def write_dict_to_json(d, file_name, permission="w"): with open(file_name, permission) as f: s = json.dumps(d, indent=4, sort_keys=True, separators=(',', ': ')) f.write(s) RX_TASK_ID = re.compile(r'^([A-z0-9_]*)\.tasks\.([A-z0-9_]*)$') RX_TASK_OPTION_ID = re.compile(r'^([A-z0-9_]*)\.task_options\.([A-z0-9_\.]*)') def _validate_id(prog, idtype, tid): if prog.match(tid): return tid else: raise ValueError("Invalid format {t}: '{i}' {p}".format(t=idtype, i=tid, p=repr(prog.pattern))) validate_task_id = functools.partial(_validate_id, RX_TASK_ID, 'task id') validate_task_option_id = functools.partial(_validate_id, RX_TASK_OPTION_ID, 'task option id') class BasePacBioOption(object): # This is an abstract class. This really blurring the abstract with # implementation which makes the interface unclear. # This MUST be a validate TaskOptionTypes.* value. OPTION_TYPE_ID = "UNKNOWN" @classmethod def validate_core_type(cls, value): """ Every Option has a "core" type that needs to validated in the constructor. The function should return the value Subclasses should implement :param value: Option value :return: validated value """ raise NotImplementedError def validate_option(self, value): """Core method used externally (e.g., resolvers) to validate option The default implementation will only validate that the "core" type is consistent with definition. Subclasses should override this to leverage internal state (e.g, self.choices) """ return self.validate_core_type(value) def __init__(self, option_id, name, default, description): """ Core constructor for the PacBio Task Option. :param option_id: PacBio Task Option type id. Must adhere to the A-z0-9_ :param name: Display name of the Task Option :param default: Default value :param description: Description of the Task Option :type option_id: str :type name: str :type description: str """ self.option_id = validate_task_option_id(option_id) self.name = name self._default = self.validate_core_type(default) self.description = description # make sure subclasses have overwritten the OPTION_TYPE_ID. # this will raise if self.OPTION_TYPE_ID not in TaskOptionTypes.ALL(): msg = "InValid Task Option type id {t} Subclasses of {c} must " \ "override OPTION_TYPE_ID to have a consistent value with " \ "TaskOptionTypes.*".format(t=self.OPTION_TYPE_ID, c=self.__class__.__name__) raise ValueError(msg) @property def default(self): """Returns the default value for the option""" return self._default def __repr__(self): _d = dict(i=self.option_id, n=self.name, v=self.default, k=self.__class__.__name__, t=self.OPTION_TYPE_ID) return "<{k} {i} name: {n} default: {v} type:{t} >".format(**_d) def to_dict(self): option_type = TaskOptionTypes.from_str(self.OPTION_TYPE_ID) # the same model is used in the pipeline template, so we break the # snake case in favor of camelcase for the option type id. return dict(id=self.option_id, name=self.name, default=self.default, description=self.description, optionTypeId=option_type) def _type_error_msg(value, expected_type): return "{v} Expected {t}, got {x}".format(v=value, t=expected_type, x=type(value)) def _strict_validate_int_or_raise(value): def _to_msg(type_): return _type_error_msg(value, type_) if isinstance(value, types.BooleanType): raise TypeError(_to_msg(types.BooleanType)) elif isinstance(value, types.FloatType): raise TypeError(_to_msg(types.FloatType)) elif isinstance(value, types.StringType): raise TypeError(_to_msg(types.StringType)) else: return int(value) def _strict_validate_bool_or_raise(value): if isinstance(value, types.BooleanType): return value raise TypeError(_type_error_msg(value, types.BooleanType)) def _strict_validate_float_or_raise(value): def _to_msg(type_): return _type_error_msg(value, type_) if isinstance(value, types.BooleanType): raise TypeError(_to_msg(types.BooleanType)) elif isinstance(value, types.StringType): raise TypeError(_to_msg(types.StringType)) else: return float(value) def _strict_validate_string_or_raise(value): # Not supporting unicode in any way if isinstance(value, str): return value raise TypeError(_type_error_msg(value, str)) class PacBioIntOption(BasePacBioOption): OPTION_TYPE_ID = TaskOptionTypes.INT @classmethod def validate_core_type(cls, value): return _strict_validate_int_or_raise(value) class PacBioFloatOption(BasePacBioOption): OPTION_TYPE_ID = TaskOptionTypes.FLOAT @classmethod def validate_core_type(cls, value): return _strict_validate_float_or_raise(value) class PacBioBooleanOption(BasePacBioOption): OPTION_TYPE_ID = TaskOptionTypes.BOOL @classmethod def validate_core_type(cls, value): return _strict_validate_bool_or_raise(value) class PacBioStringOption(BasePacBioOption): OPTION_TYPE_ID = TaskOptionTypes.STR @classmethod def validate_core_type(cls, value): return _strict_validate_string_or_raise(value) def _strict_validate_default_and_choices(core_type_validator_func): """ :param core_type_validator_func: Function (value) => value or raises TypeError Returns a func of (value, choices) => value, choices or raises TypeError or Value Error. """ def wrap(value, choices): for choice in choices: core_type_validator_func(choice) v = core_type_validator_func(value) if v not in choices: raise ValueError("Default value {v} is not in allowed choices {c}".format(v=value, c=choices)) return v, choices return wrap _strict_validate_int_choices = _strict_validate_default_and_choices(_strict_validate_int_or_raise) _strict_validate_str_choices = _strict_validate_default_and_choices(_strict_validate_string_or_raise) _strict_validate_bool_choices = _strict_validate_default_and_choices(_strict_validate_bool_or_raise) _strict_validate_float_choices = _strict_validate_default_and_choices(_strict_validate_float_or_raise) class BaseChoiceType(BasePacBioOption): # This really should be Abstract def __init__(self, option_id, name, default, description, choices): super(BaseChoiceType, self).__init__(option_id, name, default, description) _, validated_choices = self.validate_core_type_with_choices(default, choices) self.choices = validated_choices @classmethod def validate_core_type_with_choices(cls, value, choices): raise NotImplementedError def validate_option(self, value): v, _ = self.validate_core_type_with_choices(value, self.choices) return v def to_dict(self): d = super(BaseChoiceType, self).to_dict() d['choices'] = self.choices return d class PacBioIntChoiceOption(BaseChoiceType): OPTION_TYPE_ID = TaskOptionTypes.CHOICE_INT @classmethod def validate_core_type(cls, value): return _strict_validate_int_or_raise(value) @classmethod def validate_core_type_with_choices(cls, value, choices): return _strict_validate_int_choices(value, choices) class PacBioStringChoiceOption(BaseChoiceType): OPTION_TYPE_ID = TaskOptionTypes.CHOICE_STR @classmethod def validate_core_type(cls, value): return _strict_validate_string_or_raise(value) @classmethod def validate_core_type_with_choices(cls, value, choices): return _strict_validate_str_choices(value, choices) class PacBioFloatChoiceOption(BaseChoiceType): OPTION_TYPE_ID = TaskOptionTypes.CHOICE_FLOAT @classmethod def validate_core_type(cls, value): return _strict_validate_float_or_raise(value) @classmethod def validate_core_type_with_choices(cls, value, choices): return _strict_validate_float_choices(value, choices)pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/report.py0000644000000000000000000011014213035554276024273 0ustar rootroot"""Common PacBio Report model Author: Johann Miller and Michael Kocher """ from collections import defaultdict, OrderedDict import warnings import abc import logging import json import os import re import uuid as U # to allow use of uuid as local var from pprint import pformat import datetime import pbcommand log = logging.getLogger(__name__) __all__ = ['PbReportError', 'Attribute', 'Report', 'Plot', 'PlotGroup', 'Column', 'Table'] # If/when the Report datamodel change, this needs to be changed using # the semver model PB_REPORT_SCHEMA_VERSION = "1.0.0" _HAS_NUMPY = False try: import numpy as np _HAS_NUMPY = True except ImportError: pass def _get_decoder(): """ There's a bit of nonsense here to support the exiting pbreports python package. numpy is only used for Report that have Table columns that are numpy arrays. This really should have strictly defined in the original API to only support native python lists. Similarly with numpy scalars in Report Attributes. :return: None | numpy decoder """ if _HAS_NUMPY: class NumpyJsonEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.core.numerictypes.floating): return float(obj) if isinstance(obj, np.core.numerictypes.integer): return int(obj) if isinstance(obj, np.ndarray) and obj.ndim == 1: return [float(x) for x in obj] # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj) return NumpyJsonEncoder else: return None def _to_json_with_decoder(d): decoder_or_none = _get_decoder() if decoder_or_none is None: return json.dumps(d, sort_keys=True, indent=4, separators=(',', ': ')) else: return json.dumps(d, cls=decoder_or_none, sort_keys=True, indent=4, separators=(',', ': ')) class PbReportError(Exception): pass class BaseReportElement(object): __metaclass__ = abc.ABCMeta def __init__(self, id_): if not isinstance(id_, basestring): raise PbReportError( "Type error. id '{i}' cannot be {t}.".format(i=id_, t=type(id_))) if not re.match('^[a-z0-9_]+$', id_): msg = "id '{i}' for {x} must contain only lower-case alphanumeric or underscore characters".format( x=self.__class__.__name__, i=id_) log.error(msg) raise PbReportError(msg) self._id = id_ self._ids = set([]) def is_unique(self, id_): """ Raise an error if a BaseReportElement with this id has already been added. :param id_: (int) id of child BaseReportElement """ if id_ in self._ids: msg = "a plot with id '{i}' has already been added to {t}.".format( i=id_, t=str(type(self))) log.error(msg) raise PbReportError(msg) self._ids.add(id_) @property def id(self): return self._id @abc.abstractmethod def _get_attrs_simple(self): """ Return a list of attributes names where each attribute returns a simple type like a string, int, or float. The 'id' attribute should NOT be included. Example [ 'title' ] """ raise NotImplementedError @abc.abstractmethod def _get_attrs_complex_list(self): """ Return a list of attributes names where each attribute returns a list of BaseReportElement objects which implement to_dict() """ raise NotImplementedError def to_dict(self, id_parts=None): """ Return a dict-view of this object. Recursively descend in to collections of BaseReportElement instances, calling to_dict on each. Additionally, prepends the id with a '.'-delimited string of parent id's :param id_parts: (list of string) Parent id's, as a function of depth within the object graph """ if id_parts is None: # start the part list id_parts = [self.id] else: id_parts.append(self.id) d = {a: getattr(self, a) for a in self._get_attrs_simple()} d['id'] = '.'.join([str(v) for v in id_parts]) complex_attrs = self._get_attrs_complex_list() for ca in complex_attrs: d[ca] = [] for i in getattr(self, ca): copy = [] copy.extend(id_parts) d[ca].append(i.to_dict(copy)) # yank the last id so it doesn't prepend the next item of same type. # slicing doesn't work on original list. need copy! bug 23799 id_parts = copy[:-1] if len(id_parts) > 1: # yank the last id part, so it doesn't prepend the next # category of attributes id_parts = id_parts[:-1] return d class Attribute(BaseReportElement): """ An attribute always has an id and a value. A name is optional. """ def __init__(self, id_, value, name=None): """ :param id_: (str) Unique id for attribute (Not None, or Empty) :param value: (str, float) Numeric values should be float values. Formatting is performed durning the report rendering :param name: (str, None) optional display name. Can be changed in portal display rules """ BaseReportElement.__init__(self, id_) self._value = value self._name = name @property def value(self): return self._value @property def name(self): return self._name def _get_attrs_simple(self): return ['value', 'name'] def _get_attrs_complex_list(self): return [] def __eq__(self, other): if isinstance(other, Attribute): if self.name == other.name and self.value == other.value and self.id == other.id: return True return False def __ne__(self, other): return not self.__eq__(other) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.id, v=self.value, n=self.name) return "<{k} id:{i} value:{v} name:{n} >".format(**_d) class PlotGroup(BaseReportElement): """ A plotGroup is a container of plots. """ def __init__(self, id_, title=None, legend=None, thumbnail=None, plots=()): """ :param id_: (str) id of plotgroup. Not None or Empty :param title: (str, None) Title of the plotGroup, displayed to user. :param legend: (str, None) Path to legend image, if applicable :param thumbnail: (str, None)Path to thumbnail image, if applicable :param plots: (list of Plot instances) """ BaseReportElement.__init__(self, id_) self._title = title self._legend = legend self._thumbnail = thumbnail self._plots = [] if plots: for plot in plots: self.add_plot(plot) @property def title(self): return self._title @property def legend(self): return self._legend @property def thumbnail(self): return self._thumbnail @property def plots(self): return self._plots @property def nplots(self): return len(self.plots) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.id, t=self.title, n=self.nplots) return "<{k} id:{i} title:{t} nplots:{n} >".format(**_d) def _get_attrs_simple(self): return ['title', 'legend', 'thumbnail'] def _get_attrs_complex_list(self): return ['plots'] def get_plot_by_id(self, id_): for plot in self.plots: if plot.id == id_: return plot return None def add_plot(self, plot): """ Add a plot to the plotGroup """ if not isinstance(plot, Plot): raise TypeError( "Unable to add plot. Got type {x} expect Plot".format(x=type(plot))) BaseReportElement.is_unique(self, plot.id) self._plots.append(plot) def to_dict(self, id_parts=None): return BaseReportElement.to_dict(self, id_parts=id_parts) def _validate_not_abs_path(path): if os.path.isabs(path): raise ValueError("paths must be relative. Got {i}".format(i=path)) class Plot(BaseReportElement): """ A plot contains a path to image file. """ def __init__(self, id_, image, caption=None, thumbnail=None, title=None): """ :param id_: (str, not None, or empty) Unique id for plot. :param image: (str) Required - not None - path to image :param caption: (str, None) Plot caption displayed to user under plot. :param thumbnail: (str, None) thumbnail path :param title: str Display Name of the Plot Paths must be given as relative """ BaseReportElement.__init__(self, id_) if image is None: raise PbReportError('image cannot be None') _validate_not_abs_path(image) self._image = image self._caption = caption self.title = title if thumbnail is not None: _validate_not_abs_path(thumbnail) self._thumbnail = thumbnail def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.id, p=self.image) return "<{k} {i} {p} >".format(**_d) @property def image(self): return self._image @property def thumbnail(self): return self._thumbnail @property def caption(self): return self._caption def _get_attrs_simple(self): return ['image', 'caption', 'title'] def _get_attrs_complex_list(self): return [] class Table(BaseReportElement): """ A table consists of an id, title, and list of columns. """ def __init__(self, id_, title=None, columns=()): """ :param id_: (str), Unique id for table in report. :param title: (str, None) :param columns: (list of column instances) """ BaseReportElement.__init__(self, id_) self._title = title self._columns = [] if columns: for column in columns: self.add_column(column) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.id, t=self.title, n=self.ncolumns) return "<{k} {i} title:{t} ncolumns:{n} >".format(**_d) def __str__(self): pad = 2 max_values = max(len(column.values) for column in self.columns) # max length for each column value max_lengths = {} headers = [] for c in self.columns: this_header = "" if c.header is not None: this_header = c.header if c.values: n = max(max(len(str(v)) for v in c.values), len(this_header)) else: n=len(this_header) max_lengths[c] = n headers.append(this_header) header="".join([h.ljust(max_lengths[c] + pad) for h in headers]) outs = list() outs.append("") outs.append("Table id:{i}".format(i=self.id)) outs.append("-" * len(header)) outs.append(header) outs.append("-" * len(header)) for i in xrange(max_values): out = [] for column in self.columns: try: l = max_lengths[column] + pad out.append(str(column.values[i]).ljust(l)) except IndexError as e: log.warn(e) out.append("No Value ") outs.append(" ".join(out)) return "\n".join(outs) @property def id(self): return self._id @property def title(self): return self._title @property def ncolumns(self): return len(self.columns) @property def columns(self): return self._columns def _get_attrs_simple(self): return ['title'] def _get_attrs_complex_list(self): return ['columns'] def get_column_by_id(self, id_): for col in self.columns: if col.id == id_: return col return None def add_column(self, column): """ Add a column to the table :param column: (Column instance) """ if not isinstance(column, Column): raise TypeError( "Got type {x}. Expected Column type.".format(x=type(column))) BaseReportElement.is_unique(self, column.id) self._columns.append(column) def append_data(self, column_index, item): """ This should be deprecated in favor of `add_data_by_column_id`. Append datum to a column by column index :param column_index: (int) Index into internal column list :param item: (float, str) data item. """ if column_index < len(self._columns): self._columns[column_index].values.append(item) else: raise IndexError( "Unable to find index {i} in columns.".format(i=column_index)) def add_data_by_column_id(self, column_id, value): """Add a value to column. :param column_id: (str) Column id :param value: (float, str, int) """ if column_id in [c.id for c in self.columns]: # _columns should really be a dict # self._columns[column_id].values.append(value) for column in self.columns: if column_id == column.id: column.values.append(value) else: raise KeyError("Unable to Column with id '{i}' to assign value {v}".format( i=column_id, v=value)) @staticmethod def merge(tables): table_id = tables[0].id table_title = tables[0].title column_ids = sorted([col.id for col in tables[0].columns]) col_collisions = {col_id: [] for col_id in column_ids} for table in tables: assert table.id == table_id assert table.title == table_title assert sorted([col.id for col in table.columns]) == column_ids for col in table.columns: col_collisions[col.id].append(col) columns = {} for col_id, cols in col_collisions.iteritems(): assert len(cols) == len(tables) columns[col_id] = Column.merge(cols) # order by table[0]'s column order: columns = [columns[col.id] for col in tables[0].columns] return Table(table_id, table_title, columns=columns) class Column(BaseReportElement): """ A column consists of an id, header, and list of values. """ def __init__(self, id_, header=None, values=()): """ :param id_: (str) :param header: (str, None) Header of Column. """ BaseReportElement.__init__(self, id_) self._id = id_ self._header = header self._values = list(values) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.id, h=self.header, n=self.nvalues) return "<{k} id:{i} header:{h} nvalues:{n} >".format(**_d) @property def id(self): return self._id @property def header(self): return self._header @property def nvalues(self): return len(self.values) @property def values(self): return self._values def _get_attrs_simple(self): return ['header', 'values'] def _get_attrs_complex_list(self): return [] @staticmethod def merge(columns): column_id = columns[0].id column_header = columns[0].header values = [] for col in columns: assert col.id == column_id assert col.header == column_header values.extend(col.values) return Column(column_id, column_header, values=values) class Report(BaseReportElement): """ A report is a container for attributes, plotGroups, and tables. It can be serialized to json. """ def __init__(self, id_, title=None, tables=(), attributes=(), plotgroups=(), dataset_uuids=(), uuid=None): """ :param id_: (str) Should be a string that identifies the report, like 'adapter'. :param title: Display name of report Defaults to the Report+id if None (added in 0.3.9) :param tables: (list of table instances) :param attributes: (list of attribute instances) :param plotgroups: (list of plot group instances) :param dataset_uuids: list[string] DataSet uuids of files used to generate the report :param uuid: the unique identifier for the Report """ BaseReportElement.__init__(self, id_) self._attributes = [] self._plotgroups = [] self._tables = [] self.title = "Report {i}".format(i=self.id) if title is None else title # FIXME(mkocher)(2016-3-30) Add validation to make sure it's a well formed value # this needs to be required self.uuid = uuid if uuid is not None else str(U.uuid4()) if tables: for table in tables: self.add_table(table) if attributes: for attr in attributes: self.add_attribute(attr) if plotgroups: for plotgroup in plotgroups: self.add_plotgroup(plotgroup) # Datasets that self._dataset_uuids = dataset_uuids @property def dataset_uuids(self): return self._dataset_uuids def add_attribute(self, attribute): """Add an attribute to the report :param attribute: (Attribute instance) """ if not isinstance(attribute, Attribute): TypeError("Got type {x}. Expected Attribute type.".format( x=type(attribute))) BaseReportElement.is_unique(self, attribute.id) self._attributes.append(attribute) def add_plotgroup(self, plotgroup): """ Add a plotgroup to the report """ if not isinstance(plotgroup, PlotGroup): TypeError("Got type {x}. Expected Attribute type.".format( x=type(plotgroup))) BaseReportElement.is_unique(self, plotgroup.id) self._plotgroups.append(plotgroup) def add_table(self, table): """ Add a table to the report """ BaseReportElement.is_unique(self, table.id) self._tables.append(table) def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.id, n=self.title, a=len(self.attributes), p=len(self.plotGroups), t=len(self.tables), u=self.uuid) return "<{k} id:{i} title:{n} uuid:{u} nattributes:{a} nplot_groups:{p} ntables:{t} >".format(**_d) @property def attributes(self): return self._attributes @property def plotGroups(self): return self._plotgroups @property def tables(self): return self._tables def _get_attrs_simple(self): return [] def _get_attrs_complex_list(self): return ['attributes', 'plotGroups', 'tables'] def get_attribute_by_id(self, id_): """Get an attribute by id. The id should NOT contain the root report id :returns: (None, Attribute) Example: report.get_attribute_by_id('nmovies') *NOT* report.get_attribute_by_id('overview.nmovies') """ for attr in self.attributes: if attr.id == id_: return attr return None def get_table_by_id(self, id_): for table in self.tables: if table.id == id_: return table return None def get_plotgroup_by_id(self, id_): for pg in self.plotGroups: if pg.id == id_: return pg return None def to_dict(self, id_parts=None): _d = dict(v=pbcommand.get_version(), t=datetime.datetime.now().isoformat()) d = BaseReportElement.to_dict(self, id_parts=id_parts) d['_comment'] = "Generated with pbcommand version {v} at {t}".format(**_d) # Required in 1.0.0 of the spec d['uuid'] = self.uuid d['title'] = self.title d['version'] = PB_REPORT_SCHEMA_VERSION d['dataset_uuids'] = list(set(self.dataset_uuids)) return d def to_json(self): """Return a json string of the report""" from pbcommand.schemas import validate_pbreport try: s = _to_json_with_decoder(self.to_dict()) # FIXME(mkocher)(2016-6-20) Enable schema validation # this needs to be processed by the decoder, then validate the # dict # _ = validate_pbreport(json.loads(s)) return s except TypeError as e: msg = "Unable to serialize report due to {e} \n".format(e=e) log.error(msg) log.error("Object: " + pformat(self.to_dict())) raise def write_json(self, file_name): """ Serialized the report to a json file. :param file_name: (str) Path to write output json file to. """ with open(file_name, 'w') as f: f.write(self.to_json()) # log.info("Wrote report {r}".format(r=file_name)) @staticmethod def from_simple_dict(report_id, raw_d, namespace): """ Generate a Report with populated attributes, starting from a flat dictionary (without namespace). """ attributes = [] for k, v in raw_d.items(): ns = "_".join([namespace, k.lower()]) # These can't be none for some reason if v is not None: a = Attribute(ns, v, name=k) attributes.append(a) else: warnings.warn("skipping null entry {k}->{v}".format(k=k, v=v)) return Report(report_id, attributes=attributes) @staticmethod def merge(reports): report_id = reports[0].id def _merge_attributes_d(attributes_list): attrs = OrderedDict() for ax in attributes_list: for a in ax: if a.id in attrs: attrs[a.id].append(a.value) else: attrs[a.id] = [a.value] return attrs def _merge_attributes_names(attributes_list): names = {} for ax in attributes_list: for a in ax: if a.id in names: assert names[a.id] == a.name else: names[a.id] = a.name return names def _attributes_to_table(attributes_list, table_id, title): attrs = _merge_attributes_d(attributes_list) labels = _merge_attributes_names(attributes_list) columns = [Column(k.lower(), header=labels[k], values=values) for k, values in attrs.iteritems()] table = Table(table_id, title=title, columns=columns) return table def _sum_attributes(attributes_list): d = _merge_attributes_d(attributes_list) labels = _merge_attributes_names(attributes_list) return [Attribute(k, sum(values), name=labels[k]) for k, values in d.iteritems()] def _merge_tables(tables): """Pass through singletons, Table.merge dupes""" id_collisions = defaultdict(list) merged = [] for tab in tables: id_collisions[tab.id].append(tab) for tabs in id_collisions.values(): if len(tabs) == 1: merged.append(tabs[0]) else: merged.append(Table.merge(tabs)) return merged attr_list = [] table_list = [] dataset_uuids = set() for report in reports: assert report.id == report_id attr_list.append(report.attributes) table_list.extend(report.tables) dataset_uuids.update(set(report.dataset_uuids)) table = _attributes_to_table(attr_list, 'chunk_metrics', "Chunk Metrics") tables = _merge_tables(table_list) tables.append(table) merged_attributes = _sum_attributes(attr_list) return Report(report_id, attributes=merged_attributes, tables=tables, dataset_uuids=sorted(list(dataset_uuids))) ######################################################################## # SPECIFICATION MODELS FS_RE = "{([GMkp]{0,1})(:)([,]{0,1})([\.]{0,1})([0-9]*)([dfg]{1})}(.*)$" def validate_format(format_str): m = re.match(FS_RE, format_str) if m is None: raise ValueError("Format string '{s}' is uninterpretable".format( s=format_str)) return m def format_metric(format_str, value): """ Format a report metric (attribute or table column value) according to our in-house rules. These resemble Python format strings (plus optional suffix), but with the addition of optional scaling flags. """ if value is None: return "NA" elif format_str is None: return str(value) else: m = validate_format(format_str) if m.groups()[0] == 'p': value *= 100.0 elif m.groups()[0] == 'G': value /= 1000000000.0 elif m.groups()[0] == 'M': value /= 1000000.0 elif m.groups()[0] == 'k': value /= 1000.0 if isinstance(value, float) and m.groups()[5] == 'd': value = int(value) fs_python = "{{:{:s}{:s}{:s}{:s}}}".format(*(m.groups()[2:6])) formatted = fs_python.format(value) # the percent symbol can be implicit if m.groups()[0] == 'p' and m.groups()[-1] == '': return formatted + "%" else: return formatted + m.groups()[-1] # FIXME this needs to be standardized DATA_TYPES = { "int": int, "long": int, "float": float, "string": basestring, # this is hacky too "boolean": bool } class AttributeSpec(object): def __init__(self, id_, name, description, type_, format_=None, is_hidden=False): self.id = id_ self.name = name self.description = description self._type = type_ self.format_str = format_ self.is_hidden = is_hidden @property def type(self): return DATA_TYPES[self._type] @staticmethod def from_dict(d): format_str = d.get("format", None) if format_str is not None: validate_format(format_str) assert d["type"] in DATA_TYPES, d["type"] return AttributeSpec(d['id'].split(".")[-1], d['name'], d['description'], d["type"], format_str, d.get("isHidden", False)) def validate_attribute(self, attr): assert attr.id == self.id if attr.value is not None and not isinstance(attr.value, self.type): msg = "Attribute {i} has value of type {v} (expected {t})".format(i=self.id, v=type(attr.value).__name__, t=self.type) raise TypeError(msg) class ColumnSpec(object): def __init__(self, id_, header, description, type_, format_=None, is_hidden=False): self.id = id_ self.header = header self.description = description self._type = type_ self.format_str = format self.is_hidden = is_hidden @property def type(self): return DATA_TYPES[self._type] @staticmethod def from_dict(d): format_str = d.get("format", None) if format_str is not None: validate_format(format_str) assert d["type"] in DATA_TYPES, d["type"] return ColumnSpec(d['id'].split(".")[-1], d['header'], d['description'], d["type"], format_str, d.get("isHidden", False)) def validate_column(self, col): assert col.id == self.id for value in col.values: if value is not None and not isinstance(value, self.type): msg = "Column {i} contains value of type {v} (expected {t})".format(i=self.id, v=type(value).__name__, t=self.type) if isinstance(value, int) and self._type == "float": warnings.warn(msg) else: raise TypeError(msg) class TableSpec(object): def __init__(self, id_, title, description, columns): self.id = id_ self.title = title self.description = description self.columns = columns self._col_dict = {c.id: c for c in columns} @staticmethod def from_dict(d): return TableSpec(d['id'].split(".")[-1], d['title'], d['description'], [ColumnSpec.from_dict(c) for c in d['columns']]) def get_column_spec(self, id_): return self._col_dict.get(id_, None) class PlotSpec(object): def __init__(self, id_, description, caption, title, xlabel, ylabel): self.id = id_ self.description = description self.caption = caption self.title = title self.xlabel = xlabel self.ylabel = ylabel @staticmethod def from_dict(d): return PlotSpec(d['id'].split(".")[-1], d['description'], d['caption'], d['title'], d.get('xlabel', None), d.get('ylabel', None)) class PlotGroupSpec(object): def __init__(self, id_, title, description, legend, plots=()): self.id = id_ self.title = title self.description = description self.legend = legend self.plots = plots self._plot_dict = {p.id: p for p in plots} @staticmethod def from_dict(d): return PlotGroupSpec(d['id'].split(".")[-1], d['title'], d["description"], d['legend'], [PlotSpec.from_dict(p) for p in d['plots']]) def get_plot_spec(self, id_): return self._plot_dict.get(id_, None) class ReportSpec(object): """ Model for a specification of the expected content of a uniquely identified report. For obvious reasons this mirrors the Report model, minus values and with added view metadata. These specs should usually be written out explicitly in JSON rather than built programatically. """ def __init__(self, id_, version, title, description, attributes=(), plotgroups=(), tables=()): self.id = id_ self.version = version self.title = title self.description = description self.attributes = attributes self.plotgroups = plotgroups self.tables = tables self._attr_dict = {a.id: a for a in attributes} self._plotgrp_dict = {p.id: p for p in plotgroups} self._table_dict = {t.id: t for t in tables} @staticmethod def from_dict(d): return ReportSpec(d['id'], d['version'], d['title'], d['description'], [AttributeSpec.from_dict(a) for a in d['attributes']], [PlotGroupSpec.from_dict(p) for p in d['plotGroups']], [TableSpec.from_dict(t) for t in d['tables']]) def get_attribute_spec(self, id_): return self._attr_dict.get(id_, None) def get_plotgroup_spec(self, id_): return self._plotgrp_dict.get(id_, None) def get_table_spec(self, id_): return self._table_dict.get(id_, None) def validate_report(self, rpt): """ Check that a generated report corresponding to this spec is compliant with the expected types and object IDs. (Missing objects will not result in an error, but unexpected object IDs will.) """ assert rpt.id == self.id # TODO check version? errors = [] for attr in rpt.attributes: attr_spec = self.get_attribute_spec(attr.id) if attr_spec is None: errors.append("Attribute {i} not found in spec".format( i=attr.id)) else: try: attr_spec.validate_attribute(attr) except TypeError as e: errors.append(str(e)) try: format_metric(attr_spec.format_str, attr.value) except (ValueError, TypeError) as e: log.error(e) errors.append("Couldn't format {i}: {e}".format( i=attr.id, e=str(e))) for table in rpt.tables: table_spec = self.get_table_spec(table.id) if table_spec is None: errors.append("Table {i} not found in spec".format(i=table.id)) else: for column in table.columns: column_spec = table_spec.get_column_spec(column.id) if column_spec is None: errors.append("Column {i} not found in spec".format( i=column.id)) else: try: column_spec.validate_column(column) except TypeError as e: errors.append(str(e)) for pg in rpt.plotGroups: pg_spec = self.get_plotgroup_spec(pg.id) if pg_spec is None: errors.append("Plot group {i} not found in spec".format( i=pg.id)) else: for plot in pg.plots: plot_spec = pg_spec.get_plot_spec(plot.id) # FIXME how should we handle plots with variable IDs? # maybe let the title/caption vary and keep the ID # constant? if plot_spec is None: warnings.warn("Plot {i} not found in spec".format( i=plot.id)) # errors.append("Plot {i} not found in spec".format( # i=plot.id)) if len(errors) > 0: raise ValueError( "Report {i} failed validation against spec:\n{e}".format( i=self.id, e="\n".join(errors))) return rpt def is_valid_report(self, rpt): """ Returns True if report passes spec validation. """ try: rpt = self.validate_report(rpt) return True except ValueError: return False def apply_view(self, rpt, force=False): """ Propagate view metadata (i.e. labels) to a Report object corresponding to this spec. """ assert rpt.id == self.id for attr in rpt.attributes: attr_spec = self.get_attribute_spec(attr.id) if force or attr.name in [None, ""]: attr._name = attr_spec.name for table in rpt.tables: table_spec = self.get_table_spec(table.id) if force or table.title in [None, ""]: table._title = table_spec.title for col in table.columns: col_spec = table_spec.get_column_spec(col.id) if force or col.header in [None, ""]: col._header = col_spec.header for pg in rpt.plotGroups: pg_spec = self.get_plotgroup_spec(pg.id) if force or pg.title in [None, ""]: pg._title = pg_spec.title for plot in pg.plots: plot_spec = pg_spec.get_plot_spec(plot.id) # FIXME see comment above - maybe we just need to repeat IDs? if plot_spec is not None: if force or plot.title in [None, ""]: plot.title = plot_spec.title if force or plot.caption in [None, ""]: plot._caption = plot_spec.caption else: pass #warnings.warn("Can't find spec for {i}".format(i=plot.id)) return rpt pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/models/parser.py0000644000000000000000000005756113035554276024273 0ustar rootroot""" Commandline Parser for Tools. Supports Tool Contracts # Author: Michael Kocher """ import abc import os import logging import argparse import functools from .common import (SymbolTypes, validate_task_id, PacBioFloatOption, PacBioStringOption, PacBioBooleanOption, PacBioIntOption, PacBioStringChoiceOption, PacBioFloatChoiceOption, PacBioIntChoiceOption) from pbcommand.common_options import (add_base_options_with_emit_tool_contract, add_subcomponent_versions_option) from .tool_contract import (ToolDriver, InputFileType, OutputFileType, ToolContract, ToolContractTask, ScatterToolContractTask, GatherToolContractTask) log = logging.getLogger(__name__) __version__ = "0.1.1" __all__ = ["PbParser", "PyParser", "ToolContractParser", "get_pbparser", "get_scatter_pbparser", "get_gather_pbparser"] def _to_file_type(format_): return "pacbio.file_types.{x}".format(x=format_) def _validate_file(label, path): if os.path.exists(path): return os.path.abspath(path) else: raise IOError("Unable to find '{x}' file '{p}'".format(x=label, p=path)) def _validate_option_or_cast(dtype, dvalue): if isinstance(dvalue, dtype): return dvalue else: # XXX this is almost always going to be the case... if isinstance(dvalue, basestring): try: return dtype(dvalue) except ValueError as e: pass raise TypeError("Invalid option type: '{a}' provided, '{e}' " "expected".format(a=dvalue, e=dtype)) def to_opt_id(namespace, s): return ".".join([namespace, "options", s]) def _validate_option(dtype, dvalue): if isinstance(dvalue, dtype): return dvalue else: raise TypeError("Invalid option type: '{a}' provided, '{e}' " "expected".format(a=dvalue, e=dtype)) class PbParserBase(object): __metaclass__ = abc.ABCMeta def __init__(self, tool_id, version, name, description): self.tool_id = validate_task_id(tool_id) self.version = version self.description = description self.name = name def __repr__(self): _d = dict(k=self.__class__.__name__, i=self.tool_id, v=self.version) return "<{k} id:{i} {v} >".format(**_d) @abc.abstractmethod def add_input_file_type(self, file_type, file_id, name, description): """ Add a mandatory input file parameter. On the Python argparse side, this will be a positional argument. :param file_type: file type ID from pbcommand.models.common, e.g. FileTypes.DS_REF :param file_id: parameter name, mainly used on argparse side :param name: plain-English name :param description: help string """ raise NotImplementedError @abc.abstractmethod def add_output_file_type(self, file_type, file_id, name, description, default_name): """ Add a mandatory output file parameter. On the Python argparse side, this will be a positional argument. :param file_type: file type ID from pbcommand.models.common, e.g. FileTypes.DS_REF :param file_id: parameter name, mainly used on argparse side :param name: plain-English name :param description: help string :param default_name: tuple of form (base_name, extension) specifying the default output file name """ raise NotImplementedError @abc.abstractmethod def add_int(self, option_id, option_str, default, name, description): """ Add an optional integer keyword argument (e.g. "--n=10" or "--n 10" on the command line). :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param default: default value (must be an actual integer, not None) :param name: plain-English name :param description: help string """ raise NotImplementedError @abc.abstractmethod def add_float(self, option_id, option_str, default, name, description): """ Add an optional float keyword argument (e.g. "--n=10" or "--n 10" on the command line). :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param default: default value (must be an actual number, not None) :param name: plain-English name :param description: help string """ raise NotImplementedError @abc.abstractmethod def add_str(self, option_id, option_str, default, name, description): """ Add a generic keyword argument whose type is a string. :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param default: default value (can be blank, but not None) :param name: plain-English name :param description: help string """ raise NotImplementedError @abc.abstractmethod def add_boolean(self, option_id, option_str, default, name, description): """ Add a boolean option. :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param default: specifies the boolean value of this option **if the argument was supplied**, i.e. on the argparse layer, default=True is equivalent to action="store_true" :param name: plain-English name :param description: help string """ raise NotImplementedError @abc.abstractmethod def add_choice_str(self, option_id, option_str, choices, name, description, default=None): """ Add a generic enumerated argument whose type is a string. :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param choices: allowed values :param name: plain-English name :param description: help string :param default: default value (if None, will use first choice) """ raise NotImplementedError @abc.abstractmethod def add_choice_int(self, option_id, option_str, choices, name, description, default=None): """ Add a generic enumerated argument whose type is an integer. :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param choices: allowed values :param name: plain-English name :param description: help string :param default: default value (if None, will use first choice) """ raise NotImplementedError @abc.abstractmethod def add_choice_float(self, option_id, option_str, choices, name, description, default=None): """ Add a generic enumerated argument whose type is a float. :param option_id: fully-qualified option name used in tool contract layer, of form "pbcommand.task_options.my_option" :param option_str: shorter parameter name, mainly used in Python argparse layer, but *without* leading dashes :param choices: allowed values :param name: plain-English name :param description: help string :param default: default value (if None, will use first choice) """ raise NotImplementedError _validate_argparse_int = functools.partial(_validate_option_or_cast, int) _validate_argparse_float = functools.partial(_validate_option_or_cast, float) _validate_argparse_bool = functools.partial(_validate_option_or_cast, bool) _validate_argparse_str = functools.partial(_validate_option_or_cast, str) class PyParser(PbParserBase): """PbParser backed that supports argparse""" def __init__(self, tool_id, version, name, description, subcomponents=()): super(PyParser, self).__init__(tool_id, version, name, description) self.parser = argparse.ArgumentParser(#version=version, description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=True) self.parser.version = version self.parser.add_argument('--version', action="version", help="show program's version number and exit") if subcomponents: add_subcomponent_versions_option(self.parser, subcomponents) def add_input_file_type(self, file_type, file_id, name, description): # this will propagate up the label to the exception vfunc = functools.partial(_validate_file, file_id) self.parser.add_argument(file_id, type=vfunc, help=description) def add_output_file_type(self, file_type, file_id, name, description, default_name): self.parser.add_argument(file_id, type=str, help=description) def add_int(self, option_id, option_str, default, name, description): # FIXME Need to better define and validate option_str opt = "--" + option_str self.parser.add_argument(opt, type=_validate_argparse_int, help=description, default=_validate_argparse_int(default)) def add_float(self, option_id, option_str, default, name, description): if isinstance(default, int): default = float(default) opt = "--" + option_str self.parser.add_argument(opt, type=_validate_argparse_float, help=description, default=_validate_argparse_float(default)) def add_str(self, option_id, option_str, default, name, description): # Fixme opt = "--" + option_str self.parser.add_argument(opt, type=_validate_argparse_str, help=description, default=_validate_argparse_str(default)) def add_boolean(self, option_id, option_str, default, name, description): """ Note, the default value is set by NOT setting the option. Example, if you have option_str of --my-option with a default value of True, if --my-option is NOT provided, the value is True, if the --my-option is provided, then the value is false. """ d = {True: "store_true", False: "store_false"} opt = '--' + option_str self.parser.add_argument(opt, action=d[_validate_argparse_bool(not default)], help=description) def _add_choice_base(self, opt_type, option_id, option_str, choices, name, description, default=None): if default is None: default = choices[0] opt = '--' + option_str self.parser.add_argument(opt, action="store", choices=choices, type=opt_type, help=description, default=default) def add_choice_str(self, option_id, option_str, choices, name, description, default=None): return self._add_choice_base(str, option_id, option_str, choices, name, description, default) def add_choice_int(self, option_id, option_str, choices, name, description, default=None): return self._add_choice_base(int, option_id, option_str, choices, name, description, default) def add_choice_float(self, option_id, option_str, choices, name, description, default=None): return self._add_choice_base(float, option_id, option_str, choices, name, description, default) class ToolContractParser(PbParserBase): """Parser to support Emitting and running ToolContracts""" def __init__(self, tool_id, version, name, description, task_type, driver, nproc_symbol, resource_types): """Keeps the required elements for creating an instance of a ToolContract""" super(ToolContractParser, self).__init__(tool_id, version, name, description) self.input_types = [] self.output_types = [] # List of PacBioOption and subclasses of PacBioOption self.options = [] self.driver = driver self.name = name self.nproc_symbol = nproc_symbol self.resource_types = resource_types self.task_type = task_type def add_input_file_type(self, file_type, file_id, name, description): x = InputFileType(file_type.file_type_id, file_id, name, description) self.input_types.append(x) def add_output_file_type(self, file_type, file_id, name, description, default_name): x = OutputFileType(file_type.file_type_id, file_id, name, description, default_name) self.output_types.append(x) def add_int(self, option_id, option_str, default, name, description): self.options.append(PacBioIntOption(option_id, name, default, description)) def add_float(self, option_id, option_str, default, name, description): if isinstance(default, int): default = float(default) self.options.append(PacBioFloatOption(option_id, name, default, description)) def add_str(self, option_id, option_str, default, name, description): self.options.append(PacBioStringOption(option_id, name, default, description)) def add_boolean(self, option_id, option_str, default, name, description): self.options.append(PacBioBooleanOption(option_id, name, default, description)) def add_choice_str(self, option_id, option_str, choices, name, description, default=None): if default is None: default = choices[0] self.options.append(PacBioStringChoiceOption(option_id, name, default, description, choices)) def add_choice_int(self, option_id, option_str, choices, name, description, default=None): if default is None: default = choices[0] self.options.append(PacBioIntChoiceOption(option_id, name, default, description, choices)) def add_choice_float(self, option_id, option_str, choices, name, description, default=None): if default is None: default = choices[0] self.options.append(PacBioFloatChoiceOption(option_id, name, default, description, choices)) def to_tool_contract(self): # Not a well formed tool contract, must have at least one input and # one output if not self.input_types and not self.output_types: raise ValueError("Malformed tool contract inputs") task = ToolContractTask(self.tool_id, self.name, self.description, self.version, self.task_type, self.input_types, self.output_types, self.options, self.nproc_symbol, self.resource_types) tc = ToolContract(task, self.driver) # this should just return TC, not tc.to_dict() return tc class ScatterToolContractParser(ToolContractParser): def __init__(self, tool_id, version, name, description, task_type, driver, nproc_symbol, resource_types, chunk_keys, nchunks): super(ScatterToolContractParser, self).__init__(tool_id, version, name, description, task_type, driver, nproc_symbol, resource_types) self.chunk_keys = chunk_keys self.nchunks = nchunks def to_tool_contract(self): task = ScatterToolContractTask(self.tool_id, self.name, self.description, self.version, self.task_type, self.input_types, self.output_types, self.options, self.nproc_symbol, self.resource_types, self.chunk_keys, self.nchunks) tc = ToolContract(task, self.driver) return tc class GatherToolContractParser(ToolContractParser): def to_tool_contract(self): task = GatherToolContractTask(self.tool_id, self.name, self.description, self.version, self.task_type, self.input_types, self.output_types, self.options, self.nproc_symbol, self.resource_types) tc = ToolContract(task, self.driver) return tc class PbParser(PbParserBase): """ Wrapper class for managing separate tool contract and argument parsers (stored as tool_contract_parser and arg_parser attributes respectively). """ def __init__(self, tool_contract_parser, arg_parser, *parsers, **kwds): """ :param tool_contract_parser: :type tool_contract_parser: ToolContractParser :param arg_parser: :type arg_parser: PyParser :param parsers: :return: """ # Tool Contract Parser self.tool_contract_parser = tool_contract_parser # python wrapper parser. self.arg_parser = arg_parser # add options, so it will show up via --help add_base_options_with_emit_tool_contract(self.arg_parser.parser, default_level=kwds.get("default_level", "INFO")) # a list of other parsers that adhere to the PbParserBase interface # can be used. self.other_parsers = parsers # for now assume parsers have the same version, id, ... tool_id = tool_contract_parser.tool_id version = tool_contract_parser.version name = tool_contract_parser.name description = tool_contract_parser.description super(PbParser, self).__init__(tool_id, version, name, description) @property def parsers(self): return [self.tool_contract_parser, self.arg_parser] def _dispatch(self, f_name, args, kwds): for parser in self.parsers: f = getattr(parser, f_name) f(*args, **kwds) def add_input_file_type(self, file_type, file_id, name, description): args = file_type, file_id, name, description self._dispatch("add_input_file_type", args, {}) def add_output_file_type(self, file_type, file_id, name, description, default_name): args = file_type, file_id, name, description, default_name self._dispatch("add_output_file_type", args, {}) def add_int(self, option_id, option_str, default, name, description): args = option_id, option_str, default, name, description self._dispatch("add_int", args, {}) def add_float(self, option_id, option_str, default, name, description): args = option_id, option_str, default, name, description self._dispatch("add_float", args, {}) def add_str(self, option_id, option_str, default, name, description): args = option_id, option_str, default, name, description self._dispatch("add_str", args, {}) def add_boolean(self, option_id, option_str, default, name, description): args = option_id, option_str, default, name, description self._dispatch("add_boolean", args, {}) def add_choice_str(self, option_id, option_str, choices, name, description, default=None): args = option_id, option_str, choices, name, description, default self._dispatch("add_choice_str", args, {}) def add_choice_int(self, option_id, option_str, choices, name, description, default=None): args = option_id, option_str, choices, name, description, default self._dispatch("add_choice_int", args, {}) def add_choice_float(self, option_id, option_str, choices, name, description, default=None): args = option_id, option_str, choices, name, description, default self._dispatch("add_choice_float", args, {}) def to_contract(self): return self.tool_contract_parser.to_tool_contract() def _factory(tool_id, version, name, description, subcomponents, default_level): def _f(tc_parser): arg_parser = PyParser(tool_id, version, name, description, subcomponents=subcomponents) return PbParser(tc_parser, arg_parser, default_level=default_level) return _f def get_pbparser(tool_id, version, name, description, driver_exe, is_distributed=True, nproc=1, resource_types=(), subcomponents=(), serialization='json', default_level="INFO"): """ Central point of creating a Tool contract that can emit and run tool contracts. :returns: PbParser object """ driver = ToolDriver(driver_exe, serialization=serialization) tc_parser = ToolContractParser(tool_id, version, name, description, is_distributed, driver, nproc, resource_types) return _factory(tool_id, version, name, description, subcomponents, default_level)(tc_parser) def get_scatter_pbparser(tool_id, version, name, description, driver_exe, chunk_keys, is_distributed=True, nproc=1, nchunks=SymbolTypes.MAX_NCHUNKS, resource_types=(), subcomponents=(), serialization='json', default_level="INFO"): """Create a Scatter Tool""" driver = ToolDriver(driver_exe, serialization=serialization) tc_parser = ScatterToolContractParser(tool_id, version, name, description, is_distributed, driver, nproc, resource_types, chunk_keys, nchunks) return _factory(tool_id, version, name, description, subcomponents, default_level)(tc_parser) def get_gather_pbparser(tool_id, version, name, description, driver_exe, is_distributed=True, nproc=1, resource_types=(), subcomponents=(), serialization='json', default_level="INFO"): """Create a Gather tool""" driver = ToolDriver(driver_exe, serialization=serialization) tc_parser = GatherToolContractParser(tool_id, version, name, description, is_distributed, driver, nproc, resource_types) return _factory(tool_id, version, name, description, subcomponents, default_level)(tc_parser) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/common_options.py0000644000000000000000000001044513035554276024545 0ustar rootroot"""Common options and utils that can me used in commandline utils""" import logging import argparse import sys RESOLVED_TOOL_CONTRACT_OPTION = "--resolved-tool-contract" EMIT_TOOL_CONTRACT_OPTION = "--emit-tool-contract" def add_debug_option(p): p.add_argument("--pdb", action="store_true", default=False, help="Enable Python debugger") return p def add_log_debug_option(p): """This requires the log-level option""" p.add_argument('--debug', action="store_true", default=False, help="Alias for setting log level to DEBUG") return p def add_log_quiet_option(p): """This requires the log-level option""" p.add_argument('--quiet', action="store_true", default=False, help="Alias for setting log level to CRITICAL to suppress output.") return p def add_log_verbose_option(p): p.add_argument( "-v", "--verbose", dest="verbosity", action="count", help="Set the verbosity level.") return p def add_log_level_option(p, default_level='INFO'): """Add logging level with a default value""" if isinstance(default_level, int): default_level = logging.getLevelName(default_level) p.add_argument('--log-level', choices=('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'), default=default_level, help="Set log level") return p def add_log_file_option(p): p.add_argument('--log-file', default=None, type=str, help="Write the log to file. Default(None) will write to stdout.") return p def add_resolved_tool_contract_option(p): p.add_argument(RESOLVED_TOOL_CONTRACT_OPTION, type=str, help="Run Tool directly from a PacBio Resolved tool contract") return p def add_emit_tool_contract_option(p): p.add_argument(EMIT_TOOL_CONTRACT_OPTION, action="store_true", default=False, help="Emit Tool Contract to stdout") return p def add_base_options(p, default_level='INFO'): """Add the core logging options to the parser and set the default log level If you don't want the default log behavior to go to stdout, then set the default log level to be "ERROR". This will essentially suppress all output to stdout. Default behavior will only emit to stderr. This is essentially a '--quiet' default mode. my-tool --my-opt=1234 file_in.txt To override the default behavior: my-tool --my-opt=1234 --log-level=INFO file_in.txt Or write the file to an explict log file my-tool --my-opt=1234 --log-level=DEBUG --log-file=file.log file_in.txt """ # This should automatically/required be added to be added from get_default_argparser add_log_file_option(p) p_log = p.add_mutually_exclusive_group() add_log_verbose_option(add_log_quiet_option(add_log_debug_option( add_log_level_option(p_log, default_level=default_level)))) return p def add_common_options(p, default_level='INFO'): """ New model for 3.1 release. This should replace add_base_options """ return add_log_quiet_option(add_log_debug_option(add_log_level_option(add_log_file_option(p), default_level=default_level))) def add_base_options_with_emit_tool_contract(p, default_level='INFO'): # can't use compose here because of circular imports via parser return add_base_options(add_resolved_tool_contract_option(add_emit_tool_contract_option(p)), default_level=default_level) def _to_print_message_action(msg): class PrintMessageAction(argparse.Action): """Print message and exit""" def __call__(self, parser, namespace, values, option_string=None): sys.stdout.write(msg + "\n") sys.exit(0) return PrintMessageAction def add_subcomponent_versions_option(p, subcomponents): """Add subcomponents to a subparser to provide more information about the tools dependencies. Subcomponents must be provided as a list of tuples (component, version) """ max_length = max(len(x) for x, _ in subcomponents) pad = 2 msg = "\n" .join([" : ".join([x.rjust(max_length + pad), y]) for x, y in subcomponents]) action = _to_print_message_action(msg) p.add_argument("--versions", nargs=0, help="Show versions of individual components", action=action) return p pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/__init__.py0000644000000000000000000000042713035554276023240 0ustar rootrootVERSION = (0, 5, 3) def get_version(): """Return the version as a string. "O.7" This uses a major.minor.tiny to be compatible with semver spec. .. note:: This should be improved to be compliant with PEP 386. """ return ".".join([str(i) for i in VERSION]) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/testkit/0000755000000000000000000000000013035554276022613 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/testkit/core.py0000644000000000000000000001564113035554276024124 0ustar rootroot import os import unittest import logging import tempfile import subprocess from .base_utils import (HAS_PBCORE, pbcore_skip_msg, get_temp_file, get_temp_dir) from pbcommand.resolver import (resolve_tool_contract, resolve_gather_tool_contract, resolve_scatter_tool_contract) from pbcommand.models import ResolvedToolContract, PipelineChunk from pbcommand.pb_io import (load_tool_contract_from, load_resolved_tool_contract_from) from pbcommand.pb_io.tool_contract_io import write_resolved_tool_contract log = logging.getLogger(__name__) class PbTestApp(unittest.TestCase): """Generic Harness for running tool contracts end-to-end""" # if the base command is defined, DRIVER_EMIT and DRIVER_RESOLVE can be # guessed automatically DRIVER_BASE = None # complete Emit a tool contract DRIVER_EMIT = "" # Run tool from a resolve tool contract JSON file DRIVER_RESOLVE = "" # Requires Pbcore REQUIRES_PBCORE = False # input files that will be passed to the resolver # To get example files use, get_data_file("example.txt")] INPUT_FILES = [] # Arguments passed to the Resolver MAX_NPROC = 1 TASK_OPTIONS = {} # These will be checked against the resolved tool contract values RESOLVED_TASK_OPTIONS = {} RESOLVED_NPROC = 1 IS_DISTRIBUTED = False RESOLVED_IS_DISTRIBUTED = False @classmethod def setUpClass(cls): if cls.DRIVER_BASE is not None: if cls.DRIVER_EMIT == "": cls.DRIVER_EMIT = cls.DRIVER_BASE + " --emit-tool-contract " if cls.DRIVER_RESOLVE == "": cls.DRIVER_RESOLVE = cls.DRIVER_BASE + " --resolved-tool-contract " def _test_outputs_exists(self, rtc): """:type rtc: pbcommand.models.ResolvedToolContract""" log.debug("validating output file existence from {r}".format(r=rtc)) log.debug("Resolved Output files from {t}".format(t=rtc.task.task_id)) log.debug(rtc.task.output_files) # the output files should all have unique paths, otherwise the resolver # has failed emsg = "Non-unique outputs. {o}".format(o=rtc.task.output_files) self.assertEquals(len(rtc.task.output_files), len(set(rtc.task.output_files)), emsg) for i, output_file in enumerate(rtc.task.output_files): msg = "Unable to find {i}-th output file {p}".format(i=i, p=output_file) self.assertTrue(os.path.exists(output_file), msg) def _to_rtc(self, tc, output_dir, tmp_dir): # handled the polymorphism in subclasses by overriding return resolve_tool_contract(tc, self.INPUT_FILES, output_dir, tmp_dir, self.MAX_NPROC, self.TASK_OPTIONS, self.IS_DISTRIBUTED) def test_run_e2e(self): # hack to skip running the base Test class (which is the nose default behavior) if self.__class__.__name__ in ('PbTestApp', 'PbTestScatterApp', 'PbTestGatherApp'): return if self.REQUIRES_PBCORE: if not HAS_PBCORE: self.assertTrue(True, pbcore_skip_msg("Skipping running e2e for {d}".format(d=self.DRIVER_EMIT))) return output_dir = get_temp_dir(suffix="rtc-test") tmp_dir = get_temp_dir(suffix="rtc-temp") log.debug("Driver {e}".format(e=self.DRIVER_EMIT)) log.debug("input files {i}".format(i=self.INPUT_FILES)) log.debug("running in {p}".format(p=output_dir)) output_tc = get_temp_file("-{n}-tool_contract.json".format(n=self.__class__.__name__), output_dir) emit_tc_exe = "{e} > {o}".format(e=self.DRIVER_EMIT, o=output_tc) rcode = subprocess.call([emit_tc_exe], shell=True) self.assertEquals(rcode, 0, "Emitting tool contract failed for '{e}'".format(e=emit_tc_exe)) # sanity marshall-unmashalling log.debug("Loading tool-contract from {p}".format(p=output_tc)) tc = load_tool_contract_from(output_tc) log.info("Resolving tool contract to RTC") rtc = self._to_rtc(tc, output_dir, tmp_dir) output_json_rtc = get_temp_file("resolved_tool_contract.json", output_dir) write_resolved_tool_contract(rtc, output_json_rtc) # sanity loaded_rtc = load_resolved_tool_contract_from(output_json_rtc) self.assertIsInstance(loaded_rtc, ResolvedToolContract) # Test Resolved options if specified. for opt, resolved_value in self.RESOLVED_TASK_OPTIONS.iteritems(): self.assertTrue(opt in rtc.task.options, "Resolved option {x} not in RTC options.".format(x=opt)) # this needs to support polymorphic equals (i.e., almostEquals if not isinstance(resolved_value, float): emsg = "Resolved option {o} are not equal. Expected '{a}', got '{b}'".format(o=opt, b=rtc.task.options[opt], a=resolved_value) self.assertEquals(rtc.task.options[opt], resolved_value, emsg) # Resolved NPROC self.assertEquals(rtc.task.nproc, self.RESOLVED_NPROC) self.assertEquals(rtc.task.is_distributed, self.RESOLVED_IS_DISTRIBUTED) log.info("running resolved contract {r}".format(r=output_json_rtc)) exe = "{d} {p}".format(p=output_json_rtc, d=self.DRIVER_RESOLVE) log.info("Running exe '{e}'".format(e=exe)) with tempfile.TemporaryFile() as stdout: rcode = subprocess.call([exe], shell=True, stdout=stdout) self.assertEquals(rcode, 0, "Running from resolved tool contract failed from {x}".format(x=exe)) log.info("Successfully completed running e2e for {d}".format(d=self.DRIVER_EMIT)) self._test_outputs_exists(rtc) self.run_after(rtc, output_dir) def run_after(self, rtc, output_dir): """ Optional additional test code, e.g. to verify that the job produced the expected outputs. This is run automatically by test_run_e2e, but does nothing unless overridden in a subclass. """ pass class PbTestScatterApp(PbTestApp): """Test harness for testing end-to-end scattering apps Override MAX_NCHUNKS, RESOLVED_MAX_NCHUNKS and CHUNK_KEYS """ MAX_NCHUNKS = 12 RESOLVED_MAX_NCHUNKS = 12 CHUNK_KEYS = () def _to_rtc(self, tc, output_dir, tmp_dir): return resolve_scatter_tool_contract(tc, self.INPUT_FILES, output_dir, tmp_dir, self.MAX_NPROC, self.TASK_OPTIONS, self.MAX_NCHUNKS, self.CHUNK_KEYS, self.IS_DISTRIBUTED) class PbTestGatherApp(PbTestApp): """Test harness for testing end-to-end gather apps Override the CHUNK_KEY to pass that into your resolver """ CHUNK_KEY = PipelineChunk.CHUNK_KEY_PREFIX + 'fasta_id' def _to_rtc(self, tc, output_dir, tmp_dir): return resolve_gather_tool_contract(tc, self.INPUT_FILES, output_dir, tmp_dir, self.MAX_NPROC, self.TASK_OPTIONS, self.CHUNK_KEY, self.IS_DISTRIBUTED) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/testkit/__init__.py0000644000000000000000000000003413035554276024721 0ustar rootrootfrom .core import PbTestApp pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/testkit/base_utils.py0000644000000000000000000000106113035554276025315 0ustar rootrootimport os import tempfile HAS_PBCORE = False try: import pbcore HAS_PBCORE = True except ImportError: HAS_PBCORE = False def pbcore_skip_msg(msg=None): msg = "" if msg is None else msg return "" if HAS_PBCORE else "pbcore is not installed. {m}".format(m=msg) def get_temp_file(suffix, dir_): t = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir=dir_) t.close() return t.name def get_temp_dir(suffix=""): """This will make subdir in the root tmp dir""" return tempfile.mkdtemp(dir=None, suffix=suffix) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/engine/0000755000000000000000000000000013035554276022371 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/engine/runner.py0000644000000000000000000000456513035554276024266 0ustar rootroot"""Utils for Running an external process""" import logging import tempfile import shlex import platform import subprocess import time from collections import namedtuple log = logging.getLogger(__name__) ExtCmdResult = namedtuple("ExtCmdResult", "exit_code cmd run_time") def run_cmd(cmd, stdout_fh, stderr_fh, shell=True, time_out=None, env=None, executable="/bin/bash"): """Run external command :param: cmd: External command :param time_out: Timeout in seconds. :type time_out: None | int :return: ExtCmdResult This could all be bundled into a context manager with RunCommand('/path/stdout', '/path/to/stderr') as r: r.exe("echo 'exe1') r.exe("echo 'exe2') result = r.get_result() # close the file handles """ # Clarify with Dave # add simple usecase with no file handles, get stderr back as str # stdout, stderr. In general, stdout can be large # add env={} # sleeptime scaling started_at = time.time() # Most of the current pacbio shell commands have aren't shlex-able if not shell: cmd = shlex.split(cmd) hostname = platform.node() log.debug("calling cmd '{c}' on {h}".format(c=cmd, h=hostname)) process = subprocess.Popen(cmd, stderr=stderr_fh, stdout=stdout_fh, shell=shell, executable=executable, env=env) # This needs a better dynamic model max_sleep_time = 5 sleep_time = 0.1 dt = 0.1 process.poll() while process.returncode is None: process.poll() time.sleep(sleep_time) run_time = time.time() - started_at if time_out is not None: if run_time > time_out: log.warn("Exceeded TIMEOUT of {t}. Killing cmd '{c}'".format(t=time_out, c=cmd)) try: # ask for forgiveness model process.kill() except OSError: # already been killed pass if sleep_time < max_sleep_time: sleep_time += dt run_time = time.time() - started_at run_time = run_time returncode = process.returncode log.debug("returncode is {r} in {s:.2f} sec.".format(r=process.returncode, s=run_time)) return ExtCmdResult(returncode, cmd, run_time) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/engine/__init__.py0000644000000000000000000000005213035554276024477 0ustar rootrootfrom .runner import run_cmd, ExtCmdResult pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/interactive_resolver.py0000644000000000000000000000517013035554276025737 0ustar rootroot#!/usr/bin/env python from __future__ import unicode_literals import os import sys import warnings from pbcommand.cli import get_default_argparser from pbcommand.models import SymbolTypes from pbcommand.pb_io import (load_tool_contract_from, write_resolved_tool_contract, write_resolved_tool_contract_avro) from pbcommand.resolver import resolve_tool_contract try: from prompt_toolkit.filters import Always from prompt_toolkit.shortcuts import get_input except ImportError: sys.stderr.write("interactive resolver requires 'prompt_toolkit' (pip install prompt_toolkit)\n") raise def run_main(tc): """:type tc: ToolContract""" print "Loaded tc {c}".format(c=tc) if tc.task.nproc == SymbolTypes.MAX_NPROC: nproc = get_input('Enter max nproc: ') else: # not quite right nproc = 1 output_dir = get_input('Output Directory: ', enable_system_bindings=Always()) output_dir = os.path.abspath(output_dir) input_files = [] for i, input_type in enumerate(tc.task.input_file_types): in_path = get_input(" {i} file {p} path :".format(i=i, p=input_type)) if not os.path.exists(in_path): warnings.warn("Unable to find {p}".format(p=in_path)) # Make sure all inputs are abspaths p = in_path if os.path.isabs(in_path) else os.path.abspath(in_path) input_files.append(p) tool_options = {} rtc = resolve_tool_contract(tc, input_files, output_dir, '/tmp', int(nproc), tool_options, is_distributable=False) print rtc def to_n(ext): return "resolved_tool_contract." + ext def to_f(ext): return "_".join([tc.task.task_id, to_n(ext)]) def to_p(ext): return os.path.join(output_dir, to_f(ext)) rtc_path = to_p("json") print "writing RTC to {f}".format(f=rtc_path) # Always write the JSON RTC file write_resolved_tool_contract(rtc, rtc_path) if rtc.driver.serialization.lower() == "avro": avro_rtc_path = to_p("avro") print "writing AVRO RTC to {f}".format(f=avro_rtc_path) write_resolved_tool_contract_avro(rtc, avro_rtc_path) return rtc def _run_main(args): return run_main(load_tool_contract_from(args.tc_path)) def get_parser(): p = get_default_argparser("0.1.0", "Interactive tool for resolving Tool Contracts") p.add_argument("tc_path", type=str, help='Path to Tool Contract') p.set_defaults(func=_run_main) return p def main(argv=sys.argv): p = get_parser() args = p.parse_args(argv[1:]) args.func(args) return 0 if __name__ == '__main__': sys.exit(main()) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/utils.py0000644000000000000000000003430613035554276022644 0ustar rootroot"""Utils for common funcs, such as setting up a log, composing functions.""" import functools import os import logging import logging.config import argparse import pprint import traceback import time import types import subprocess from contextlib import contextmanager import xml.etree.ElementTree as ET from pbcommand.models import FileTypes, DataSetMetaData log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) # suppress the annoying no handlers msg class Constants(object): """Log Level format strings""" LOG_FMT_ONLY_MSG = '%(message)s' LOG_FMT_ERR = '%(message)s' LOG_FMT_LVL = '[%(levelname)s] %(message)s' LOG_FMT_MIN = '[%(asctime)-15sZ] %(message)s' LOG_FMT_SIMPLE = '[%(levelname)s] %(asctime)-15sZ %(message)s' LOG_FMT_STD = '[%(levelname)s] %(asctime)-15sZ [%(name)s] %(message)s' LOG_FMT_FULL = '[%(levelname)s] %(asctime)-15sZ [%(name)s %(funcName)s %(lineno)d] %(message)s' class ExternalCommandNotFoundError(Exception): """External command is not found in Path""" pass def _handler_stream_d(stream, level_str, formatter_id): d = {'level': level_str, 'class': "logging.StreamHandler", 'formatter': formatter_id, 'stream': stream} return d _handler_stdout_stream_d = functools.partial(_handler_stream_d, "ext://sys.stdout") _handler_stderr_stream_d = functools.partial(_handler_stream_d, "ext://sys.stderr") def _handler_file(level_str, path, formatter_id): d = {'class': 'logging.FileHandler', 'level': level_str, 'formatter': formatter_id, 'filename': path} return d def _get_default_logging_config_dict(level, file_name_or_none, formatter): """ Setup a logger to either a file or console. If file name is none, then a logger will be setup to stdout. :note: adds console Returns a dict configuration of the logger. """ level_str = logging.getLevelName(level) formatter_id = 'custom_logger_fmt' console_handler_id = "console_handler" error_fmt_id = "error_fmt_id" error_handler_id = "error_handler" error_handler_d = _handler_stderr_stream_d(logging.ERROR, error_fmt_id) if file_name_or_none is None: handler_d = _handler_stdout_stream_d(level_str, formatter_id) else: handler_d = _handler_file(level_str, file_name_or_none, formatter_id) formatters_d = {fid: {'format': fx} for fid, fx in [(formatter_id, formatter), (error_fmt_id, Constants.LOG_FMT_ERR)]} handlers_d = {console_handler_id: handler_d, error_handler_id: error_handler_d} loggers_d = {"custom": {'handlers': [console_handler_id], 'stderr': {'handlers': [error_handler_id]}}} d = { 'version': 1, 'disable_existing_loggers': False, # this fixes the problem 'formatters': formatters_d, 'handlers': handlers_d, 'loggers': loggers_d, 'root': {'handlers': [error_handler_id, console_handler_id], 'level': logging.NOTSET} } #print pprint.pformat(d) return d def _get_console_and_file_logging_config_dict(console_level, console_formatter, path, path_level, path_formatter): """ Get logging configuration that is both for console and a file. :note: A stderr logger handler is also added. """ def _to_handler_d(handlers_, level): return {"handlers": handlers_, "level": level, "propagate": True} console_handler_id = "console_handler" console_fmt_id = "console_fmt" console_handler_d = _handler_stdout_stream_d(console_level, console_fmt_id) stderr_handler_id = "stderr_handler" error_fmt_id = "error_fmt" stderr_handler_d = _handler_stderr_stream_d(logging.ERROR, console_fmt_id) file_handler_id = "file_handler" file_fmt_id = "file_fmt" file_handler_d = _handler_file(path_level, path, file_fmt_id) formatters = {console_fmt_id: {"format": console_formatter}, file_fmt_id: {"format": path_formatter}, error_fmt_id: {"format": Constants.LOG_FMT_ERR} } handlers = {console_handler_id: console_handler_d, file_handler_id: file_handler_d, stderr_handler_id: stderr_handler_d} loggers = {"console": _to_handler_d([console_handler_id], console_level), "custom_file": _to_handler_d([file_handler_id], path_level), "stderr_err": _to_handler_d([stderr_handler_id], logging.ERROR) } d = {'version': 1, 'disable_existing_loggers': False, # this fixes the problem 'formatters': formatters, 'handlers': handlers, 'loggers': loggers, 'root': {'handlers': handlers.keys(), 'level': logging.DEBUG} } # print pprint.pformat(d) return d def _setup_logging_config_d(d): logging.config.dictConfig(d) logging.Formatter.converter = time.gmtime return d def setup_logger(file_name_or_none, level, formatter=Constants.LOG_FMT_FULL): """ :param file_name_or_none: Path to log file, None will default to stdout :param level: logging.LEVEL of :param formatter: Log Formatting string """ d = _get_default_logging_config_dict(level, file_name_or_none, formatter) return _setup_logging_config_d(d) def setup_console_and_file_logger(stdout_level, stdout_formatter, path, path_level, path_formatter): d = _get_console_and_file_logging_config_dict(stdout_level, stdout_formatter, path, path_level, path_formatter) return _setup_logging_config_d(d) def setup_log(alog, level=logging.INFO, file_name=None, log_filter=None, str_formatter=Constants.LOG_FMT_FULL): """Core Util to setup log handler :param alog: a log instance :param level: (int) Level of logging debug :param file_name: (str, None) if None, stdout is used, str write to file :param log_filter: (LogFilter, None) :param str_formatter: (str) log formatting str .. warning:: THIS NEEDS TO BE DEPRECATED """ setup_logger(file_name, level, formatter=str_formatter) # FIXME. Keeping the interface, but the specific log instance isn't used, # the python logging setup mutates global state if log_filter is not None: alog.warn("log_filter kw is no longer supported") return alog def get_parsed_args_log_level(pargs, default_level=logging.INFO): """ Utility for handling logging setup flexibly in a variety of use cases, assuming standard command-line arguments. :param pargs: argparse namespace or equivalent :param default_level: logging level to use if the parsed arguments do not specify one """ level = default_level if isinstance(level, basestring): level = logging.getLevelName(level) if hasattr(pargs, 'verbosity') and pargs.verbosity > 0: if pargs.verbosity >= 2: level = logging.DEBUG else: level = logging.INFO elif hasattr(pargs, 'debug') and pargs.debug: level = logging.DEBUG elif hasattr(pargs, 'quiet') and pargs.quiet: level = logging.ERROR elif hasattr(pargs, 'log_level'): level = logging.getLevelName(pargs.log_level) return level def log_traceback(alog, ex, ex_traceback): """ Log a python traceback in the log file :param ex: python Exception instance :param ex_traceback: exception traceback Example Usage (assuming you have a log instance in your scope) :Example: >>> try: >>> 1 / 0 >>> except Exception as e: >>> msg = "{i} failed validation. {e}".format(i=item, e=e) >>> log.error(msg) >>> _, _, ex_traceback = sys.exc_info() >>> log_traceback(log, e, ex_traceback) """ tb_lines = traceback.format_exception(ex.__class__, ex, ex_traceback) tb_text = ''.join(tb_lines) alog.error(tb_text) def validate_type_or_raise(instance, type_or_types, error_prefix=None): _d = dict(t=instance, x=type(instance), v=instance) e = error_prefix if error_prefix is not None else "" msg = e + "Expected type {t}. Got type {x} for {v}".format(**_d) if not isinstance(instance, type_or_types): raise TypeError(msg) else: return instance def _simple_validate_type(atype, instance): return validate_type_or_raise(instance, atype) _is_argparser_instance = functools.partial(_simple_validate_type, argparse.ArgumentParser) def is_argparser_instance(func): @functools.wraps def wrapper(*args, **kwargs): _is_argparser_instance(args[0]) return func(*args, **kwargs) return wrapper def compose(*funcs): """ Functional composition of a non-empty list [f, g, h] will be f(g(h(x))) :Example: >>> f = lambda x: x * x >>> g = lambda x: x + 1 >>> h = lambda x: x * 2 >>> funcs = [f, g, h] >>> fgh = compose(*funcs) >>> fgh(3) # 49 >>> compose(f, g, h)(3) """ if not funcs: raise ValueError("Compose only supports non-empty lists") for func in funcs: if not isinstance(func, (types.BuiltinMethodType, functools.partial, types.MethodType, types.BuiltinFunctionType, types.FunctionType)): raise TypeError("Only Function types are supported") def compose_two(f, g): def c(x): return f(g(x)) return c return functools.reduce(compose_two, funcs) def which(exe_str): """walk the current PATH for exe_str to get the absolute path of the exe :param exe_str: Executable name :rtype: str | None :returns Absolute path to the executable or None if the exe is not found """ paths = os.environ.get('PATH', None) resolved_exe = None if paths is None: # log warning msg = "PATH env var is not defined." log.error(msg) return resolved_exe for path in paths.split(":"): exe_path = os.path.join(path, exe_str) # print exe_path if os.path.exists(exe_path): resolved_exe = exe_path break # log.debug("Resolved cmd {e} to {x}".format(e=exe_str, x=resolved_exe)) return resolved_exe def which_or_raise(cmd): """Find exe in path or raise ExternalCommandNotFoundError""" resolved_cmd = which(cmd) if resolved_cmd is None: raise ExternalCommandNotFoundError("Unable to find required cmd '{c}'".format(c=cmd)) else: return resolved_cmd class Singleton(type): """ General Purpose singleton class Usage: >>> class MyClass(object): >>> __metaclass__ = Singleton >>> def __init__(self): >>> self.name = 'name' """ def __init__(cls, name, bases, dct): super(Singleton, cls).__init__(name, bases, dct) cls.instance = None def __call__(cls, *args, **kw): if cls.instance is None: cls.instance = super(Singleton, cls).__call__(*args) return cls.instance def nfs_exists_check(ff): """ Central place for all NFS hackery Return whether a file or a dir ff exists or not. Call listdir() instead of os.path.exists() to eliminate NFS errors. Added try/catch black hole exception cases to help trigger an NFS refresh :rtype bool: """ try: # All we really need is opendir(), but listdir() is usually fast. os.listdir(os.path.dirname(os.path.realpath(ff))) # But is it a file or a directory? We do not know until it actually exists. if os.path.exists(ff): return True # Might be a directory, so refresh itself too. # Not sure this is necessary, since we already ran this on parent, # but it cannot hurt. os.listdir(os.path.realpath(ff)) if os.path.exists(ff): return True except OSError: pass # The rest is probably unnecessary, but it cannot hurt. # try to trigger refresh for File case try: f = open(ff, 'r') f.close() except Exception: pass # try to trigger refresh for Directory case try: _ = os.stat(ff) _ = os.listdir(ff) except Exception: pass # Call externally # this is taken from Yuan cmd = "ls %s" % ff rcode = 1 try: p = subprocess.Popen([cmd], shell=True) rcode = p.wait() except Exception: pass return rcode == 0 def nfs_refresh(path, ntimes=3, sleep_time=1.0): while True: if nfs_exists_check(path): return True ntimes -= 1 if ntimes <= 0: break time.sleep(sleep_time) log.warn("NFS refresh failed. unable to resolve {p}".format(p=path)) return False @contextmanager def ignored(*exceptions): try: yield except exceptions: pass def get_dataset_metadata(path): """ Returns DataSetMeta data or raises ValueError if dataset XML is missing the required UniqueId and MetaType values. :param path: Path to DataSet XML :raises: ValueError :return: DataSetMetaData """ uuid = mt = None for event, element in ET.iterparse(path, events=("start",)): uuid = element.get("UniqueId") mt = element.get("MetaType") break if mt in FileTypes.ALL_DATASET_TYPES().keys(): return DataSetMetaData(uuid, mt) else: raise ValueError("Unsupported dataset type '{t}'".format(t=mt)) def get_dataset_metadata_or_none(path): """ Returns DataSetMeta data, else None if the file doesn't exist or a processing of the XML raises. :param path: Path to DataSet XML :return: DataSetMetaData or None """ try: return get_dataset_metadata(path) except Exception: return None def is_dataset(path): """peek into the XML to get the MetaType and verify that it's a valid dataset :param path: Path to DataSet XML """ return get_dataset_metadata_or_none(path) is not None def walker(root_dir, file_filter_func): """ Walk the file sytem and filter by the supplied filter function. Filter function F(path) -> bool """ for root, dnames, fnames in os.walk(root_dir): for fname in fnames: path = os.path.join(root, fname) if file_filter_func(path): yield path pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/services/0000755000000000000000000000000013035554276022747 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/services/service_access_layer.py0000644000000000000000000006300213035554276027477 0ustar rootroot"""Utils for Updating state/progress and results to WebServices """ import json import logging import pprint import time import requests from requests import RequestException from pbcommand.models import (FileTypes, DataSetFileType, DataStore, DataStoreFile) from pbcommand.utils import get_dataset_metadata from .models import (SMRTServiceBaseError, JobResult, JobStates, JobExeError, JobTypes, LogLevels, ServiceEntryPoint, ServiceResourceTypes, ServiceJob, JobEntryPoint) from .utils import to_ascii, to_sal_summary log = logging.getLogger(__name__) #log.addHandler(logging.NullHandler()) # to prevent the annoying 'No handlers .. ' msg class Constants(object): HEADERS = {'Content-type': 'application/json'} def _post_requests(headers): def wrapper(url, d_): data = json.dumps(d_) return requests.post(url, data=data, headers=headers) return wrapper def _get_requests(headers): def wrapper(url): return requests.get(url, headers=headers) return wrapper # These are exposed publicly as a utility, but shouldn't be used in any API # call. The _process_* are the entry points for API calls to make sure an # errors are handled correctly. rqpost = _post_requests(Constants.HEADERS) rqget = _get_requests(Constants.HEADERS) def _parse_base_service_error(response): """:type response: requests.Response Don't trust the services. Try to parse the response to SMRT Server Error datastructure (even if a 200 is returned) """ if response.ok: try: d = response.json() emsg = SMRTServiceBaseError.from_d(d) raise emsg except (KeyError, TypeError): # couldn't parse response -> error, # so everything is fine return response else: return response def _process_rget(total_url, ignore_errors=False): """Process get request and return JSON response. Raise if not successful""" r = rqget(total_url) _parse_base_service_error(r) if not r.ok and not ignore_errors: log.error("Failed ({s}) GET to {x}".format(x=total_url, s=r.status_code)) r.raise_for_status() j = r.json() return j def _process_rget_with_transform(func, ignore_errors=False): """Post process the JSON result (if successful) with F(json_d) -> T""" def wrapper(total_url): j = _process_rget(total_url, ignore_errors=ignore_errors) return func(j) return wrapper def _process_rget_with_jobs_transform(total_url, ignore_errors=False): # defining an internal method, because this used in several places jobs_d = _process_rget(total_url, ignore_errors=ignore_errors) return [ServiceJob.from_d(job_d) for job_d in jobs_d] def _process_rget_or_none(func, ignore_errors=False): """ apply the transform func to the output of GET request if it was successful, else returns None This is intended to be used for looking up Results by Id where the a 404 is found. """ def wrapper(total_url): try: return _process_rget_with_transform(func, ignore_errors)(total_url) except (RequestException, SMRTServiceBaseError): # FIXME # this should be a tighter exception case # only look for 404 return None return wrapper def _process_rget_with_job_transform_or_none(total_url): return _process_rget_or_none(ServiceJob.from_d)(total_url) def _process_rpost(total_url, payload_d): r = rqpost(total_url, payload_d) _parse_base_service_error(r) # FIXME This should be strict to only return a 201 if r.status_code not in (200, 201): log.error("Failed ({s} to call {u}".format(u=total_url, s=r.status_code)) log.error("payload") log.error("\n" + pprint.pformat(payload_d)) r.raise_for_status() j = r.json() return j def _process_rpost_with_transform(func): def wrapper(total_url, payload_d): j = _process_rpost(total_url, payload_d) return func(j) return wrapper def _to_url(base, ext): return "".join([base, ext]) def _null_func(x): # Pass thorough func return x def _import_dataset_by_type(dataset_type_or_id): if isinstance(dataset_type_or_id, DataSetFileType): ds_type_id = dataset_type_or_id.file_type_id else: ds_type_id = dataset_type_or_id def wrapper(total_url, path): _d = dict(datasetType=ds_type_id, path=path) return _process_rpost_with_transform(ServiceJob.from_d)(total_url, _d) return wrapper def _get_job_by_id_or_raise(sal, job_id, error_klass, error_messge_extras=None): job = sal.get_job_by_id(job_id) if job is None: details = "" if error_messge_extras is None else error_messge_extras base_msg = "Failed to find job {i}".format(i=job_id) emsg = " ".join([base_msg, details]) raise error_klass(emsg) return job def _block_for_job_to_complete(sal, job_id, time_out=600, sleep_time=2): """ Waits for job to complete :param sal: ServiceAccessLayer :param job_id: Job Id :param time_out: Total runtime before aborting :param sleep_time: polling interval (in sec) :rtype: JobResult :raises: KeyError if job is not initially found, or JobExeError if the job fails during the polling process or times out """ time.sleep(sleep_time) job = _get_job_by_id_or_raise(sal, job_id, KeyError) log.debug("time_out = {t}".format(t=time_out)) error_msg = "" job_result = JobResult(job, 0, error_msg) started_at = time.time() # number of polling steps i = 0 while True: run_time = time.time() - started_at if job.state in JobStates.ALL_COMPLETED: break i += 1 time.sleep(sleep_time) msg = "Running pipeline {n} state: {s} runtime:{r:.2f} sec {i} iteration".format(n=job.name, s=job.state, r=run_time, i=i) log.debug(msg) # making the exceptions different to distinguish between an initial # error and a "polling" error. Adding some msg details job = _get_job_by_id_or_raise(sal, job_id, JobExeError, error_messge_extras=msg) # FIXME, there's currently not a good way to get errors for jobs job_result = JobResult(job, run_time, "") if time_out is not None: if run_time > time_out: raise JobExeError("Exceeded runtime {r} of {t}. {m}".format(r=run_time, t=time_out, m=msg)) return job_result # Make this consistent somehow. Maybe defined 'shortname' in the core model? # Martin is doing this for the XML file names DATASET_METATYPES_TO_ENDPOINTS = { FileTypes.DS_SUBREADS_H5: "hdfsubreads", FileTypes.DS_SUBREADS: "subreads", FileTypes.DS_ALIGN: "alignments", FileTypes.DS_REF: "references", FileTypes.DS_BARCODE: "barcodes", FileTypes.DS_CCS: "ccsreads", FileTypes.DS_CONTIG: "contigs", FileTypes.DS_ALIGN_CCS: "cssalignments", FileTypes.DS_GMAP_REF: "gmapreferences"} def _get_endpoint_or_raise(ds_type): if ds_type in DATASET_METATYPES_TO_ENDPOINTS: return DATASET_METATYPES_TO_ENDPOINTS[ds_type] raise KeyError("Unsupported datasettype {t}. Supported values {v}".format(t=ds_type, v=DATASET_METATYPES_TO_ENDPOINTS.keys())) def _job_id_or_error(job_or_error, custom_err_msg=None): """ Extract job id from job creation service (by type) or Raise exception from an EngineJob response :raises: JobExeError """ if isinstance(job_or_error, ServiceJob): return job_or_error.id else: emsg = job_or_error.get('message', "Unknown") if custom_err_msg is not None: emsg += " {f}".format(f=custom_err_msg) raise JobExeError("Failed to create job. {e}. Raw Response {x}".format(e=emsg, x=job_or_error)) def _to_host(h): prefix = "http://" return h if h.startswith(prefix) else prefix + h def _to_ds_file(d): # is_chunk this isn't exposed at the service level return DataStoreFile(d['uuid'], d['sourceId'], d['fileTypeId'], d['path'], is_chunked=False, name=d.get("name", ""), description=d.get("description", "")) def _to_datastore(dx): # Friction to get around service endpoint not returning a list of files ds_files = [_to_ds_file(d) for d in dx] return DataStore(ds_files) def _to_job_report_files(dx): return [{u"reportTypeId": d["reportTypeId"], u"dataStoreFile": _to_ds_file(d["dataStoreFile"])} for d in dx] def _to_entry_points(d): return [JobEntryPoint.from_d(i) for i in d] def _get_all_report_attributes(sal_get_reports_func, sal_get_reports_details_func, job_id): """Util func for getting report Attributes Note, this assumes that only one report type has been created. This is probably not a great idea. Should re-evaluate this. """ report_datafiles = sal_get_reports_func(job_id) report_uuids = [r.values()[0].uuid for r in report_datafiles] reports = [sal_get_reports_details_func(job_id, r_uuid) for r_uuid in report_uuids] all_report_attributes = {} for r in reports: for x in r['attributes']: all_report_attributes[x['id']] = x['value'] return all_report_attributes class ServiceAccessLayer(object): """General Access Layer for interfacing with the job types on Secondary SMRT Server""" ROOT_JM = "/secondary-analysis/job-manager" ROOT_JOBS = ROOT_JM + "/jobs" ROOT_DS = "/secondary-analysis/datasets" ROOT_PT = '/secondary-analysis/resolved-pipeline-templates' # in sec when blocking to run a job JOB_DEFAULT_TIMEOUT = 60 * 30 def __init__(self, base_url, port, debug=False, sleep_time=2): self.base_url = _to_host(base_url) self.port = port # This will display verbose details with respect to the failed request self.debug = debug self._sleep_time = sleep_time @property def uri(self): return "{b}:{u}".format(b=self.base_url, u=self.port) def _to_url(self, rest): return _to_url(self.uri, rest) def __repr__(self): return "<{k} {u} >".format(k=self.__class__.__name__, u=self.uri) def to_summary(self): return to_sal_summary(self) def get_status(self): """Get status of the server""" return _process_rget(_to_url(self.uri, "/status")) def get_job_by_type_and_id(self, job_type, job_id): return _process_rget_with_job_transform_or_none(_to_url(self.uri, "{p}/{t}/{i}".format(i=job_id, t=job_type, p=ServiceAccessLayer.ROOT_JOBS))) def get_job_by_id(self, job_id): """Get a Job by int id""" # FIXME. Make this an internal method It's ambiguous which job type type you're asking for return _process_rget_with_job_transform_or_none(_to_url(self.uri, "{r}/{i}".format(i=job_id, r=ServiceAccessLayer.ROOT_JOBS))) def _get_job_resource_type(self, job_type, job_id, resource_type_id): # grab the datastore or the reports _d = dict(t=job_type, i=job_id, r=resource_type_id, p=ServiceAccessLayer.ROOT_JOBS) return _process_rget_with_job_transform_or_none(_to_url(self.uri, "{p}/{t}/{i}/{r}".format(**_d))) def _get_job_resource_type_with_transform(self, job_type, job_id, resource_type_id, transform_func): _d = dict(t=job_type, i=job_id, r=resource_type_id, p=ServiceAccessLayer.ROOT_JOBS) return _process_rget_or_none(transform_func)(_to_url(self.uri, "{p}/{t}/{i}/{r}".format(**_d))) def _get_jobs_by_job_type(self, job_type): return _process_rget_with_jobs_transform(_to_url(self.uri, "{p}/{t}".format(t=job_type, p=ServiceAccessLayer.ROOT_JOBS))) def get_analysis_jobs(self): return self._get_jobs_by_job_type(JobTypes.PB_PIPE) def get_import_dataset_jobs(self): return self._get_jobs_by_job_type(JobTypes.IMPORT_DS) def get_merge_dataset_jobs(self): return self._get_jobs_by_job_type(JobTypes.MERGE_DS) def get_fasta_convert_jobs(self): self._get_jobs_by_job_type(JobTypes.CONVERT_FASTA) def get_analysis_job_by_id(self, job_id): """Get an Analysis job by id or UUID or return None :rtype: ServiceJob """ return self.get_job_by_type_and_id(JobTypes.PB_PIPE, job_id) def get_import_job_by_id(self, job_id): return self.get_job_by_type_and_id(JobTypes.IMPORT_DS, job_id) def get_analysis_job_datastore(self, job_id): """Get DataStore output from (pbsmrtpipe) analysis job""" # this doesn't work the list is sli return self._get_job_resource_type_with_transform(JobTypes.PB_PIPE, job_id, ServiceResourceTypes.DATASTORE, _to_datastore) def get_analysis_job_reports(self, job_id): """Get list of DataStore ReportFile types output from (pbsmrtpipe) analysis job""" return self._get_job_resource_type_with_transform(JobTypes.PB_PIPE, job_id, ServiceResourceTypes.REPORTS, _to_job_report_files) def get_analysis_job_report_details(self, job_id, report_uuid): _d = dict(t=JobTypes.PB_PIPE, i=job_id, r=ServiceResourceTypes.REPORTS, p=ServiceAccessLayer.ROOT_JOBS, u=report_uuid) return _process_rget_or_none(lambda x: x)(_to_url(self.uri, "{p}/{t}/{i}/{r}/{u}".format(**_d))) def get_analysis_job_report_attrs(self, job_id): """Return a dict of all the Report Attributes""" return _get_all_report_attributes(self.get_analysis_job_reports, self.get_analysis_job_report_details, job_id) def get_import_job_reports(self, job_id): return self._get_job_resource_type_with_transform(JobTypes.IMPORT_DS, job_id, ServiceResourceTypes.REPORTS, _to_job_report_files) def get_import_job_report_details(self, job_id, report_uuid): # It would have been better to return a Report instance, not raw json _d = dict(t=JobTypes.IMPORT_DS, i=job_id, r=ServiceResourceTypes.REPORTS, p=ServiceAccessLayer.ROOT_JOBS, u=report_uuid) return _process_rget_or_none(lambda x: x)(_to_url(self.uri, "{p}/{t}/{i}/{r}/{u}".format(**_d))) def get_import_job_report_attrs(self, job_id): """Return a dict of all the Report Attributes""" return _get_all_report_attributes(self.get_import_job_reports, self.get_import_job_report_details, job_id) def get_analysis_job_entry_points(self, job_id): return self._get_job_resource_type_with_transform(JobTypes.PB_PIPE, job_id, ServiceResourceTypes.ENTRY_POINTS, _to_entry_points) def get_import_dataset_job_datastore(self, job_id): """Get a List of Service DataStore files from an import DataSet job""" return self._get_job_resource_type(JobTypes.IMPORT_DS, job_id, ServiceResourceTypes.DATASTORE) def get_merge_dataset_job_datastore(self, job_id): return self._get_job_resource_type(JobTypes.MERGE_DS, job_id, ServiceResourceTypes.DATASTORE) def _import_dataset(self, dataset_type, path): # This returns a job resource url = self._to_url("{p}/{x}".format(x=JobTypes.IMPORT_DS, p=ServiceAccessLayer.ROOT_JOBS)) return _import_dataset_by_type(dataset_type)(url, path) def run_import_dataset_by_type(self, dataset_type, path_to_xml): job_or_error = self._import_dataset(dataset_type, path_to_xml) custom_err_msg = "Import {d} {p}".format(p=path_to_xml, d=dataset_type) job_id = _job_id_or_error(job_or_error, custom_err_msg=custom_err_msg) return _block_for_job_to_complete(self, job_id, sleep_time=self._sleep_time) def _run_import_and_block(self, func, path, time_out=None): # func while be self.import_dataset_X job_or_error = func(path) custom_err_msg = "Import {p}".format(p=path) job_id = _job_id_or_error(job_or_error, custom_err_msg=custom_err_msg) return _block_for_job_to_complete(self, job_id, time_out=time_out, sleep_time=self._sleep_time) def import_dataset_subread(self, path): return self._import_dataset(FileTypes.DS_SUBREADS, path) def run_import_dataset_subread(self, path, time_out=10): return self._run_import_and_block(self.import_dataset_subread, path, time_out=time_out) def import_dataset_hdfsubread(self, path): return self._import_dataset(FileTypes.DS_SUBREADS_H5, path) def run_import_dataset_hdfsubread(self, path, time_out=10): return self._run_import_and_block(self.import_dataset_hdfsubread, path, time_out=time_out) def import_dataset_reference(self, path): return self._import_dataset(FileTypes.DS_REF, path) def run_import_dataset_reference(self, path, time_out=10): return self._run_import_and_block(self.import_dataset_reference, path, time_out=time_out) def import_dataset_barcode(self, path): return self._import_dataset(FileTypes.DS_BARCODE, path) def run_import_dataset_barcode(self, path, time_out=10): return self._run_import_and_block(self.import_dataset_barcode, path, time_out=time_out) def run_import_local_dataset(self, path): """Import a file from FS that is local to where the services are running Returns a JobResult instance :rtype: JobResult """ dataset_meta_type = get_dataset_metadata(path) def _verify_dataset_in_list(): file_type = FileTypes.ALL()[dataset_meta_type.metatype] ds_endpoint = _get_endpoint_or_raise(file_type) # all datasets for a specific type datasets = self._get_datasets_by_type(ds_endpoint) uuids = {ds['uuid'] for ds in datasets} if dataset_meta_type.uuid not in uuids: raise JobExeError(("Dataset {u} was imported but does not "+ "appear in the dataset list; this may "+ "indicate XML schema errors.").format( u=dataset_meta_type.uuid)) result = self.get_dataset_by_uuid(dataset_meta_type.uuid, ignore_errors=True) if result is None: log.info("Importing dataset {p}".format(p=path)) job_result = self.run_import_dataset_by_type(dataset_meta_type.metatype, path) log.info("Confirming database update") # validation 1: attempt to retrieve dataset info result_new = self.get_dataset_by_uuid(dataset_meta_type.uuid) if result_new is None: raise JobExeError(("Dataset {u} was imported but could "+ "not be retrieved; this may indicate "+ "XML schema errors.").format( u=dataset_meta_type.uuid)) # validation 2: make sure it shows up in the listing _verify_dataset_in_list() return job_result else: log.info("{f} already imported. Skipping importing. {r}".format(r=result, f=dataset_meta_type.metatype)) _verify_dataset_in_list() # need to clean this up return JobResult(self.get_job_by_id(result['jobId']), 0, "") def get_dataset_by_uuid(self, int_or_uuid, ignore_errors=False): """The recommend model is to look up DataSet type by explicit MetaType Returns None if the dataset was not found """ return _process_rget_or_none(_null_func, ignore_errors=ignore_errors)( _to_url(self.uri, "{p}/{i}".format(i=int_or_uuid, p=ServiceAccessLayer.ROOT_DS))) def get_dataset_by_id(self, dataset_type, int_or_uuid): """Get a Dataset using the DataSetMetaType and (int|uuid) of the dataset""" ds_endpoint = _get_endpoint_or_raise(dataset_type) return _process_rget(_to_url(self.uri, "{p}/{t}/{i}".format(t=ds_endpoint, i=int_or_uuid, p=ServiceAccessLayer.ROOT_DS))) def _get_datasets_by_type(self, dstype): return _process_rget(_to_url(self.uri, "{p}/{i}".format(i=dstype, p=ServiceAccessLayer.ROOT_DS))) def get_subreadset_by_id(self, int_or_uuid): return self.get_dataset_by_id(FileTypes.DS_SUBREADS, int_or_uuid) def get_subreadsets(self): return self._get_datasets_by_type("subreads") def get_hdfsubreadset_by_id(self, int_or_uuid): return self.get_dataset_by_id(FileTypes.DS_SUBREADS_H5, int_or_uuid) def get_hdfsubreadsets(self): return self._get_datasets_by_type("hdfsubreads") def get_referenceset_by_id(self, int_or_uuid): return self.get_dataset_by_id(FileTypes.DS_REF, int_or_uuid) def get_referencesets(self): return self._get_datasets_by_type("references") def get_barcodeset_by_id(self, int_or_uuid): return self.get_dataset_by_id(FileTypes.DS_BARCODE, int_or_uuid) def get_barcodesets(self): return self._get_datasets_by_type("barcodes") def get_alignmentset_by_id(self, int_or_uuid): return self.get_dataset_by_id(FileTypes.DS_ALIGN, int_or_uuid) def get_ccsreadset_by_id(self, int_or_uuid): return self.get_dataset_by_id(FileTypes.DS_CCS, int_or_uuid) def get_ccsreadsets(self): return self._get_datasets_by_type("ccsreads") def get_alignmentsets(self): return self._get_datasets_by_type("alignments") def import_fasta(self, fasta_path, name, organism, ploidy): """Convert fasta file to a ReferenceSet and Import. Returns a Job """ d = dict(path=fasta_path, name=name, organism=organism, ploidy=ploidy) return _process_rpost_with_transform(ServiceJob.from_d)(self._to_url("{p}/{t}".format(p=ServiceAccessLayer.ROOT_JOBS, t=JobTypes.CONVERT_FASTA)), d) def run_import_fasta(self, fasta_path, name, organism, ploidy, time_out=JOB_DEFAULT_TIMEOUT): """Import a Reference into a Block""""" job_or_error = self.import_fasta(fasta_path, name, organism, ploidy) _d = dict(f=fasta_path, n=name, o=organism, p=ploidy) custom_err_msg = "Fasta-convert path:{f} name:{n} organism:{o} ploidy:{p}".format(**_d) job_id = _job_id_or_error(job_or_error, custom_err_msg=custom_err_msg) return _block_for_job_to_complete(self, job_id, time_out=time_out, sleep_time=self._sleep_time) def create_logger_resource(self, idx, name, description): _d = dict(id=idx, name=name, description=description) return _process_rpost(_to_url(self.uri, "/smrt-base/loggers"), _d) def log_progress_update(self, job_type_id, job_id, message, level, source_id): """This is the generic job logging mechanism""" _d = dict(message=message, level=level, sourceId=source_id) return _process_rpost(_to_url(self.uri, "{p}/{t}/{i}/log".format(t=job_type_id, i=job_id, p=ServiceAccessLayer.ROOT_JOBS)), _d) def get_pipeline_template_by_id(self, pipeline_template_id): return _process_rget(_to_url(self.uri, "{p}/{i}".format(i=pipeline_template_id, p=ServiceAccessLayer.ROOT_PT))) def create_by_pipeline_template_id(self, name, pipeline_template_id, epoints, task_options=()): """Creates and runs a pbsmrtpipe pipeline by pipeline template id""" # sanity checking to see if pipeline is valid _ = self.get_pipeline_template_by_id(pipeline_template_id) seps = [dict(entryId=e.entry_id, fileTypeId=e.dataset_type, datasetId=e.resource) for e in epoints] def _to_o(opt_id, opt_value, option_type_id): return dict(optionId=opt_id, value=opt_value, optionTypeId=option_type_id) task_options = list(task_options) # FIXME. Need to define this in the scenario IO layer. # workflow_options = [_to_o("woption_01", "value_01")] workflow_options = [] d = dict(name=name, pipelineId=pipeline_template_id, entryPoints=seps, taskOptions=task_options, workflowOptions=workflow_options) raw_d = _process_rpost(_to_url(self.uri, "{r}/{p}".format(p=JobTypes.PB_PIPE, r=ServiceAccessLayer.ROOT_JOBS)), d) return ServiceJob.from_d(raw_d) def run_by_pipeline_template_id(self, name, pipeline_template_id, epoints, task_options=(), time_out=JOB_DEFAULT_TIMEOUT): """Blocks and runs a job with a timeout""" job_or_error = self.create_by_pipeline_template_id(name, pipeline_template_id, epoints, task_options=task_options) _d = dict(name=name, p=pipeline_template_id, eps=epoints) custom_err_msg = "Job {n} args: {a}".format(n=name, a=_d) job_id = _job_id_or_error(job_or_error, custom_err_msg=custom_err_msg) return _block_for_job_to_complete(self, job_id, time_out=time_out, sleep_time=self._sleep_time) def log_pbsmrtpipe_progress(total_url, message, level, source_id, ignore_errors=True): """Log the status of a pbsmrtpipe to SMRT Server""" # Need to clarify the model here. Trying to pass the most minimal # data necessary to pbsmrtpipe. _d = dict(message=message, level=level, sourceId=source_id) if ignore_errors: try: return _process_rpost(total_url, _d) except Exception as e: log.warn("Failed Request to {u} data: {d}. {e}".format(u=total_url, d=_d, e=e)) else: return _process_rpost(total_url, _d) def add_datastore_file(total_url, datastore_file, ignore_errors=True): """Add datastore to SMRT Server :type datastore_file: DataStoreFile """ _d = datastore_file.to_dict() if ignore_errors: try: return _process_rpost(total_url, _d) except Exception as e: log.warn("Failed Request to {u} data: {d}. {e}".format(u=total_url, d=_d, e=e)) else: return _process_rpost(total_url, _d) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/services/models.py0000644000000000000000000001110013035554276024575 0ustar rootroot"""Services Specific Data Models""" from collections import namedtuple import json import uuid import iso8601 from requests.exceptions import RequestException def to_ascii(s): return s.encode('ascii', 'ignore') # This are mirrored from the BaseSMRTServer class LogLevels(object): TRACE = "TRACE" DEBUG = "DEBUG" INFO = "INFO" NOTICE = "NOTICE" WARN = "WARN" ERROR = "ERROR" CRITICAL = "CRITICAL" FATAL = "FATAL" ALL = (TRACE, DEBUG, INFO, NOTICE, WARN, ERROR, CRITICAL, FATAL) @classmethod def is_valid(cls, level): return level in cls.ALL SERVICE_LOGGER_RESOURCE_ID = "pbsmrtpipe" LogResource = namedtuple("LogResource", "id name description") LogMessage = namedtuple("LogMessage", "sourceId level message") PbsmrtpipeLogResource = LogResource(SERVICE_LOGGER_RESOURCE_ID, "Pbsmrtpipe", "Secondary Analysis Pbsmrtpipe Job logger") class ServiceJob(namedtuple("ServiceJob", 'id uuid name state path job_type created_at settings')): @staticmethod def from_d(d): def sx(x): return d[x] def se(x): return sx(x).encode('ascii', 'ignore') def to_t(x): return iso8601.parse_date(se(x)) def to_d(x): # the "jsonSettings" are a string for some stupid reason return json.loads(sx(x)) return ServiceJob(sx('id'), sx('uuid'), se('name'), se('state'), se('path'), se('jobTypeId'), to_t('createdAt'), to_d('jsonSettings')) def was_successful(self): return self.state == JobStates.SUCCESSFUL class JobExeError(ValueError): """Service Job failed to complete successfully""" pass class SmrtServerConnectionError(RequestException): """This is blunt to catch all status related errors""" pass class SMRTServiceBaseError(Exception): """Fundamental Error datastructure in SMRT Server""" def __init__(self, http_code, error_type, message, **kwargs): self.http_code = http_code self.error_type = error_type self.msg = message message = "Http code={h} msg={m} type={t}".format(h=http_code, m=message, t=error_type) super(Exception, self).__init__(message) @staticmethod def from_d(d): return SMRTServiceBaseError(d['httpCode'], d['errorType'], d['message']) # "Job" is the raw output from the jobs/1234 JobResult = namedtuple("JobResult", "job run_time errors") def _to_resource_id(x): if isinstance(x, int): return x try: _ = uuid.UUID(x) return x except ValueError as e: raise ValueError("Resource id '{x}' must be given as int or uuid".format(x=x)) class ServiceEntryPoint(object): """Entry Points to initialize Pipelines""" def __init__(self, entry_id, dataset_type, path_or_uri): self.entry_id = entry_id self.dataset_type = dataset_type # int (only supported), UUID or path to XML dataset will be added self._resource = path_or_uri @property def resource(self): return self._resource def __repr__(self): return "<{k} {e} {d} {r} >".format(k=self.__class__.__name__, e=self.entry_id, r=self._resource, d=self.dataset_type) @staticmethod def from_d(d): i = _to_resource_id(d['datasetId']) return ServiceEntryPoint(to_ascii(d['entryId']), to_ascii(d['fileTypeId']), i) def to_d(self): return dict(entryId=self.entry_id, fileTypeId=self.dataset_type, datasetId=self.resource) class JobEntryPoint(namedtuple("JobEntryPoint", "job_id dataset_uuid dataset_metatype")): """ Returned from the Services /job/1234/entry-points """ @staticmethod def from_d(d): return JobEntryPoint(d['jobId'], d['datasetUUID'], d['datasetType']) class JobStates(object): CREATED = "CREATED" SUBMITTED = "SUBMITTED" RUNNING = "RUNNING" FAILED = "FAILED" SUCCESSFUL = "SUCCESSFUL" ALL = (RUNNING, CREATED, FAILED, SUCCESSFUL, SUBMITTED) # End points ALL_COMPLETED = (FAILED, SUCCESSFUL) class JobTypes(object): IMPORT_DS = "import-dataset" IMPORT_DSTORE = "import-datastore" MERGE_DS = "merge-datasets" PB_PIPE = "pbsmrtpipe" MOCK_PB_PIPE = "mock-pbsmrtpipe" CONVERT_FASTA = 'convert-fasta-reference' @classmethod def ALL(cls): return (cls.IMPORT_DS, cls.IMPORT_DSTORE, cls.MERGE_DS, cls.PB_PIPE, cls.MOCK_PB_PIPE, cls.CONVERT_FASTA) class ServiceResourceTypes(object): REPORTS = "reports" DATASTORE = "datastore" ENTRY_POINTS = "entry-points" pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/services/__init__.py0000644000000000000000000000034113035554276025056 0ustar rootrootfrom .service_access_layer import ServiceAccessLayer from .models import (JobExeError, JobResult, LogLevels, ServiceResourceTypes, JobTypes, JobStates, ServiceJob, ServiceEntryPoint) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/services/cli.py0000644000000000000000000005212213035554276024072 0ustar rootroot""" CLI (deprecated) for interacting with the PacBio Services 0.1.0 Version, Import/Convert datasets pbservice import-dataset # dir or XML file pbservice import-rs-movie # dir or XML file (Requires 'movie-to-dataset' exe) pbservice import-ref-info # dir or XML file (Requires 'reference-to-dataset' exe) pbservice import-fasta /path/to/file.fasta --name my-name --organism my-org --ploidy haploid 0.2.0 Version, Jobs Support, leveraging pbservice run-analysis path/to/file.json pbservice run-merge-dataset path/to/file.json This program is largely replaced by the Scala version in 'smrtflow'. """ import argparse import json import os import pprint import sys import logging import functools import time import traceback import uuid import warnings from requests import RequestException import iso8601 from pbcommand.cli import get_default_argparser_with_base_opts from pbcommand.models import FileTypes from pbcommand.services import (ServiceAccessLayer, ServiceEntryPoint, JobExeError) from pbcommand.services.service_access_layer import (DATASET_METATYPES_TO_ENDPOINTS, ) from pbcommand.validators import validate_file, validate_or from pbcommand.common_options import add_common_options from pbcommand.utils import (is_dataset, walker, setup_log, compose, setup_logger, get_parsed_args_log_level) from .utils import to_ascii __version__ = "0.2.1" log = logging.getLogger(__name__) log.addHandler(logging.NullHandler()) # suppress warning message _LOG_FORMAT = '[%(levelname)s] %(asctime)-15s %(message)s' def _list_dict_printer(list_d): for i in list_d: print i try: # keep this to keep backward compatible from tabulate import tabulate def printer(list_d): print tabulate(list_d) list_dict_printer = printer except ImportError: list_dict_printer = _list_dict_printer class Constants(object): # When running from the commandline, the host and port will default to these # values if provided ENV_PB_SERVICE_HOST = "PB_SERVICE_HOST" ENV_PB_SERVICE_PORT = "PB_SERVICE_PORT" DEFAULT_HOST = "http://localhost" DEFAULT_PORT = 8070 FASTA_TO_REFERENCE = "fasta-to-reference" RS_MOVIE_TO_DS = "movie-metadata-to-dataset" # Currently only small-ish files are supported, users should # use fasta-to-reference offline and import the reference set MAX_FASTA_FILE_MB = 100 def _is_xml(path): return path.endswith(".xml") def add_max_items_option(default, desc="Max items to return"): def f(p): p.add_argument('-m', '--max-items', type=int, default=default, help=desc) return p return f def validate_xml_file_or_dir(path): px = os.path.abspath(os.path.expanduser(path)) if os.path.isdir(px): return px elif os.path.isfile(px) and _is_xml(px): return px else: raise argparse.ArgumentTypeError("Expected dir or file '{p}'".format(p=path)) validate_int_or_uuid = validate_or(int, uuid.UUID, "Expected Int or UUID") def _get_size_mb(path): return os.stat(path).st_size / 1024.0 / 1024.0 def validate_file_and_size(max_size_mb): def _wrapper(path): p = validate_file(path) sx = _get_size_mb(path) if sx > max_size_mb: raise argparse.ArgumentTypeError("Fasta file is too large {s:.2f} MB > {m:.2f} MB. Create a ReferenceSet using {e}, then import using `pbservice import-dataset /path/to/referenceset.xml` ".format(e=Constants.FASTA_TO_REFERENCE, s=sx, m=Constants.MAX_FASTA_FILE_MB)) else: return p return _wrapper validate_max_fasta_file_size = validate_file_and_size(Constants.MAX_FASTA_FILE_MB) def add_block_option(p): p.add_argument('--block', action='store_true', default=False, help="Block during importing process") return p def add_sal_options(p): default_port = os.environ.get(Constants.ENV_PB_SERVICE_PORT, Constants.DEFAULT_PORT) default_host = os.environ.get(Constants.ENV_PB_SERVICE_HOST, Constants.DEFAULT_HOST) p.add_argument('--host', type=str, default=default_host, help="Server host. Override the default with env {v}".format(v=Constants.ENV_PB_SERVICE_HOST)) p.add_argument('--port', type=int, default=default_port, help="Server Port. Override default with env {v}".format(v=Constants.ENV_PB_SERVICE_PORT)) return p def add_base_and_sal_options(p): fx = [add_common_options, add_sal_options] f = compose(*fx) return f(p) def add_xml_or_dir_option(p): p.add_argument('xml_or_dir', type=validate_xml_file_or_dir, help="Directory or XML file.") return p def add_sal_and_xml_dir_options(p): fx = [add_common_options, add_sal_options, add_xml_or_dir_option] f = compose(*fx) return f(p) def get_sal_and_status(host, port): """Get Sal or Raise if status isn't successful""" try: sal = ServiceAccessLayer(host, port) sal.get_status() return sal except RequestException as e: log.error("Failed to connect to {h}:{p}".format(h=host, p=port)) raise def run_file_or_dir(file_func, dir_func, xml_or_dir): if os.path.isdir(xml_or_dir): return dir_func(xml_or_dir) elif os.path.isfile(xml_or_dir): return file_func(xml_or_dir) else: raise ValueError("Unsupported value {x}".format(x=xml_or_dir)) def is_xml_dataset(path): if _is_xml(path): if is_dataset(path): return True return False def dataset_walker(root_dir): filter_func = is_xml_dataset return walker(root_dir, filter_func) def import_local_dataset(sal, path): """:type sal: ServiceAccessLayer""" # XXX basic validation of external resources try: from pbcore.io import openDataSet, ReadSet, HdfSubreadSet except ImportError: log.warn("Can't import pbcore, skipping dataset sanity check") else: ds = openDataSet(path, strict=True) if isinstance(ds, ReadSet) and not isinstance(ds, HdfSubreadSet): if len(ds) > 0: log.info("checking BAM file integrity") for rr in ds.resourceReaders(): try: _ = rr[-1] except Exception as e: log.exception("Import failed because the underlying "+ "data appear to be corrupted. Run "+ "'pbvalidate' on the dataset for more "+ "thorough checking.") return 1 else: log.warn("Empty dataset - will import anyway") # this will raise if the import wasn't successful _ = sal.run_import_local_dataset(path) log.info("Successfully import dataset from {f}".format(f=path)) return 0 def import_datasets(sal, root_dir): # FIXME. Need to add a flag to keep importing even if an import fails rcodes = [] for path in dataset_walker(root_dir): try: import_local_dataset(sal, path) rcodes.append(0) except Exception as e: log.error("Failed to import dataset {e}".format(e=e)) rcodes.append(1) state = all(v == 0 for v in rcodes) return 0 if state else 1 def run_import_local_datasets(host, port, xml_or_dir): sal = ServiceAccessLayer(host, port) file_func = functools.partial(import_local_dataset, sal) dir_func = functools.partial(import_datasets, sal) return run_file_or_dir(file_func, dir_func, xml_or_dir) def args_runner_import_datasets(args): return run_import_local_datasets(args.host, args.port, args.xml_or_dir) def add_import_fasta_opts(p): px = p.add_argument px('fasta_path', type=validate_max_fasta_file_size, help="Path to Fasta File") px('--name', required=True, type=str, help="Name of ReferenceSet") px('--organism', required=True, type=str, help="Organism") px('--ploidy', required=True, type=str, help="Ploidy") add_block_option(p) add_sal_options(p) add_common_options(p) return p def run_import_fasta(host, port, fasta_path, name, organism, ploidy, block=False): sal = ServiceAccessLayer(host, port) log.info("importing ({s:.2f} MB) {f} ".format(s=_get_size_mb(fasta_path), f=fasta_path)) if block is True: result = sal.run_import_fasta(fasta_path, name, organism, ploidy) log.info("Successfully imported {f}".format(f=fasta_path)) log.info("result {r}".format(r=result)) else: sal.import_fasta(fasta_path, name, organism, ploidy) return 0 def args_run_import_fasta(args): log.debug(args) return run_import_fasta(args.host, args.port, args.fasta_path, args.name, args.organism, args.ploidy, block=args.block) def load_analysis_job_json(d): """Translate a dict to args for scenario runner inputs""" job_name = to_ascii(d['name']) pipeline_template_id = to_ascii(d["pipelineId"]) service_epoints = [ServiceEntryPoint.from_d(x) for x in d['entryPoints']] return job_name, pipeline_template_id, service_epoints def _validate_analysis_job_json(path): px = validate_file(path) with open(px, 'r') as f: d = json.loads(f.read()) try: load_analysis_job_json(d) return px except (KeyError, TypeError, ValueError) as e: raise argparse.ArgumentTypeError("Invalid analysis.json format for '{p}' {e}".format(p=px, e=repr(e))) def add_run_analysis_job_opts(p): p.add_argument('json_path', type=_validate_analysis_job_json, help="Path to analysis.json file") add_sal_options(p) add_common_options(p) add_block_option(p) return def run_analysis_job(sal, job_name, pipeline_id, service_entry_points, block=False, time_out=None, task_options=()): """Run analysis (pbsmrtpipe) job :rtype ServiceJob: """ if time_out is None: time_out = sal.JOB_DEFAULT_TIMEOUT status = sal.get_status() log.info("System:{i} v:{v} Status:{x}".format(x=status['message'], i=status['id'], v=status['version'])) resolved_service_entry_points = [] for service_entry_point in service_entry_points: # Always lookup/resolve the dataset by looking up the id ds = sal.get_dataset_by_uuid(service_entry_point.resource) if ds is None: raise ValueError("Failed to find DataSet with id {r} {s}".format(s=service_entry_point, r=service_entry_point.resource)) dataset_id = ds['id'] ep = ServiceEntryPoint(service_entry_point.entry_id, service_entry_point.dataset_type, dataset_id) log.debug("Resolved dataset {e}".format(e=ep)) resolved_service_entry_points.append(ep) if block: job_result = sal.run_by_pipeline_template_id(job_name, pipeline_id, resolved_service_entry_points, time_out=time_out, task_options=task_options) job_id = job_result.job.id # service job result = sal.get_analysis_job_by_id(job_id) if not result.was_successful(): raise JobExeError("Job {i} failed".format(i=job_id)) else: # service job or error result = sal.create_by_pipeline_template_id(job_name, pipeline_id, resolved_service_entry_points) log.info("Result {r}".format(r=result)) return result def args_run_analysis_job(args): log.debug(args) with open(args.json_path, 'r') as f: d = json.loads(f.read()) log.debug("Loaded \n" + pprint.pformat(d)) job_name, pipeline_id, service_entry_points = load_analysis_job_json(d) sal = ServiceAccessLayer(args.host, args.port) # this should raise if there's a failure result = run_analysis_job(sal, job_name, pipeline_id, service_entry_points, block=args.block) return 0 def args_emit_analysis_template(args): ep1 = ServiceEntryPoint("eid_ref_dataset", FileTypes.DS_REF.file_type_id, 1) ep1_d = ep1.to_d() ep1_d['_comment'] = "datasetId can be provided as the DataSet UUID or Int. The entryId(s) can be obtained by running 'pbsmrtpipe show-pipeline-templates {PIPELINE-ID}'" d = dict(name="Job name", pipelineId="pbsmrtpipe.pipelines.dev_diagnostic", entryPoints=[ep1_d], taskOptions=[], workflowOptions=[]) sx = json.dumps(d, sort_keys=True, indent=4, separators=(',', ': ')) print sx return 0 def args_get_sal_summary(args): host = args.host port = args.port sal = ServiceAccessLayer(host, port) print sal.to_summary() return 0 def add_get_job_options(p): add_base_and_sal_options(p) p.add_argument("job_id", type=validate_int_or_uuid, help="Job id or UUID") return p def run_get_job_summary(host, port, job_id): sal = get_sal_and_status(host, port) job = sal.get_job_by_id(job_id) epoints = sal.get_analysis_job_entry_points(job_id) if job is None: log.error("Unable to find job {i} from {u}".format(i=job_id, u=sal.uri)) else: # this is not awesome, but the scala code should be the fundamental # tool print "Job {}".format(job_id) # The settings will often make this unreadable print job._replace(settings={}) print " Entry Points {}".format(len(epoints)) for epoint in epoints: print " {}".format(epoint) return 0 def add_get_job_list_options(p): fs = [add_base_and_sal_options, add_max_items_option(25, "Max Number of jobs")] f = compose(*fs) return f(p) def args_get_job_summary(args): return run_get_job_summary(args.host, args.port, args.job_id) def run_job_list_summary(host, port, max_items, sort_by=None): sal = get_sal_and_status(host, port) jobs = sal.get_analysis_jobs() jobs_list = jobs if sort_by is None else sorted(jobs, cmp=sort_by) printer(jobs_list[:max_items]) return 0 def args_get_job_list_summary(args): return run_job_list_summary(args.host, args.port, args.max_items, sort_by=_cmp_sort_by_id_desc) def add_get_dataset_options(p): add_base_and_sal_options(p) p.add_argument('id_or_uuid', type=validate_int_or_uuid, help="DataSet Id or UUID") return p def add_get_dataset_list_options(p): add_base_and_sal_options(p) fx = add_max_items_option(25, "Max number of Datasets to show") fx(p) default_dataset_type = DATASET_METATYPES_TO_ENDPOINTS[FileTypes.DS_SUBREADS] # this should be choice p.add_argument('-t', '--dataset-type', type=str, default=default_dataset_type, help="DataSet Meta type") return p def run_get_dataset_summary(host, port, dataset_id_or_uuid): sal = get_sal_and_status(host, port) log.debug("Getting dataset {d}".format(d=dataset_id_or_uuid)) ds = sal.get_dataset_by_uuid(dataset_id_or_uuid) if ds is None: log.error("Unable to find DataSet '{i}' on {u}".format(i=dataset_id_or_uuid, u=sal.uri)) else: print pprint.pformat(ds, indent=2) return 0 def _cmp_sort_by_id_key_desc(a, b): return b['id'] - a['id'] def _cmp_sort_by_id_desc(a, b): return b.id - a.id def run_get_dataset_list_summary(host, port, dataset_type, max_items, sort_by=None): """ Display a list of Dataset summaries :param host: :param port: :param dataset_type: :param max_items: :param sort_by: func to sort resources sort_by = lambda x.created_at :return: """ sal = get_sal_and_status(host, port) def to_ep(file_type): return DATASET_METATYPES_TO_ENDPOINTS[file_type] # FIXME(mkocher)(2016-3-26) need to centralize this on the dataset "shortname"? fs = {to_ep(FileTypes.DS_SUBREADS): sal.get_subreadsets, to_ep(FileTypes.DS_REF): sal.get_referencesets, to_ep(FileTypes.DS_ALIGN): sal.get_alignmentsets, to_ep(FileTypes.DS_BARCODE): sal.get_barcodesets } f = fs.get(dataset_type) if f is None: raise KeyError("Unsupported dataset type {t} Supported types {s}".format(t=dataset_type, s=fs.keys())) else: datasets = f() # this needs to be improved sorted_datasets = datasets if sort_by is None else sorted(datasets, cmp=sort_by) print "Number of {t} Datasets {n}".format(t=dataset_type, n=len(datasets)) list_dict_printer(sorted_datasets[:max_items]) return 0 def args_run_dataset_summary(args): return run_get_dataset_summary(args.host, args.port, args.id_or_uuid) def args_run_dataset_list_summary(args): return run_get_dataset_list_summary(args.host, args.port, args.dataset_type, args.max_items, sort_by=_cmp_sort_by_id_key_desc) def subparser_builder(subparser, subparser_id, description, options_func, exe_func): """ Util to add subparser options :param subparser: :param subparser_id: :param description: :param options_func: Function that will add args and options to Parser instance F(subparser) -> None :param exe_func: Function to run F(args) -> Int :return: """ p = subparser.add_parser(subparser_id, help=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) options_func(p) p.set_defaults(func=exe_func) return p def get_parser(): desc = "Tool to import datasets, convert/import fasta file and run analysis jobs" p = get_default_argparser_with_base_opts(__version__, desc) sp = p.add_subparsers(help='commands') def builder(subparser_id, description, options_func, exe_func): subparser_builder(sp, subparser_id, description, options_func, exe_func) status_desc = "Get System Status, DataSet and Job Summary" builder('status', status_desc, add_base_and_sal_options, args_get_sal_summary) local_desc = " The file location must be accessible from the host where the Services are running (often on a shared file system)" ds_desc = "Import Local DataSet XML." + local_desc builder('import-dataset', ds_desc, add_sal_and_xml_dir_options, args_runner_import_datasets) fasta_desc = "Import Fasta (and convert to ReferenceSet)." + local_desc builder("import-fasta", fasta_desc, add_import_fasta_opts, args_run_import_fasta) run_analysis_desc = "Run Secondary Analysis Pipeline using an analysis.json" builder("run-analysis", run_analysis_desc, add_run_analysis_job_opts, args_run_analysis_job) emit_analysis_json_desc = "Emit an analysis.json Template to stdout that can be run using 'run-analysis'" builder("emit-analysis-template", emit_analysis_json_desc, add_common_options, args_emit_analysis_template) # Get Summary Job by Id job_summary_desc = "Get Job Summary by Job Id" builder('get-job', job_summary_desc, add_get_job_options, args_get_job_summary) job_list_summary_desc = "Get Job Summary by Job Id" builder('get-jobs', job_list_summary_desc, add_get_job_list_options, args_get_job_list_summary) ds_summary_desc = "Get DataSet Summary by DataSet Id or UUID" builder('get-dataset', ds_summary_desc, add_get_dataset_options, args_run_dataset_summary) ds_list_summary_desc = "Get DataSet List Summary by DataSet Type" builder('get-datasets', ds_list_summary_desc, add_get_dataset_list_options, args_run_dataset_list_summary) return p def args_executer(args): """ This is pulled from pbsmrtpipe that uses the goofy func=my_runner_func, which will be called using args.func(args) :rtype int """ try: return_code = args.func(args) except Exception as e: if isinstance(e, RequestException): # make this terse so there's not a useless stacktrace emsg = "Failed to connect to SmrtServer {e}".format(e=repr(e.__class__.__name__)) log.error(emsg) return_code = 3 elif isinstance(e, IOError): log.error(e, exc_info=True) traceback.print_exc(sys.stderr) return_code = 1 else: log.error(e, exc_info=True) traceback.print_exc(sys.stderr) return_code = 2 return return_code def main_runner(argv, parser, exe_runner_func, level=logging.DEBUG, str_formatter=_LOG_FORMAT): """ Fundamental interface to commandline applications """ dep_msg = "The `pbservice` commandline is deprecated and will be removed " \ "in a future version. Please using the scala implementation in smrtflow " \ "at https://github.com/PacificBiosciences/smrtflow" started_at = time.time() args = parser.parse_args(argv) level = get_parsed_args_log_level(args, default_level=level) console_or_file = args.log_file setup_logger(console_or_file, level, formatter=str_formatter) warnings.warn(dep_msg, DeprecationWarning) log.warn(dep_msg) log.debug(args) log.info("Starting tool version {v}".format(v=parser.version)) rcode = exe_runner_func(args) run_time = time.time() - started_at _d = dict(r=rcode, s=run_time) log.info("exiting with return code {r} in {s:.2f} sec.".format(**_d)) return rcode def main(argv=None): argv_ = sys.argv if argv is None else argv parser = get_parser() return main_runner(argv_[1:], parser, args_executer) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/pbcommand/services/utils.py0000644000000000000000000000565013035554276024467 0ustar rootroot# This is not public. Might want to move this into service_access_layer from collections import defaultdict from .models import ServiceJob, JobStates, JobTypes def to_ascii(s): # This is not awesome return s.encode('ascii', 'ignore') def _jobs_by_state_gen(sal, job_states): """:type sal: ServiceAccessLayer""" states = job_states if isinstance(job_states, (tuple, list)) else [job_states] jobs = sal.get_analysis_jobs() for job in jobs: sjob = ServiceJob.from_d(job) if sjob.state in states: yield sjob def get_failed_jobs(sal): return sorted(_jobs_by_state_gen(sal, JobStates.FAILED), key=lambda x: x.created_at, reverse=True) def jobs_summary(jobs): """dict(state) -> count (int) """ states_counts = defaultdict(lambda: 0) if jobs: for job in jobs: states_counts[job.state] += 1 return states_counts def to_jobs_summary(jobs, header=None): """Return string of jobs summary""" header = "Jobs" if header is None else header # Make easier to handle Option[Seq[Job]] xjobs = [] if jobs is None else jobs outs = [] x = outs.append states_counts = jobs_summary(xjobs) x("{h} {n}".format(n=len(xjobs), h=header)) for state, c in states_counts.iteritems(): x("State {s} {c}".format(c=c, s=state)) return "\n".join(outs) def to_all_job_types_summary(sal, sep="*****"): # only use a subset of the job types funcs = [(JobTypes.IMPORT_DS, sal.get_import_dataset_jobs), (JobTypes.MERGE_DS, sal.get_merge_dataset_jobs), (JobTypes.CONVERT_FASTA, sal.get_fasta_convert_jobs), (JobTypes.PB_PIPE, sal.get_analysis_jobs)] outs = [] x = outs.append x("All Job types Summary") x(sep) for name, func in funcs: out = to_jobs_summary(func(), header="{n} Jobs".format(n=name)) x(out) x(sep) return "\n".join(outs) def to_all_datasets_summary(sal, sep="****"): ds_types = [("SubreadSets", sal.get_subreadsets), ("HdfSubreadSets", sal.get_hdfsubreadsets), ("ReferenceSets", sal.get_referencesets), ("AlignmentSets", sal.get_alignmentsets), #("ConsensusSets", sal.get_ccsreadsets) ] outs = [] x = outs.append x("Dataset Summary") x(sep) for name, func in ds_types: d = func() ndatasets = len(d) x("{n} {d}".format(n=name, d=ndatasets)) return "\n".join(outs) def to_sal_summary(sal): """:type sal: ServiceAccessLayer""" status = sal.get_status() outs = [] x = outs.append sep = "-" * 30 x(repr(sal)) x("SystemId : {}".format(status['id'])) x("Version : {}".format(status['version'])) x("Status : {}".format(status['message'])) x(sep) x(to_all_datasets_summary(sal, sep=sep)) x(sep) x(to_all_job_types_summary(sal, sep=sep)) return "\n".join(outs) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/Makefile0000644000000000000000000000401613035554276020625 0ustar rootroot.PHONY: all clean install dev-install test doc SHELL = /bin/bash -e all: install install: @which pip > /dev/null @pip freeze|grep 'pbcommand=='>/dev/null \ && pip uninstall -y pbcommand \ || echo -n '' @pip install ./ clean: rm -rf build/;\ find . -name "*.egg-info" | xargs rm -rf;\ find . -name "*.pyc" | xargs rm -f;\ find . -name "*.err" | xargs rm -f;\ find . -name "*.log" | xargs rm -f;\ rm -rf dist;\ rm -rf docs/_build test: nosetests -s --verbose --with-xunit --logging-config log_nose.cfg tests/test_*.py doc: cd docs && make html build-tool-contracts: python -m pbcommand.cli.examples.dev_app --emit-tool-contract > ./tests/data/tool-contracts/pbcommand.tasks.dev_app_tool_contract.json python -m pbcommand.cli.examples.dev_app --emit-tool-contract > ./tests/data/tool-contracts/dev_example_tool_contract.json python -m pbcommand.cli.examples.dev_txt_app --emit-tool-contract > ./tests/data/tool-contracts/dev_example_dev_txt_app_tool_contract.json python -m pbcommand.cli.examples.dev_mixed_app --emit-tool-contract > ./tests/data/tool-contracts/dev_mixed_app_tool_contract.json python -m pbcommand.cli.examples.dev_gather_fasta_app --emit-tool-contract > ./tests/data/tool-contracts/dev_gather_fasta_app_tool_contract.json python -m pbcommand.cli.examples.dev_scatter_fasta_app --emit-tool-contract > ./tests/data/tool-contracts/dev_scatter_fasta_app_tool_contract.json python -m pbcommand.cli.examples.dev_quick_hello_world emit-tool-contracts -o ./tests/data/tool-contracts run-pep8: find pbcommand -name "*.py" -exec pep8 --ignore=E501,E265,E731,E402,W292 {} \; run-auto-pep8: find pbcommand -name "*.py" -exec autopep8 -i --ignore=E501,E265,E731,E402,W292 {} \; build-java-classes: avro-tools compile schema pbcommand/schemas java-classes/ extract-readme-snippets: rm -rf readme-snippet-*.py pandoc -t markdown README.md | pandoc --filter ./extract-readme-snippets.py build-avro-schema-docs: # this requires nodejs + https://github.com/ept/avrodoc avrodoc pbcommand/schemas/*.avsc > index.html pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/circle.yml0000644000000000000000000000035313035554276021151 0ustar rootrootmachine: python: version: 2.7.9 dependencies: pre: - pip install -r REQUIREMENTS.txt - pip install -r REQUIREMENTS_TEST.txt - pip install nose test: override: - mkdir -p $CIRCLE_TEST_REPORTS - make test pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/0000755000000000000000000000000013035554276020326 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_engine_runner.py0000644000000000000000000000174013035554276024577 0ustar rootrootimport logging import unittest from pbcommand.engine import run_cmd from .base_utils import get_temp_file, get_temp_dir log = logging.getLogger(__name__) class RunnerSmokeTest(unittest.TestCase): def test_simple_run_cmd(self): d = get_temp_dir("simple-cmd") txt_in = get_temp_file(".txt", d) txt_out = get_temp_file("*.txt", d) exe = "cat {i} > {o}".format(i=txt_in, o=txt_out) # this could all be bundled into a context manager # with RunCommand('/path/stdout', '/path/to/stderr') as r: # r.exe("echo 'exe1') # r.exe("echo 'exe2') # result = r.get_result() # close the file handles stdout = get_temp_file("-stdout", d) stderr = get_temp_file("-stderr", d) with open(stdout, 'w') as fo: with open(stderr, 'w') as fe: result = run_cmd(exe, fo, fe) emgs = "Command {e} failed".format(e=exe) self.assertEquals(result.exit_code, 0, emgs) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_pb_io_tool_contract_v1.py0000644000000000000000000000173713035554276026377 0ustar rootrootimport unittest import logging from base_utils import get_tool_contract_v1 from pbcommand.models import (ToolContract, MalformedToolContractError) from pbcommand.pb_io.tool_contract_io import (load_tool_contract_from, ) log = logging.getLogger(__name__) class TestLoadToolContract(unittest.TestCase): def test_01(self): file_name = "dev_example_tool_contract.json" path = get_tool_contract_v1(file_name) tc = load_tool_contract_from(path) self.assertIsInstance(tc, ToolContract) self.assertEqual(tc.schema_version, "UNKNOWN") class TestMalformedToolContract(unittest.TestCase): def test_tc_no_inputs(self): file_name = "dev_example_tool_contract.json" path = get_tool_contract_v1(file_name) tc = load_tool_contract_from(path) tc.task.input_file_types = [] def _run(): return tc.to_dict() self.assertRaises(MalformedToolContractError, _run) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_report_column.py0000644000000000000000000000421413035554276026173 0ustar rootrootimport logging import unittest from pbcommand.models.report import Column log = logging.getLogger(__name__) class TestColumn(unittest.TestCase): def test_column(self): """Test: Can't create a Column without an id.""" def none_col(): c = Column(None) self.assertRaises(none_col) def test_repr(self): c = Column('my_column', header="My Column", values=list(xrange(5))) self.assertIsNotNone(repr(c)) # def test_plotgroup_add_duplicate_plot(self): # ''' # Test: Can't add plots with duplicate ids # ''' # try: # log.info( TestPlotGroup.test_plotgroup_add_duplicate_plot.__doc__ ) # pg = PlotGroup('foo') # pg.add_plot(Plot('id', 'i1')) # # try: # pg.add_plot( Plot('id', 'i2') ) # self.fail( 'Cannot add plot with same id' ) # except PbReportError: # pass # except: # log.error(traceback.format_exc()) # raise # # # # def test_plotgroup_id_prepend(self): # ''' # Test: PlotGroup id gets prepended to plot.id when plot is added # ''' # try: # log.info( TestPlotGroup.test_plotgroup_id_prepend.__doc__ ) # pg = PlotGroup('foo') # pg.add_plot( Plot('id', 'i1') ) # self.assertEqual( 'foo.id', pg.plots[0].id ) # except: # log.error(traceback.format_exc()) # raise # # # def test_to_dict(self): # ''' # Test plotGroup to_dict function # ''' # try: # log.info( TestPlotGroup.test_to_dict.__doc__ ) # a = PlotGroup(123, 'foo title', 'foo legend', 'foo thumbnail' ) # a.add_plot( Plot('id', 'i1') ) # # d = a.to_dict() # self.assertEquals( 123, d['id'] ) # self.assertEquals( 'foo title', d['title'] ) # self.assertEquals( 'foo legend', d['legend'] ) # self.assertEquals( 'foo thumbnail', d['thumbnail'] ) # self.assertEquals( 1, len(d['plots']) ) # except: # log.error(traceback.format_exc()) # raise # # # pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/0000755000000000000000000000000013035554276021237 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-conditions/0000755000000000000000000000000013035554276025041 5ustar rootroot././@LongLink0000644000000000000000000000015200000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-conditions/reseq-conditions-01.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-conditions/reseq-conditions-010000644000000000000000000000132013035554276030464 0ustar rootroot{ "_condition_doc": "Example of a 'Resequencing' Condition Type", "conditions": [ { "condId": "cond_alpha", "subreadset": "/path/to/subreadset-01.xml", "alignmentset": "/path/to/alignmentset-A.xml", "referenceset": "/path/to/reference.xml" }, { "condId": "cond_alpha", "subreadset": "/path/to/subreadset-02.xml", "alignmentset": "/path/to/alignmentset-B.xml", "referenceset": "/path/to/reference.xml" }, { "condId": "cond_beta", "subreadset": "/path/to/subreadset-03.xml", "alignmentset": "/path/to/alignmentset-C.xml", "referenceset": "/path/to/reference.xml" } ], "pipelineId": "pbsmrtpipe.pipelines.my_pipeline" }././@LongLink0000644000000000000000000000015200000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-conditions/reseq-conditions-02.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-conditions/reseq-conditions-020000644000000000000000000000124313035554276030471 0ustar rootroot{ "_condition_doc": "Example of a 'Resequencing' Condition Type with Files that have relative paths", "conditions": [ { "condId": "cond_alpha", "subreadset": "subreadset-01.xml", "alignmentset": "alignmentset-A.xml", "referenceset": "reference.xml" }, { "condId": "cond_alpha", "subreadset": "subreadset-02.xml", "alignmentset": "alignmentset-B.xml", "referenceset": "reference.xml" }, { "condId": "cond_beta", "subreadset": "subreadset-03.xml", "alignmentset": "alignmentset-C.xml", "referenceset": "reference.xml" } ], "pipelineId": "pbsmrtpipe.pipelines.my_pipeline" }pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/report-specs/0000755000000000000000000000000013035554276023665 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/report-specs/report_spec.json0000644000000000000000000000665013035554276027114 0ustar rootroot{ "id": "test_report", "version": "0.1", "title": "Example report spec", "description": "This is a small test report which is used to ensure report_spec.py is working", "attributes": [ { "description": "An attribute of type int", "type": "int", "id": "attribute1", "name": "Attribute 1", "format": "{:,d}" }, { "description": "An attribute of type float", "type": "float", "id": "attribute2", "name": "Attribute 2", "format": "{p:5g} %" }, { "description": "An attribute of type bool", "type": "boolean", "id": "attribute3", "name": "Attribute 3" }, { "description": "An attribute of type string", "type": "string", "id": "attribute4", "name": "Attribute 4", "format": null } ], "tables": [ { "id": "table1", "title": "Table 1", "description": "The first table", "columns": [ { "header": "Column 1", "type": "int", "id": "column1", "description": "A column of type int", "format": "{:d}" } ] }, { "id": "table2", "title": "Table 2", "description": "The second table", "columns": [ { "header": "Column 1", "type": "float", "id": "column1", "description": "A column of type float", "format": "{:.2f}" }, { "header": "Column 2", "type": "string", "id": "column2", "description": "A column of type str", "format": null } ] } ], "plotGroups": [ { "plots": [ { "description": "The first plot of the first plotgroup", "title": "Plot 1", "caption": "Plot 1", "xlabel": "x variable", "ylabel": "y variable", "id": "plot1" } ], "description": "The first plotgroup", "legend": "legend1.png", "id": "plotgroup1", "title": "Plotgroup 1" }, { "plots": [ { "description": "The first plot of the second plotgroup", "title": "Plot 1", "caption": "Plot 1", "xlabel": "x variable", "ylabel": "y variable", "id": "plot1" }, { "description": "The second plot of the second plotgroup", "title": "Plot 2", "caption": "Plot 2", "xlabel": "x variable", "ylabel": "y variable", "id": "plot2" } ], "description": "The second plotgroup", "legend": "legend2.png", "id": "plotgroup2", "title": "Plotgroup 2" } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/0000755000000000000000000000000013035554276024212 5ustar rootroot././@LongLink0000644000000000000000000000017000000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_app0000644000000000000000000000307613035554276030644 0ustar rootroot{ "version": "0.2.1", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.example.dev_app --resolved-tool-contract ", "env": {} }, "schema_version": "2.0.0", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [ "$tmpfile", "$tmpfile", "$tmpdir" ], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "optionTypeId": "integer", "default": 25, "id": "pbcommand.task_options.dev_read_length", "name": "Length filter", "description": "Min Sequence Length filter" } ], "output_types": [ { "title": "Filtered Fasta file", "description": "Filtered Fasta file", "default_name": "filter", "id": "fasta_out", "file_type_id": "PacBio.FileTypes.Fasta" } ], "_comment": "Created by pbcommand 0.5.2", "name": "Example Dev App", "input_types": [ { "description": "PacBio Spec'ed fasta file", "title": "Fasta File", "id": "fasta_in", "file_type_id": "PacBio.FileTypes.Fasta" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_app" }, "tool_contract_id": "pbcommand.tasks.dev_app" } ././@LongLink0000644000000000000000000000020400000000000011577 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_txt_custom_outs_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_txt0000644000000000000000000000337713035554276030707 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "schema_version": "2.0.0", "tool_contract": { "_comment": "Created by pbcommand 0.5.2", "description": "Quick tool dev_txt_custom_outs pbcommand.tasks.dev_txt_custom_outs", "input_types": [ { "description": "description for PacBio.FileTypes.txt_0", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_0", "title": "" } ], "is_distributed": true, "name": "Custom Txt Task", "nproc": 1, "output_types": [ { "default_name": "PacBio.FileTypes.txt_file_0", "description": "File ", "file_type_id": "PacBio.FileTypes.txt", "id": "label_PacBio.FileTypes.txt", "title": "" }, { "default_name": "PacBio.FileTypes.txt_file_1", "description": "File ", "file_type_id": "PacBio.FileTypes.txt", "id": "label_PacBio.FileTypes.txt", "title": "" } ], "resource_types": [], "schema_options": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_txt_custom_outs" }, "tool_contract_id": "pbcommand.tasks.dev_txt_custom_outs", "version": "0.1.0" }././@LongLink0000644000000000000000000000016600000000000011606 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_scatter_fasta_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_scatter_fasta_app_t0000644000000000000000000000324013035554276031000 0ustar rootroot{ "version": "0.1.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_scatter_fasta_app --resolved-tool-contract ", "env": {} }, "schema_version": "2.0.0", "tool_contract": { "task_type": "pbsmrtpipe.task_types.scattered", "resource_types": [], "description": "Scatter a single fasta file to create chunk.json file", "schema_options": [ { "optionTypeId": "integer", "default": 10, "id": "pbcommand.task_options.dev_scatter_fa_nchunks", "name": "Number of chunks", "description": "Suggested number of chunks. May be overridden by $max_nchunks" } ], "output_types": [ { "title": "Chunk JSON", "description": "Scattered/Chunked Fasta Chunk.json", "default_name": "fasta.chunks", "id": "cjson", "file_type_id": "PacBio.FileTypes.CHUNK" } ], "_comment": "Created by pbcommand 0.5.2", "nchunks": "$max_nchunks", "name": "Fasta Scatter", "input_types": [ { "description": "Fasta file to scatter", "title": "Fasta In", "id": "fasta_in", "file_type_id": "PacBio.FileTypes.Fasta" } ], "chunk_keys": [ "$chunk.fasta_id" ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_scatter_fasta" }, "tool_contract_id": "pbcommand.tasks.dev_scatter_fasta" } ././@LongLink0000644000000000000000000000016500000000000011605 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_gather_fasta_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_gather_fasta_app_to0000644000000000000000000000234013035554276030764 0ustar rootroot{ "version": "0.1.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_gather_fasta_app --resolved-tool-contract ", "env": {} }, "schema_version": "2.0.0", "tool_contract": { "task_type": "pbsmrtpipe.task_types.gathered", "resource_types": [], "description": "Gather a fasta resources in a Chunk.json file", "schema_options": [], "output_types": [ { "title": "Chunk JSON", "description": "Output Fasta", "default_name": "gathered", "id": "output", "file_type_id": "PacBio.FileTypes.Fasta" } ], "_comment": "Created by pbcommand 0.5.2", "name": "Fasta Chunk Gather", "input_types": [ { "description": "Chunked Fasta JSON Out", "title": "Chunk JSON", "id": "chunk_json", "file_type_id": "PacBio.FileTypes.CHUNK" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_gather_fasta" }, "tool_contract_id": "pbcommand.tasks.dev_gather_fasta" } ././@LongLink0000644000000000000000000000020000000000000011573 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_fastq2fasta_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_fas0000644000000000000000000000356113035554276030634 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "schema_version": "2.0.0", "tool_contract": { "_comment": "Created by pbcommand 0.5.2", "description": "Dev Task Fastq to Fasta Example", "input_types": [ { "description": "description for PacBio.FileTypes.Fastq_0", "file_type_id": "PacBio.FileTypes.Fastq", "id": "Label PacBio.FileTypes.Fastq_0", "title": "" } ], "is_distributed": true, "name": "Fastq to Fasta", "nproc": 1, "output_types": [ { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.Fasta", "id": "Label PacBio.FileTypes.Fasta_0", "title": "" } ], "resource_types": [], "schema_options": [ { "default": 1234.0, "description": "Beta Description", "id": "pbcommand.task_options.beta", "name": "Beta Name", "optionTypeId": "float" }, { "default": true, "description": "Option gamma description", "id": "pbcommand.task_options.gamma", "name": "Option gamma", "optionTypeId": "boolean" } ], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_fastq2fasta" }, "tool_contract_id": "pbcommand.tasks.dev_fastq2fasta", "version": "0.1.0" }pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/makefile0000644000000000000000000000052213035554276025711 0ustar rootrootemit-tool-contracts: python -m pbcommand.cli.examples.dev_scatter_fasta_app --emit-tool-contract > dev_scatter_fasta_app_tool_contract.json python -m pbcommand.cli.examples.dev_scatter_fasta_app --emit-tool-contract > dev_scatter_fasta_app_tool_contract.json python -m pbcommand.cli.examples.dev_quick_hello_world emit-tool-contracts ././@LongLink0000644000000000000000000000017600000000000011607 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_txt_hello_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_txt0000644000000000000000000000332713035554276030702 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "schema_version": "2.0.0", "tool_contract": { "_comment": "Created by pbcommand 0.5.2", "description": "Quick tool dev_txt_hello pbcommand.tasks.dev_txt_hello", "input_types": [ { "description": "description for PacBio.FileTypes.txt_0", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_0", "title": "" } ], "is_distributed": false, "name": "Tool dev_txt_hello", "nproc": 3, "output_types": [ { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_0", "title": "" }, { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_1", "title": "" } ], "resource_types": [], "schema_options": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_txt_hello" }, "tool_contract_id": "pbcommand.tasks.dev_txt_hello", "version": "0.1.0" }././@LongLink0000644000000000000000000000017000000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_example_dev_txt_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_example_dev_txt_app0000644000000000000000000000304513035554276031025 0ustar rootroot{ "version": "0.1.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_app --resolved-tool-contract ", "env": {} }, "schema_version": "2.0.0", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [ "$tmpfile", "$tmpfile", "$tmpdir" ], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "optionTypeId": "integer", "default": 10, "id": "pbcommand.task_options.dev_max_nlines", "name": "Max Lines", "description": "Max Number of lines to Copy" } ], "output_types": [ { "title": "Txt outfile", "description": "Generic Output Txt file", "default_name": "output", "id": "txt_out", "file_type_id": "PacBio.FileTypes.txt" } ], "_comment": "Created by pbcommand 0.5.2", "name": "Txt App", "input_types": [ { "description": "Generic Text File", "title": "Txt file", "id": "txt_in", "file_type_id": "PacBio.FileTypes.txt" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_txt_app" }, "tool_contract_id": "pbcommand.tasks.dev_txt_app" } ././@LongLink0000644000000000000000000000015400000000000011603 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_example_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_example_tool_contra0000644000000000000000000000307613035554276031037 0ustar rootroot{ "version": "0.2.1", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.example.dev_app --resolved-tool-contract ", "env": {} }, "schema_version": "2.0.0", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [ "$tmpfile", "$tmpfile", "$tmpdir" ], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "optionTypeId": "integer", "default": 25, "id": "pbcommand.task_options.dev_read_length", "name": "Length filter", "description": "Min Sequence Length filter" } ], "output_types": [ { "title": "Filtered Fasta file", "description": "Filtered Fasta file", "default_name": "filter", "id": "fasta_out", "file_type_id": "PacBio.FileTypes.Fasta" } ], "_comment": "Created by pbcommand 0.5.2", "name": "Example Dev App", "input_types": [ { "description": "PacBio Spec'ed fasta file", "title": "Fasta File", "id": "fasta_in", "file_type_id": "PacBio.FileTypes.Fasta" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_app" }, "tool_contract_id": "pbcommand.tasks.dev_app" } ././@LongLink0000644000000000000000000000020100000000000011574 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_qhello_world_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/pbcommand.tasks.dev_qhe0000644000000000000000000000324213035554276030634 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "schema_version": "2.0.0", "tool_contract": { "_comment": "Created by pbcommand 0.5.2", "description": "Quick tool dev_qhello_world pbcommand.tasks.dev_qhello_world", "input_types": [ { "description": "description for PacBio.FileTypes.Fasta_0", "file_type_id": "PacBio.FileTypes.Fasta", "id": "Label PacBio.FileTypes.Fasta_0", "title": "" } ], "is_distributed": true, "name": "Tool dev_qhello_world", "nproc": 1, "output_types": [ { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.Fasta", "id": "Label PacBio.FileTypes.Fasta_0", "title": "" } ], "resource_types": [], "schema_options": [ { "default": 1234, "description": "Option alpha description", "id": "pbcommand.task_options.alpha", "name": "Option alpha", "optionTypeId": "integer" } ], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_qhello_world" }, "tool_contract_id": "pbcommand.tasks.dev_qhello_world", "version": "0.2.1" }././@LongLink0000644000000000000000000000015600000000000011605 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_mixed_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts/dev_mixed_app_tool_cont0000644000000000000000000000637413035554276031033 0ustar rootroot{ "version": "0.2.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_mixed_app --resolved-tool-contract ", "env": {} }, "schema_version": "2.0.0", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "optionTypeId": "integer", "default": 25, "id": "pbcommand.task_options.alpha", "name": "Alpha", "description": "Alpha description" }, { "optionTypeId": "float", "default": 1.234, "id": "pbcommand.task_options.beta", "name": "Beta", "description": "Beta description" }, { "optionTypeId": "boolean", "default": true, "id": "pbcommand.task_options.gamma", "name": "Gamma", "description": "Gamma description" }, { "name": "Ploidy", "default": "haploid", "choices": [ "haploid", "diploid" ], "optionTypeId": "choice_string", "id": "pbcommand.task_options.ploidy", "description": "Genome ploidy" }, { "name": "Delta", "default": 1, "choices": [ 1, 2, 3 ], "optionTypeId": "choice_integer", "id": "pbcommand.task_options.delta", "description": "An integer choice" }, { "name": "Epsilon", "default": 0.1, "choices": [ 0.01, 0.1, 1.0 ], "optionTypeId": "choice_float", "id": "pbcommand.task_options.epsilon", "description": "A float choice" }, { "optionTypeId": "string", "default": "asdf", "id": "pbcommand.task_options.comment", "name": "Comments", "description": "A string parameter" } ], "output_types": [ { "title": "Output Report", "description": "Output PacBio Report JSON", "default_name": "example.report", "id": "rpt", "file_type_id": "PacBio.FileTypes.JsonReport" } ], "_comment": "Created by pbcommand 0.5.2", "name": "DevApp", "input_types": [ { "description": "Input csv description", "title": "Input CSV", "id": "csv", "file_type_id": "PacBio.FileTypes.csv" } ], "nproc": 2, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_mixed_app" }, "tool_contract_id": "pbcommand.tasks.dev_mixed_app" } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/0000755000000000000000000000000013035554276026033 5ustar rootroot././@LongLink0000644000000000000000000000020000000000000011573 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/dev_mixed_app_resolved_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/dev_mixed_app_0000644000000000000000000000153413035554276030724 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_mixed_app --resolved-tool-contract ", "serialization": "json" }, "resolved_tool_contract": { "_comment": "Created by pbcommand v0.5.0", "input_files": [ "tests/data/example.txt" ], "is_distributed": false, "log_level": "INFO", "nproc": 1, "options": { "pbcommand.task_options.alpha": 50, "pbcommand.task_options.beta": 9.876, "pbcommand.task_options.gamma": false, "pbcommand.task_options.ploidy": "diploid" }, "output_files": [ "example.report.json" ], "resources": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_mixed_app" } } ././@LongLink0000644000000000000000000000016000000000000011600 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/resolved_contract_01.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/resolved_contr0000644000000000000000000000141013035554276031002 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_app --resolved-tool-contract " }, "resolved_tool_contract": { "_comment": "Created by pbcommand v0.2.3", "input_files": [ "/Users/mkocher/gh_projects/pbcommand/tests/data/example.txt" ], "is_distributed": false, "nproc": 1, "options": { "pbcommand.task_options.dev_max_nlines": 27 }, "output_files": [ "/var/folders/xk/_785bh115wj4m6_sy8g5wsx00000gn/T/tmp3fWNGvrtc-test/output.txt" ], "resources": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_txt_app", "log_level": "INFO" } } ././@LongLink0000644000000000000000000000017600000000000011607 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/dev_example_resolved_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/dev_example_re0000644000000000000000000000101713035554276030734 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.example.dev_app --resolved-tool-contract " }, "resolved_tool_contract": { "input_files": [ "/tmp/tmpVgzvudfasta" ], "nproc": 1, "options": { "pbcommand.task_options.dev_read_length": 27 }, "output_files": [ "/tmp/file.fasta" ], "resources": [], "is_distributed": false, "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tools.dev_app", "log_level": "INFO" } } ././@LongLink0000644000000000000000000000017200000000000011603 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/resolved_tool_contract_dev_app.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/resolved-tool-contracts/resolved_tool_0000644000000000000000000000141013035554276030771 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_app --resolved-tool-contract " }, "resolved_tool_contract": { "_comment": "Created by pbcommand v0.2.3", "input_files": [ "/Users/mkocher/gh_projects/pbcommand/tests/data/example.txt" ], "is_distributed": false, "nproc": 1, "options": { "pbcommand.task_options.dev_max_nlines": 27 }, "output_files": [ "/var/folders/xk/_785bh115wj4m6_sy8g5wsx00000gn/T/tmp3fWNGvrtc-test/output.txt" ], "resources": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_txt_app", "log_level": "INFO" } } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/pipeline-presets/0000755000000000000000000000000013035554276024527 5ustar rootroot././@LongLink0000644000000000000000000000015500000000000011604 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/pipeline-presets/example-pipeline-presets.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/pipeline-presets/example-pipeline-pres0000644000000000000000000000111513035554276030655 0ustar rootroot{ "_comment": "Resolved Pipeline Template Preset JSON format", "pipelineId": "pbsmrtpipe.pipelines.dev_a", "presetId": "pbsmrtpipe.pipeline_preset.settings_01", "name": "Pipeline Template Preset Name", "description": "Description of preset is required", "options": { "pbsmrtpipe.options.max_nchunks": 10, "pbsmrtpipe.options.chunk_mode": true }, "taskOptions": { "pbcommand.task_options.num_records": 51, "pbcommand.task_options.alpha": 1.234, "pbcommand.task_options.beta": false, "pbcommand.task_options.gamma": "this is a string parameter" } } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/0000755000000000000000000000000013035554276024366 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/overview.json0000644000000000000000000000071313035554276027130 0ustar rootroot{ "tables": [], "_comment": "Manually updated by MK for 0.3.9 and Added UUID for 0.3.24", "uuid": "196136c8-f6fd-11e5-b481-3c15c2cc8f88", "_version": "0.3.9", "_changelist": 127707, "attributes": [ { "name": "SMRT Cells", "value": 1, "id": "overview.ncells" }, { "name": "Movies", "value": 1, "id": "overview.nmovies" } ], "id": "overview", "title": "Overview Report", "plotGroups": [] }pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/test_report.json0000644000000000000000000000131313035554276027631 0ustar rootroot{ "id": "test_report", "version": "0.1", "title": "Example report for comparing to specification", "attributes": [ { "id": "attribute1", "name": "Attribute 1", "value": 123456789 }, { "id": "attribute2", "name": "Attribute 2", "value": 0.987654321 }, { "id": "attribute3", "name": "Attribute 3", "value": true }, { "id": "attribute4", "name": "Attribute 4", "value": "qwerty" } ], "tables": [ { "id": "table1", "title": "Table 1", "columns": [ { "id": "column1", "header": "Column 1", "values": [1,2,3,4,5,6] } ] } ] } ././@LongLink0000644000000000000000000000015100000000000011600 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/example_version_1_0_0.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/example_version_1_0_0.0000644000000000000000000000142513035554276030447 0ustar rootroot{ "_comment": "Example of v 1.0.0 Report schema.", "tables": [], "uuid": "196136c8-f6fd-11e5-b481-3c15c2cc8f88", "version": "1.0.0", "attributes": [ { "name": "SMRT Cells", "value": 1, "id": "overview.ncells" } ], "id": "my_example", "title": "Example Report", "plotGroups": [ { "id": "adapter.observed_insert_length_distribution", "thumbnail": "adapter_observed_insert_length_distribution_thumb.png", "plots": [ { "title": "My Plot", "caption": null, "image": "adapter_observed_insert_length_distribution.png", "id": "adapter.observed_insert_length_distribution.plot1" } ], "legend": null, "title": "Observed Insert Length Distribution" } ] }././@LongLink0000644000000000000000000000015300000000000011602 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/filter_reports_adapters.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/filter_reports_adapter0000644000000000000000000000240513035554276031055 0ustar rootroot{ "tables": [], "_version": "2.1", "_changelist": 127707, "attributes": [ { "name": "Adapter Dimers", "value": 0.0014104560030870359, "id": "adapter.adapter_dimers" }, { "name": "Short Inserts", "value": 0.000252817585458997, "id": "adapter.short_inserts" }, { "name": "Medium Inserts", "value": 0.0010911074740861974, "id": "adapter.medium_inserts" }, { "name": "Adapter Dimers", "value": 0.0033818058843422386, "id": "adapter.hq_adapter_dimers" }, { "name": "Short Inserts", "value": 0.00013527223537368956, "id": "adapter.hq_short_inserts" }, { "name": "Medium Inserts", "value": 0.002198173824822455, "id": "adapter.hq_medium_inserts" } ], "id": "adapter", "plotGroups": [ { "id": "adapter.observed_insert_length_distribution", "thumbnail": "adapter_observed_insert_length_distribution_thumb.png", "plots": [ { "caption": null, "image": "adapter_observed_insert_length_distribution.png", "id": "adapter.observed_insert_length_distribution.plot1" } ], "legend": null, "title": "Observed Insert Length Distribution" } ] }pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/laa_report2.json0000644000000000000000000000340013035554276027470 0ustar rootroot{ "_changelist": "UNKNOWN", "_version": "0.2.14", "attributes": [], "dataset_uuids": [], "id": "pblaa_tasks_laa", "plotGroups": [], "tables": [ { "columns": [ { "header": "BarcodeName", "id": "pblaa_tasks_laa.pblaa_result_table.barcodename", "values": ["Barcode4", "Barcode3"] }, { "header": "FastaName", "id": "pblaa_tasks_laa.pblaa_result_table.fastaname", "values": ["BarcodeFasta4", "BarcodeFasta3"] }, { "header": "CoarseCluster", "id": "pblaa_tasks_laa.pblaa_result_table.coarsecluster", "values": [4, 3] }, { "header": "Phase", "id": "pblaa_tasks_laa.pblaa_result_table.phase", "values": [4, 3] }, { "header": "TotalCoverage", "id": "pblaa_tasks_laa.pblaa_result_table.totalcoverage", "values": [4, 3] }, { "header": "SequenceLength", "id": "pblaa_tasks_laa.pblaa_result_table.sequencelength", "values": [4, 3] }, { "header": "PredictedAccuracy", "id": "pblaa_tasks_laa.pblaa_result_table.predictedaccuracy", "values": [4, 3] } ], "id": "pblaa_tasks_laa.pblaa_result_table", "title": "Pblaa Results By Barcode" } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/test_report2.json0000644000000000000000000000254613035554276027724 0ustar rootroot{ "id": "test_report", "version": "1.0.0", "title": null, "attributes": [ { "id": "test_report.attribute1", "name": null, "value": 123456789 }, { "id": "test_report.attribute2", "name": null, "value": 0.987654321 }, { "id": "test_report.attribute3", "name": null, "value": true }, { "id": "test_report.attribute4", "name": null, "value": "qwerty" } ], "tables": [ { "id": "test_report.table1", "title": null, "columns": [ { "id": "test_report.table1.column1", "header": null, "values": [1,2,3,4,5,6] } ] } ], "plotGroups": [ { "id": "test_report.plotgroup1", "title": null, "plots": [ { "id": "test_report.plotgroup1.plot1", "image": "unknown.png", "title": null, "caption": null } ] }, { "id": "test_report.plotgroup2", "title": null, "plots": [ { "id": "test_report.plotgroup2.plot1", "image": "unknown.png", "title": null, "caption": null }, { "id": "test_report.plotgroup2.plot2", "image": "unknown.png", "title": null, "caption": null } ] } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/example_with_plot.json0000644000000000000000000000117613035554276031012 0ustar rootroot{ "uuid": "2fcb60de-3b20-11e6-b559-3c15c2cc8f88", "dataset_uuids": [], "tables": [], "version": "1.0.0", "attributes": [], "id": "adapter", "plotGroups": [ { "id": "adapter.observed_insert_length_distribution", "thumbnail": "adapter_observed_insert_length_distribution_thumb.png", "plots": [ { "title": "My Plot", "caption": null, "image": "adapter_observed_insert_length_distribution.png", "id": "adapter.observed_insert_length_distribution.plot1" } ], "legend": null, "title": "Observed Insert Length Distribution" } ] }pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example-reports/laa_report1.json0000644000000000000000000000340013035554276027467 0ustar rootroot{ "_changelist": "UNKNOWN", "_version": "0.2.14", "attributes": [], "dataset_uuids": [], "id": "pblaa_tasks_laa", "plotGroups": [], "tables": [ { "columns": [ { "header": "BarcodeName", "id": "pblaa_tasks_laa.pblaa_result_table.barcodename", "values": ["Barcode1", "Barcode2"] }, { "header": "FastaName", "id": "pblaa_tasks_laa.pblaa_result_table.fastaname", "values": ["BarcodeFasta1", "BarcodeFasta2"] }, { "header": "CoarseCluster", "id": "pblaa_tasks_laa.pblaa_result_table.coarsecluster", "values": [1, 2] }, { "header": "Phase", "id": "pblaa_tasks_laa.pblaa_result_table.phase", "values": [1, 2] }, { "header": "TotalCoverage", "id": "pblaa_tasks_laa.pblaa_result_table.totalcoverage", "values": [1, 2] }, { "header": "SequenceLength", "id": "pblaa_tasks_laa.pblaa_result_table.sequencelength", "values": [1, 2] }, { "header": "PredictedAccuracy", "id": "pblaa_tasks_laa.pblaa_result_table.predictedaccuracy", "values": [1, 2] } ], "id": "pblaa_tasks_laa.pblaa_result_table", "title": "Pblaa Results By Barcode" } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/0000755000000000000000000000000013035554276024536 5ustar rootroot././@LongLink0000644000000000000000000000017300000000000011604 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_0000644000000000000000000000432413035554276030464 0ustar rootroot{ "version": "0.2.1", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.example.dev_app --resolved-tool-contract ", "env": {} }, "tool_contract_id": "pbcommand.tasks.dev_app", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [ "$tmpfile", "$tmpfile", "$tmpdir" ], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "pb_option": { "default": 25, "type": "integer", "option_id": "pbcommand.task_options.dev_read_length", "name": "Length filter", "description": "Min Sequence Length filter" }, "title": "JSON Schema for pbcommand.task_options.dev_read_length", "required": [ "pbcommand.task_options.dev_read_length" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.dev_read_length": { "default": 25, "type": "integer", "description": "Min Sequence Length filter", "title": "Length filter" } } } ], "output_types": [ { "title": "Filtered Fasta file", "description": "Filtered Fasta file", "default_name": "filter", "id": "fasta_out", "file_type_id": "PacBio.FileTypes.Fasta" } ], "_comment": "Created by v0.4.9", "name": "Example Dev App", "input_types": [ { "description": "PacBio Spec'ed fasta file", "title": "Fasta File", "id": "fasta_in", "file_type_id": "PacBio.FileTypes.Fasta" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_app" } } ././@LongLink0000644000000000000000000000020700000000000011602 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_txt_custom_outs_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_0000644000000000000000000000332713035554276030466 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "tool_contract": { "_comment": "Created by v0.4.9", "description": "Quick tool dev_txt_custom_outs pbcommand.tasks.dev_txt_custom_outs", "input_types": [ { "description": "description for PacBio.FileTypes.txt_0", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_0", "title": "" } ], "is_distributed": true, "name": "Custom Txt Task", "nproc": 1, "output_types": [ { "default_name": "PacBio.FileTypes.txt_file_0", "description": "File ", "file_type_id": "PacBio.FileTypes.txt", "id": "label_PacBio.FileTypes.txt", "title": "" }, { "default_name": "PacBio.FileTypes.txt_file_1", "description": "File ", "file_type_id": "PacBio.FileTypes.txt", "id": "label_PacBio.FileTypes.txt", "title": "" } ], "resource_types": [], "schema_options": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_txt_custom_outs" }, "tool_contract_id": "pbcommand.tasks.dev_txt_custom_outs", "version": "0.1.0" }././@LongLink0000644000000000000000000000017100000000000011602 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_scatter_fasta_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_scatter_fasta_ap0000644000000000000000000000456113035554276030630 0ustar rootroot{ "version": "0.1.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_scatter_fasta_app --resolved-tool-contract ", "env": {} }, "tool_contract_id": "pbcommand.tasks.dev_scatter_fasta", "tool_contract": { "task_type": "pbsmrtpipe.task_types.scattered", "resource_types": [], "description": "Scatter a single fasta file to create chunk.json file", "schema_options": [ { "pb_option": { "default": 10, "type": "integer", "option_id": "pbcommand.task_options.dev_scatter_fa_nchunks", "name": "Number of chunks", "description": "Suggested number of chunks. May be overridden by $max_nchunks" }, "title": "JSON Schema for pbcommand.task_options.dev_scatter_fa_nchunks", "required": [ "pbcommand.task_options.dev_scatter_fa_nchunks" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.dev_scatter_fa_nchunks": { "default": 10, "type": "integer", "description": "Suggested number of chunks. May be overridden by $max_nchunks", "title": "Number of chunks" } } } ], "output_types": [ { "title": "Chunk JSON", "description": "Scattered/Chunked Fasta Chunk.json", "default_name": "fasta.chunks", "id": "cjson", "file_type_id": "PacBio.FileTypes.CHUNK" } ], "_comment": "Created by v0.4.9", "nchunks": "$max_nchunks", "name": "Fasta Scatter", "input_types": [ { "description": "Fasta file to scatter", "title": "Fasta In", "id": "fasta_in", "file_type_id": "PacBio.FileTypes.Fasta" } ], "chunk_keys": [ "$chunk.fasta_id" ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_scatter_fasta" } } ././@LongLink0000644000000000000000000000017000000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_gather_fasta_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_gather_fasta_app0000644000000000000000000000227113035554276030611 0ustar rootroot{ "version": "0.1.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_scatter_fasta_app --resolved-tool-contract ", "env": {} }, "tool_contract_id": "pbcommand.tasks.dev_gather_fasta", "tool_contract": { "task_type": "pbsmrtpipe.task_types.gathered", "resource_types": [], "description": "Gather a fasta resources in a Chunk.json file", "schema_options": [], "output_types": [ { "title": "Chunk JSON", "description": "Output Fasta", "default_name": "gathered", "id": "output", "file_type_id": "PacBio.FileTypes.Fasta" } ], "_comment": "Created by v0.4.9", "name": "Fasta Chunk Gather", "input_types": [ { "description": "Chunked Fasta JSON Out", "title": "Chunk JSON", "id": "chunk_json", "file_type_id": "PacBio.FileTypes.CHUNK" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_gather_fasta" } } ././@LongLink0000644000000000000000000000020300000000000011576 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_fastq2fasta_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_0000644000000000000000000000617313035554276030470 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "tool_contract": { "_comment": "Created by v0.4.9", "description": "Dev Task Fastq to Fasta Example", "input_types": [ { "description": "description for PacBio.FileTypes.Fastq_0", "file_type_id": "PacBio.FileTypes.Fastq", "id": "Label PacBio.FileTypes.Fastq_0", "title": "" } ], "is_distributed": true, "name": "Fastq to Fasta", "nproc": 1, "output_types": [ { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.Fasta", "id": "Label PacBio.FileTypes.Fasta_0", "title": "" } ], "resource_types": [], "schema_options": [ { "$schema": "http://json-schema.org/draft-04/schema#", "pb_option": { "default": 1234.0, "description": "Beta Description", "name": "Beta Name", "option_id": "pbcommand.task_options.beta", "type": "number" }, "properties": { "pbcommand.task_options.beta": { "default": 1234.0, "description": "Beta Description", "title": "Beta Name", "type": "number" } }, "required": [ "pbcommand.task_options.beta" ], "title": "JSON Schema for pbcommand.task_options.beta", "type": "object" }, { "$schema": "http://json-schema.org/draft-04/schema#", "pb_option": { "default": true, "description": "Option gamma description", "name": "Option gamma", "option_id": "pbcommand.task_options.gamma", "type": "boolean" }, "properties": { "pbcommand.task_options.gamma": { "default": true, "description": "Option gamma description", "title": "Option gamma", "type": "boolean" } }, "required": [ "pbcommand.task_options.gamma" ], "title": "JSON Schema for pbcommand.task_options.gamma", "type": "object" } ], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_fastq2fasta" }, "tool_contract_id": "pbcommand.tasks.dev_fastq2fasta", "version": "0.1.0" }././@LongLink0000644000000000000000000000020100000000000011574 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_txt_hello_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_0000644000000000000000000000325713035554276030470 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "tool_contract": { "_comment": "Created by v0.4.9", "description": "Quick tool dev_txt_hello pbcommand.tasks.dev_txt_hello", "input_types": [ { "description": "description for PacBio.FileTypes.txt_0", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_0", "title": "" } ], "is_distributed": false, "name": "Tool dev_txt_hello", "nproc": 3, "output_types": [ { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_0", "title": "" }, { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.txt", "id": "Label PacBio.FileTypes.txt_1", "title": "" } ], "resource_types": [], "schema_options": [], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_txt_hello" }, "tool_contract_id": "pbcommand.tasks.dev_txt_hello", "version": "0.1.0" }././@LongLink0000644000000000000000000000017300000000000011604 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_example_dev_txt_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_example_dev_txt_0000644000000000000000000000426513035554276030655 0ustar rootroot{ "version": "0.1.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_app --resolved-tool-contract ", "env": {} }, "tool_contract_id": "pbcommand.tasks.dev_txt_app", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [ "$tmpfile", "$tmpfile", "$tmpdir" ], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "pb_option": { "default": 10, "type": "integer", "option_id": "pbcommand.task_options.dev_max_nlines", "name": "Max Lines", "description": "Max Number of lines to Copy" }, "title": "JSON Schema for pbcommand.task_options.dev_max_nlines", "required": [ "pbcommand.task_options.dev_max_nlines" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.dev_max_nlines": { "default": 10, "type": "integer", "description": "Max Number of lines to Copy", "title": "Max Lines" } } } ], "output_types": [ { "title": "Txt outfile", "description": "Generic Output Txt file", "default_name": "output", "id": "txt_out", "file_type_id": "PacBio.FileTypes.txt" } ], "_comment": "Created by v0.4.9", "name": "Txt App", "input_types": [ { "description": "Generic Text File", "title": "Txt file", "id": "txt_in", "file_type_id": "PacBio.FileTypes.txt" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_txt_app" } } ././@LongLink0000644000000000000000000000015700000000000011606 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_example_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_example_tool_con0000644000000000000000000000432413035554276030651 0ustar rootroot{ "version": "0.2.1", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.example.dev_app --resolved-tool-contract ", "env": {} }, "tool_contract_id": "pbcommand.tasks.dev_app", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [ "$tmpfile", "$tmpfile", "$tmpdir" ], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "pb_option": { "default": 25, "type": "integer", "option_id": "pbcommand.task_options.dev_read_length", "name": "Length filter", "description": "Min Sequence Length filter" }, "title": "JSON Schema for pbcommand.task_options.dev_read_length", "required": [ "pbcommand.task_options.dev_read_length" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.dev_read_length": { "default": 25, "type": "integer", "description": "Min Sequence Length filter", "title": "Length filter" } } } ], "output_types": [ { "title": "Filtered Fasta file", "description": "Filtered Fasta file", "default_name": "filter", "id": "fasta_out", "file_type_id": "PacBio.FileTypes.Fasta" } ], "_comment": "Created by v0.4.4", "name": "Example Dev App", "input_types": [ { "description": "PacBio Spec'ed fasta file", "title": "Fasta File", "id": "fasta_in", "file_type_id": "PacBio.FileTypes.Fasta" } ], "nproc": 1, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_app" } } ././@LongLink0000644000000000000000000000020400000000000011577 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_qhello_world_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/pbcommand.tasks.dev_0000644000000000000000000000443113035554276030463 0ustar rootroot{ "driver": { "env": {}, "exe": "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc ", "serialization": "json" }, "tool_contract": { "_comment": "Created by v0.4.9", "description": "Quick tool dev_qhello_world pbcommand.tasks.dev_qhello_world", "input_types": [ { "description": "description for PacBio.FileTypes.Fasta_0", "file_type_id": "PacBio.FileTypes.Fasta", "id": "Label PacBio.FileTypes.Fasta_0", "title": "" } ], "is_distributed": true, "name": "Tool dev_qhello_world", "nproc": 1, "output_types": [ { "default_name": "file", "description": "description for ", "file_type_id": "PacBio.FileTypes.Fasta", "id": "Label PacBio.FileTypes.Fasta_0", "title": "" } ], "resource_types": [], "schema_options": [ { "$schema": "http://json-schema.org/draft-04/schema#", "pb_option": { "default": 1234, "description": "Option alpha description", "name": "Option alpha", "option_id": "pbcommand.task_options.alpha", "type": "integer" }, "properties": { "pbcommand.task_options.alpha": { "default": 1234, "description": "Option alpha description", "title": "Option alpha", "type": "integer" } }, "required": [ "pbcommand.task_options.alpha" ], "title": "JSON Schema for pbcommand.task_options.alpha", "type": "object" } ], "task_type": "pbsmrtpipe.task_types.standard", "tool_contract_id": "pbcommand.tasks.dev_qhello_world" }, "tool_contract_id": "pbcommand.tasks.dev_qhello_world", "version": "0.2.1" }././@LongLink0000644000000000000000000000016100000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_mixed_app_tool_contract.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/tool-contracts-v1/dev_mixed_app_tool_c0000644000000000000000000002060113035554276030623 0ustar rootroot{ "version": "0.2.0", "driver": { "serialization": "json", "exe": "python -m pbcommand.cli.examples.dev_mixed_app --resolved-tool-contract ", "env": {} }, "tool_contract_id": "pbcommand.tasks.dev_mixed_app", "tool_contract": { "task_type": "pbsmrtpipe.task_types.standard", "resource_types": [], "description": "Dev app for Testing that supports emitting tool contracts", "schema_options": [ { "pb_option": { "name": "Alpha", "default": 25, "option_id": "pbcommand.task_options.alpha", "choices": null, "optionTypeId": "pbsmrtpipe.option_types.integer", "type": "integer", "description": "Alpha description" }, "title": "JSON Schema for pbcommand.task_options.alpha", "required": [ "pbcommand.task_options.alpha" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.alpha": { "default": 25, "type": "integer", "description": "Alpha description", "title": "Alpha" } } }, { "pb_option": { "name": "Beta", "default": 1.234, "option_id": "pbcommand.task_options.beta", "choices": null, "optionTypeId": "pbsmrtpipe.option_types.float", "type": "number", "description": "Beta description" }, "title": "JSON Schema for pbcommand.task_options.beta", "required": [ "pbcommand.task_options.beta" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.beta": { "default": 1.234, "type": "number", "description": "Beta description", "title": "Beta" } } }, { "pb_option": { "name": "Gamma", "default": true, "option_id": "pbcommand.task_options.gamma", "choices": null, "optionTypeId": "pbsmrtpipe.option_types.boolean", "type": "boolean", "description": "Gamma description" }, "title": "JSON Schema for pbcommand.task_options.gamma", "required": [ "pbcommand.task_options.gamma" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.gamma": { "default": true, "type": "boolean", "description": "Gamma description", "title": "Gamma" } } }, { "pb_option": { "name": "Ploidy", "default": "haploid", "option_id": "pbcommand.task_options.ploidy", "choices": [ "haploid", "diploid" ], "optionTypeId": "pbsmrtpipe.option_types.string", "type": "string", "description": "Genome ploidy" }, "title": "JSON Schema for pbcommand.task_options.ploidy", "required": [ "pbcommand.task_options.ploidy" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.ploidy": { "default": "haploid", "type": "string", "description": "Genome ploidy", "title": "Ploidy" } } }, { "pb_option": { "name": "Delta", "default": 1, "option_id": "pbcommand.task_options.delta", "choices": [ 1, 2, 3 ], "optionTypeId": "pbsmrtpipe.option_types.choice_int", "type": "integer", "description": "An integer choice" }, "title": "JSON Schema for pbcommand.task_options.delta", "required": [ "pbcommand.task_options.delta" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.delta": { "default": 1, "type": "integer", "description": "An integer choice", "title": "Delta" } } }, { "pb_option": { "name": "Epsilon", "default": 0.1, "option_id": "pbcommand.task_options.epsilon", "choices": [ 0.01, 0.1, 1.0 ], "optionTypeId": "pbsmrtpipe.option_types.choice_float", "type": "number", "description": "A float choice" }, "title": "JSON Schema for pbcommand.task_options.epsilon", "required": [ "pbcommand.task_options.epsilon" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.epsilon": { "default": 0.1, "type": "number", "description": "A float choice", "title": "Epsilon" } } }, { "pb_option": { "name": "Comments", "default": "asdf", "option_id": "pbcommand.task_options.comment", "choices": null, "optionTypeId": "pbsmrtpipe.option_types.string", "type": "string", "description": "A string parameter" }, "title": "JSON Schema for pbcommand.task_options.comment", "required": [ "pbcommand.task_options.comment" ], "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { "pbcommand.task_options.comment": { "default": "asdf", "type": "string", "description": "A string parameter", "title": "Comments" } } } ], "output_types": [ { "title": "Output Report", "description": "Output PacBio Report JSON", "default_name": "example.report", "id": "rpt", "file_type_id": "PacBio.FileTypes.JsonReport" } ], "_comment": "Created by v0.5.0", "name": "DevApp", "input_types": [ { "description": "Input csv description", "title": "Input CSV", "id": "csv", "file_type_id": "PacBio.FileTypes.csv" } ], "nproc": 2, "is_distributed": false, "tool_contract_id": "pbcommand.tasks.dev_mixed_app" } } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example.fasta0000644000000000000000000000007613035554276023715 0ustar rootroot>record_48 AACTTTCGGACCCGTGGTAGGATTGTGGGAGAATACTGTTGATGTTTTCACpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/example.txt0000644000000000000000000000023413035554276023432 0ustar rootrootThis is a line This is a line This is a line This is a line This is a line This is a line This is a line This is a line This is a line This is the last linepbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/pipeline-datastore-view-rules/0000755000000000000000000000000013035554276027130 5ustar rootroot././@LongLink0000644000000000000000000000015200000000000011601 Lustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/pipeline-datastore-view-rules/rules_01.jsonpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/data/pipeline-datastore-view-rules/rules_010000644000000000000000000000102513035554276030503 0ustar rootroot{ "pipelineId": "pbsmrtpipe.pipelines.sa3_sat", "smrtlinkVersion": "3.2", "rules": [ { "sourceId": "pbreports.tasks.sat_report-out-0", "fileTypeId": "PacBio.FileTypes.JsonReport", "isHidden": true, "name": "Site Acceptance Test", "description": "JSON report for PacBio site acceptance test" }, { "sourceId": "pbreports.tasks.top_variants-out-0", "fileTypeId": "PacBio.FileTypes.JsonReport", "isHidden": true, "name": null, "description": null } ] } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_pb_io_tool_contract.py0000644000000000000000000000323113035554276025760 0ustar rootrootimport unittest import logging from base_utils import get_temp_file, get_temp_dir from base_utils import get_tool_contract, get_resolved_tool_contract from pbcommand.models import (ToolContract, ResolvedToolContract, MalformedToolContractError) from pbcommand.pb_io.tool_contract_io import (load_tool_contract_from, load_resolved_tool_contract_from, write_resolved_tool_contract_avro) import pbcommand.cli.examples.dev_app log = logging.getLogger(__name__) class TestLoadToolContract(unittest.TestCase): def test_01(self): file_name = "dev_example_tool_contract.json" path = get_tool_contract(file_name) tc = load_tool_contract_from(path) self.assertIsInstance(tc, ToolContract) class TestMalformedToolContract(unittest.TestCase): def test_tc_no_inputs(self): file_name = "dev_example_tool_contract.json" path = get_tool_contract(file_name) tc = load_tool_contract_from(path) tc.task.input_file_types = [] def _run(): return tc.to_dict() self.assertRaises(MalformedToolContractError, _run) class TestWriteResolvedToolContractAvro(unittest.TestCase): def test_01(self): file_name = "resolved_tool_contract_dev_app.json" rtc = load_resolved_tool_contract_from(get_resolved_tool_contract(file_name)) self.assertIsInstance(rtc, ResolvedToolContract) d = get_temp_dir("rtc-app") f = get_temp_file("-resolved-tool-contract.avro", d) write_resolved_tool_contract_avro(rtc, f) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_utils.py0000644000000000000000000000704013035554276023100 0ustar rootrootimport functools import tempfile import unittest import argparse import logging from pbcommand.utils import (Singleton, compose, get_parsed_args_log_level, get_dataset_metadata) class TestSingleton(unittest.TestCase): def test_basic(self): class Lithium(object): __metaclass__ = Singleton def __init__(self): self.name = 'Lithium' self.number = 3 a = Lithium() b = Lithium() self.assertEqual(id(a), id(b)) class TestCompose(unittest.TestCase): def test_simple(self): f = lambda x: x * 2 g = lambda y: y + 2 h = compose(f, g) value = h(7) self.assertEquals(value, 18) def test_no_args_list(self): def _f(): return compose() self.assertRaises(ValueError, _f) def test_empty_list(self): def _f(): return compose([]) self.assertRaises(TypeError, _f) def test_partial(self): def add(a, b): return a + b add_five = functools.partial(add, 5) add_two = functools.partial(add, 2) f = compose(add_five, add_two) value = f(5) self.assertEquals(value, 12) class TestLogging(unittest.TestCase): def test_get_parsed_args_log_level(self): # XXX more of an integration test, sorry - we need to ensure that # these functions work in combination with get_parsed_args_log_level from pbcommand.common_options import ( add_log_debug_option, add_log_quiet_option, add_log_verbose_option, add_log_level_option) def _get_argparser(level="INFO"): p = argparse.ArgumentParser() p.add_argument("--version", action="store_true") add_log_level_option(add_log_debug_option(add_log_quiet_option( add_log_verbose_option(p))), default_level=level) return p p = _get_argparser().parse_args([]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.INFO) p = _get_argparser().parse_args(["--quiet"]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.ERROR) p = _get_argparser().parse_args(["--debug"]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.DEBUG) p = _get_argparser("ERROR").parse_args(["--verbose"]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.INFO) p = _get_argparser("DEBUG").parse_args(["--log-level=WARNING"]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.WARNING) p = _get_argparser("NOTSET").parse_args([]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.NOTSET) p = _get_argparser(logging.NOTSET).parse_args([]) l = get_parsed_args_log_level(p) self.assertEqual(l, logging.NOTSET) class TestUtils(unittest.TestCase): def test_get_dataset_metadata(self): try: import pbtestdata except ImportError: raise unittest.SkipTest("pbtestdata not available, skipping") else: md = get_dataset_metadata(pbtestdata.get_file("subreads-xml")) self.assertEqual(md.metatype, "PacBio.DataSet.SubreadSet") try: from pbcore.io import SubreadSet except ImportError: raise unittest.SkipTest("pbcore not available, skipping") else: ds = SubreadSet(pbtestdata.get_file("subreads-xml")) self.assertEqual(md.uuid, ds.uuid) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_pb_io_report.py0000644000000000000000000000301613035554276024422 0ustar rootrootimport os import logging import unittest import json from pprint import pformat from pbcommand.pb_io import load_report_from_json _SERIALIZED_JSON_DIR = 'example-reports' from base_utils import get_data_file_from_subdir log = logging.getLogger(__name__) def _to_report(name): file_name = get_data_file_from_subdir(_SERIALIZED_JSON_DIR, name) log.info("loading json report from {f}".format(f=file_name)) r = load_report_from_json(file_name) return r class TestSerializationOverviewReport(unittest.TestCase): @classmethod def setUpClass(cls): name = 'overview.json' cls.report = _to_report(name) def test_id(self): self.assertEqual(self.report.id, "overview") def test_uuid(self): self.assertEqual(self.report.uuid, "196136c8-f6fd-11e5-b481-3c15c2cc8f88") def test_title(self): self.assertEqual(self.report.title, "Overview Report") def test_attributes(self): self.assertTrue(len(self.report.attributes), 2) class TestSerializationAdapterReport(unittest.TestCase): @classmethod def setUpClass(cls): file_name = 'filter_reports_adapters.json' cls.report = _to_report(file_name) def test_id(self): self.assertEqual(self.report.id, 'adapter') def test_attributes(self): self.assertEqual(len(self.report.attributes), 6) def test_plotgroups(self): self.assertEqual(len(self.report.plotGroups), 1) def test_plots(self): self.assertEqual(len(self.report.plotGroups[0].plots), 1) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_schema_validation.py0000644000000000000000000000631513035554276025416 0ustar rootrootimport json import os import logging import unittest from pbcommand.models import (ToolContract, ResolvedToolContract, PipelinePreset, PipelineDataStoreViewRules) from pbcommand.models.report import Report, ReportSpec from pbcommand.pb_io import (load_tool_contract_from, load_resolved_tool_contract_from, load_pipeline_presets_from, load_pipeline_datastore_view_rules_from_json, load_report_spec_from_json) from pbcommand.schemas import (validate_rtc, validate_tc, validate_presets, validate_datastore_view_rules, validate_report_spec) from pbcommand.utils import walker from base_utils import DATA_DIR_RTC, DATA_DIR_TC, DATA_DIR_PRESETS, DATA_DIR_DSVIEW, DATA_DIR_REPORT_SPECS log = logging.getLogger(__name__) def _to_json(path): with open(path, 'r') as f: d = json.loads(f.read()) return d def json_filter(path): return path.endswith(".json") def _to_assertion(path, schema_validate_func): def test_is_validate(self): d = _to_json(path) # log.debug(d) log.info("Attempting to validate '{}'".format(path)) is_valid = schema_validate_func(d) log.info(" is-valid? {i} {p}".format(i=is_valid, p=path)) self.assertTrue(is_valid, "{p} is not valid with the avro schema".format(p=path)) return test_is_validate class ValidateResolvedToolContracts(unittest.TestCase): def test_validate_resolved_tool_contracts(self): for path in walker(DATA_DIR_RTC, json_filter): f = _to_assertion(path, validate_rtc) f(self) self.assertIsInstance(load_resolved_tool_contract_from(path), ResolvedToolContract) class ValidateToolContracts(unittest.TestCase): def test_validate_tool_contracts(self): for path in walker(DATA_DIR_TC, json_filter): f = _to_assertion(path, validate_tc) f(self) self.assertIsInstance(load_tool_contract_from(path), ToolContract) class ValidatePipelinePreset(unittest.TestCase): def test_validate_pipeline_presets(self): for path in walker(DATA_DIR_PRESETS, json_filter): f = _to_assertion(path, validate_presets) f(self) self.assertIsInstance(load_pipeline_presets_from(path), PipelinePreset) class ValidateDataStoreViewRules(unittest.TestCase): def test_validate_pipeline_datastore_view_rules(self): for path in walker(DATA_DIR_DSVIEW, json_filter): f = _to_assertion(path, validate_datastore_view_rules) f(self) self.assertIsInstance( load_pipeline_datastore_view_rules_from_json(path), PipelineDataStoreViewRules) class ValidateReportSpec(unittest.TestCase): def test_validate_report_spec(self): for path in walker(DATA_DIR_REPORT_SPECS, json_filter): if os.path.basename(path).startswith("report-specs"): f = _to_assertion(path, validate_report_spec) f(self) self.assertIsInstance(load_report_spec_from_json(path), ReportSpec) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_report_plot.py0000644000000000000000000000246313035554276025660 0ustar rootrootimport logging import unittest from pprint import pformat from pbcommand.models.report import Plot, PbReportError log = logging.getLogger(__name__) class TestPlot(unittest.TestCase): def test_plot_null_id(self): """Can't create an plot without an id.""" with self.assertRaises(PbReportError): p = Plot(None, 'foo') def test_plot_null_image(self): """Can't create an plot without an image.""" def _test(): p = Plot('123', None) self.assertRaises(PbReportError, _test) def test_to_dict(self): """Test plot to dictionary method""" a = Plot('123', 'foo', caption='foo is the caption') d = a.to_dict() self.assertEquals('123', d['id']) self.assertEquals('foo', d['image']) self.assertEquals('foo is the caption', d['caption']) log.info(pformat(d, indent=4)) log.info(repr(a)) self.assertIsNotNone(repr(a)) def test_init_with_thumbnail(self): """Initial with thumbnail""" image = "my_image.png" thumbnail = "my_image_thumb.png" p = Plot('plot_1', image, thumbnail=thumbnail, caption="Awesome image") self.assertEqual(p.thumbnail, thumbnail) log.info(pformat(p.to_dict())) self.assertTrue(isinstance(p.to_dict(), dict)) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_resolver.py0000644000000000000000000001232113035554276023577 0ustar rootrootimport logging import unittest from base_utils import (get_temp_dir, get_tool_contract, get_resolved_tool_contract) from pbcommand.models import ResolvedToolContract, ResolvedToolContractTask, ResolvedScatteredToolContractTask, ResolvedGatherToolContractTask from pbcommand.pb_io import load_tool_contract_from from pbcommand.resolver import resolve_tool_contract, resolve_scatter_tool_contract, resolve_gather_tool_contract, ToolContractError log = logging.getLogger(__name__) class TestScatterResolver(unittest.TestCase): FILE_NAME = "dev_scatter_fasta_app_tool_contract.json" MAX_NCHUNKS = 7 MAX_NPROC = 9 INPUT_FILES = ['/tmp/file.fasta'] CHUNK_KEYS = ('$chunk.fasta_id') TOOL_OPTIONS = {} def test_sanity(self): d = get_temp_dir("resolved-tool-contract") tc = load_tool_contract_from(get_tool_contract(self.FILE_NAME)) rtc = resolve_scatter_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, self.TOOL_OPTIONS, self.MAX_NCHUNKS, self.CHUNK_KEYS, False) self.assertIsInstance(rtc, ResolvedToolContract) self.assertIsInstance(rtc.task, ResolvedScatteredToolContractTask) self.assertEqual(rtc.task.max_nchunks, 7) self.assertEqual(rtc.task.is_distributed, False) class TestGatherResolver(unittest.TestCase): FILE_NAME = "dev_gather_fasta_app_tool_contract.json" MAX_NCHUNKS = 7 MAX_NPROC = 9 INPUT_FILES = ['/tmp/file.fasta.chunk.json'] CHUNK_KEY = '$chunk.filter_fasta_id' TOOL_OPTIONS = {} def test_sanity(self): d = get_temp_dir("resolved-tool-contract") tc = load_tool_contract_from(get_tool_contract(self.FILE_NAME)) rtc = resolve_gather_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, self.TOOL_OPTIONS, self.CHUNK_KEY, False) self.assertIsInstance(rtc, ResolvedToolContract) self.assertIsInstance(rtc.task, ResolvedGatherToolContractTask) self.assertEqual(rtc.task.chunk_key, self.CHUNK_KEY) self.assertEqual(rtc.task.is_distributed, False) def _to_id(i): return "pbcommand.task_options.{i}".format(i=i) class TestResolver(unittest.TestCase): FILE_NAME = "dev_mixed_app_tool_contract.json" MAX_NPROC = 1 INPUT_FILES = ['/tmp/file.csv'] PLOIDY = _to_id("ploidy") ALPHA = _to_id("alpha") BETA = _to_id("beta") GAMMA = _to_id("gamma") DELTA = _to_id("delta") EPS = _to_id("epsilon") COMMENTS = _to_id("comment") def test_sanity(self): d = get_temp_dir("resolved-tool-contract") tc = load_tool_contract_from(get_tool_contract(self.FILE_NAME)) tool_options = {} rtc = resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False) self.assertIsInstance(rtc, ResolvedToolContract) self.assertIsInstance(rtc.task, ResolvedToolContractTask) self.assertEqual(rtc.task.is_distributed, False) self.assertEqual(rtc.task.options[self.ALPHA], 25) self.assertEqual(rtc.task.options[self.BETA], 1.234) self.assertEqual(rtc.task.options[self.GAMMA], True) self.assertEqual(rtc.task.options[self.PLOIDY], "haploid") self.assertEqual(rtc.task.options[self.DELTA], 1) self.assertEqual(rtc.task.options[self.EPS], 0.1) self.assertEqual(rtc.task.options[self.COMMENTS], "asdf") # non-defaults tool_options = {self.ALPHA: 15, self.BETA: 2.5, self.GAMMA: False, self.PLOIDY: "diploid", self.DELTA: 2, self.EPS: 1.0, self.COMMENTS: "Hello, world!"} rtc = resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False) self.assertEqual(rtc.task.options[self.ALPHA], 15) self.assertEqual(rtc.task.options[self.BETA], 2.5) self.assertEqual(rtc.task.options[self.GAMMA], False) self.assertEqual(rtc.task.options[self.PLOIDY], "diploid") self.assertEqual(rtc.task.options[self.DELTA], 2) self.assertEqual(rtc.task.options[self.EPS], 1.0) self.assertEqual(rtc.task.options[self.COMMENTS], "Hello, world!") def test_failure_modes(self): d = get_temp_dir("resolved-tool-contract") tc = load_tool_contract_from(get_tool_contract(self.FILE_NAME)) tool_options = {self.PLOIDY: "other"} self.assertRaises(ToolContractError, lambda: resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False)) tool_options = {self.ALPHA:2.5} self.assertRaises(ToolContractError, lambda: resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False)) tool_options = {self.ALPHA:"abcdef"} self.assertRaises(ToolContractError, lambda: resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False)) tool_options = {self.BETA:"asdf"} self.assertRaises(ToolContractError, lambda: resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False)) tool_options = {self.GAMMA:1.0} self.assertRaises(ToolContractError, lambda: resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False)) tool_options = {self.GAMMA:""} self.assertRaises(ToolContractError, lambda: resolve_tool_contract(tc, self.INPUT_FILES, d, d, self.MAX_NPROC, tool_options, False)) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_report_table.py0000644000000000000000000000712113035554276025765 0ustar rootrootimport logging import unittest from pbcommand.models.report import Table, Column, PbReportError log = logging.getLogger(__name__) class TestEmptyTable(unittest.TestCase): """Basic Smoke tests""" def setUp(self): self.columns = [Column('one', header="One"), Column('two', header="Two"), Column('three', header="Three")] self.table = Table('my_table', columns=self.columns) def test_str(self): """Smoke test for conversion to str""" log.info(str(self.table)) self.assertIsNotNone(str(self.table)) def test_columns(self): """Test Columns""" self.assertEqual(len(self.table.columns), 3) def test_column_values(self): """Basic check for column values""" for column in self.table.columns: self.assertEqual(len(column.values), 0) def test_to_dict(self): """Conversion to dictionary""" self.assertTrue(isinstance(self.table.to_dict(), dict)) log.info(self.table.to_dict()) class TestBasicTable(unittest.TestCase): """Basic Smoke tests""" def setUp(self): self.columns = [Column('one', header="One"), Column('two', header="Two"), Column('three', header="Three")] self.table = Table('my_table_with_values', columns=self.columns) datum = {'one': list(xrange(3)), 'two': list('abc'), 'three': 'file1 file2 file3'.split()} for k, values in datum.iteritems(): for value in values: self.table.add_data_by_column_id(k, value) def test_str(self): """Smoke test for conversion to str""" log.info(str(self.table)) self.assertIsNotNone(str(self.table)) def test_columns(self): """Test Columns""" self.assertEqual(len(self.table.columns), 3) def test_column_values(self): """Basic check for column values""" for column in self.table.columns: self.assertEqual(len(column.values), 3) def test_to_dict(self): """Conversion to dictionary""" self.assertTrue(isinstance(self.table.to_dict(), dict)) log.info(self.table.to_dict()) class TestTable(unittest.TestCase): def test_table(self): """Can't create an Table without an id.""" def none_table(): t = Table(None) self.assertRaises(none_table) def test_add_column(self): """Cannot add column with duplicate id.""" cs = [Column('1'), Column('2')] t = Table('foo', columns=cs) def add_dupe(): t.add_column(Column('2')) self.assertSequenceEqual(cs, t.columns) self.assertRaises(PbReportError, add_dupe) def test_append_data(self): """Append data to columns by index.""" cs = [Column('1'), Column('2')] t = Table('foo', columns=cs) t.append_data(0, 'whatev') t.append_data(0, 'huh') t.append_data(1, 'ernie') t.append_data(1, 'bert') self.assertSequenceEqual(['whatev', 'huh'], t.columns[0].values) self.assertSequenceEqual(['ernie', 'bert'], t.columns[1].values) def test_add_data_by_column_id(self): """Added data values by column identifier.""" columns = [Column('one'), Column('two')] table = Table('mytable', columns=columns) datum = {'one': 12.0, 'two': 1234.0} for k, v in datum.iteritems(): table.add_data_by_column_id(k, v) self.assertTrue(12.0 in table.columns[0].values) self.assertTrue(1234.0 in table.columns[1].values) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_common.py0000644000000000000000000000064313035554276024575 0ustar rootrootimport unittest import logging from pbcommand.models import FileTypes log = logging.getLogger(__name__) class TestLoadFileTypes(unittest.TestCase): def test_file_types(self): # smoke test for loading file types ft = FileTypes.DS_ALIGN self.assertIsNotNone(ft) def test_is_valid(self): ft = FileTypes.DS_ALIGN self.assertTrue(FileTypes.is_valid_id(ft.file_type_id)) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_report_attribute.py0000644000000000000000000000242013035554276026676 0ustar rootroot import unittest import logging from pbcommand.models.report import Attribute, PbReportError log = logging.getLogger(__name__) class TestAttribute(unittest.TestCase): def test_attribute_null_id(self): """Can't create an attribute without an id.""" def _test(): a = Attribute(None, 1) self.assertRaises(PbReportError, _test) def test_attribute_int_id(self): """Test exception of handling Attribute with int ids""" def _test(): a = Attribute(1, 12345) self.assertRaises(PbReportError, _test) def test_to_dict(self): """ Test attribute to_dict function """ a = Attribute('bob', 123, "Bob is the name") d = a.to_dict() self.assertEquals('bob', d['id']) self.assertEquals(123, d['value']) self.assertEquals('Bob is the name', d['name']) def test_eq(self): a = Attribute('a', 1234, "My Attribute") b = Attribute('b', 1234, "My B Attribute") c = Attribute('a', 1234, "My Attribute") self.assertTrue(a == c) self.assertTrue(a != b) self.assertTrue(b != c) def test_repr(self): a = Attribute('a', 1234, "My Attribute") log.info(repr(a)) self.assertIsNotNone(repr(a)) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_report.py0000644000000000000000000003304113035554276024616 0ustar rootrootimport json import logging from pprint import pformat import os.path import re import unittest from pbcommand.pb_io import load_report_from_json, load_report_spec_from_json from pbcommand.models.report import (Report, Attribute, PlotGroup, Plot, Table, Column, PbReportError, format_metric) from pbcommand.schemas import validate_report _SERIALIZED_JSON_DIR = 'example-reports' from base_utils import get_data_file_from_subdir, DATA_DIR log = logging.getLogger(__name__) def _to_report(name): file_name = get_data_file_from_subdir(_SERIALIZED_JSON_DIR, name) log.info("loading json report from {f}".format(f=file_name)) r = load_report_from_json(file_name) return r class TestReportModel(unittest.TestCase): def test_from_simple_dict(self): r = Report.from_simple_dict("pbcommand_test", {"n_reads": 50}, "pbcommand") json_dict = json.loads(r.to_json()) self.assertEqual(json_dict['attributes'], [ { "id": "pbcommand_test.pbcommand_n_reads", "name": "n_reads", "value": 50 }, ]) def test_report_null_ns(self): """Can't create a report without a namespace.""" with self.assertRaises(PbReportError): r = Report(None) def test_report_empty_ns(self): """Can't create a report with an empty namespace.""" with self.assertRaises(PbReportError): r = Report("") def test_duplicate_ids(self): """Can't add elements with the same id.""" with self.assertRaises(PbReportError): r = Report('redfang') r.add_attribute(Attribute('a', 'b')) r.add_attribute(Attribute('a', 'c')) def test_illegal_id(self): """Ids must be alphanumberic with underscores""" with self.assertRaises(PbReportError): r = Report('redfang') r.add_attribute(Attribute('a b', 'b')) r.add_attribute(Attribute('a', 'c')) def test_empty_id(self): with self.assertRaises(PbReportError): r = Report('') def test_uppercase_id(self): with self.assertRaises(PbReportError): r = Report('A') def test_to_dict(self): """ The id of report sub elements is prepended with the id of the parent element when to_dict is called. """ r = Report('redfang') a = Attribute('a', 'b') a2 = Attribute('a2', 'b2') r.add_attribute(a) r.add_attribute(a2) pg = PlotGroup('pgid') pg.add_plot(Plot('pid', 'anImg')) pg.add_plot(Plot('pid2', 'anImg2')) r.add_plotgroup(pg) t = Table('tabid') t.add_column(Column('c1')) r.add_table(t) d = r.to_dict() log.debug("\n" + pformat(d)) self.assertEqual('redfang', d['id']) self.assertEqual('redfang.a', d['attributes'][0]['id']) self.assertEqual('redfang.a2', d['attributes'][1]['id']) self.assertEqual('redfang.pgid', d['plotGroups'][0]['id']) self.assertEqual('redfang.pgid.pid', d[ 'plotGroups'][0]['plots'][0]['id']) self.assertEqual('redfang.pgid.pid2', d[ 'plotGroups'][0]['plots'][1]['id']) self.assertEqual('redfang.tabid', d['tables'][0]['id']) self.assertEqual('redfang.tabid.c1', d['tables'][ 0]['columns'][0]['id']) def test_version_and_changelist(self): r = Report('example') d = r.to_dict() log.info("\n" + pformat(d)) fields = ('version', 'uuid', 'plotGroups', 'tables', 'dataset_uuids') for field in fields: self.assertTrue(field in d) def test_to_dict_multi(self): """ Multiple complex elements. The id of report sub elements is prepended with the id of the parent element when to_dict is called. """ r = Report('redfang') a = Attribute('a', 'b') a2 = Attribute('a2', 'b2') r.add_attribute(a) r.add_attribute(a2) pg = PlotGroup('pgid') pg.add_plot(Plot('pid', 'anImg')) pg.add_plot(Plot('pid2', 'anImg2')) r.add_plotgroup(pg) pg = PlotGroup('pgid2') pg.add_plot(Plot('pid2', 'anImg2')) pg.add_plot(Plot('pid22', 'anImg22')) r.add_plotgroup(pg) t = Table('tabid') t.add_column(Column('c1')) r.add_table(t) t = Table('tabid2') t.add_column(Column('c2')) r.add_table(t) d = r.to_dict() log.debug(str(d)) self.assertEqual('redfang', d['id']) self.assertEqual('redfang.a', d['attributes'][0]['id']) self.assertEqual('redfang.a2', d['attributes'][1]['id']) self.assertEqual('redfang.pgid', d['plotGroups'][0]['id']) self.assertEqual('redfang.pgid.pid', d[ 'plotGroups'][0]['plots'][0]['id']) self.assertEqual('redfang.pgid.pid2', d[ 'plotGroups'][0]['plots'][1]['id']) self.assertEqual('redfang.pgid2', d['plotGroups'][1]['id']) self.assertEqual('redfang.pgid2.pid2', d[ 'plotGroups'][1]['plots'][0]['id']) self.assertEqual('redfang.pgid2.pid22', d[ 'plotGroups'][1]['plots'][1]['id']) self.assertEqual('redfang.tabid', d['tables'][0]['id']) self.assertEqual('redfang.tabid.c1', d['tables'][ 0]['columns'][0]['id']) self.assertEqual('redfang.tabid2', d['tables'][1]['id']) self.assertEqual('redfang.tabid2.c2', d[ 'tables'][1]['columns'][0]['id']) log.info(repr(r)) self.assertIsNotNone(repr(r)) def test_get_attribute_by_id(self): a = Attribute('a', 'b') a2 = Attribute('b', 'b2') attributes = [a, a2] r = Report('redfang', attributes=attributes) a1 = r.get_attribute_by_id('a') self.assertEqual(a, a1) def test_get_attribute_by_id_with_bad_id(self): a1 = Attribute('a', 'b') a2 = Attribute('b', 'b2') attributes = [a1, a2] report = Report('redfang', attributes=attributes) a = report.get_attribute_by_id('a') self.assertEqual(a.value, 'b') bad_a = report.get_attribute_by_id('id_that_does_not_exist') self.assertIsNone(bad_a) def test_get_table_by_id(self): r = Report('redfang') t1 = Table('tabid1') t1.add_column(Column('c1')) r.add_table(t1) t = r.get_table_by_id('tabid1') self.assertEqual(t, t1) def test_get_table_by_id_with_bad_id(self): r = Report('redfang') t1 = Table('tabid1') t1.add_column(Column('c1')) r.add_table(t1) bad_t = r.get_table_by_id('id_that_does_not_exist') self.assertIsNone(bad_t) def test_get_column_by_id(self): r = Report('redfang') t1 = Table('tabid1') c1 = Column('c1') t1.add_column(c1) r.add_table(t1) c = r.get_table_by_id('tabid1').get_column_by_id('c1') self.assertEqual(c, c1) def test_get_column_by_id_with_bad_id(self): r = Report('redfang') t1 = Table('tabid1') c1 = Column('c1') t1.add_column(c1) r.add_table(t1) bad_c = r.get_table_by_id('tabid1').get_column_by_id( 'id_that_does_not_exist') self.assertIsNone(bad_c) def test_get_plotgroup_by_id(self): r = Report('redfang') pg1 = PlotGroup('pgid1') pg1.add_plot(Plot('pid1', 'anImg')) r.add_plotgroup(pg1) pg = r.get_plotgroup_by_id('pgid1') self.assertEqual(pg, pg1) def test_get_plotgroup_by_id_with_bad_id(self): r = Report('redfang') pg1 = PlotGroup('pgid1') pg1.add_plot(Plot('pid1', 'anImg')) r.add_plotgroup(pg1) bad_pg = r.get_plotgroup_by_id('id_that_does_not_exist') self.assertIsNone(bad_pg) def test_get_plot_by_id(self): r = Report('redfang') pg1 = PlotGroup('pgid1') p1 = Plot('pid1', 'anImg') pg1.add_plot(p1) r.add_plotgroup(pg1) p = r.get_plotgroup_by_id('pgid1').get_plot_by_id('pid1') self.assertEqual(p, p1) def test_get_plot_by_id_with_bad_id(self): r = Report('redfang') pg1 = PlotGroup('pgid1') p1 = Plot('pid1', 'anImg') pg1.add_plot(p1) r.add_plotgroup(pg1) bad_p = r.get_plotgroup_by_id( 'pgid1').get_plot_by_id('id_that_does_not_exist') self.assertIsNone(bad_p) def test_merge(self): EXPECTED_VALUES = { "n_reads": 300, "n_zmws": 60, } NAMES = { "n_reads": "Number of reads", "n_zmws": "Number of ZMWs" } chunks = [ Report("pbcommand_test", attributes=[ Attribute(id_="n_reads", value=50, name="Number of reads"), Attribute(id_="n_zmws", value=10, name="Number of ZMWs")], dataset_uuids=["12345"]), Report("pbcommand_test", attributes=[ Attribute(id_="n_reads", value=250, name="Number of reads"), Attribute(id_="n_zmws", value=50, name="Number of ZMWs")]), ] r = Report.merge(chunks) self.assertEqual([a.id for a in r.attributes], ["n_reads", "n_zmws"]) self.assertEqual(r._dataset_uuids, ["12345"]) for attr in r.attributes: self.assertEqual(attr.value, EXPECTED_VALUES[attr.id]) self.assertEqual(attr.name, NAMES[attr.id]) for table in r.tables: for column in table.columns: self.assertEqual(column.header, NAMES[column.id]) def test_merge_tables(self): names = ['laa_report1.json', 'laa_report2.json'] r = Report.merge([_to_report(names[0]), _to_report(names[1])]) table = r.tables[0] self.assertEqual(len(table.columns), 7) self.assertEqual( [col.header for col in table.columns], ['BarcodeName', 'FastaName', 'CoarseCluster', 'Phase', 'TotalCoverage', 'SequenceLength', 'PredictedAccuracy']) for col in table.columns: self.assertEqual(len(col.values), 4) if col.header == 'BarcodeName': self.assertEqual( col.values, ['Barcode1', 'Barcode2', 'Barcode4', 'Barcode3']) elif col.header == 'FastaName': self.assertEqual( col.values, ['BarcodeFasta1', 'BarcodeFasta2', 'BarcodeFasta4', 'BarcodeFasta3']) else: self.assertEqual(col.values, [1, 2, 4, 3]) class TestMalformedReport(unittest.TestCase): def test_bad_01(self): r = Report("stuff", uuid=1234) d = r.to_dict() def fx(): # when the Report validation is enabled, use to_json # r.to_json() return validate_report(d) self.assertRaises(IOError, fx) class TestReportSchemaVersion100(unittest.TestCase): name = "example_version_1_0_0.json" def test_sanity(self): r = _to_report(self.name) self.assertIsInstance(r, Report) class TestRepotSchemaVersion100WithPlots(TestReportSchemaVersion100): name = "example_with_plot.json" class TestReportSpec(unittest.TestCase): def setUp(self): self.spec = load_report_spec_from_json( os.path.join(DATA_DIR, "report-specs", "report_spec.json")) def test_report_validation(self): rpt = _to_report("test_report.json") r = self.spec.validate_report(rpt) self.assertTrue(isinstance(r, Report)) rpt.attributes.append(Attribute("attribute5", value=12345)) error_len = lambda e: len(e.message.split("\n")) try: self.spec.validate_report(rpt) except ValueError as e: self.assertEqual(error_len(e), 2) else: self.fail("Expected exception") self.assertFalse(self.spec.is_valid_report(rpt)) rpt.attributes[0] = Attribute("attribute1", value=1.2345) try: self.spec.validate_report(rpt) except ValueError as e: print e self.assertEqual(error_len(e), 3) else: self.fail("Expected exception") self.assertFalse(self.spec.is_valid_report(rpt)) def test_format_metric(self): s = format_metric("{:,d}", 123456789) self.assertEqual(s, "123,456,789") s = format_metric("{:.4g}", 1.2345678) self.assertEqual(s, "1.235") s = format_metric("{M:.2f} Mb", 123456789) self.assertEqual(s, "123.46 Mb") s = format_metric("{p:.5g}%", 0.987654321) self.assertEqual(s, "98.765%") s = format_metric("{p:g}", 0.000001) self.assertEqual(s, "0.0001%") s = format_metric("{:,.3f}", 1000000.2345678) self.assertEqual(s, "1,000,000.235") def test_apply_view(self): rpt = _to_report("test_report2.json") rpt = self.spec.apply_view(rpt) self.assertTrue(all([a.name is not None for a in rpt.attributes])) self.assertTrue(all([t.title is not None for t in rpt.tables])) self.assertTrue(all([c.header is not None for c in rpt.tables[0].columns])) self.assertTrue(all([pg.title is not None for pg in rpt.plotGroups])) self.assertTrue(all([p.title is not None for p in rpt.plotGroups[0].plots])) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_common_cmdline_core.py0000644000000000000000000000171113035554276025732 0ustar rootrootimport unittest import logging import shlex import pbcommand.common_options as CU from pbcommand.cli.core import pacbio_args_runner from pbcommand.cli import get_default_argparser from pbcommand.utils import setup_log log = logging.getLogger(__name__) def args_runner(*args, **kwargs): log.info("Running args: {a}".format(a=args)) return 0 def _example_parser(): p = get_default_argparser("1.0.0", "Example Mock Parser") p = CU.add_log_debug_option(p) p.add_argument('example_file', type=str, help="No testing of existence") return p def _example_main(cmdline_args): """Example func for testing.""" p = _example_parser() argv = shlex.split(cmdline_args) rcode = pacbio_args_runner(argv, p, args_runner, log, setup_log) return rcode class SimpleTest(unittest.TestCase): def test_01(self): args = "--debug /path/to/my_fake_file.txt" rcode = _example_main(args) self.assertEqual(rcode, 0) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_common_pacbio_options.py0000644000000000000000000001165113035554276027666 0ustar rootrootimport unittest import logging from pbcommand.models.common import (PacBioFloatChoiceOption, PacBioIntOption, PacBioStringOption, PacBioStringChoiceOption, PacBioIntChoiceOption, PacBioBooleanOption, PacBioFloatOption) log = logging.getLogger(__name__) def _to_i(s): return "test.task_options.{}".format(s) def get_or(i, value): return value if i is None else i class TestPacBioBasicOptionTest(unittest.TestCase): OPT_KLASS = PacBioIntOption OPT_ID = "alpha" OPT_NAME = "Alpha" OPT_DESC = "Alpha description" OPT_DEFAULT = 2 def _to_opt(self, i=None, n=None, v=None, d=None): ix = get_or(i, _to_i(self.OPT_ID)) name = get_or(n, self.OPT_NAME) value = get_or(v, self.OPT_DEFAULT) description = get_or(d, self.OPT_DESC) return self.OPT_KLASS(ix, name, value, description) def test_sanity_option(self): o = self._to_opt() log.debug("Created option {o}".format(o=o)) self.assertEqual(o.option_id, "test.task_options.{}".format(self.OPT_ID)) self.assertEqual(o.name, self.OPT_NAME) self.assertEqual(o.default, self.OPT_DEFAULT) self.assertEqual(o.description, self.OPT_DESC) class TestPacBioIntOptionTest(TestPacBioBasicOptionTest): def test_bad_value_string(self): with self.assertRaises(TypeError): _ = self._to_opt(v="bad-string") def test_bad_value_float(self): with self.assertRaises(TypeError): _ = self._to_opt(v=3.124) def test_bad_value_boolean(self): with self.assertRaises(TypeError): _ = self._to_opt(v=True) class TestPacBioBooleanOptionTest(TestPacBioBasicOptionTest): OPT_KLASS = PacBioBooleanOption OPT_DEFAULT = True def test_bad_value_int(self): with self.assertRaises(TypeError): _ = self._to_opt(v=1) def test_bad_value_float(self): with self.assertRaises(TypeError): _ = self._to_opt(v=1.10) def test_bad_value_string(self): with self.assertRaises(TypeError): _ = self._to_opt(v="bad-string") class TestPacBioFloatOptionTest(TestPacBioBasicOptionTest): OPT_KLASS = PacBioFloatOption OPT_DEFAULT = 3.1415 def test_coerced_value_int(self): o = self._to_opt(v=1) self.assertEqual(o.default, 1.0) def test_bad_value_boolean(self): with self.assertRaises(TypeError): _ = self._to_opt(v=True) def test_bad_value_string(self): with self.assertRaises(TypeError): _ = self._to_opt(v="bad-string") def test_bad_value_float_tuple(self): with self.assertRaises(TypeError): _ = self._to_opt(v=(1.0, 2.0)) class TestPacBioStringOptionTest(TestPacBioBasicOptionTest): OPT_KLASS = PacBioStringOption OPT_DEFAULT = "gamma" def test_bad_value_int(self): with self.assertRaises(TypeError): _ = self._to_opt(v=1) def test_bad_value_float(self): with self.assertRaises(TypeError): _ = self._to_opt(v=1.10) def test_bad_not_supported_unicode(self): """Test that unicode values are not Supported""" with self.assertRaises(TypeError): _ = self._to_opt(v=unicode('abcdef')) class TestPacBioBasicChoiceTest(TestPacBioBasicOptionTest): OPT_KLASS = PacBioStringChoiceOption OPT_CHOICES = ("alpha", "beta", "gamma") OPT_DEFAULT = "beta" OPT_BAD_OPTION = "delta" def _to_opt(self, i=None, n=None, v=None, d=None, c=None): ix = get_or(i, _to_i(self.OPT_ID)) name = get_or(n, self.OPT_NAME) value = get_or(v, self.OPT_DEFAULT) description = get_or(d, self.OPT_DESC) choices = get_or(c, self.OPT_CHOICES) return self.OPT_KLASS(ix, name, value, description, choices) def test_sanity_choice_option(self): o = self._to_opt() self.assertEqual(o.choices, self.OPT_CHOICES) def test_bad_invalid_choice(self): with self.assertRaises(ValueError): _ = self._to_opt(v=self.OPT_BAD_OPTION) class TestPacBioChoiceStringOptionTest(TestPacBioBasicChoiceTest): OPT_KLASS = PacBioStringChoiceOption OPT_DEFAULT = "gamma" OPT_BAD_OPTION = "Bad-value" class TestPacBioIntChoiceOptionTest(TestPacBioBasicChoiceTest): OPT_KLASS = PacBioIntChoiceOption OPT_CHOICES = (1, 2, 7) OPT_DEFAULT = 2 OPT_BAD_OPTION = 3 class TestPacBioFloatChoiceOptionTest(TestPacBioBasicChoiceTest): OPT_KLASS = PacBioFloatChoiceOption OPT_CHOICES = (1.0, 2.0, 7.0) OPT_DEFAULT = 2.0 OPT_BAD_OPTION = -1.0 def test_coerce_float_choices(self): choices = (10, 12123, 12) o = self._to_opt(c=choices, v=12) def test_bad_choices(self): choices = (1, 2.0, "bad-value") with self.assertRaises(TypeError): _ = self._to_opt(c=choices) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/__init__.py0000644000000000000000000000043013035554276022434 0ustar rootroot#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:fileencoding=utf_8 """Add doc string """ __author__ = 'M. Kocher' __copyright__ = "" __credits__ = ['M. Kocher'] __license__ = 'MIT License' __maintainer__ = 'M. Kocher' __email__ = 'Michael.Kocher@me.com' __version__ = '0.1' pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_pb_io_common.py0000644000000000000000000000216113035554276024377 0ustar rootrootimport unittest import logging from pbcommand.testkit.base_utils import get_temp_dir log = logging.getLogger(__name__) from pbcommand.models import PipelineChunk from pbcommand.pb_io import load_pipeline_chunks_from_json, write_pipeline_chunks from base_utils import get_temp_file class TestWriteChunk(unittest.TestCase): def test_write_chunks(self): def f(i): return {"{c}movie_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/to_movie-{i}.fofn".format(i=i), "{c}region_fofn_id".format(c=PipelineChunk.CHUNK_KEY_PREFIX): "/path/rgn_{i}.fofn".format(i=i)} to_i = lambda i: "chunk-id-{i}".format(i=i) to_p = lambda i: PipelineChunk(to_i(i), **f(i)) nchunks = 5 pipeline_chunks = [to_p(i) for i in xrange(nchunks)] log.debug(pipeline_chunks) tmp_dir = get_temp_dir("pipeline-chunks") tmp_name = get_temp_file("_chunk.json", tmp_dir) write_pipeline_chunks(pipeline_chunks, tmp_name, "Example chunk file") pchunks = load_pipeline_chunks_from_json(tmp_name) self.assertEquals(len(pchunks), nchunks)pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_load_resolved_tool_contract.py0000644000000000000000000000375613035554276027526 0ustar rootrootimport pprint import tempfile import unittest import logging import os.path from base_utils import get_data_file, get_tool_contract, get_resolved_tool_contract from pbcommand.resolver import resolve_tool_contract from pbcommand.pb_io.tool_contract_io import (load_resolved_tool_contract_from, load_tool_contract_from) log = logging.getLogger(__name__) class _TestUtil(unittest.TestCase): FILE_NAME = "resolved_contract_01" def _to_object(self, path): log.debug("Loading from {p}".format(p=path)) return load_tool_contract_from(path) def test_sanity(self): path = get_data_file(self.FILE_NAME) tool_contract = self._to_object(path) self.assertIsNotNone(tool_contract) class TestLoadResolvedContract(unittest.TestCase): def test_01(self): path = get_resolved_tool_contract("dev_example_resolved_tool_contract.json") rtc = load_resolved_tool_contract_from(path) log.info(rtc) self.assertIsNotNone(rtc) class TestResolveContract(unittest.TestCase): def test_01(self): name = "dev_example_dev_txt_app_tool_contract.json" p = get_tool_contract(name) tc = load_tool_contract_from(p) input_files = ["/tmp/file.txt"] root_output_dir = "/tmp" root_tmp_dir = root_output_dir tmp_file = tempfile.NamedTemporaryFile().name max_nproc = 2 tool_options = {} rtc = resolve_tool_contract(tc, input_files, root_output_dir, root_tmp_dir, max_nproc, tool_options, False) log.info(pprint.pformat(rtc)) self.assertIsNotNone(rtc) self.assertEqual(os.path.basename(rtc.task.output_files[0]), "output.txt") # Validate Resolved Resource Types log.debug("Resources {t}".format(t=rtc.task.resources)) self.assertEqual(len(rtc.task.tmpdir_resources), 1) self.assertEqual(len(rtc.task.tmpfile_resources), 2) #self.assertEqual(rtc.task.tmp_file, tmp_file) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_models_report_plotgroup.py0000644000000000000000000000340613035554276026733 0ustar rootroot import unittest import logging from pprint import pformat from pbcommand.models.report import PlotGroup, Plot, PbReportError log = logging.getLogger(__name__) class TestPlotGroup(unittest.TestCase): def test_init(self): """Test constructor with kwargs""" plot = Plot('a_plot', 'path/to/image.png', caption="My Image") p = PlotGroup('my_pg', plots=[plot]) self.assertIsNotNone(p) def test_plotgroup_null_id(self): """Can't create an plotGroup without an id.""" def _test(): p = PlotGroup(None) self.assertRaises(PbReportError, _test) def test_plotgroup_add_duplicate_plot(self): """Can't add plots with duplicate ids.""" def _test(): pg = PlotGroup('foo') pg.add_plot(Plot('id', 'i1')) pg.add_plot(Plot('id', 'i2')) self.assertRaises(PbReportError, _test) def test_to_dict(self): """Test plotGroup to_dict function.""" a = PlotGroup('123', title='foo title', legend='foo legend', thumbnail='foo thumbnail') a.add_plot(Plot('id', 'i1', caption='a caption')) d = a.to_dict() log.debug(pformat(d)) self.assertEquals('123', d['id']) self.assertEquals('foo title', d['title']) self.assertEquals('foo legend', d['legend']) self.assertEquals('foo thumbnail', d['thumbnail']) self.assertEquals(1, len(d['plots'])) log.info(a) self.assertIsNotNone(repr(a)) def test_adding_incorrect_type(self): """Validate type when adding Plots.""" def _test(): plots = ['Not a plot instance', 'Another bad plot.'] p = PlotGroup('my_plotgroup', plots=plots) self.assertRaises(TypeError, _test) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/base_utils.py0000755000000000000000000000162113035554276023035 0ustar rootroot import os from pbcommand.testkit.base_utils import * DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) DATA_DIR_TC = os.path.join(DATA_DIR, 'tool-contracts') DATA_DIR_TC_V1 = os.path.join(DATA_DIR, 'tool-contracts-v1') DATA_DIR_RTC = os.path.join(DATA_DIR, 'resolved-tool-contracts') DATA_DIR_PRESETS = os.path.join(DATA_DIR, "pipeline-presets") DATA_DIR_DSVIEW = os.path.join(DATA_DIR, "pipeline-datastore-view-rules") DATA_DIR_REPORT_SPECS = os.path.join(DATA_DIR, "report-specs") def get_data_file(path): return os.path.join(DATA_DIR, path) def get_data_file_from_subdir(subdir, path): return os.path.join(DATA_DIR, subdir, path) def get_tool_contract(name): return os.path.join(DATA_DIR_TC, name) def get_tool_contract_v1(name): return os.path.join(DATA_DIR_TC_V1, name) def get_resolved_tool_contract(name): return os.path.join(DATA_DIR_RTC, name) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_parsers.py0000644000000000000000000001112113035554276023412 0ustar rootrootimport unittest from pbcommand.models import TaskTypes, FileTypes, get_pbparser class TestParsers(unittest.TestCase): def test_input_output_files(self): p = get_pbparser( "pbcommand.tasks.test_parsers", "0.1", "Display Name", "Tool Description ", "pbcommand-driver-cmd", is_distributed=False, nproc=1, resource_types=()) p.add_input_file_type( file_type=FileTypes.FASTA, file_id="fasta", name="Fasta file", description="Fasta file description") p.add_input_file_type(FileTypes.JSON, "json", "JSON file", "JSON file description") p.add_output_file_type( file_type=FileTypes.GFF, file_id="gff", name="GFF file", description="GFF file description", default_name="annotations") tc_contract = p.to_contract() d = tc_contract.to_dict() inputs = d['tool_contract']['input_types'] self.assertEqual(inputs, [ { 'description': 'Fasta file description', 'title': 'Fasta file', 'id': 'fasta', 'file_type_id': 'PacBio.FileTypes.Fasta' }, { 'description': 'JSON file description', 'title': 'JSON file', 'id': 'json', 'file_type_id': 'PacBio.FileTypes.json' } ]) outputs = d['tool_contract']['output_types'] self.assertEqual(outputs, [ { 'title': 'GFF file', 'description': 'GFF file description', 'default_name': 'annotations', 'id': 'gff', 'file_type_id': 'PacBio.FileTypes.gff' } ]) def test_misc_parser_types(self): p = get_pbparser( "pbcommand.tasks.test_parsers", "0.1.0", "Tool Name", "Tool Descripion", "pbcommand-driver-exe ") p.add_int("pbcommand.task_options.n", "n", default=0, name="N", description="Integer option") p.add_float("pbcommand.task_options.f", "f", default=0, name="F", description="Float option") # XXX note that the 'default' value is not actually what the option is # set to by default - it simply signals that action=store_true p.add_boolean("pbcommand.task_options.loud", "loud", default=False, name="Verbose", description="Boolean option") p.add_choice_str("pbcommand.task_options.ploidy", "ploidy", choices=["haploid","diploid"], name="Ploidy", description="Choice Option", default="haploid") p.add_choice_int("pbcommand.task_options.delta", "delta", choices=[1,2,3], name="Delta", description="Int Choice Option", default=1) p.add_choice_float("pbcommand.task_options.epsilon", "epsilon", choices=[0.01,0.1,1.0], name="Epsilon", description="Float Choice Option", default=0.1) pa = p.arg_parser.parser.parse_args opts = pa(["--n", "250", "--f", "1.2345", "--loud"]) self.assertEqual(opts.n, 250) self.assertEqual(opts.f, 1.2345) self.assertTrue(opts.loud) self.assertEqual(opts.ploidy, "haploid") self.assertEqual(opts.delta, 1) self.assertEqual(opts.epsilon, 0.1) opts2 = pa([]) self.assertFalse(opts2.loud) p.add_input_file_type(FileTypes.JSON, "json", "JSON file", "JSON file description") p.add_output_file_type( file_type=FileTypes.GFF, file_id="gff", name="GFF file", description="GFF file description", default_name="annotations") tc = p.to_contract() def test_catch_output_file_extension(self): p = get_pbparser( "pbcommand.tasks.test_parsers", "0.1.0", "Tool Name", "Tool Descripion", "pbcommand-driver-exe ") p.add_output_file_type( file_type=FileTypes.GFF, file_id="gff", name="GFF file", description="GFF file description", default_name="annotations.gff") tc = p.to_contract() self.assertRaises(ValueError, tc.to_dict) # TODO we should add a lot more tests for parser behavior if __name__ == "__main__": unittest.main() pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_e2e_example_apps.py0000644000000000000000000000763613035554276025164 0ustar rootrootimport logging from base_utils import get_data_file from pbcommand.testkit import PbTestApp from pbcommand.resolver import ToolContractError log = logging.getLogger(__name__) class TestRunDevApp(PbTestApp): DRIVER_BASE = "python -m pbcommand.cli.examples.dev_app " REQUIRES_PBCORE = True INPUT_FILES = [get_data_file("example.fasta")] TASK_OPTIONS = {"pbcommand.task_options.dev_read_length": 27} class TestTxtDevApp(PbTestApp): DRIVER_BASE = "python -m pbcommand.cli.examples.dev_txt_app " # XXX using default args, so the emit/resolve drivers are automatic REQUIRES_PBCORE = False INPUT_FILES = [get_data_file("example.txt")] TASK_OPTIONS = {"pbcommand.task_options.dev_max_nlines": 27} RESOLVED_TASK_OPTIONS = {"pbcommand.task_options.dev_max_nlines": 27} class TestQuickDevHelloWorld(PbTestApp): """Runs dev_qhello_world """ DRIVER_EMIT = "python -m pbcommand.cli.examples.dev_quick_hello_world emit-tool-contract pbcommand.tasks.dev_qhello_world " DRIVER_RESOLVE = "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc " REQUIRES_PBCORE = False INPUT_FILES = [get_data_file("example.txt")] IS_DISTRIBUTED = True RESOLVED_IS_DISTRIBUTED = True class TestQuickTxt(PbTestApp): """Runs dev_qhello_world """ DRIVER_EMIT = "python -m pbcommand.cli.examples.dev_quick_hello_world emit-tool-contract pbcommand.tasks.dev_txt_hello " DRIVER_RESOLVE = "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc " REQUIRES_PBCORE = False INPUT_FILES = [get_data_file("example.txt")] IS_DISTRIBUTED = True RESOLVED_IS_DISTRIBUTED = False # XXX is_distributed=False in task TC! class TestQuickCustomTxtCustomOuts(PbTestApp): """Runs dev_qhello_world """ DRIVER_EMIT = "python -m pbcommand.cli.examples.dev_quick_hello_world emit-tool-contract pbcommand.tasks.dev_txt_custom_outs " DRIVER_RESOLVE = "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc " REQUIRES_PBCORE = False INPUT_FILES = [get_data_file("example.txt")] class TestOptionTypes(PbTestApp): DRIVER_BASE = "python -m pbcommand.cli.examples.dev_mixed_app" REQUIRES_PBCORE = False INPUT_FILES = [get_data_file("example.txt")] TASK_OPTIONS = { "pbcommand.task_options.alpha": 50, "pbcommand.task_options.beta": 9.876, "pbcommand.task_options.gamma": False, "pbcommand.task_options.ploidy": "diploid" } RESOLVED_TASK_OPTIONS = { "pbcommand.task_options.alpha": 50, "pbcommand.task_options.beta": 9.876, "pbcommand.task_options.gamma": False, "pbcommand.task_options.ploidy": "diploid", "pbcommand.task_options.delta": 1, "pbcommand.task_options.epsilon": 0.1 } class TestBadChoiceValue(TestOptionTypes): TASK_OPTIONS = { "pbcommand.task_options.alpha": 50, "pbcommand.task_options.beta": 9.876, "pbcommand.task_options.gamma": False, "pbcommand.task_options.ploidy": "other" } def test_run_e2e(self): self.assertRaises(ToolContractError, super(TestBadChoiceValue, self).test_run_e2e) class TestQuickOptionTypes(PbTestApp): DRIVER_EMIT = "python -m pbcommand.cli.examples.dev_quick_hello_world emit-tool-contract pbcommand.tasks.dev_test_options" DRIVER_RESOLVE = "python -m pbcommand.cli.examples.dev_quick_hello_world run-rtc " INPUT_FILES = [get_data_file("example.txt")] TASK_OPTIONS = { "pbcommand.task_options.alpha": 50, "pbcommand.task_options.beta": 9.876, "pbcommand.task_options.gamma": False, "pbcommand.task_options.ploidy": "diploid" } RESOLVED_TASK_OPTIONS = { "pbcommand.task_options.alpha": 50, "pbcommand.task_options.beta": 9.876, "pbcommand.task_options.gamma": False, "pbcommand.task_options.ploidy": "diploid", "pbcommand.task_options.delta": 1, "pbcommand.task_options.epsilon": 0.01 } pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/tests/test_pb_io_conditions.py0000644000000000000000000000236013035554276025261 0ustar rootrootimport unittest import logging import os from base_utils import get_data_file_from_subdir from pbcommand.pb_io import load_reseq_conditions_from log = logging.getLogger(__name__) _SERIALIZED_JSON_DIR = 'example-conditions' def _loader(name): file_name = get_data_file_from_subdir(_SERIALIZED_JSON_DIR, name) log.info("loading json report from {f}".format(f=file_name)) r = load_reseq_conditions_from(file_name) return r class TestSerializationOfResequencingConditions(unittest.TestCase): FILE_NAME = 'reseq-conditions-01.json' @classmethod def setUpClass(cls): cls.cs = _loader(cls.FILE_NAME) def test_condition_n(self): self.assertEqual(len(self.cs.conditions), 3) def test_condition_a(self): log.info(self.cs) self.assertEqual(self.cs.conditions[0].cond_id, "cond_alpha") def test_condition_paths_abs(self): for c in self.cs.conditions: self.assertTrue(os.path.isabs(c.subreadset)) self.assertTrue(os.path.isabs(c.alignmentset)) self.assertTrue(os.path.isabs(c.referenceset)) class TestSerializationOfResequencingConditionsWithRelativePath(TestSerializationOfResequencingConditions): FILE_NAME = 'reseq-conditions-02.json' pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/REQUIREMENTS_TEST.txt0000644000000000000000000000024613035554276022511 0ustar rootrootnose tox # Putting these here for RTD sphinx-argparse sphinx-bootstrap-theme avro requests iso8601 # For sphinx extension ipython # ipython requires this? matplotlib pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/MANIFEST.in0000644000000000000000000000007013035554276020717 0ustar rootrootinclude *.txt *.md recursive-include examples *.txt *.pypbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/.travis.yml0000644000000000000000000000035313035554276021276 0ustar rootrootlanguage: python python: - "2.7" # command to install dependencies install: - "pip install -r REQUIREMENTS.txt" - "pip install ." # command to run tests script: nosetests -s --verbose --logging-config log_nose.cfg tests/test_*.pypbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/README.md0000644000000000000000000003341213035554276020446 0ustar rootroot## pbcommand High Level Overview co-owners: [mpkocher](https://github.com/mpkocher) [natechols](https://github.com/natechols) PacBio Officially Supported Library. Note the APIs are still in flux and not yet 1.0.0. [Full Docs](http://pbcommand.readthedocs.org/en/latest/) [![Circle CI](https://circleci.com/gh/PacificBiosciences/pbcommand.svg?style=svg)](https://circleci.com/gh/PacificBiosciences/pbcommand) ### Components PacBio library for common utils, models, and tools to interface with pbsmrtpipe workflow engine. 1. Common Models and Schemas 2. Service client layer to the SMRTLink services 3. Tool Contract and Resolved Tool Contract interface for integrating with pbsmrtpipe and SMRT Link ## PacBio Core Models and Schemas - [ToolContract](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/tool_contract.avsc) : Used in define task interfaces and used in pipeline definitions (i.e., Resolved Pipeline Templates) - [ResolvedToolContract](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/resolved_tool_contract.avsc): Used in pipeline running - [PacBio Report](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/pbreport.avsc) Core data model for plots, tables and metrics (i.e., attributes) displayed in SMRTLink or available as output from SMRT Link web services) [Examples](https://github.com/PacificBiosciences/pbcommand/tree/master/tests/data/example-reports) - [PacBio DataStore](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/datastore.avsc): JSON file of datastore file(s) that are emitted from an analysis, import-dataset or other job type. - [DataStore View Rules](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/datastore_view_rules.avsc) Custom views of datastore files in SMRTLink) - [TODO] Pipeline Template View Rules: Custom hiding and renaming of pipeline task options for SMRTLink UI) [Examples](https://github.com/PacificBiosciences/smrtflow/tree/master/smrt-server-analysis/src/main/resources/pipeline-template-view-rules) - [TODO] Resolved Pipeline Template: Custom description of pipeline, bindings and task options for SMRTLink UI [Examples](https://github.com/PacificBiosciences/smrtflow/blob/master/smrt-server-analysis/src/main/resources/resolved-pipeline-templates/pbsmrtpipe.pipelines.dev_diagnostic_pipeline_template.json) - [TODO] Report View Rules [Examples](https://github.com/PacificBiosciences/smrtflow/tree/master/smrt-server-analysis/src/main/resources/report-view-rules) An HTML view of these models can be generated by using [AvroDoc](https://github.com/ept/avrodoc) The Avro schema format can be converted to JSON Schema using [Avro To Json Schema](https://json-schema-validator.herokuapp.com) ## Service Client Layer to SMRT Link pbcommand provides a high level interface to the SMRT Link services. See `pbcommand.services` for more details. Here's a terse example of getting a Job by id and fetching the report metrics from the results. ``` IPython 5.1.0 -- An enhanced Interactive Python. ? -> Introduction and overview of IPython's features. %quickref -> Quick reference. help -> Python's own help system. object? -> Details about 'object', use 'object??' for extra details. In [1]: from pbcommand.services import ServiceAccessLayer In [2]: s = ServiceAccessLayer("smrtlink-beta", 8081) In [3]: j = s.get_analysis_job_by_id(22270) In [4]: j.id, j.name, j.path Out[4]: (22270, 'm54088_160819_000654_resequencing', '/pbi/dept/secondary/siv/smrtlink/smrtlink-beta/smrtsuite_166987/userdata/jobs_root/022/022270') In [5]: j Out[5]: ServiceJob(id=22270, uuid=u'4443d0b6-899c-40b9-98f2-2e2b4f889b53', name='m54088_160819_000654_resequencing', state='SUCCESSFUL', path='/pbi/dept/secondary/siv/smrtlink/smrtlink-beta/smrtsuite_166987/userdata/jobs_root/022/022270', job_type='pbsmrtpipe', created_at=datetime.datetime(2016, 8, 20, 1, 25, 50, 874000, tzinfo=), settings={u'workflowOptions': [], u'entryPoints': [{u'fileTypeId': u'PacBio.DataSet.ReferenceSet', u'entryId': u'eid_ref_dataset', u'datasetId': 9898}, {u'fileTypeId': u'PacBio.DataSet.SubreadSet', u'entryId': u'eid_subread', u'datasetId': 35547}], u'pipelineId': u'pbsmrtpipe.pipelines.sa3_ds_resequencing_fat', u'taskOptions': [], u'name': u'm54088_160819_000654_resequencing'}) In [6]: report_metrics = s.get_analysis_job_report_attrs(22270) In [7]: report_metrics Out[7]: {u'coverage.depth_coverage_mean': 11251.832147121406, u'coverage.missing_bases_pct': 0.004559547692868867, u'mapping_stats.mapped_readlength_max': 15478, u'mapping_stats.mapped_readlength_mean': 3899, u'mapping_stats.mapped_readlength_n50': 5844, u'mapping_stats.mapped_readlength_q95': 9560, u'mapping_stats.mapped_reads_n': 207986, u'mapping_stats.mapped_subread_bases_n': 810462596, u'mapping_stats.mapped_subread_concordance_mean': 0.8354, u'mapping_stats.mapped_subread_readlength_max': 12846.0, u'mapping_stats.mapped_subread_readlength_mean': 3823, u'mapping_stats.mapped_subreadlength_n50': 5799, u'mapping_stats.mapped_subreadlength_q95': 9530, u'mapping_stats.mapped_subreads_n': 212005, u'variants.longest_contig_name': u'11k_pbell_H1-6_ScaI_circular_3x_l65796', u'variants.mean_contig_length': 65796.0, u'variants.weighted_mean_bases_called': 0.9999544045230713, u'variants.weighted_mean_concordance': 0.9999696016293528, u'variants.weighted_mean_coverage': 11251.832147121406} In [8]: ``` ## Tool Contract and Resolved Tool Contracts To integrate with the pbsmrtpipe workflow engine you must to be able to generate a **Tool Contract** and to be able to run from a **Resolved Tool Contract**. A **Tool Contract** contains the metadata of the exe, such as the file types of inputs, outputs and options. Example [Tool Contract Json](https://github.com/PacificBiosciences/pbcommand/blob/master/tests/data/dev_example_dev_txt_app_tool_contract.json) (and [Avro Schema](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/tool_contract.avsc)) Example [Resolved Tool Contract Json](https://github.com/PacificBiosciences/pbcommand/blob/master/tests/data/resolved_tool_contract_dev_app.json) (and [Avro Schema](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/schemas/resolved_tool_contract.avsc)) There are two principle use cases, first wrapping/calling python functions that have been defined in external python packages, or scripts. Second, creating a CLI tool that supports emitting tool contracts, running resolved tool contracts and complete argparse style CLI. Example from **pbcommand.cli.examples** ```python import sys import logging from pbcommand.models import FileTypes from pbcommand.cli import registry_builder, registry_runner log = logging.getLogger(__name__) registry = registry_builder("pbcommand", "python -m pbcommand.cli.examples.dev_quick_hello_world ") def _example_main(input_files, output_files, **kwargs): # Simple Function that should imported from your library code # write mock output files for testing purposes, otherwise the End-to-End test will fail xs = output_files if isinstance(output_files, (list, tuple)) else [output_files] for x in xs: with open(x, 'w') as writer: writer.write("Mock data\n") return 0 @registry("dev_qhello_world", "0.2.1", FileTypes.FASTA, FileTypes.FASTA, nproc=1, options=dict(alpha=1234)) def run_rtc(rtc): return _example_main(rtc.task.input_files[0], rtc.task.output_files[0], nproc=rtc.task.nproc) @registry("dev_fastq2fasta", "0.1.0", FileTypes.FASTQ, FileTypes.FASTA) def run_rtc(rtc): return _example_main(rtc.task.input_files[0], rtc.task.output_files[0]) if __name__ == '__main__': sys.exit(registry_runner(registry, sys.argv[1:])) ``` A driver is the commandline interface that the workflow engine will call. The driver will be called with "${exe} /path/to/resolved_tool_contract.json" The tool contracts can be emitted to a directory and used in [pbsmrtpipe](https://github.com/PacificBiosciences/pbsmrtpipe). ```bash $> python -m pbcommand.cli.examples.dev_quick_hello_world -o /path/to/my-tool-contracts ``` ## Creating a Full Commandline Tool with TC/RTC and argparse support Three Steps - define Parser using `get_pbparser` - add running from argparse and running from Resolved ToolContract funcs to call your main - add call to driver Import or define your main function from your library. ```python def run_my_main(fasta_in, fasta_out, min_length): # do work. Main should return an int exit code and be completely independent of argparse return 0 ``` Define a function that will add inputs, outputs and options to your parser. ```python from pbcommand.models import FileTypes def add_args_and_options(p): # FileType, label, name, description p.add_input_file_type(FileTypes.FASTA, "fasta_in", "Fasta File", "PacBio Spec'ed fasta file") # File Type, label, name, description, default file name p.add_output_file_type(FileTypes.FASTA, "fasta_out", "Filtered Fasta file", "Filtered Fasta file", "filter.fasta") # Option id, label, default value, name, description # for the argparse, the read-length will be translated to --read-length and (accessible via args.read_length) p.add_int("pbcommand.task_options.dev_read_length", "read-length", 25, "Length filter", "Min Sequence Length filter") return p ``` Define Parser ```python from pbcommand.models import TaskTypes, SymbolTypes, get_pbparser def get_contract_parser(): tool_id = "example_namespace.tasks.my_id" version = "0.1.0" # or reuse __version__ display_name = "My Example Tool" # Number of processors to use, can also be SymbolTypes.MAX_NPROC nproc = 1 # Log file, tmp dir, tmp file. See ResourceTypes in models, ResourceTypes.TMP_DIR resource_types = () # Commandline exe to call "{exe}" /path/to/resolved-tool-contract.json driver_exe = "python -m pbcommand.cli.example.dev_app --resolved-tool-contract " desc = "Dev app for Testing that supports emitting tool contracts" is_distributed = False # set to True if you want your task to be submitted to the cluster manager (e.g., SGE) if # one is provided to the workflow engine. p = get_pbparser(tool_id, version, display_name, desc, driver_exe, is_distributed=is_distributed, nproc=nproc, resource_types=resource_types) add_args_and_options(p) return p ``` Define a Wrapping IO layer to call your main function from both the tool contract and raw argparse IO layer ```python def _args_runner(args): # this is the args from parser.parse_args() using the python stdlib argparse model # the properties of args are defined as "labels" in the add_args_and_options func. return run_my_main(args.fasta_in, args.fasta_out, args.read_length) def _resolved_tool_contract_runner(resolved_tool_contract): """ :type resolved_tool_contract: pbcommand.models.ResolvedToolContract""" rtc = resolved_tool_contract # all options are referenced by globally namespaced id. This allows tools to use other tools options # e.g., pbalign to use blasr defined options. return run_my_main(rtc.task.input_files[0], rtc.task.outputs[0], rtc.task.options["pbcommand.task_options.dev_read_length"]) ``` Add running layer ```python import sys import logging from pbcommand.utils import setup_log from pbcommand.cli import pbparser_runner log = logging.getLogger(__name__) def main(argv=sys.argv): # New interface that supports running resolved tool contracts log.info("Starting {f} version {v} pbcommand example dev app".format(f=__file__, v="0.1.0")) return pbparser_runner(argv[1:], get_contract_parser(), _args_runner, # argparse runner func _resolved_tool_contract_runner, # tool contract runner func log, # log instance setup_log # setup log func ) if __name__ == '__main__': sys.exit(main()) ``` Now you can run your tool via the argparse standard interface as well as emitting a **Tool Contract** to stdout from the commandline interface. ```sh > python -m 'pbcommand.cli.examples.dev_app' --emit-tool-contract ``` And you can run the tool from a **Resolved Tool Contract** ```sh > python -m pbcommand.cli.examples.dev_app --resolved-tool-contract /path/to/resolved_contract.json ``` See the dev apps in ["pbcommand.cli.examples"](https://github.com/PacificBiosciences/pbcommand/blob/master/pbcommand/cli/examples/dev_app.py) for a complete application (They require pbcore to be installed). In addition to TC/RTC support, there's a complete argparse support for the task options. An example of **help** is shown below. ```sh $> python -m pbcommand.cli.examples.dev_app --help usage: dev_app.py [-h] [-v] [--versions] [--emit-tool-contract] [--resolved-tool-contract RESOLVED_TOOL_CONTRACT] [--log-level LOG_LEVEL] [--debug] [--read-length READ_LENGTH] fasta_in fasta_out Dev app for Testing that supports emitting tool contracts positional arguments: fasta_in PacBio Spec'ed fasta file fasta_out Filtered Fasta file optional arguments: -h, --help show this help message and exit -v, --version show program's version number and exit --versions Show versions of individual components (default: None) --emit-tool-contract Emit Tool Contract to stdout (default: False) --resolved-tool-contract RESOLVED_TOOL_CONTRACT Run Tool directly from a PacBio Resolved tool contract (default: None) --log-level LOG_LEVEL Set log level (default: 10) --debug Debug to stdout (default: False) --read-length READ_LENGTH Min Sequence Length filter (default: 25) ``` pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/Dockerfile0000644000000000000000000000036413035554276021161 0ustar rootroot# Install CramUnit and run a simple dev example FROM mpkocher/docker-pacbiobase MAINTAINER Michael Kocher # Copy the code to container COPY ./ /tmp/pbcommand # Install RUN pip install -r /tmp/C/REQUIREMENTS.txt && pip install /tmp/pbcommand pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/setup.py0000644000000000000000000000335313035554276020702 0ustar rootrootimport os try: from setuptools import setup, find_packages except ImportError: from distutils.core import setup version = __import__('pbcommand').get_version() _REQUIREMENTS_FILE = 'REQUIREMENTS.txt' _REQUIREMENTS_TEST_FILE = "REQUIREMENTS_TEST.txt" _README = 'README.md' def _get_description(): with open(_get_local_file(_README)) as f: _long_description = f.read() return _long_description def _get_local_file(file_name): return os.path.join(os.path.dirname(__file__), file_name) def _get_requirements(file_name): with open(file_name, 'r') as f: reqs = [line for line in f if not line.startswith("#")] return reqs def _get_local_requirements(file_name): return _get_requirements(_get_local_file(file_name)) setup( name='pbcommand', version=version, license='BSD', author='mpkocher natechols', author_email='mkocher@pacificbiosciences.com', url="https://github.com/PacificBiosciences/pbcommand", download_url='https://github.com/PacificBiosciences/pbcommand/tarball/{v}'.format(v=version), description='Library and Tools for interfacing to PacBio pbsmrtpipe workflow engine.', install_requires=_get_local_requirements(_REQUIREMENTS_FILE), tests_require=_get_local_requirements(_REQUIREMENTS_TEST_FILE), long_description=_get_description(), keywords='workflow pacbio'.split(), packages=find_packages(), package_data={"pbcommand": ["schemas/*.avsc"]}, zip_safe=False, extras_require={"pbcore": ["pbcore", "ipython", "autopep8"], "interactive": ['prompt_toolkit']}, classifiers=['Development Status :: 4 - Beta', 'Environment :: Console', 'Topic :: Software Development :: Bug Tracking'] ) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/setup.cfg0000644000000000000000000000004713035554276021006 0ustar rootroot[metadata] description-file = README.mdpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/log_nose.cfg0000644000000000000000000000070413035554276021453 0ustar rootroot[loggers] keys=root,log01 [logger_root] #level=DEBUG level=NOTSET handlers=hand01 [logger_log01] level=NOTSET handlers=hand01 propagate=1 qualname="" [handlers] keys=hand01 [filters] [formatters] keys=form01 [handler_hand01] class=FileHandler level=DEBUG formatter=form01 args=('reports_unittests.log', 'w') [formatter_form01] format=[%(levelname)s] %(asctime)-15s [%(name)s %(funcName)s %(lineno)d] %(message)s datefmt= class=logging.Formatter pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/.gitignore0000644000000000000000000000156213035554276021160 0ustar rootroot# Created by .ignore support plugin (hsz.mobi) ### Python template # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ # Custom logger for nose reports_unittests.log .DS_Store # ipython notebooks notebooks java-classespbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/extract-readme-snippets.py0000644000000000000000000000120213035554276024301 0ustar rootroot#!/usr/bin/env python """ Pandoc filter to exact python code blocks and write each snippet out. """ from pandocfilters import toJSONFilter, Str n = 0 def caps(key, value, format, meta): global n if key == "CodeBlock": py_types = value[0][1][0] if py_types.encode("ascii") == "python": code_block = value[-1] # eval(code_block) with open("readme-snippet-{n}.py".format(n=n), 'a') as f: f.write("# example {k}-{n}\n".format(k=key, n=n)) f.write("{v}\n".format(v=code_block)) n += 1 if __name__ == "__main__": toJSONFilter(caps) pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/REQUIREMENTS.txt0000644000000000000000000000002613035554276021646 0ustar rootrootavro requests iso8601 pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/0000755000000000000000000000000013035554276020114 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/example_01.py0000644000000000000000000000253213035554276022423 0ustar rootrootimport pprint from pbcommand.models.common import (FileTypes, ResourceTypes, SymbolTypes, TaskTypes) from pbcommand.models.parser import get_pbparser from pbcommand.models.tool_contract import ToolDriver def _example_options(p): p.add_input_file_type(FileTypes.BAM, "ubam", "Unaligned BAM", "A General description of BAM") p.add_input_file_type(FileTypes.DS_REF, "ref", "Reference", "Reference Dataset XML") p.add_int("mytool.task_options.myoption", "myopt", 7, "My Option", "My Option which does this and that") p.add_str("mytool.task_options.myopt2", "mylabel", "Stuff", "My Option name", "My Option2 description") p.add_output_file_type(FileTypes.REPORT, "rpt", "Json Report", "Mapping Stats Report Task", "mapping-stats.report.json") return p def example_01(): driver = ToolDriver("my-exe --config") resource_types = (ResourceTypes.TMP_DIR, ResourceTypes.LOG_FILE) p = get_pbparser("pbcommand.tools.example", "0.1.2", "My Description", driver, TaskTypes.DISTRIBUTED, SymbolTypes.MAX_NPROC, resource_types) return _example_options(p) def example_02(): p = example_01() print "Generated Manifest" pprint.pprint(p.parsers[1].to_tool_contract()) # ipython will dump out here. with non-zero exitcode. blah... print "Running Argparse --help" p.parsers[0].parser.parse_args(["--help"]) return p pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/Makefile0000644000000000000000000001640613035554276021563 0ustar rootroot# Makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." qthelp: $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp @echo @echo "Build finished; now you can run "qcollectiongenerator" with the" \ ".qhcp project file in $(BUILDDIR)/qthelp, like this:" @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pbcommand.qhcp" @echo "To view the help file:" @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pbcommand.qhc" applehelp: $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp @echo @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." @echo "N.B. You won't be able to view it unless you put it in" \ "~/Library/Documentation/Help or install it in your application" \ "bundle." devhelp: $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp @echo @echo "Build finished." @echo "To view the help file:" @echo "# mkdir -p $$HOME/.local/share/devhelp/pbcommand" @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pbcommand" @echo "# devhelp" epub: $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/0000755000000000000000000000000013035554276021414 5ustar rootrootpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.utils.rst0000644000000000000000000000014713035554276025247 0ustar rootrootpbcommand Utils ############### Util functions .. automodule:: pbcommand.utils :members: pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.models.task_options.rst0000644000000000000000000000053513035554276030107 0ustar rootrootpbcommand Task Option Types ########################### Supported Task Option Data models .. automodule:: pbcommand.models :members: BasePacBioOption, PacBioIntOption, PacBioBooleanOption,PacBioStringOption,PacBioFloatOption,PacBioIntChoiceOption,PacBioFloatChoiceOption,PacBioStringChoiceOption :undoc-members: :show-inheritance: pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.models.tc.rst0000644000000000000000000000053713035554276026002 0ustar rootrootpbcommand Tool Contract ####################### Tool Contract Models .. automodule:: pbcommand.models :members: ToolContractTask, ToolDriver, InputFileType, OutputFileType, MalformedToolContractError, MalformedResolvedToolContractError, ToolContract, ScatterToolContractTask, GatherToolContractTask :undoc-members: :show-inheritance:pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/advanced_task_types.rst0000644000000000000000000001202513035554276026161 0ustar rootrootAdvanced Task/ToolContract Types ================================ To enable pipeline scaling, "Chunking" of files two new Tool Contract types extend the base Tool Contract data model. Scattering/Chunking Tool Contract ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tasks/ToolContract that take a any file type(s) and emit a **single** scatter.chunk.json file. At a high level, the Scatter Tool Contract data model extends the core Tool Contract model and adds two fields, `chunk_keys` and `nchunks`. - `chunk_keys` is the expected key(s) that will be written to the `PipelineChunk` data model (defined below) - `nchunks` mirrors the `nproc` model of using a symbol `$max_nchunks` or an int to define the absolute upper bound on the number of chunks that should be created. If this value is exceeded, the pipeline engine will immediately fail the execution. Example Tool Contract .. literalinclude:: ../../tests/data/tool-contracts/dev_scatter_fasta_app_tool_contract.json :language: javascript PipelineChunk Data Model ~~~~~~~~~~~~~~~~~~~~~~~~ The `PipelineChunk` data model is defined in `pbcommand.models` and the companion IO layers (`load_pipeline_chunks_from_json` and `write_pipeline_chunks` are in `pbcommand.pb_io`. Each input file **must** be mapped to a `chunk_key` that can then be mapped to the input of the original `unchunked` task. For example, if there's a single input file (e.g., FileTypes.FASTA), then the Scatter ToolContract should define a `chunk_key` of "fasta_id". `chunk_key`(s) that do NOT start with `$chunk.` will considered to be extra metadata that will be passed through. This is useful for adding chunk specific metadata, such as the number of contigs or average contig length. Minimal example of reading and writing `PipelineChunk(s)` data model. .. ipython:: In [1]: from pbcommand.models import PipelineChunk In [5]: c0 = PipelineChunk("scattered-fasta_0", **{"$chunk.fasta_id":"/path/to/chunk-0.fasta"}) In [6]: c1 = PipelineChunk("scattered-fasta_1", **{"$chunk.fasta_id":"/path/to/chunk-1.fasta"}) In [7]: chunks = [c0, c1] In [8]: from pbcommand.pb_io import write_pipeline_chunks In [10]: write_pipeline_chunks(chunks, "test-scatter.chunk.json", "Test comment") In [11]: from pbcommand.pb_io import load_pipeline_chunks_from_json In [12]: load_pipeline_chunks_from_json("test-scatter.chunk.json") Out[12]: [, ] Defining a Scatter Tool Contract ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Currently, python is the only language that is supported for writing CHUNK JSON files. The python Scatter tool contract API follows similar to base Tool Contract API, Simple example of Scattering/Chunking a single Fasta file. The notable points are adding the required `chunk_keys` and `nchunks` to the scattering specific pbparser. .. literalinclude:: ../../pbcommand/cli/examples/dev_scatter_fasta_app.py :language: python Advanced Scattering/Chunking Patterns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ For more advanced scattering/chunks usecases, such as chunking multiple input files (e.g., task input signature SubreadSet and ReferenceSet XML), this will require writing a chunk key for each input. Specifically, `$chunk.subreadset_id` and `$chunk.referenceset_id` to the `PipelineChunk`. This enables the chunking pattern of a specific task to be completely decoupled from the workflow level. The chunking pattern is communicated in the chunk(s) in `PipelineChunk` defined by the chunking task. In this specific chunking pattern, the SubreadSet is chunked into N files, while the ReferenceSet is passed unchunked. These chunk keys combined with the chunk operator (defined in pbsmrtpipe_) communicates to the workflow engine how to pass `$chunk.subreadset_id` to the first input of N-chunked instances of unchunked task. Similarly, the `$chunk.referenceset_id` to the second input of the N-chunked task instance. See the pbsmrtpipe_ docs and the testkit-data jobs in pbsmrtpipe for more details. Gather ToolContract ~~~~~~~~~~~~~~~~~~~ A Gather Tool Contract takes a **single** CHUNK Json file type as input and emits a **single** output file of any type. Example: .. literalinclude:: ../../tests/data/tool-contracts/dev_gather_fasta_app_tool_contract.json :language: javascript The Gather task doesn't extend the base ToolContract and add new properties. However, it will restrict the the input type to `FileTypes.CHUNK` and the output type signature **must only be one file type**. Example Gather Tool: .. literalinclude:: ../../pbcommand/cli/examples/dev_gather_fasta_app.py :language: python For Gather'ing a task that has a multiple N outputs, N gather tasks must be defined. See the pbsmrtpipe_ docs for details of constructing a chunked pipeline. More examples of scatter/chunking and gather tasks are in pbcoretools_. .. _pbsmrtpipe: http://pbsmrtpipe.readthedocs.io .. _pbcoretools: https://github.com/PacificBiosciences/pbcoretools/tree/master/pbcoretools/taskspbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.engine.rst0000644000000000000000000000023613035554276025353 0ustar rootrootpbcommand Engine ################ Util functions for calling external commands .. automodule:: pbcommand.engine :members: run_cmd, ExtCmdResult pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.services.rst0000644000000000000000000000027213035554276025731 0ustar rootrootpbcommand Client to SMRT Link Services ###################################### Python client to SMRT Link Services .. automodule:: pbcommand.services :members: ServiceAccessLayerpbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/index.rst0000644000000000000000000000067113035554276023261 0ustar rootroot.. pbcommand documentation master file, created by sphinx-quickstart on Mon Jul 6 12:53:15 2015. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. pbcommand ========= Contents: .. toctree:: :maxdepth: 2 commandline_interface advanced_task_types services report_model api Indices and tables: * :ref:`genindex` * :ref:`modindex` * :ref:`search` pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.pb_io.pipeline.rst0000644000000000000000000000054213035554276027002 0ustar rootrootpbcommand IO Pipeline ##################### IO utils for loading Tool Contract and Resolved Tool Contract .. automodule:: pbcommand.pb_io :members: load_tool_contract_from,load_resolved_tool_contract_from,load_pipeline_presets_from,write_resolved_tool_contract,write_tool_contract,write_resolved_tool_contract_avro,write_tool_contract_avro pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/commandline_interface.rst0000644000000000000000000001153713035554276026463 0ustar rootrootCommon Commandline Interface ============================ Motivation And High Level Example ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Provide a common interface for executables to expose options - Provide a common interface for executables to be called - Provide a common interface for exposing metadata of tool, such as memory usage, cpu usage, required temp files Benefits ~~~~~~~~ - A consistent concrete common interface for shelling out to an executable - task options have a consistent model for validation - task version is supported - A principled model for wrapping tools. For example, pbalign would "inherit" blasr options and extend, or wrap them. - Once a manifest has been defined and registered to pbsmrtpipe, the task/manifest can be referenced in pipelines with no additional work Terms ~~~~~ - 'Tool Contract' is a single file that exposing the exe interface. It contains metadata about the task, such as input and output file types, nproc. - 'Resolved Tool Contract' is a single file that contains the resolved values in the manifest - 'Driver' is the general interface for calling a commandline exe. This can be called from the commandline or directly as an API call (via any language which supports the manifest interface). Hello World Dev Example ~~~~~~~~~~~~~~~~~~~~~~~ Tool Contract example for an exe, 'python -m pbcommand.cli.example.dev_app` with tool contract id `pbcommand.tasks.dev_app`. .. literalinclude:: ../../tests/data/tool-contracts/pbcommand.tasks.dev_app_tool_contract.json :language: javascript Details of Tool Contract ~~~~~~~~~~~~~~~~~~~~~~~~ - Tool Contract id which can be referenced globally (e.g., within a pipeline template) - Input File types have file type id, id that can be referenced within the driver, and a brief description - Output File types have a file type id and a default output file name - number of processors is defined by $nproc. "\$" prefixed values are symbols that have well defined semantic meaning - Temp files and Log files are defined using "$" symbols are can have multiple items - the exe options are exposed via jsonschema standard. Each option has an id and maps to a single schema definition. Each option must have a default value. - the exe section of the "driver" is the commandline interface that will be called as a positional arguement (e.g., "my-exe resolved-manifest.json") - task type describes if the task should be submitted to the cluster resources Note. A single driver can reference many manifests. For example "pbreports" would have a single driver exe. From the "task_manifest_id", the driver would dispatch to the correct function call Programmatically defining a Parser to Emit a Tool Contract ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pbcommand provides a API to create a tool contract and an argparse instance from a single interface. This facilitates a single point of defining options and keeps the standard commandline entry point and the tool contract to be in sync. This also allows your tool to emit the tool contract to stdout using "--emit-tool-contract" **and** the tool to be run from a **Resolved Tool Contract** using the "--resolved-tool-contract /path/to/resolved-tool-contract.json" commandline argument **while** also supporting the python standards commandline interface via argparse. Complete App shown below. .. literalinclude:: ../../pbcommand/cli/examples/dev_app.py :language: python .. note:: Options must be prefixed with {pbcommand}.task_options.{option_id} format. Details and Example of a Resolved Tool Contract ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Language agnostic JSON format to encode the resolved values - input, outputs file types are resolved to file paths - nproc and other resources are resolved - IO layers to convert between JSON and python using `load_resolved_tool_contract_from` in `pbcommand.pb_io` Example Resolved Tool Contract: .. literalinclude:: ../../tests/data/resolved-tool-contracts/dev_example_resolved_tool_contract.json :language: javascript Testing Tool Contracts ~~~~~~~~~~~~~~~~~~~~~~ There is a thin test framework in `pbcommand.testkit` to help test tool contracts from within nose. The `PbTestApp` base class will provide the core validation of the outputs as well as handled the creation of the resolved tool contract. Output Validation assertions - validates Output files exist - validates resolved task options - validates resolved value of is distributed - validates resolved value of nproc Example: .. literalinclude:: ../../tests/test_e2e_example_apps.py :language: python Tips ~~~~ A dev tool within pbcommand can help convert Tool Contract JSON files to Resolved Tool Contract for testing purposes. .. argparse:: :module: pbcommand.interactive_resolver :func: get_parser :prog: python -m pbcommand.interactive_resolver .. note:: This tool has dependency on `prompt_kit` and can be installed via pip.pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/report_model.rst0000644000000000000000000001571313035554276024650 0ustar rootrootReport Models ============= A report is composed of model objects whose classes are defined in pbreports.model. Typically, a report object is created and then attributes, tables, or plotGroups are added to the report. Lastly, the report is serialized as json to a file. The objects that comprise a report extend BaseReportElement. `All report elements have an id`. When the report is converted to a dictionary before serialization, each report element's id is prepended with its parent id, which has also been prepended. For example, given the nested elements of report --> plotGroup --> plot, with respective ids "r", "pg", and "p", the plot id would be "r.pg.p" in the dictionary. This allows elements of a report to be looked up id, such as "mapping_stats.n_mapped_bases" for a report Attribute (i.e., metric), or a specific plot group, such as "filter_report.filtered_subreads". .. note:: Once a report element id has been assigned, it should not change. Report ------ Report is the root class of the model hierarchy. It's instantiated with an id (should be a short string), which defines its namespace. This example shows how a report is with one attribute, plotGroup, and table is created and written. .. code-block:: python import os import logging from pbcommand.models.report import Report, Attribute, PlotGroup, Table log = logging.getLogger(__name__) def make_report(): """Write a simple report""" table = create_table() # See example below attribute = create_attribute() # See example below plotGroup = create_plotGroup() # See example below # Id must match ^[a-z0-9_]+$ r = Report('loading', title="Loading Report", attributes=[attribute], plotgroups=[plotGroup], tables=[table]) # Alternatively r.add_table(table) r.add_attribute(attribute) r.add_plotGroup(plotGroup) r.write_json('/my/file.json') Attribute --------- An attribute represents a key-value pair with an optional name. The id is the key. A report contains a list of attributes. .. code-block:: python import os import logging from pbcommand.models.report import Attribute log = logging.getLogger(__name__) def create_attribute(): """Return an attribute""" a = Attribute('alpha', 1234, name='Alpha') b = Attribute('beta', "value", name="Beta Display Name") return a Table ----- A table contains a list of column objects and has an optional title and id. A report contains a list of tables. In general, the paradigm for creating a table is to instantiate a table and a series of columns. Add the columns to the table in the desired order. Finally, iterate over your data set and append data to the columns by index. .. code-block:: python import os import logging import random from pbcommand.models.report import Attribute, Table, Column log = logging.getLogger(__name__) def create_table(): """Return a table with 2 columns""" columns = [Column( 'c1id', header='C1 header'), Column('c2id', header='C2 header')] t = Table('myid', title='My Table', columns=columns) #Now append data to the columns #Assume data is a list of tuples of len == 2 datum = [(c.id, random.random()) for c in columns] for column_id, value in datum: t.add_data_by_column_id(column_id, value) # Alternatively cx = Column("cx", header="X", values=[1,2,3,4]) cy = Column("cy", header="Y", values=[1,4,9,16]) t = Table("xy", title="X vs Y", columns=[cx, cy]) return t PlotGroup --------- A `Plot Group` represents a logical grouping or collection of plots that convey related information, such coverage across 5 contigs. A plotGroup has an id, an optional thumbnail (to represent the group in SMRT Link in a preview), an optional legend and a list of plots. .. code-block:: python import os import logging from pbcommand.model.report import PlotGroup, Plot log = logging.getLogger(__name__) def create_plotGroup(): """Return a PlotGroup with 1 plot""" # Image paths must be relative to the dir where the final Report is written plot = Plot('plot_id', image='image.png', caption='this is a plot') p = PlotGroup('myid', title='some title', thumbnail='image_thumb.png', plots=[plot]) return p .. note:: The image paths must be written relative to where the report JSON file will be written. .. note:: Currently, only PNG is supported Report Specs ============ A parallel family of models in the same module handles specifications for individual reports, i.e. enumerating the data items expected for each model type, along with view metadata. The overall structure and names of objects in the hierarchy is identical to the Report model. For any of the nodes in the hierarchy, the following view metadata may be specified: - a UI label, usually `title` (or `name` for Attributes, `header` for table columns) - a description suitable for formal documentation or mouseover text - a boolean `isHidden` attribute that controls visibility There is some redundancy between the report specifications and the actual reports - for example the Report `title` and Attribute `name` occur in both models. This was due to the lack of a clear model for view metadata in previous versions of SMRTAnalysis; the Report model may be slimmed down in the future as the view rules are deployed and utilized. The `pbcommand` module itself does not actually define any reports; currently most of these are part of the `pbreports` module. Format strings -------------- For formatting numerical attribute and column values, we are using a lightweight syntax based on Python's `str.format(...)` method. If the `format` attribute is set to `None` (`null` in JSON), the value should simply be directly converted to string without any formatting. (In the case of string and boolean values, the format should always be left unset.) More complex operations values must match this regular expression:: {([GMkp]{0,1})(:)([\.,]{0,1})([0-9]*)([dfg]{1})}(.*)$ The `[GMkp]` group specifies scaling - if one of these characters is present, the value should be divided by one billion (`G`), one million (`M`), or one thousand (`k`) before formatting, or multiplied by 100 (`p`). The period or comma after the colon modifies the display of floating-point and integer values respectively. The following characters before the closing brace correspond to conventional format string syntax. The format can optionally include a suffix to be appended to the formatted value. Examples of use:: format_value("{:,d}", 123456) # 123,456 format_value("{:.2f)", 1.23456) # 1.23 format_value("{G:.2f} Gb", 1234567890) # 1.23 Gb format_value("{p:5g}%", 0.987654321) # 98.765% format_value(None, 0.987654321) # 0.987654321 pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/api.rst0000644000000000000000000000214413035554276022720 0ustar rootrootpbcommand API docs ################## The `pbcommand` package provides Python modules for common data models, building commandline tools (e.g., ToolContract, CLI parser interface) and interacting with the SMRT Link webservices. Library API documentation ========================= .. automodule:: pbcommand.pb_io :members: :doc:`pbcommand.utils`: Util functions :doc:`pbcommand.models`: Core models :doc:`pbcommand.models.file_types`: Registered File Types :doc:`pbcommand.models.tc` Tool Contract data model :doc:`pbcommand.models.rtc` Resolved Tool Contract data model :doc:`pbcommand.engine`: Util functions for calling an external process :doc:`pbcommand.cli`: Commandline interface :doc:`pbcommand.pb_io.tc`: IO to load Tool Contract and Resolved Tool Contracts :doc:`pbcommand.pb_io.pipeline`: IO to load Pipeline and Pipeline Chunk :doc:`pbcommand.models.task_options`: PacBio Task Options models :doc:`pbcommand.services`: Client API to SMRT Link WebServices :doc:`pbcommand.models.view_rules`: View Rules used by SMRT Link WebServices pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.models.file_types.rst0000644000000000000000000000024713035554276027535 0ustar rootrootpbcommand Models ################ Pacbio Registered FileTypes .. automodule:: pbcommand.models :members: FileTypes :undoc-members: :show-inheritance:pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.models.rtc.rst0000644000000000000000000000046113035554276026160 0ustar rootrootpbcommand Resolved Tool Contract ################################ Resolved Tool Contract Models .. automodule:: pbcommand.models :members: ResolvedToolContractTask, ResolvedScatteredToolContractTask, ResolvedGatherToolContractTask, ResolvedToolContract :undoc-members: :show-inheritance:pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/conf.py0000644000000000000000000002274713035554276022727 0ustar rootroot# -*- coding: utf-8 -*- # # pbcommand documentation build configuration file, created by # sphinx-quickstart on Mon Jul 6 12:53:15 2015. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. import sys import os import shlex import sphinx_bootstrap_theme import pbcommand # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. #needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.coverage', 'sphinx.ext.viewcode', 'IPython.sphinxext.ipython_console_highlighting', 'IPython.sphinxext.ipython_directive' ] # For argparse extensions += ['sphinxarg.ext'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. master_doc = 'index' # General information about the project. project = u'pbcommand' copyright = u'2015-2017, Michael Kocher' author = u'Michael Kocher' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = pbcommand.get_version() # The full version, including alpha/beta/rc tags. release = pbcommand.get_version() # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: #today = '' # Else, today_fmt is used as the format for a strftime call. #today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. exclude_patterns = [] # The reST default role (used for this markup: `text`) to use for all # documents. #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. #add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). #add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. #show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'alabaster' # Activate the my theme. html_theme = 'bootstrap' html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. #html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. #html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". #html_title = None # A shorter title for the navigation bar. Default is the same as html_title. #html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. #html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. #html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. #html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. #html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. #html_use_smartypants = True # Custom sidebar templates, maps document names to template names. #html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. #html_additional_pages = {} # If false, no module index is generated. #html_domain_indices = True # If false, no index is generated. #html_use_index = True # If true, the index is split into individual pages for each letter. #html_split_index = False # If true, links to the reST sources are added to the pages. #html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. #html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. #html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). #html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' #html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value #html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. #html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = 'pbcommanddoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). #'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. #'preamble': '', # Latex figure (float) alignment #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'pbcommand.tex', u'pbcommand Documentation', u'Michael Kocher', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of # the title page. #latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. #latex_use_parts = False # If true, show page references after internal links. #latex_show_pagerefs = False # If true, show URL addresses after external links. #latex_show_urls = False # Documents to append as an appendix to all manuals. #latex_appendices = [] # If false, no module index is generated. #latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'pbcommand', u'pbcommand Documentation', [author], 1) ] # If true, show URL addresses after external links. #man_show_urls = False # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'pbcommand', u'pbcommand Documentation', author, 'pbcommand', 'One line description of project.', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] # If false, no module index is generated. #texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False autoclass_content = 'both' pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.pb_io.tc.rst0000644000000000000000000000064213035554276025604 0ustar rootrootpbcommand IO Tool Contract and Resolved Tool Contract ##################################################### IO utils for loading Tool Contract and Resolved Tool Contract .. automodule:: pbcommand.pb_io :members: load_tool_contract_from,load_resolved_tool_contract_from,load_pipeline_presets_from,write_resolved_tool_contract,write_tool_contract,write_resolved_tool_contract_avro,write_tool_contract_avro pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.cli.rst0000644000000000000000000000040513035554276024653 0ustar rootrootpbcommand Commandline Interface ############################### Pacbio Commandline .. automodule:: pbcommand.cli :members: pacbio_args_runner, pacbio_args_or_contract_runner, pbparser_runner, get_default_argparser, get_default_argparser_with_base_opts pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.models.rst0000644000000000000000000000035413035554276025372 0ustar rootrootpbcommand Models ################ Core Models .. automodule:: pbcommand.models :members: FileType, TaskOptionTypes, DataStore, DataStoreFile, PipelineChunk, SymbolTypes,ResourceTypes :undoc-members: :show-inheritance: pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/pbcommand.models.view_rules.rst0000644000000000000000000000031013035554276027545 0ustar rootrootpbcommand View Rules #################### View Rule Models .. automodule:: pbcommand.models :members: DataStoreViewRule, PipelineDataStoreViewRules :undoc-members: :show-inheritance:pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/docs/source/services.rst0000644000000000000000000000237713035554276024002 0ustar rootroot SMRT Service commandline interface ================================== A high level client to the SMRT Link Services is accessible from `ServiceAccessLayer` in `pbcommand.services`. Client Layer ~~~~~~~~~~~~ Example: .. code-block:: python In [1]: from pbcommand.services import ServiceAccessLayer In [2]: s = ServiceAccessLayer("smrtlink-alpha", 8081) In [3]: s.get_status() Out[3]: {u'id': u'smrtlink_analysis', u'message': u'Services have been up for 141 hours, 37 minutes and 13.138 seconds.', u'uptime': 509833138, u'user': u'secondarytest', u'uuid': u'12e1c62a-99a4-46c1-b616-a327dc38525f', u'version': u'0.1.8-3a66e4a'} In [4]: jobs = s.get_analysis_jobs() In [5]: j = s.get_analysis_job_by_id(3) In [6]: j.state, j.name Out[6]: ('SUCCESSFUL', 'sirv_isoseq') In [7]: import pbcommand; pbcommand.get_version() Out[7]: '0.4.9' Commandline Tool Interface to Services ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. warning:: This has been migrated to scala in smrtflow_. Support for the python Client layer API will remain, however the python commandline tool is no longer installed by default and will be removed in a future version. .. _smrtflow: https://github.com/PacificBiosciences/smrtflow pbcommand-fb2fea0228a57711cea7e26cbe798831a03e6861/LICENSES.txt0000644000000000000000000000312313035554276021131 0ustar rootrootjjCopyright (c) 2011-2015, Pacific Biosciences of California, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Pacific Biosciences nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.