streamz-0.6.4/0000755000175000017500000000000014270277270012544 5ustar nileshnileshstreamz-0.6.4/streamz/0000755000175000017500000000000014270277270014231 5ustar nileshnileshstreamz-0.6.4/streamz/utils_test.py0000644000175000017500000000630514270277270017006 0ustar nileshnileshimport asyncio from contextlib import contextmanager import logging import os import six import shutil import tempfile from time import time, sleep import pytest from tornado import gen from tornado.ioloop import IOLoop from .core import _io_loops, Stream @contextmanager def tmpfile(extension=''): extension = '.' + extension.lstrip('.') handle, filename = tempfile.mkstemp(extension) os.close(handle) os.remove(filename) yield filename if os.path.exists(filename): if os.path.isdir(filename): shutil.rmtree(filename) else: try: os.remove(filename) except OSError: # sometimes we can't remove a generated temp file pass def inc(x): return x + 1 def double(x): return 2 * x @contextmanager def pristine_loop(): IOLoop.clear_instance() IOLoop.clear_current() loop = IOLoop() loop.make_current() try: yield loop finally: loop.close(all_fds=True) IOLoop.clear_instance() IOLoop.clear_current() def gen_test(timeout=10): """ Coroutine test @gen_test(timeout=5) def test_foo(): yield ... # use tornado coroutines """ def _(func): def test_func(): with pristine_loop() as loop: cor = gen.coroutine(func) try: loop.run_sync(cor, timeout=timeout) finally: loop.stop() return test_func return _ @contextmanager def captured_logger(logger, level=logging.INFO, propagate=None): """Capture output from the given Logger. """ if isinstance(logger, str): logger = logging.getLogger(logger) orig_level = logger.level orig_handlers = logger.handlers[:] if propagate is not None: orig_propagate = logger.propagate logger.propagate = propagate sio = six.StringIO() logger.handlers[:] = [logging.StreamHandler(sio)] logger.setLevel(level) try: yield sio finally: logger.handlers[:] = orig_handlers logger.setLevel(orig_level) if propagate is not None: logger.propagate = orig_propagate @pytest.fixture def clean(): for loop in _io_loops: loop.add_callback(loop.stop) del _io_loops[:] def wait_for(predicate, timeout, fail_func=None, period=0.001): """Wait for predicate to turn true, or fail this test""" # from distributed.utils_test deadline = time() + timeout while not predicate(): sleep(period) if time() > deadline: # pragma: no cover if fail_func is not None: fail_func() pytest.fail("condition not reached within %s seconds" % timeout) async def await_for(predicate, timeout, fail_func=None, period=0.001): deadline = time() + timeout while not predicate(): await asyncio.sleep(period) if time() > deadline: # pragma: no cover if fail_func is not None: fail_func() pytest.fail("condition not reached until %s seconds" % (timeout,)) class metadata(Stream): def update(self, x, who=None, metadata=None): if metadata: return self._emit(metadata) streamz-0.6.4/streamz/utils.py0000644000175000017500000000224014270277270015741 0ustar nileshnilesh_method_cache = {} class methodcaller(object): """ Return a callable object that calls the given method on its operand. Unlike the builtin `operator.methodcaller`, instances of this class are serializable """ __slots__ = ('method',) func = property(lambda self: self.method) # For `funcname` to work def __new__(cls, method): if method in _method_cache: return _method_cache[method] self = object.__new__(cls) self.method = method _method_cache[method] = self return self def __call__(self, obj, *args, **kwargs): return getattr(obj, self.method)(*args, **kwargs) def __reduce__(self): return (methodcaller, (self.method,)) def __str__(self): return "<%s: %s>" % (self.__class__.__name__, self.method) __repr__ = __str__ class MethodCache(object): """Attribute access on this object returns a methodcaller for that attribute. Examples -------- >>> a = [1, 3, 3] >>> M.count(a, 3) == a.count(3) True """ __getattr__ = staticmethod(methodcaller) __dir__ = lambda self: list(_method_cache) M = MethodCache() streamz-0.6.4/streamz/tests/0000755000175000017500000000000014270277270015373 5ustar nileshnileshstreamz-0.6.4/streamz/tests/test_sources.py0000644000175000017500000001102014270277270020461 0ustar nileshnileshimport asyncio import sys from flaky import flaky import pytest from streamz import Source from streamz.utils_test import wait_for, await_for, gen_test import socket def test_periodic(): s = Source.from_periodic(lambda: True) l = s.sink_to_list() assert s.stopped s.start() wait_for(lambda: l, 0.3, period=0.01) wait_for(lambda: len(l) > 1, 0.3, period=0.01) assert all(l) @flaky(max_runs=3, min_passes=1) def test_tcp(): port = 9876 s = Source.from_tcp(port) out = s.sink_to_list() s.start() wait_for(lambda: s.server is not None, 2, period=0.02) try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", port)) sock.send(b'data\n') sock.close() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", port)) sock.send(b'data\n') sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock2.connect(("localhost", port)) sock2.send(b'data2\n') wait_for(lambda: out == [b'data\n', b'data\n', b'data2\n'], 2, period=0.01) finally: s.stop() sock.close() sock2.close() @flaky(max_runs=3, min_passes=1) @gen_test(timeout=60) def test_tcp_async(): port = 9876 s = Source.from_tcp(port) out = s.sink_to_list() s.start() yield await_for(lambda: s.server is not None, 2, period=0.02) try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", port)) sock.send(b'data\n') sock.close() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", port)) sock.send(b'data\n') sock2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock2.connect(("localhost", port)) sock2.send(b'data2\n') yield await_for(lambda: out == [b'data\n', b'data\n', b'data2\n'], 2, period=0.01) finally: s.stop() sock.close() sock2.close() def test_http(): requests = pytest.importorskip('requests') port = 9875 s = Source.from_http_server(port) out = s.sink_to_list() s.start() wait_for(lambda: s.server is not None, 2, period=0.02) r = requests.post('http://localhost:%i/' % port, data=b'data') wait_for(lambda: out == [b'data'], 2, period=0.01) assert r.ok r = requests.post('http://localhost:%i/other' % port, data=b'data2') wait_for(lambda: out == [b'data', b'data2'], 2, period=0.01) assert r.ok s.stop() with pytest.raises(requests.exceptions.RequestException): requests.post('http://localhost:%i/other' % port, data=b'data2') @gen_test(timeout=60) def test_process(): cmd = ["python", "-c", "for i in range(4): print(i, end='')"] s = Source.from_process(cmd, with_end=True) if sys.platform != "win32": # don't know why - something with pytest and new processes policy = asyncio.get_event_loop_policy() watcher = asyncio.SafeChildWatcher() policy.set_child_watcher(watcher) watcher.attach_loop(s.loop.asyncio_loop) out = s.sink_to_list() s.start() yield await_for(lambda: out == [b'0123'], timeout=5) s.stop() @gen_test(timeout=60) def test_process_str(): cmd = 'python -c "for i in range(4): print(i)"' s = Source.from_process(cmd) if sys.platform != "win32": # don't know why - something with pytest and new processes policy = asyncio.get_event_loop_policy() watcher = asyncio.SafeChildWatcher() policy.set_child_watcher(watcher) watcher.attach_loop(s.loop.asyncio_loop) out = s.sink_to_list() s.start() yield await_for(lambda: out == [b'0\n', b'1\n', b'2\n', b'3\n'], timeout=5) s.stop() def test_from_iterable(): source = Source.from_iterable(range(3)) L = source.sink_to_list() source.start() wait_for(lambda: L == [0, 1, 2], 0.1) def test_from_iterable_backpressure(): it = iter(range(5)) source = Source.from_iterable(it) L = source.rate_limit(0.1).sink_to_list() source.start() wait_for(lambda: L == [0], 1, period=0.01) assert next(it) == 2 # 1 is in blocked _emit def test_from_iterable_stop(): from _pytest.outcomes import Failed source = Source.from_iterable(range(5)) L = source.rate_limit(0.01).sink_to_list() source.start() wait_for(lambda: L == [0], 1) source.stop() assert source.stopped with pytest.raises(Failed): wait_for(lambda: L == [0, 1, 2], 0.1) streamz-0.6.4/streamz/tests/test_sinks.py0000644000175000017500000000456214270277270020142 0ustar nileshnileshimport weakref import pytest from streamz import Stream from streamz.sinks import _global_sinks, Sink from streamz.utils_test import tmpfile, wait_for def test_sink_with_args_and_kwargs(): L = dict() def mycustomsink(elem, key, prefix=""): key = prefix + key if key not in L: L[key] = list() L[key].append(elem) s = Stream() sink = s.sink(mycustomsink, "cat", "super", stream_name="test") s.emit(1) s.emit(2) assert L['supercat'] == [1, 2] assert sink.name == "test" def test_sink_to_textfile_fp(): source = Stream() with tmpfile() as filename, open(filename, "w") as fp: source.map(str).sink_to_textfile(fp) source.emit(0) source.emit(1) fp.flush() assert open(filename, "r").read() == "0\n1\n" def test_sink_to_textfile_named(): source = Stream() with tmpfile() as filename: _sink = source.map(str).sink_to_textfile(filename) source.emit(0) source.emit(1) _sink._fp.flush() assert open(filename, "r").read() == "0\n1\n" def test_sink_to_textfile_closes(): source = Stream() with tmpfile() as filename: sink = source.sink_to_textfile(filename) fp = sink._fp _global_sinks.remove(sink) del sink with pytest.raises(ValueError, match=r"I/O operation on closed file\."): fp.write(".") def test_sink_destroy(): source = Stream() sink = Sink(source) ref = weakref.ref(sink) sink.destroy() assert sink not in _global_sinks del sink assert ref() is None def test_ws_roundtrip(): pytest.importorskip("websockets") s0 = Stream.from_websocket("localhost", 8989, start=True) l = s0.sink_to_list() data = [b'0123'] * 4 s = Stream.from_iterable(data) s.to_websocket("ws://localhost:8989") s.start() wait_for(lambda: data == l, timeout=1) s.stop() s0.stop() def test_mqtt_roundtrip(): pytest.importorskip("paho.mqtt.client") s0 = Stream.from_mqtt("mqtt.eclipseprojects.io", 1883, "streamz/sensor/temperature") l = s0.map(lambda msg: msg.payload).sink_to_list() s0.start() data = [b'0123'] * 4 s = Stream.from_iterable(data) s.to_mqtt("mqtt.eclipseprojects.io", 1883, "streamz/sensor/temperature") s.start() wait_for(lambda: data == l, timeout=1) s.stop() s0.stop() streamz-0.6.4/streamz/tests/test_plugins.py0000644000175000017500000000254314270277270020471 0ustar nileshnileshimport inspect import pytest from streamz import Source, Stream class MockEntryPoint: def __init__(self, name, cls, module_name=None): self.name = name self.cls = cls self.module_name = module_name def load(self): return self.cls def test_register_plugin_entry_point(): class test_stream(Stream): pass entry_point = MockEntryPoint("test_node", test_stream) Stream.register_plugin_entry_point(entry_point) assert Stream.test_node.__name__ == "stub" Stream().test_node() assert Stream.test_node.__name__ == "test_stream" def test_register_plugin_entry_point_modifier(): class test_source(Source): pass entry_point = MockEntryPoint("from_test", test_source) Stream.register_plugin_entry_point(entry_point, staticmethod) Stream.from_test() assert inspect.isfunction(Stream().from_test) def test_register_plugin_entry_point_raises_type(): class invalid_node: pass entry_point = MockEntryPoint("test", invalid_node, "test_module.test") Stream.register_plugin_entry_point(entry_point) with pytest.raises(TypeError): Stream.test() def test_register_plugin_entry_point_raises_duplicate_name(): entry_point = MockEntryPoint("map", None) with pytest.raises(ValueError): Stream.register_plugin_entry_point(entry_point) streamz-0.6.4/streamz/tests/test_kafka.py0000644000175000017500000006007214270277270020066 0ustar nileshnileshimport asyncio import atexit from contextlib import contextmanager from flaky import flaky import os import pytest import random import shlex import subprocess import time from tornado import gen from ..core import Stream from ..dask import DaskStream from streamz.utils_test import gen_test, wait_for, await_for pytest.importorskip('distributed') from distributed.utils_test import gen_cluster # flake8: noqa KAFKA_FILE = 'kafka_2.11-1.0.0' LAUNCH_KAFKA = os.environ.get('STREAMZ_LAUNCH_KAFKA', '') == 'true' ck = pytest.importorskip('confluent_kafka') def stop_docker(name='streamz-kafka', cid=None, let_fail=False): """Stop docker container with given name tag Parameters ---------- name: str name field which has been attached to the container we wish to remove cid: str container ID, if known let_fail: bool whether to raise an exception if the underlying commands return an error. """ try: if cid is None: print('Finding %s ...' % name) cmd = shlex.split('docker ps -q --filter "name=%s"' % name) cid = subprocess.check_output(cmd).strip().decode() if cid: print('Stopping %s ...' % cid) subprocess.call(['docker', 'rm', '-f', cid]) except subprocess.CalledProcessError as e: print(e) if not let_fail: raise def launch_kafka(): stop_docker(let_fail=True) subprocess.call(shlex.split("docker pull spotify/kafka")) cmd = ("docker run -d -p 2181:2181 -p 9092:9092 --env " "ADVERTISED_HOST=127.0.0.1 --env ADVERTISED_PORT=9092 " "--name streamz-kafka spotify/kafka") print(cmd) cid = subprocess.check_output(shlex.split(cmd)).decode()[:-1] def end(): if cid: stop_docker(cid=cid) atexit.register(end) def predicate(): try: out = subprocess.check_output(['docker', 'logs', cid], stderr=subprocess.STDOUT) return b'kafka entered RUNNING state' in out except subprocess.CalledProcessError: pass wait_for(predicate, 10, period=0.1) return cid _kafka = [None] @contextmanager def kafka_service(): TOPIC = "test-%i" % random.randint(0, 10000) if _kafka[0] is None: if LAUNCH_KAFKA: launch_kafka() else: raise pytest.skip.Exception( "Kafka not available. " "To launch kafka use `export STREAMZ_LAUNCH_KAFKA=true`") producer = ck.Producer({'bootstrap.servers': 'localhost:9092', 'topic.metadata.refresh.interval.ms': '5000'}) producer.produce('test-start-kafka', b'test') out = producer.flush(10) if out > 0: raise RuntimeError('Timeout waiting for kafka') _kafka[0] = producer yield _kafka[0], TOPIC def split(messages): parsed = [] for message in messages: message = message.decode("utf-8") parsed.append(int(message.split('-')[1])) return parsed @flaky(max_runs=3, min_passes=1) @gen_test(timeout=60) def test_from_kafka(): j = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j} with kafka_service() as kafka: kafka, TOPIC = kafka stream = Stream.from_kafka([TOPIC], ARGS, asynchronous=True) out = stream.sink_to_list() stream.start() yield gen.sleep(1.1) # for loop to run for i in range(10): yield gen.sleep(0.1) # small pause ensures correct ordering kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() # it takes some time for messages to come back out of kafka wait_for(lambda: len(out) == 10, 10, period=0.1) assert out[-1] == b'value-9' kafka.produce(TOPIC, b'final message') kafka.flush() wait_for(lambda: out[-1] == b'final message', 10, period=0.1) stream._close_consumer() kafka.produce(TOPIC, b'lost message') kafka.flush() # absolute sleep here, since we expect output list *not* to change yield gen.sleep(1) assert out[-1] == b'final message' stream._close_consumer() @flaky(max_runs=3, min_passes=1) @gen_test(timeout=60) def test_to_kafka(): ARGS = {'bootstrap.servers': 'localhost:9092'} with kafka_service() as kafka: _, TOPIC = kafka source = Stream() kafka = source.to_kafka(TOPIC, ARGS) out = kafka.sink_to_list() for i in range(10): yield source.emit(b'value-%d' % i) source.emit('final message') kafka.flush() wait_for(lambda: len(out) == 11, 10, period=0.1) assert out[-1] == b'final message' @flaky(max_runs=3, min_passes=1) @gen_test(timeout=60) def test_from_kafka_thread(): j = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j} with kafka_service() as kafka: kafka, TOPIC = kafka stream = Stream.from_kafka([TOPIC], ARGS) out = stream.sink_to_list() stream.start() yield gen.sleep(1.1) for i in range(10): yield gen.sleep(0.1) kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() # it takes some time for messages to come back out of kafka yield await_for(lambda: len(out) == 10, 10, period=0.1) assert out[-1] == b'value-9' kafka.produce(TOPIC, b'final message') kafka.flush() yield await_for(lambda: out[-1] == b'final message', 10, period=0.1) stream._close_consumer() kafka.produce(TOPIC, b'lost message') kafka.flush() # absolute sleep here, since we expect output list *not* to change yield gen.sleep(1) assert out[-1] == b'final message' stream._close_consumer() def test_kafka_batch(): j = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j, 'auto.offset.reset': 'latest'} with kafka_service() as kafka: kafka, TOPIC = kafka # These messages aren't read since Stream starts reading from latest offsets for i in range(10): kafka.produce(TOPIC, b'value-%d' % i, b'%d' % i) kafka.flush() stream = Stream.from_kafka_batched(TOPIC, ARGS, max_batch_size=4, keys=True) out = stream.sink_to_list() stream.start() wait_for(lambda: stream.upstream.started, 10, 0.1) for i in range(10): kafka.produce(TOPIC, b'value-%d' % i, b'%d' % i) kafka.flush() # out may still be empty or first item of out may be [] wait_for(lambda: any(out) and out[-1][-1]['value'] == b'value-9', 10, period=0.2) assert out[-1][-1]['key'] == b'9' # max_batch_size checks assert len(out[0]) == len(out[1]) == 4 and len(out) == 3 stream.upstream.stopped = True @gen_cluster(client=True, timeout=60) async def test_kafka_dask_batch(c, s, w1, w2): j = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j} with kafka_service() as kafka: kafka, TOPIC = kafka stream = Stream.from_kafka_batched(TOPIC, ARGS, keys=True, asynchronous=True, dask=True) out = stream.gather().sink_to_list() stream.start() await asyncio.sleep(5) # this frees the loop while dask workers report in assert isinstance(stream, DaskStream) for i in range(10): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() await await_for(lambda: any(out), 10, period=0.2) assert {'key': None, 'value': b'value-1'} in out[0] stream.stop() await asyncio.sleep(0) stream.upstream.upstream.consumer.close() def test_kafka_batch_npartitions(): j1 = random.randint(0, 10000) ARGS1 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j1, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} j2 = j1 + 1 ARGS2 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j2, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} with kafka_service() as kafka: kafka, TOPIC = kafka TOPIC = "test-partitions" subprocess.call(shlex.split("docker exec streamz-kafka " "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh " "--create --zookeeper localhost:2181 " "--replication-factor 1 --partitions 2 " "--topic test-partitions")) for i in range(10): if i % 2 == 0: kafka.produce(TOPIC, b'value-%d' % i, partition=0) else: kafka.produce(TOPIC, b'value-%d' % i, partition=1) kafka.flush() with pytest.raises(ValueError): stream1 = Stream.from_kafka_batched(TOPIC, ARGS1, asynchronous=True, npartitions=0) stream1.gather().sink_to_list() stream1.start() stream2 = Stream.from_kafka_batched(TOPIC, ARGS1, asynchronous=True, npartitions=1) out2 = stream2.gather().sink_to_list() stream2.start() wait_for(lambda: stream2.upstream.started, 10, 0.1) wait_for(lambda: len(out2) == 1 and len(out2[0]) == 5, 10, 0.1) stream2.upstream.stopped = True stream3 = Stream.from_kafka_batched(TOPIC, ARGS2, asynchronous=True, npartitions=4) out3 = stream3.gather().sink_to_list() stream3.start() wait_for(lambda: stream3.upstream.started, 10, 0.1) wait_for(lambda: len(out3) == 2 and (len(out3[0]) + len(out3[1])) == 10, 10, 0.1) stream3.upstream.stopped = True def test_kafka_refresh_partitions(): j1 = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j1, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} with kafka_service() as kafka: kafka, TOPIC = kafka TOPIC = "test-refresh-partitions" subprocess.call(shlex.split("docker exec streamz-kafka " "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh " "--create --zookeeper localhost:2181 " "--replication-factor 1 --partitions 2 " "--topic test-refresh-partitions")) for i in range(10): if i % 2 == 0: kafka.produce(TOPIC, b'value-%d' % i, partition=0) else: kafka.produce(TOPIC, b'value-%d' % i, partition=1) kafka.flush() stream = Stream.from_kafka_batched(TOPIC, ARGS, asynchronous=True, refresh_partitions=True, poll_interval='2s') out = stream.gather().sink_to_list() stream.start() wait_for(lambda: stream.upstream.started, 10, 0.1) wait_for(lambda: len(out) == 2 and (len(out[0]) + len(out[1])) == 10, 10, 0.1) subprocess.call(shlex.split("docker exec streamz-kafka " "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh " "--alter --zookeeper localhost:2181 " "--topic test-refresh-partitions --partitions 4")) time.sleep(5) for i in range(10,20): if i % 2 == 0: kafka.produce(TOPIC, b'value-%d' % i, partition=2) else: kafka.produce(TOPIC, b'value-%d' % i, partition=3) kafka.flush() wait_for(lambda: len(out) == 4 and (len(out[2]) + len(out[3])) == 10 and out[3][4] == b'value-19', 10, 0.1) stream.upstream.stopped = True def test_kafka_batch_checkpointing_sync_nodes(): ''' Streams 1 and 3 have different consumer groups, while Stream 2 has the same group as 1. Hence, Stream 2 does not re-read the data that had been finished processing by Stream 1, i.e. it picks up from where Stream 1 had left off. ''' j1 = random.randint(0, 10000) ARGS1 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j1, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} j2 = j1 + 1 ARGS2 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j2, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} with kafka_service() as kafka: kafka, TOPIC = kafka for i in range(10): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() stream1 = Stream.from_kafka_batched(TOPIC, ARGS1) out1 = stream1.map(split).filter(lambda x: x[-1] % 2 == 1).sink_to_list() stream1.start() wait_for(lambda: any(out1) and out1[-1][-1] == 9, 10, period=0.2) stream1.upstream.stopped = True stream2 = Stream.from_kafka_batched(TOPIC, ARGS1) out2 = stream2.map(split).filter(lambda x: x[-1] % 2 == 1).sink_to_list() stream2.start() time.sleep(5) assert len(out2) == 0 stream2.upstream.stopped = True stream3 = Stream.from_kafka_batched(TOPIC, ARGS2) out3 = stream3.map(split).filter(lambda x: x[-1] % 2 == 1).sink_to_list() stream3.start() wait_for(lambda: any(out3) and out3[-1][-1] == 9, 10, period=0.2) stream3.upstream.stopped = True @gen_cluster(client=True, timeout=60) async def test_kafka_dask_checkpointing_sync_nodes(c, s, w1, w2): ''' Testing whether Dask's scatter and gather works in conformity with the reference counting checkpointing implementation. ''' j1 = random.randint(0, 10000) ARGS1 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j1, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} j2 = j1 + 1 ARGS2 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j2, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest'} with kafka_service() as kafka: kafka, TOPIC = kafka for i in range(10): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() stream1 = Stream.from_kafka_batched(TOPIC, ARGS1, asynchronous=True, dask=True) out1 = stream1.map(split).gather().filter(lambda x: x[-1] % 2 == 1).sink_to_list() stream1.start() await await_for(lambda: any(out1) and out1[-1][-1] == 9, 10, period=0.2) stream1.upstream.stopped = True stream2 = Stream.from_kafka_batched(TOPIC, ARGS1, asynchronous=True, dask=True) out2 = stream2.map(split).gather().filter(lambda x: x[-1] % 2 == 1).sink_to_list() stream2.start() time.sleep(5) assert len(out2) == 0 stream2.upstream.stopped = True stream3 = Stream.from_kafka_batched(TOPIC, ARGS2, asynchronous=True, dask=True) out3 = stream3.map(split).gather().filter(lambda x: x[-1] % 2 == 1).sink_to_list() stream3.start() await await_for(lambda: any(out3) and out3[-1][-1] == 9, 10, period=0.2) stream3.upstream.stopped = True def test_kafka_batch_checkpointing_async_nodes_1(): ''' In async nodes like partition & sliding window, data is checkpointed only after the pipeline has finished processing it. ''' j = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j, 'enable.auto.commit': False} with kafka_service() as kafka: kafka, TOPIC = kafka stream1 = Stream.from_kafka_batched(TOPIC, ARGS) out1 = stream1.partition(2).sliding_window(2, return_partial=False).sink_to_list() stream1.start() for i in range(0,2): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() time.sleep(2) assert len(out1) == 0 #Stream stops before data can finish processing, hence no checkpointing. stream1.upstream.stopped = True stream1.destroy() stream2 = Stream.from_kafka_batched(TOPIC, ARGS) out2 = stream2.partition(2).sliding_window(2, return_partial=False).sink_to_list() stream2.start() wait_for(lambda: stream2.upstream.started, 10, 0.1) for i in range(2,6): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() time.sleep(2) assert len(out2) == 1 assert out2 == [(([b'value-0', b'value-1'], [b'value-2']), ([b'value-3'], [b'value-4']))] #Some data gets processed and exits pipeline before the stream stops, hence checkpointing complete. stream2.upstream.stopped = True stream2.destroy() stream3 = Stream.from_kafka_batched(TOPIC, ARGS) out3 = stream3.sink_to_list() stream3.start() wait_for(lambda: stream3.upstream.started, 10, 0.1) #Stream picks up from where it left before, i.e., from the last committed offset. wait_for(lambda: len(out3) == 1 and out3[0] == [b'value-3', b'value-4', b'value-5'], 10, 0.1) stream3.upstream.stopped = True stream3.destroy() def test_kafka_batch_checkpointing_async_nodes_2(): ''' In async nodes like zip_latest, zip, combine_latest which involve multiple streams, checkpointing in each stream commits offsets after the datum in that specific stream is processed completely and exits the pipeline. ''' CONSUMER_ARGS1 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'zip_latest', 'enable.auto.commit': False} CONSUMER_ARGS2 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'zip', 'enable.auto.commit': False} CONSUMER_ARGS3 = {'bootstrap.servers': 'localhost:9092', 'group.id': 'combine_latest', 'enable.auto.commit': False} TOPIC1 = 'test1' TOPIC2 = 'test2' with kafka_service() as kafka: kafka, TOPIC = kafka stream1 = Stream.from_kafka_batched(TOPIC1, CONSUMER_ARGS1, asynchronous=True) stream1.start() stream2 = Stream.from_kafka_batched(TOPIC2, CONSUMER_ARGS1, asynchronous=True) stream2.start() stream1.zip_latest(stream2).sink_to_list() stream3 = Stream.from_kafka_batched(TOPIC1, CONSUMER_ARGS2, asynchronous=True) stream3.start() stream4 = Stream.from_kafka_batched(TOPIC2, CONSUMER_ARGS2, asynchronous=True) stream4.start() stream3.zip(stream4).sink_to_list() stream5 = Stream.from_kafka_batched(TOPIC1, CONSUMER_ARGS3, asynchronous=True) stream5.start() stream6 = Stream.from_kafka_batched(TOPIC2, CONSUMER_ARGS3, asynchronous=True) stream6.start() stream5.combine_latest(stream6).sink_to_list() kafka.produce(TOPIC1, b'value-0') time.sleep(5) kafka.produce(TOPIC2, b'value-1') ''' 1. zip_latest emits a tuple, the lossless stream 1 commits an offset. 2. Since zip emits a tuple, streams 3 and 4 commit offsets in their topics. 3. combine_latest does not commit any offset since data is still to be used. ''' time.sleep(5) kafka.produce(TOPIC1, b'value-2') ''' 1. zip_latest emits a tuple, the lossless stream 1 commits an offset. 2. zip does not commit any offset. 3. combine_latest commits an offset in stream 5. ''' time.sleep(5) kafka.produce(TOPIC2, b'value-3') ''' 1. zip_latest emits a tuple, the non-lossless stream 2 commits an offset. 2. Since zip emits a tuple, streams 3 and 4 commit offsets in their topics. 3. combine_latest commits an offset in stream 6. ''' time.sleep(10) stream1.upstream.stopped = True stream2.upstream.stopped = True stream3.upstream.stopped = True stream4.upstream.stopped = True stream5.upstream.stopped = True stream6.upstream.stopped = True stream1.destroy() stream2.destroy() stream3.destroy() stream4.destroy() stream5.destroy() stream6.destroy() ''' Each stream/group.id picks up from their last committed offset. ''' consumer1 = ck.Consumer(CONSUMER_ARGS1) consumer2 = ck.Consumer(CONSUMER_ARGS2) consumer3 = ck.Consumer(CONSUMER_ARGS3) tps = [ck.TopicPartition(TOPIC1, 0), ck.TopicPartition(TOPIC2, 0)] committed1 = consumer1.committed(tps) committed2 = consumer2.committed(tps) committed3 = consumer3.committed(tps) assert committed1[0].offset == 2 assert committed1[1].offset == 1 assert committed2[0].offset == 2 assert committed2[1].offset == 2 assert committed3[0].offset == 1 assert committed3[1].offset == 1 @flaky(max_runs=3, min_passes=1) def test_kafka_checkpointing_auto_offset_reset_latest(): """ Testing whether checkpointing works as expected with multiple topic partitions and with auto.offset.reset configuration set to latest (also default). """ j = random.randint(0, 10000) ARGS = {'bootstrap.servers': 'localhost:9092', 'group.id': 'streamz-test%i' % j, 'auto.offset.reset': 'latest'} with kafka_service() as kafka: kafka, TOPIC = kafka TOPIC = "test-checkpointing-offset-reset-latest" subprocess.call(shlex.split("docker exec streamz-kafka " "/opt/kafka_2.11-0.10.1.0/bin/kafka-topics.sh " "--create --zookeeper localhost:2181 " "--replication-factor 1 --partitions 3 " "--topic test-checkpointing-offset-reset-latest")) ''' Since the stream has not started yet, these messages are not read because the stream has auto.offset.reset set to latest. ''' for i in range(30): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() stream1 = Stream.from_kafka_batched(TOPIC, ARGS, asynchronous=True) out1 = stream1.map(split).gather().sink_to_list() stream1.start() wait_for(lambda: stream1.upstream.started, 10, period=0.1) ''' Stream has started, so these are read. ''' for i in range(30): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() wait_for(lambda: len(out1) == 3 and (len(out1[0]) + len(out1[1]) + len(out1[2])) == 30, 10, period=0.1) ''' Stream stops but checkpoint has been created. ''' stream1.upstream.stopped = True ''' When the stream is restarted, these messages are read, because the checkpoint overrrides the auto.offset.reset:latest config this time around as expected. ''' for i in range(30): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() stream2 = Stream.from_kafka_batched(TOPIC, ARGS, asynchronous=True) out2 = stream2.map(split).gather().sink_to_list() ''' Stream restarts here. ''' stream2.start() wait_for(lambda: stream2.upstream.started, 10, 0.1) for i in range(30): kafka.produce(TOPIC, b'value-%d' % i) kafka.flush() wait_for(lambda: len(out2) == 6 and (len(out2[3]) + len(out2[4]) + len(out2[5])) == 30, 10, period=0.1) stream2.upstream.stopped = True streamz-0.6.4/streamz/tests/test_graph.py0000644000175000017500000000417214270277270020111 0ustar nileshnileshfrom operator import add, mul import os import pytest nx = pytest.importorskip('networkx') from streamz import Stream, create_graph, visualize from streamz.utils_test import tmpfile from ..graph import _clean_text def test_create_graph(): source1 = Stream(stream_name='source1') source2 = Stream(stream_name='source2') n1 = source1.zip(source2) n2 = n1.map(add) s = n2.sink(source1.emit) g = nx.DiGraph() create_graph(n2, g) for t in [hash(a) for a in [source1, source2, n1, n2, s]]: assert t in g for e in [(hash(a), hash(b)) for a, b in [ (source1, n1), (source2, n1), (n1, n2), (n2, s) ]]: assert e in g.edges() def test_create_cyclic_graph(): source1 = Stream(stream_name='source1') source2 = Stream(stream_name='source2') n1 = source1.zip(source2) n2 = n1.map(add) n2.connect(source1) g = nx.DiGraph() create_graph(n2, g) for t in [hash(a) for a in [source1, source2, n1, n2]]: assert t in g assert nx.find_cycle(g) for e in [(hash(a), hash(b)) for a, b in [ (source1, n1), (source2, n1), (n1, n2), (n2, source1) ]]: assert e in g.edges() def test_create_file(): source1 = Stream(stream_name='source1') source2 = Stream(stream_name='source2') n1 = source1.zip(source2) n2 = n1.map(add).scan(mul).map(lambda x : x + 1) n2.sink(source1.emit) with tmpfile(extension='png') as fn: visualize(n1, filename=fn) assert os.path.exists(fn) with tmpfile(extension='svg') as fn: n1.visualize(filename=fn, rankdir="LR") assert os.path.exists(fn) with tmpfile(extension='dot') as fn: n1.visualize(filename=fn, rankdir="LR") with open(fn) as f: text = f.read() for word in ['rankdir', 'source1', 'source2', 'zip', 'map', 'add', 'shape=box', 'shape=ellipse']: assert word in text def test_cleantext(): text = "JFDSM*(@&$:FFDS:;;" expected_text = "JFDSM ;FFDS; " cleaned_text = _clean_text(text) assert cleaned_text == expected_text streamz-0.6.4/streamz/tests/test_dask.py0000644000175000017500000001625014270277270017732 0ustar nileshnileshimport asyncio from operator import add import random import time import pytest pytest.importorskip('dask.distributed') from tornado import gen from streamz.dask import scatter from streamz import RefCounter, Stream from distributed import Future, Client from distributed.utils import sync from distributed.utils_test import gen_cluster, inc, cluster, loop, slowinc # noqa: F401 @gen_cluster(client=True) async def test_map(c, s, a, b): source = Stream(asynchronous=True) futures = scatter(source).map(inc) futures_L = futures.sink_to_list() L = futures.gather().sink_to_list() for i in range(5): await source.emit(i) assert L == [1, 2, 3, 4, 5] assert all(isinstance(f, Future) for f in futures_L) @gen_cluster(client=True) async def test_map_on_dict(c, s, a, b): # dask treats dicts differently, so we have to make sure # the user sees no difference in the streamz api. # Regression test against #336 def add_to_dict(d): d["x"] = d["i"] return d source = Stream(asynchronous=True) futures = source.scatter().map(add_to_dict) L = futures.gather().sink_to_list() for i in range(5): await source.emit({"i": i}) assert len(L) == 5 for i, item in enumerate(sorted(L, key=lambda x: x["x"])): assert item["x"] == i assert item["i"] == i @gen_cluster(client=True) async def test_partition_then_scatter_async(c, s, a, b): # Ensure partition w/ timeout before scatter works correctly for # asynchronous start = time.monotonic() source = Stream(asynchronous=True) L = source.partition(2, timeout=.1).scatter().map( lambda x: [xx+1 for xx in x]).buffer(2).gather().flatten().sink_to_list() rc = RefCounter(loop=source.loop) for i in range(3): await source.emit(i, metadata=[{'ref': rc}]) while rc.count != 0 and time.monotonic() - start < 1.: await gen.sleep(1e-2) assert L == [1, 2, 3] def test_partition_then_scatter_sync(loop): # Ensure partition w/ timeout before scatter works correctly for synchronous with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 start = time.monotonic() source = Stream() L = source.partition(2, timeout=.1).scatter().map( lambda x: [xx+1 for xx in x]).gather().flatten().sink_to_list() assert source.loop is client.loop rc = RefCounter() for i in range(3): source.emit(i, metadata=[{'ref': rc}]) while rc.count != 0 and time.monotonic() - start < 2.: time.sleep(1e-2) assert L == [1, 2, 3] @gen_cluster(client=True) async def test_non_unique_emit(c, s, a, b): """Regression for https://github.com/python-streamz/streams/issues/397 Non-unique stream entries still need to each be processed. """ source = Stream(asynchronous=True) futures = source.scatter().map(lambda x: random.random()) L = futures.gather().sink_to_list() for _ in range(3): # Emit non-unique values await source.emit(0) assert len(L) == 3 assert L[0] != L[1] or L[0] != L[2] @gen_cluster(client=True) async def test_scan(c, s, a, b): source = Stream(asynchronous=True) futures = scatter(source).map(inc).scan(add) futures_L = futures.sink_to_list() L = futures.gather().sink_to_list() for i in range(5): await source.emit(i) assert L == [1, 3, 6, 10, 15] assert all(isinstance(f, Future) for f in futures_L) @gen_cluster(client=True) async def test_scan_state(c, s, a, b): source = Stream(asynchronous=True) def f(acc, i): acc = acc + i return acc, acc L = scatter(source).scan(f, returns_state=True).gather().sink_to_list() for i in range(3): await source.emit(i) assert L == [0, 1, 3] @gen_cluster(client=True) async def test_zip(c, s, a, b): a = Stream(asynchronous=True) b = Stream(asynchronous=True) c = scatter(a).zip(scatter(b)) L = c.gather().sink_to_list() await a.emit(1) await b.emit('a') await a.emit(2) await b.emit('b') assert L == [(1, 'a'), (2, 'b')] @gen_cluster(client=True) async def test_accumulate(c, s, a, b): source = Stream(asynchronous=True) L = source.scatter().accumulate(lambda acc, x: acc + x, with_state=True).gather().sink_to_list() for i in range(3): await source.emit(i) assert L[-1][1] == 3 def test_sync(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 source = Stream() L = source.scatter().map(inc).gather().sink_to_list() async def f(): for i in range(10): await source.emit(i, asynchronous=True) sync(loop, f) assert L == list(map(inc, range(10))) def test_sync_2(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop): # noqa: F841 source = Stream() L = source.scatter().map(inc).gather().sink_to_list() for i in range(10): source.emit(i) assert len(L) == i + 1 assert L == list(map(inc, range(10))) @gen_cluster(client=True, nthreads=[('127.0.0.1', 1)] * 2) async def test_buffer(c, s, a, b): source = Stream(asynchronous=True) L = source.scatter().map(slowinc, delay=0.5).buffer(5).gather().sink_to_list() start = time.time() for i in range(5): await source.emit(i) end = time.time() assert end - start < 0.5 for i in range(5, 10): await source.emit(i) end2 = time.time() assert end2 - start > (0.5 / 3) while len(L) < 10: await gen.sleep(0.01) assert time.time() - start < 5 assert L == list(map(inc, range(10))) assert source.loop == c.loop def test_buffer_sync(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as c: # noqa: F841 source = Stream() buff = source.scatter().map(slowinc, delay=0.5).buffer(5) L = buff.gather().sink_to_list() start = time.time() for i in range(5): source.emit(i) end = time.time() assert end - start < 0.5 for i in range(5, 10): source.emit(i) while len(L) < 10: time.sleep(0.01) assert time.time() - start < 5 assert L == list(map(inc, range(10))) @pytest.mark.xfail(reason='') async def test_stream_shares_client_loop(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 source = Stream() d = source.timed_window('20ms').scatter() # noqa: F841 assert source.loop is client.loop @gen_cluster(client=True) async def test_starmap(c, s, a, b): def add(x, y, z=0): return x + y + z source = Stream(asynchronous=True) L = source.scatter().starmap(add, z=10).gather().sink_to_list() for i in range(5): await source.emit((i, i)) assert L == [10, 12, 14, 16, 18] streamz-0.6.4/streamz/tests/test_core.py0000644000175000017500000011302414270277270017735 0ustar nileshnileshfrom datetime import timedelta from functools import partial import itertools import json import operator from operator import add import os from time import sleep import sys import pytest from tornado.queues import Queue from tornado.ioloop import IOLoop import streamz as sz from streamz import RefCounter from streamz.sources import sink_to_file from streamz.utils_test import (inc, double, gen_test, tmpfile, captured_logger, # noqa: F401 clean, await_for, metadata, wait_for) # noqa: F401 from distributed.utils_test import loop # noqa: F401 def test_basic(): source = Stream() b1 = source.map(inc) b2 = source.map(double) c = b1.scan(add) Lc = c.sink_to_list() Lb = b2.sink_to_list() for i in range(4): source.emit(i) assert Lc == [1, 3, 6, 10] assert Lb == [0, 2, 4, 6] def test_no_output(): source = Stream() assert source.emit(1) is None def test_scan(): source = Stream() def f(acc, i): acc = acc + i return acc, acc L = source.scan(f, returns_state=True).sink_to_list() for i in range(3): source.emit(i) assert L == [0, 1, 3] def test_kwargs(): source = Stream() def f(acc, x, y=None): acc = acc + x + y return acc L = source.scan(f, y=10).sink_to_list() for i in range(3): source.emit(i) assert L == [0, 11, 23] def test_filter(): source = Stream() L = source.filter(lambda x: x % 2 == 0).sink_to_list() for i in range(10): source.emit(i) assert L == [0, 2, 4, 6, 8] def test_filter_args(): source = Stream() L = source.filter(lambda x, n: x % n == 0, 2).sink_to_list() for i in range(10): source.emit(i) assert L == [0, 2, 4, 6, 8] def test_filter_kwargs(): source = Stream() L = source.filter(lambda x, n=1: x % n == 0, n=2).sink_to_list() for i in range(10): source.emit(i) assert L == [0, 2, 4, 6, 8] def test_filter_none(): source = Stream() L = source.filter(None).sink_to_list() for i in range(10): source.emit(i % 3) assert L == [1, 2, 1, 2, 1, 2] def test_map(): def add(x=0, y=0): return x + y source = Stream() L = source.map(add, y=10).sink_to_list() source.emit(1) assert L[0] == 11 def test_map_args(): source = Stream() L = source.map(operator.add, 10).sink_to_list() source.emit(1) assert L == [11] def test_starmap(): def add(x=0, y=0): return x + y source = Stream() L = source.starmap(add).sink_to_list() source.emit((1, 10)) assert L[0] == 11 def test_remove(): source = Stream() L = source.remove(lambda x: x % 2 == 0).sink_to_list() for i in range(10): source.emit(i) assert L == [1, 3, 5, 7, 9] def test_partition(): source = Stream() L = source.partition(2).sink_to_list() for i in range(10): source.emit(i) assert L == [(0, 1), (2, 3), (4, 5), (6, 7), (8, 9)] @pytest.mark.parametrize( "n,key,keep,elements,exp_result", [ (3, sz.identity, "first", [1, 2, 1, 3, 1, 3, 3, 2], [(1, 2, 3), (1, 3, 2)]), (3, sz.identity, "last", [1, 2, 1, 3, 1, 3, 3, 2], [(2, 1, 3), (1, 3, 2)]), ( 3, len, "last", ["f", "fo", "f", "foo", "f", "foo", "foo", "fo"], [("fo", "f", "foo"), ("f", "foo", "fo")], ), ( 2, "id", "first", [{"id": 0, "foo": "bar"}, {"id": 0, "foo": "baz"}, {"id": 1, "foo": "bat"}], [({"id": 0, "foo": "bar"}, {"id": 1, "foo": "bat"})], ), ( 2, "id", "last", [{"id": 0, "foo": "bar"}, {"id": 0, "foo": "baz"}, {"id": 1, "foo": "bat"}], [({"id": 0, "foo": "baz"}, {"id": 1, "foo": "bat"})], ), ] ) def test_partition_unique(n, key, keep, elements, exp_result): source = Stream() L = source.partition_unique(n, key, keep).sink_to_list() for ele in elements: source.emit(ele) assert L == exp_result def test_partition_timeout(): source = Stream() L = source.partition(10, timeout=0.01).sink_to_list() for i in range(5): source.emit(i) sleep(0.1) assert L == [(0, 1, 2, 3, 4)] def test_partition_timeout_cancel(): source = Stream() L = source.partition(3, timeout=0.1).sink_to_list() for i in range(3): source.emit(i) sleep(0.09) source.emit(3) sleep(0.02) assert L == [(0, 1, 2)] sleep(0.09) assert L == [(0, 1, 2), (3,)] def test_partition_key(): source = Stream() L = source.partition(2, key=0).sink_to_list() for i in range(4): source.emit((i % 2, i)) assert L == [((0, 0), (0, 2)), ((1, 1), (1, 3))] def test_partition_key_callable(): source = Stream() L = source.partition(2, key=lambda x: x % 2).sink_to_list() for i in range(10): source.emit(i) assert L == [(0, 2), (1, 3), (4, 6), (5, 7)] def test_partition_size_one(): source = Stream() source.partition(1, timeout=.01).sink(lambda x: None) for i in range(10): source.emit(i) def test_sliding_window(): source = Stream() L = source.sliding_window(2).sink_to_list() for i in range(10): source.emit(i) assert L == [(0, ), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9)] L = source.sliding_window(2, return_partial=False).sink_to_list() for i in range(10): source.emit(i) assert L == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9)] def test_sliding_window_ref_counts(): source = Stream() _ = source.sliding_window(2) r_prev = RefCounter() source.emit(-2) source.emit(-1, metadata=[{'ref': r_prev}]) for i in range(10): r = RefCounter() assert r_prev.count == 1 source.emit(i, metadata=[{'ref': r}]) assert r_prev.count == 0 assert r.count == 1 r_prev = r def test_sliding_window_metadata(): source = Stream() L = metadata(source.sliding_window(2)).sink_to_list() source.emit(0) source.emit(1, metadata=[{'v': 1}]) source.emit(2, metadata=[{'v': 2}]) source.emit(3, metadata=[{'v': 3}]) assert L == [ [{'v': 1}], # First emit, because 0 has no metadata [{'v': 1}, {'v': 2}], # Second emit [{'v': 2}, {'v': 3}] # Third emit ] @gen_test() def test_backpressure(): q = Queue(maxsize=2) source = Stream(asynchronous=True) source.map(inc).scan(add, start=0).sink(q.put) @gen.coroutine def read_from_q(): while True: yield q.get() yield gen.sleep(0.1) IOLoop.current().add_callback(read_from_q) start = time() for i in range(5): yield source.emit(i) end = time() assert end - start >= 0.2 @gen_test() def test_timed_window_unique(): tests = [ (0.05, sz.identity, "first", [1, 2, 1, 3, 1, 3, 3, 2], [(1, 2, 3)]), (0.05, sz.identity, "last", [1, 2, 1, 3, 1, 3, 3, 2], [(1, 3, 2)]), ( 0.05, len, "last", ["f", "fo", "f", "foo", "f", "foo", "foo", "fo"], [("f", "foo", "fo")], ), ( 0.05, "id", "first", [{"id": 0, "foo": "bar"}, {"id": 1, "foo": "bat"}, {"id": 0, "foo": "baz"}], [({"id": 0, "foo": "bar"}, {"id": 1, "foo": "bat"})], ), ( 0.05, "id", "last", [{"id": 0, "foo": "bar"}, {"id": 1, "foo": "bat"}, {"id": 0, "foo": "baz"}], [({"id": 1, "foo": "bat"}, {"id": 0, "foo": "baz"})], ), ] for interval, key, keep, elements, exp_result in tests: source = Stream(asynchronous=True) a = source.timed_window_unique(interval, key, keep) assert a.loop is IOLoop.current() L = a.sink_to_list() for ele in elements: yield source.emit(ele) yield gen.sleep(a.interval) assert L assert all(wi in elements for window in L for wi in window) assert sum(1 for window in L for _ in window) <= len(elements) assert L == exp_result yield gen.sleep(a.interval) assert not L[-1] @gen_test() def test_timed_window(): source = Stream(asynchronous=True) a = source.timed_window(0.01) assert a.loop is IOLoop.current() L = a.sink_to_list() for i in range(10): yield source.emit(i) yield gen.sleep(0.004) yield gen.sleep(a.interval) assert L assert sum(L, []) == list(range(10)) assert all(len(x) <= 3 for x in L) assert any(len(x) >= 2 for x in L) yield gen.sleep(0.1) assert not L[-1] @gen_test() def test_timed_window_ref_counts(): source = Stream(asynchronous=True) _ = source.timed_window(0.01) ref1 = RefCounter() assert str(ref1) == "" source.emit(1, metadata=[{'ref': ref1}]) assert ref1.count == 1 yield gen.sleep(0.05) ref2 = RefCounter() source.emit(2, metadata=[{'ref': ref2}]) assert ref1.count == 0 assert ref2.count == 1 def test_mixed_async(): s1 = Stream(asynchronous=False) with pytest.raises(ValueError): Stream(asynchronous=True, upstream=s1) @gen_test() def test_timed_window_metadata(): source = Stream() L = metadata(source.timed_window(0.06)).sink_to_list() source.emit(0) source.emit(1, metadata=[{'v': 1}]) yield gen.sleep(0.1) source.emit(2, metadata=[{'v': 2}]) source.emit(3, metadata=[{'v': 3}]) yield gen.sleep(0.1) assert L == [ [{'v': 1}], # first emit because 0 has no metadata [{'v': 2}, {'v': 3}] # second emit ] def test_timed_window_timedelta(clean): # noqa: F811 pytest.importorskip('pandas') source = Stream(asynchronous=True) a = source.timed_window('10ms') assert a.interval == 0.010 @gen_test() def test_timed_window_backpressure(): q = Queue(maxsize=1) source = Stream(asynchronous=True) source.timed_window(0.01).sink(q.put) @gen.coroutine def read_from_q(): while True: yield q.get() yield gen.sleep(0.1) IOLoop.current().add_callback(read_from_q) start = time() for i in range(5): yield source.emit(i) yield gen.sleep(0.01) stop = time() assert stop - start > 0.2 def test_sink_to_file(): with tmpfile() as fn: source = Stream() with sink_to_file(fn, source) as f: source.emit('a') source.emit('b') with open(fn) as f: data = f.read() assert data == 'a\nb\n' @gen_test() def test_counter(): counter = itertools.count() source = Stream.from_periodic(lambda: next(counter), 0.001, asynchronous=True, start=True) L = source.sink_to_list() yield gen.sleep(0.05) assert L @gen_test() def test_rate_limit(): source = Stream(asynchronous=True) L = source.rate_limit(0.05).sink_to_list() start = time() for i in range(5): yield source.emit(i) stop = time() assert stop - start > 0.2 assert len(L) == 5 @gen_test() def test_delay(): source = Stream(asynchronous=True) L = source.delay(0.02).sink_to_list() for i in range(5): yield source.emit(i) assert not L yield gen.sleep(0.04) assert len(L) < 5 yield gen.sleep(0.1) assert len(L) == 5 @gen_test() def test_delay_ref_counts(): source = Stream(asynchronous=True) _ = source.delay(0.01) refs = [] for i in range(5): r = RefCounter() refs.append(r) source.emit(i, metadata=[{'ref': r}]) assert all(r.count == 1 for r in refs) yield gen.sleep(0.05) assert all(r.count == 0 for r in refs) @gen_test() def test_buffer(): source = Stream(asynchronous=True) L = source.map(inc).buffer(10).map(inc).rate_limit(0.05).sink_to_list() start = time() for i in range(10): yield source.emit(i) stop = time() assert stop - start < 0.01 assert not L start = time() for i in range(5): yield source.emit(i) stop = time() assert L assert stop - start > 0.04 @gen_test() def test_buffer_ref_counts(): source = Stream(asynchronous=True) _ = source.buffer(5) refs = [] for i in range(5): r = RefCounter() refs.append(r) source.emit(i, metadata=[{'ref': r}]) assert all(r.count == 1 for r in refs) yield gen.sleep(0.05) assert all(r.count == 0 for r in refs) def test_zip(): a = Stream() b = Stream() c = sz.zip(a, b) L = c.sink_to_list() a.emit(1) b.emit('a') a.emit(2) b.emit('b') assert L == [(1, 'a'), (2, 'b')] d = Stream() # test zip from the object itself # zip 3 streams together e = a.zip(b, d) L2 = e.sink_to_list() a.emit(1) b.emit(2) d.emit(3) assert L2 == [(1, 2, 3)] def test_zip_literals(): a = Stream() b = Stream() c = sz.zip(a, 123, b) L = c.sink_to_list() a.emit(1) b.emit(2) assert L == [(1, 123, 2)] a.emit(4) b.emit(5) assert L == [(1, 123, 2), (4, 123, 5)] def test_zip_same(): a = Stream() b = a.zip(a) L = b.sink_to_list() a.emit(1) a.emit(2) assert L == [(1, 1), (2, 2)] def test_combine_latest(): a = Stream() b = Stream() c = a.combine_latest(b) d = a.combine_latest(b, emit_on=[a, b]) L = c.sink_to_list() L2 = d.sink_to_list() a.emit(1) a.emit(2) b.emit('a') a.emit(3) b.emit('b') assert L == [(2, 'a'), (3, 'a'), (3, 'b')] assert L2 == [(2, 'a'), (3, 'a'), (3, 'b')] def test_combine_latest_emit_on(): a = Stream() b = Stream() c = a.combine_latest(b, emit_on=a) L = c.sink_to_list() a.emit(1) b.emit('a') a.emit(2) a.emit(3) b.emit('b') a.emit(4) assert L == [(2, 'a'), (3, 'a'), (4, 'b')] def test_combine_latest_emit_on_stream(): a = Stream() b = Stream() c = a.combine_latest(b, emit_on=0) L = c.sink_to_list() a.emit(1) b.emit('a') a.emit(2) a.emit(3) b.emit('b') a.emit(4) assert L == [(2, 'a'), (3, 'a'), (4, 'b')] def test_combine_latest_ref_counts(): a = Stream() b = Stream() _ = a.combine_latest(b) ref1 = RefCounter() a.emit(1, metadata=[{'ref': ref1}]) assert ref1.count == 1 # The new value kicks out the old value ref2 = RefCounter() a.emit(2, metadata=[{'ref': ref2}]) assert ref1.count == 0 assert ref2.count == 1 # The value on stream a is still retained and the value on stream b is new ref3 = RefCounter() b.emit(3, metadata=[{'ref': ref3}]) assert ref2.count == 1 assert ref3.count == 1 def test_combine_latest_metadata(): a = Stream() b = Stream() L = metadata(a.combine_latest(b)).sink_to_list() a.emit(1, metadata=[{'v': 1}]) b.emit(2, metadata=[{'v': 2}]) b.emit(3) b.emit(4, metadata=[{'v': 4}]) assert L == [ [{'v': 1}, {'v': 2}], # first emit when 2 is introduced [{'v': 1}], # 3 has no metadata but it replaces the value on 'b' [{'v': 1}, {'v': 4}] # 4 replaces the value without metadata on 'b' ] @gen_test() def test_zip_timeout(): a = Stream(asynchronous=True) b = Stream(asynchronous=True) c = sz.zip(a, b, maxsize=2) L = c.sink_to_list() a.emit(1) a.emit(2) future = a.emit(3) with pytest.raises(gen.TimeoutError): yield gen.with_timeout(timedelta(seconds=0.01), future) b.emit('a') yield future assert L == [(1, 'a')] def test_zip_ref_counts(): a = Stream() b = Stream() _ = a.zip(b) # The first value in a becomes buffered ref1 = RefCounter() a.emit(1, metadata=[{'ref': ref1}]) assert ref1.count == 1 # The second value in a also becomes buffered ref2 = RefCounter() a.emit(2, metadata=[{'ref': ref2}]) assert ref1.count == 1 assert ref2.count == 1 # All emitted values are removed from the buffer ref3 = RefCounter() b.emit(3, metadata=[{'ref': ref3}]) assert ref1.count == 0 assert ref2.count == 1 # still in the buffer assert ref3.count == 0 def test_zip_metadata(): a = Stream() b = Stream() L = metadata(a.zip(b)).sink_to_list() a.emit(1, metadata=[{'v': 1}]) b.emit(2, metadata=[{'v': 2}]) a.emit(3) b.emit(4, metadata=[{'v': 4}]) assert L == [ [{'v': 1}, {'v': 2}], # first emit when 2 is introduced [{'v': 4}] # second emit when 4 is introduced, and 3 has no metadata ] def test_frequencies(): source = Stream() L = source.frequencies().sink_to_list() source.emit('a') source.emit('b') source.emit('a') assert L[-1] == {'a': 2, 'b': 1} def test_flatten(): source = Stream() L = source.flatten().sink_to_list() source.emit([1, 2, 3]) source.emit([4, 5]) source.emit([6, 7, 8]) assert L == [1, 2, 3, 4, 5, 6, 7, 8] def test_unique(): source = Stream() L = source.unique().sink_to_list() source.emit(1) source.emit(2) source.emit(1) assert L == [1, 2] def test_unique_key(): source = Stream() L = source.unique(key=lambda x: x % 2, maxsize=1).sink_to_list() source.emit(1) source.emit(2) source.emit(4) source.emit(6) source.emit(3) assert L == [1, 2, 3] def test_unique_metadata(): source = Stream() L = metadata(source.unique()).flatten().sink_to_list() for i in range(5): source.emit(i, metadata=[{'v': i}]) assert L == [{'v': i} for i in range(5)] def test_unique_history(): source = Stream() s = source.unique(maxsize=2) s2 = source.unique(maxsize=2, hashable=False) L = s.sink_to_list() L2 = s2.sink_to_list() source.emit(1) source.emit(2) source.emit(1) source.emit(2) source.emit(1) source.emit(2) assert L == [1, 2] assert L == L2 source.emit(3) source.emit(2) assert L == [1, 2, 3] assert L == L2 source.emit(1) assert L == [1, 2, 3, 1] assert L == L2 # update 2 position source.emit(2) # knock out 1 source.emit(3) # update 2 position source.emit(2) assert L == [1, 2, 3, 1, 3] assert L == L2 def test_unique_history_dict(): source = Stream() s = source.unique(maxsize=2, hashable=False) L = s.sink_to_list() a = {'hi': 'world'} b = {'hi': 'bar'} c = {'foo': 'bar'} source.emit(a) source.emit(b) source.emit(a) source.emit(b) source.emit(a) source.emit(b) assert L == [a, b] source.emit(c) source.emit(b) assert L == [a, b, c] source.emit(a) assert L == [a, b, c, a] def test_union(): a = Stream() b = Stream() c = Stream() L = a.union(b, c).sink_to_list() a.emit(1) assert L == [1] b.emit(2) assert L == [1, 2] a.emit(3) assert L == [1, 2, 3] c.emit(4) assert L == [1, 2, 3, 4] def test_pluck(): a = Stream() L = a.pluck(1).sink_to_list() a.emit([1, 2, 3]) assert L == [2] a.emit([4, 5, 6, 7, 8, 9]) assert L == [2, 5] with pytest.raises(IndexError): a.emit([1]) def test_pluck_list(): a = Stream() L = a.pluck([0, 2]).sink_to_list() a.emit([1, 2, 3]) assert L == [(1, 3)] a.emit([4, 5, 6, 7, 8, 9]) assert L == [(1, 3), (4, 6)] with pytest.raises(IndexError): a.emit([1]) def test_collect(): source1 = Stream() source2 = Stream() collector = source1.collect() L = collector.sink_to_list() source2.sink(collector.flush) source1.emit(1) source1.emit(2) assert L == [] source2.emit('anything') # flushes collector assert L == [(1, 2)] source2.emit('anything') assert L == [(1, 2), ()] source1.emit(3) assert L == [(1, 2), ()] source2.emit('anything') assert L == [(1, 2), (), (3,)] def test_collect_ref_counts(): source = Stream() collector = source.collect() refs = [] for i in range(10): r = RefCounter() refs.append(r) source.emit(i, metadata=[{'ref': r}]) assert all(r.count == 1 for r in refs) collector.flush() assert all(r.count == 0 for r in refs) def test_collect_metadata(): source = Stream() collector = source.collect() L = metadata(collector).sink_to_list() source.emit(0) source.emit(1, metadata=[{'v': 1}]) source.emit(2, metadata=[{'v': 2}]) collector.flush() source.emit(3, metadata=[{'v': 3}]) source.emit(4, metadata=[{'v': 4}]) collector.flush() assert L == [ [{'v': 1}, {'v': 2}], # Flush 0-2, but 0 has no metadata [{'v': 3}, {'v': 4}] # Flush the rest ] def test_map_str(): def add(x=0, y=0): return x + y source = Stream() s = source.map(add, y=10) assert str(s) == '' def test_no_ipywidget_repr(monkeypatch, capsys): pytest.importorskip("ipywidgets") import ipywidgets source = Stream() # works by side-affect of display() source._ipython_display_() assert "Output()" in capsys.readouterr().out def get(*_, **__): raise ImportError monkeypatch.setattr(ipywidgets.Output, "__init__", get) out = source._ipython_display_() assert "Stream" in capsys.readouterr().out def test_filter_str(): def iseven(x): return x % 2 == 0 source = Stream() s = source.filter(iseven) assert str(s) == '' def test_timed_window_str(clean): # noqa: F811 source = Stream() s = source.timed_window(.05) assert str(s) == '' def test_partition_str(): source = Stream() s = source.partition(2) assert str(s) == '' def test_partition_ref_counts(): source = Stream() _ = source.partition(2) for i in range(10): r = RefCounter() source.emit(i, metadata=[{'ref': r}]) if i % 2 == 0: assert r.count == 1 else: assert r.count == 0 def test_partition_metadata(): source = Stream() L = metadata(source.partition(2)).sink_to_list() source.emit(0) source.emit(1, metadata=[{'v': 1}]) source.emit(2, metadata=[{'v': 2}]) source.emit(3, metadata=[{'v': 3}]) assert L == [ [{'v': 1}], # first emit when 1 is introduced. 0 has no metadata [{'v': 2}, {'v': 3}] # second emit ] def test_stream_name_str(): source = Stream(stream_name='this is not a stream') assert str(source) == '' def test_zip_latest(): a = Stream() b = Stream() c = a.zip_latest(b) d = a.combine_latest(b, emit_on=a) L = c.sink_to_list() L2 = d.sink_to_list() a.emit(1) a.emit(2) b.emit('a') b.emit('b') a.emit(3) assert L == [(1, 'a'), (2, 'a'), (3, 'b')] assert L2 == [(3, 'b')] def test_zip_latest_reverse(): a = Stream() b = Stream() c = a.zip_latest(b) L = c.sink_to_list() b.emit('a') a.emit(1) a.emit(2) a.emit(3) b.emit('b') a.emit(4) assert L == [(1, 'a'), (2, 'a'), (3, 'a'), (4, 'b')] def test_triple_zip_latest(): from streamz.core import Stream s1 = Stream() s2 = Stream() s3 = Stream() s_simple = s1.zip_latest(s2, s3) L_simple = s_simple.sink_to_list() s1.emit(1) s2.emit('I') s2.emit("II") s1.emit(2) s2.emit("III") s3.emit('a') s3.emit('b') s1.emit(3) assert L_simple == [(1, 'III', 'a'), (2, 'III', 'a'), (3, 'III', 'b')] def test_zip_latest_ref_counts(): a = Stream() b = Stream() _ = a.zip_latest(b) ref1 = RefCounter() a.emit(1, metadata=[{'ref': ref1}]) assert ref1.count == 1 # Retained until stream b has a value # The lossless stream is never retained if all upstreams have a value ref2 = RefCounter() b.emit(2, metadata=[{'ref': ref2}]) assert ref1.count == 0 assert ref2.count == 1 # Kick out the stream b value and verify it has zero references ref3 = RefCounter() b.emit(3, metadata=[{'ref': ref3}]) assert ref2.count == 0 assert ref3.count == 1 # Verify the lossless value is not retained, but the lossy value is ref4 = RefCounter() a.emit(3, metadata=[{'ref': ref4}]) assert ref3.count == 1 assert ref4.count == 0 def test_zip_latest_metadata(): a = Stream() b = Stream() L = metadata(a.zip_latest(b)).sink_to_list() a.emit(1, metadata=[{'v': 1}]) b.emit(2, metadata=[{'v': 2}]) a.emit(3) b.emit(4, metadata=[{'v': 4}]) assert L == [ [{'v': 1}, {'v': 2}], # the first emit when 2 is introduced [{'v': 2}] # 3 has no metadata ] def test_connect(): source_downstream = Stream() # connect assumes this default behaviour # of stream initialization assert not source_downstream.downstreams assert source_downstream.upstreams == [] # initialize the second stream to connect to source_upstream = Stream() sout = source_downstream.map(lambda x : x + 1) L = list() sout = sout.map(L.append) source_upstream.connect(source_downstream) source_upstream.emit(2) source_upstream.emit(4) assert L == [3, 5] def test_multi_connect(): source0 = Stream() source1 = Stream() source_downstream = source0.union(source1) # connect assumes this default behaviour # of stream initialization assert not source_downstream.downstreams # initialize the second stream to connect to source_upstream = Stream() sout = source_downstream.map(lambda x : x + 1) L = list() sout = sout.map(L.append) source_upstream.connect(source_downstream) source_upstream.emit(2) source_upstream.emit(4) assert L == [3, 5] def test_disconnect(): source = Stream() upstream = Stream() L = upstream.sink_to_list() source.emit(1) assert L == [] source.connect(upstream) source.emit(2) source.emit(3) assert L == [2, 3] source.disconnect(upstream) source.emit(4) assert L == [2, 3] def test_gc(): source = Stream() L = [] a = source.map(L.append) source.emit(1) assert L == [1] del a import gc; gc.collect() start = time() while source.downstreams: sleep(0.01) assert time() < start + 1 source.emit(2) assert L == [1] @gen_test() def test_from_file(): with tmpfile() as fn: with open(fn, 'wt') as f: f.write('{"x": 1, "y": 2}\n') f.write('{"x": 2, "y": 2}\n') f.write('{"x": 3, "y": 2}\n') f.flush() source = Stream.from_textfile(fn, poll_interval=0.010, asynchronous=True, start=False) L = source.map(json.loads).pluck('x').sink_to_list() assert L == [] source.start() yield await_for(lambda: len(L) == 3, timeout=5) assert L == [1, 2, 3] f.write('{"x": 4, "y": 2}\n') f.write('{"x": 5, "y": 2}\n') f.flush() start = time() while L != [1, 2, 3, 4, 5]: yield gen.sleep(0.01) assert time() < start + 2 # reads within 2s @gen_test() def test_from_file_end(): with tmpfile() as fn: with open(fn, 'wt') as f: f.write('data1\n') f.flush() source = Stream.from_textfile(fn, poll_interval=0.010, start=False, from_end=True) out = source.sink_to_list() source.start() assert out == [] yield await_for(lambda: source.started, 2, period=0.02) f.write('data2\n') f.flush() yield await_for(lambda: out == ['data2\n'], timeout=5, period=0.1) @gen_test() def test_filenames(): with tmpfile() as fn: os.mkdir(fn) with open(os.path.join(fn, 'a'), 'w'): pass with open(os.path.join(fn, 'b'), 'w'): pass source = Stream.filenames(fn, asynchronous=True) L = source.sink_to_list() source.start() while len(L) < 2: yield gen.sleep(0.01) assert L == [os.path.join(fn, x) for x in ['a', 'b']] with open(os.path.join(fn, 'c'), 'w'): pass while len(L) < 3: yield gen.sleep(0.01) assert L == [os.path.join(fn, x) for x in ['a', 'b', 'c']] def test_docstrings(): for s in [Stream, Stream()]: assert 'every element' in s.map.__doc__ assert s.map.__name__ == 'map' assert 'predicate' in s.filter.__doc__ assert s.filter.__name__ == 'filter' def test_subclass(): class NewStream(Stream): pass @NewStream.register_api() class foo(NewStream): pass assert hasattr(NewStream, 'map') assert hasattr(NewStream(), 'map') assert hasattr(NewStream, 'foo') assert hasattr(NewStream(), 'foo') assert not hasattr(Stream, 'foo') assert not hasattr(Stream(), 'foo') @gen_test() def test_latest(): source = Stream(asynchronous=True) L = [] @gen.coroutine def slow_write(x): yield gen.sleep(0.050) L.append(x) s = source.map(inc).latest().map(slow_write) # noqa: F841 source.emit(1) yield gen.sleep(0.010) source.emit(2) source.emit(3) start = time() while len(L) < 2: yield gen.sleep(0.01) assert time() < start + 3 assert L == [2, 4] yield gen.sleep(0.060) assert L == [2, 4] def test_latest_ref_counts(): source = Stream() _ = source.latest() ref1 = RefCounter() source.emit(1, metadata=[{'ref': ref1}]) assert ref1.count == 1 ref2 = RefCounter() source.emit(2, metadata=[{'ref': ref2}]) assert ref1.count == 0 assert ref2.count == 1 def test_destroy(): source = Stream() s = source.map(inc) L = s.sink_to_list() source.emit(1) assert L == [2] s.destroy() assert not list(source.downstreams) assert not s.upstreams source.emit(2) assert L == [2] def dont_test_stream_kwargs(clean): # noqa: F811 ''' Test the good and bad kwargs for the stream Currently just stream_name ''' test_name = "some test name" sin = Stream(stream_name=test_name) sin2 = Stream() assert sin.name == test_name # when not defined, should be None assert sin2.name is None # add new core methods here, initialized # these should be functions, use partial to partially initialize them # (if they require more arguments) streams = [ # some filter kwargs, so we comment them out partial(sin.map, lambda x : x), partial(sin.accumulate, lambda x1, x2 : x1), partial(sin.filter, lambda x : True), partial(sin.partition, 2), partial(sin.sliding_window, 2), partial(sin.timed_window, .01), partial(sin.rate_limit, .01), partial(sin.delay, .02), partial(sin.buffer, 2), partial(sin.zip, sin2), partial(sin.combine_latest, sin2), sin.frequencies, sin.flatten, sin.unique, sin.union, partial(sin.pluck, 0), sin.collect, ] good_kwargs = dict(stream_name=test_name) bad_kwargs = dict(foo="bar") for s in streams: # try good kwargs sout = s(**good_kwargs) assert sout.name == test_name del sout with pytest.raises(TypeError): sout = s(**bad_kwargs) sin.emit(1) # need a second emit for accumulate sin.emit(1) del sout # verify that sout is properly deleted each time by emitting once into sin # and not getting TypeError # garbage collect and then try import gc gc.collect() sin.emit(1) @pytest.fixture def thread(loop): # noqa: F811 from threading import Thread, Event thread = Thread(target=loop.start) thread.daemon = True thread.start() event = Event() loop.add_callback(event.set) event.wait() return thread def test_percolate_loop_information(clean): # noqa: F811 source = Stream() assert not source.loop s = source.timed_window(0.5) assert source.loop is s.loop def test_separate_thread_without_time(loop, thread): # noqa: F811 assert thread.is_alive() source = Stream(loop=loop) L = source.map(inc).sink_to_list() for i in range(10): source.emit(i) assert L[-1] == i + 1 def test_separate_thread_with_time(clean): # noqa: F811 L = [] @gen.coroutine def slow_write(x): yield gen.sleep(0.1) L.append(x) source = Stream(asynchronous=False) source.map(inc).sink(slow_write) start = time() source.emit(1) stop = time() assert stop - start > 0.1 assert L == [2] def test_execution_order(): L = [] for i in range(5): s = Stream() b = s.pluck(1) a = s.pluck(0) li = a.combine_latest(b, emit_on=a).sink_to_list() z = [(1, 'red'), (2, 'blue'), (3, 'green')] for zz in z: s.emit(zz) L.append((li, )) for ll in L: assert ll == L[0] L2 = [] for i in range(5): s = Stream() a = s.pluck(0) b = s.pluck(1) li = a.combine_latest(b, emit_on=a).sink_to_list() z = [(1, 'red'), (2, 'blue'), (3, 'green')] for zz in z: s.emit(zz) L2.append((li,)) for ll, ll2 in zip(L, L2): assert ll2 == L2[0] assert ll != ll2 @gen_test() def test_map_errors_log(): a = Stream(asynchronous=True) b = a.delay(0.001).map(lambda x: 1 / x) # noqa: F841 with captured_logger('streamz') as logger: a._emit(0) yield gen.sleep(0.1) out = logger.getvalue() assert 'ZeroDivisionError' in out def test_map_errors_raises(): a = Stream() b = a.map(lambda x: 1 / x) # noqa: F841 with pytest.raises(ZeroDivisionError): a.emit(0) @gen_test() def test_accumulate_errors_log(): a = Stream(asynchronous=True) b = a.delay(0.001).accumulate(lambda x, y: x / y, with_state=True) # noqa: F841 with captured_logger('streamz') as logger: a._emit(1) a._emit(0) yield gen.sleep(0.1) out = logger.getvalue() assert 'ZeroDivisionError' in out def test_accumulate_errors_raises(): a = Stream() b = a.accumulate(lambda x, y: x / y, with_state=True) # noqa: F841 with pytest.raises(ZeroDivisionError): a.emit(1) a.emit(0) @gen_test() def test_sync_in_event_loop(): a = Stream() assert not a.asynchronous L = a.timed_window(0.01).sink_to_list() sleep(0.05) assert L assert a.loop assert a.loop is not IOLoop.current() def test_share_common_ioloop(clean): # noqa: F811 a = Stream() b = Stream() aa = a.timed_window(0.01) bb = b.timed_window(0.01) assert aa.loop is bb.loop @pytest.mark.parametrize('data', [ [[], [0, 1, 2, 3, 4, 5]], [[None, None, None], [0, 1, 2, 3, 4, 5]], [[1, None, None], [1, 2, 3, 4, 5]], [[None, 4, None], [0, 1, 2, 3]], [[None, 4, 2], [0, 2]], [[3, 1, None], []] ]) def test_slice(data): pars, expected = data a = Stream() b = a.slice(*pars) out = b.sink_to_list() for i in range(6): a.emit(i) assert out == expected def test_slice_err(): a = Stream() with pytest.raises(ValueError): a.slice(end=-1) def test_start(): flag = [] class MySource(Stream): def start(self): flag.append(True) s = MySource().map(inc) s.start() assert flag == [True] def test_connect_zip(): a = Stream() b = Stream() c = Stream() x = a.zip(b) L = x.sink_to_list() c.connect(x) a.emit(1) b.emit(1) assert not L c.emit(1) assert L == [(1, 1, 1)] def test_disconnect_zip(): a = Stream() b = Stream() c = Stream() x = a.zip(b, c) L = x.sink_to_list() b.disconnect(x) a.emit(1) b.emit(1) assert not L c.emit(1) assert L == [(1, 1)] def test_connect_combine_latest(): a = Stream() b = Stream() c = Stream() x = a.combine_latest(b, emit_on=a) L = x.sink_to_list() c.connect(x) b.emit(1) c.emit(1) a.emit(1) assert L == [(1, 1, 1)] def test_connect_discombine_latest(): a = Stream() b = Stream() c = Stream() x = a.combine_latest(b, c, emit_on=a) L = x.sink_to_list() c.disconnect(x) b.emit(1) c.emit(1) a.emit(1) assert L == [(1, 1)] if sys.version_info >= (3, 5): from streamz.tests.py3_test_core import * # noqa def test_buffer_after_partition(): Stream().partition(1).buffer(1) def test_buffer_after_timed_window(): Stream().timed_window(1).buffer(1) def test_buffer_after_sliding_window(): Stream().sliding_window(1).buffer(1) def test_backpressure_connect_empty_stream(): @Stream.register_api() class from_list(Stream): def __init__(self, source, **kwargs): self.source = source super().__init__(ensure_io_loop=True, **kwargs) def start(self): self.stopped = False self.loop.add_callback(self.run) @gen.coroutine def run(self): while not self.stopped and len(self.source) > 0: yield self._emit(self.source.pop(0)) source_list = [0, 1, 2, 3, 4] source = Stream.from_list(source_list) sout = Stream() L = sout.rate_limit(1).sink_to_list() source.connect(sout) source.start() wait_for(lambda: L == [0], 0.01) assert len(source_list) > 0 streamz-0.6.4/streamz/tests/test_batch.py0000644000175000017500000000275714270277270020100 0ustar nileshnileshimport pytest import toolz from streamz.batch import Batch, Streaming from streamz.utils_test import inc def test_core(): a = Batch() b = a.pluck('x').map(inc) c = b.sum() L = c.stream.sink_to_list() a.emit([{'x': i, 'y': 0} for i in range(4)]) assert isinstance(b, Batch) assert isinstance(c, Streaming) assert L == [1 + 2 + 3 + 4] def test_dataframes(): pd = pytest.importorskip('pandas') from streamz.dataframe import DataFrame data = [{'x': i, 'y': 2 * i} for i in range(10)] s = Batch(example=[{'x': 0, 'y': 0}]) sdf = s.map(lambda d: toolz.assoc(d, 'z', d['x'] + d['y'])).to_dataframe() assert isinstance(sdf, DataFrame) L = sdf.stream.sink_to_list() for batch in toolz.partition_all(3, data): s.emit(batch) result = pd.concat(L) assert result.z.tolist() == [3 * i for i in range(10)] def test_periodic_dataframes(): pd = pytest.importorskip('pandas') from streamz.dataframe import PeriodicDataFrame from streamz.dataframe.core import random_datapoint df = random_datapoint(now=pd.Timestamp.now()) assert len(df) == 1 def callback(now, **kwargs): return pd.DataFrame(dict(x=50, index=[now])) df = PeriodicDataFrame(callback, interval='20ms') assert df.tail(0).x == 50 df.stop() def test_filter(): a = Batch() f = a.filter(lambda x: x % 2 == 0) s = f.to_stream() L = s.sink_to_list() a.emit([1, 2, 3, 4]) a.emit([5, 6]) assert L == [2, 4, 6] streamz-0.6.4/streamz/tests/py3_test_core.py0000644000175000017500000000110014270277270020517 0ustar nileshnilesh# flake8: noqa from time import time from distributed.utils_test import loop, inc # noqa from tornado import gen from streamz import Stream def test_await_syntax(loop): # noqa L = [] async def write(x): await gen.sleep(0.1) L.append(x) async def f(): source = Stream(asynchronous=True) source.map(inc).buffer(3).sink(write) start = time() for x in range(6): await source.emit(x) stop = time() assert 0.2 < stop - start < 0.4 assert 2 <= len(L) <= 4 loop.run_sync(f) streamz-0.6.4/streamz/tests/__init__.py0000644000175000017500000000000014270277270017472 0ustar nileshnileshstreamz-0.6.4/streamz/sources.py0000644000175000017500000007560414270277270016302 0ustar nileshnileshimport asyncio from glob import glob import queue import os import time from tornado import gen import weakref from .core import Stream, convert_interval, RefCounter, sync def sink_to_file(filename, upstream, mode='w', prefix='', suffix='\n', flush=False): file = open(filename, mode=mode) def write(text): file.write(prefix + text + suffix) if flush: file.flush() upstream.sink(write) return file class Source(Stream): """Start node for a set of Streams Source nodes emit data into other nodes. They typically get this data by polling external sources, and are necessarily run by an event loop. Parameters ---------- start: bool Whether to call the run method immediately. If False, nothing will happen until ``source.start()`` is called. """ _graphviz_shape = 'doubleoctagon' def __init__(self, start=False, **kwargs): self.stopped = True super().__init__(ensure_io_loop=True, **kwargs) self.started = False if start: self.start() def stop(self): """set self.stopped, which will cause polling to stop after next run""" if not self.stopped: self.stopped = True def start(self): """start polling If already running, this has no effect. If the source was started and then stopped again, this will restart the ``self.run`` coroutine. """ if self.stopped: self.stopped = False self.started = True self.loop.add_callback(self.run) async def run(self): """This coroutine will be invoked by start() and emit all data You might either overrive ``_run()`` when all logic can be contained there, or override this method directly. Note the use of ``.stopped`` to halt the coroutine, whether or not """ while not self.stopped: await self._run() async def _run(self): """This is the functionality to run on each cycle Typically this may be used for polling some external IO source or time-based data emission. You might choose to include an ``await asyncio.sleep()`` for the latter. """ raise NotImplementedError @Stream.register_api(staticmethod) class from_periodic(Source): """Generate data from a function on given period cf ``streamz.dataframe.PeriodicDataFrame`` Parameters ---------- callback: callable Function to call on each iteration. Takes no arguments. poll_interval: float Time to sleep between calls (s) """ def __init__(self, callback, poll_interval=0.1, **kwargs): self._cb = callback self._poll = poll_interval super().__init__(**kwargs) async def _run(self): await asyncio.gather(*self._emit(self._cb())) await asyncio.sleep(self._poll) def PeriodicCallback(callback, callback_time, asynchronous=False, **kwargs): # pragma: no cover """For backward compatibility - please use Stream.from_periodic""" if kwargs: callback = lambda: callback(**kwargs) return Stream.from_periodic(callback, callback_time, asynchronous=asynchronous) @Stream.register_api(staticmethod) class from_textfile(Source): """ Stream data from a text file Parameters ---------- f: file or string Source of the data. If string, will be opened. poll_interval: Number Interval to poll file for new data in seconds delimiter: str Character(s) to use to split the data into parts start: bool Whether to start running immediately; otherwise call stream.start() explicitly. from_end: bool Whether to begin streaming from the end of the file (i.e., only emit lines appended after the stream starts). Examples -------- >>> source = Stream.from_textfile('myfile.json') # doctest: +SKIP >>> source.map(json.loads).pluck('value').sum().sink(print) # doctest: +SKIP >>> source.start() # doctest: +SKIP Returns ------- Stream """ def __init__(self, f, poll_interval=0.100, delimiter='\n', from_end=False, **kwargs): if isinstance(f, str): f = open(f) self.buffer = '' self.file = f self.from_end = from_end if self.from_end: # this only happens when we are ready to read self.file.seek(0, 2) self.delimiter = delimiter self.poll_interval = poll_interval super().__init__(**kwargs) async def _run(self): line = self.file.read() if line: self.buffer = self.buffer + line if self.delimiter in self.buffer: parts = self.buffer.split(self.delimiter) self.buffer = parts.pop(-1) for part in parts: await asyncio.gather(*self._emit(part + self.delimiter)) else: await asyncio.sleep(self.poll_interval) @Stream.register_api(staticmethod) class filenames(Source): """ Stream over filenames in a directory Parameters ---------- path: string Directory path or globstring over which to search for files poll_interval: Number Seconds between checking path start: bool (False) Whether to start running immediately; otherwise call stream.start() explicitly. Examples -------- >>> source = Stream.filenames('path/to/dir') # doctest: +SKIP >>> source = Stream.filenames('path/to/*.csv', poll_interval=0.500) # doctest: +SKIP """ def __init__(self, path, poll_interval=0.100, **kwargs): if '*' not in path: if os.path.isdir(path): if not path.endswith(os.path.sep): path = path + '/' path = path + '*' self.path = path self.seen = set() self.poll_interval = poll_interval super().__init__(**kwargs) async def _run(self): filenames = set(glob(self.path)) new = filenames - self.seen for fn in sorted(new): self.seen.add(fn) await asyncio.gather(*self._emit(fn)) await asyncio.sleep(self.poll_interval) # TODO: remove poll if delayed @Stream.register_api(staticmethod) class from_tcp(Source): """ Creates events by reading from a socket using tornado TCPServer The stream of incoming bytes is split on a given delimiter, and the parts become the emitted events. Parameters ---------- port : int The port to open and listen on. It only gets opened when the source is started, and closed upon ``stop()`` delimiter : bytes The incoming data will be split on this value. The resulting events will still have the delimiter at the end. start : bool Whether to immediately initiate the source. You probably want to set up downstream nodes first. server_kwargs : dict or None If given, additional arguments to pass to TCPServer Examples -------- >>> source = Source.from_tcp(4567) # doctest: +SKIP """ def __init__(self, port, delimiter=b'\n', server_kwargs=None, **kwargs): self.server_kwargs = server_kwargs or {} self.port = port self.server = None self.delimiter = delimiter super().__init__(**kwargs) def run(self): from tornado.tcpserver import TCPServer from tornado.iostream import StreamClosedError class EmitServer(TCPServer): source = self async def handle_stream(self, stream, address): while not self.source.stopped: try: data = await stream.read_until(self.source.delimiter) await self.source._emit(data) except StreamClosedError: break self.server = EmitServer(**self.server_kwargs) self.server.listen(self.port) def stop(self): if not self.stopped: self.server.stop() self.server = None self.stopped = True @Stream.register_api(staticmethod) class from_http_server(Source): """Listen for HTTP POSTs on given port Each connection will emit one event, containing the body data of the request Parameters ---------- port : int The port to listen on path : str Specific path to listen on. Can be regex, but content is not used. start : bool Whether to immediately startup the server. Usually you want to connect downstream nodes first, and then call ``.start()``. server_kwargs : dict or None If given, set of further parameters to pass on to HTTPServer Examples -------- >>> source = Source.from_http_server(4567) # doctest: +SKIP """ def __init__(self, port, path='/.*', server_kwargs=None, **kwargs): self.port = port self.path = path self.server_kwargs = server_kwargs or {} self.server = None super().__init__(**kwargs) def run(self): from tornado.web import Application, RequestHandler from tornado.httpserver import HTTPServer class Handler(RequestHandler): source = self async def post(self): await asyncio.gather(*self.source._emit(self.request.body)) self.write('OK') application = Application([ (self.path, Handler), ]) server = HTTPServer(application, **self.server_kwargs) server.listen(self.port) self.server = server def stop(self): """Shutdown HTTP server""" if not self.stopped: self.server.stop() self.server = None self.stopped = True @Stream.register_api(staticmethod) class from_process(Source): """Messages from a running external process This doesn't work on Windows Parameters ---------- cmd : list of str or str Command to run: program name, followed by arguments open_kwargs : dict To pass on the the process open function, see ``subprocess.Popen``. with_stderr : bool Whether to include the process STDERR in the stream start : bool Whether to immediately startup the process. Usually you want to connect downstream nodes first, and then call ``.start()``. Example ------- >>> source = Source.from_process(['ping', 'localhost']) # doctest: +SKIP """ def __init__(self, cmd, open_kwargs=None, with_stderr=False, with_end=True, **kwargs): self.cmd = cmd self.open_kwargs = open_kwargs or {} self.with_stderr = with_stderr self.with_end = with_end self.process = None super().__init__(**kwargs) async def run(self): import shlex import subprocess stderr = subprocess.STDOUT if self.with_stderr else None if isinstance(self.cmd, (list, tuple)): cmd, *args = self.cmd else: cmd, *args = shlex.split(self.cmd) process = await asyncio.create_subprocess_exec( cmd, *args, stdout=subprocess.PIPE, stderr=stderr, **self.open_kwargs) while not self.stopped: try: out = await process.stdout.readuntil(b'\n') except asyncio.IncompleteReadError as err: if self.with_end and err.partial: out = err.partial else: break if process.returncode is not None: self.stopped = True await asyncio.gather(*self._emit(out)) if process.returncode is not None: process.terminate() await process.wait() @Stream.register_api(staticmethod) class from_kafka(Source): """ Accepts messages from Kafka Uses the confluent-kafka library, https://docs.confluent.io/current/clients/confluent-kafka-python/ Parameters ---------- topics: list of str Labels of Kafka topics to consume from consumer_params: dict Settings to set up the stream, see https://docs.confluent.io/current/clients/confluent-kafka-python/#configuration https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md Examples: bootstrap.servers, Connection string(s) (host:port) by which to reach Kafka; group.id, Identity of the consumer. If multiple sources share the same group, each message will be passed to only one of them. poll_interval: number Seconds that elapse between polling Kafka for new messages start: bool (False) Whether to start polling upon instantiation Examples -------- >>> source = Stream.from_kafka(['mytopic'], ... {'bootstrap.servers': 'localhost:9092', ... 'group.id': 'streamz'}) # doctest: +SKIP """ def __init__(self, topics, consumer_params, poll_interval=0.1, **kwargs): self.cpars = consumer_params self.consumer = None self.topics = topics self.poll_interval = poll_interval super().__init__(**kwargs) def do_poll(self): if self.consumer is not None: msg = self.consumer.poll(0) if msg and msg.value() and msg.error() is None: return msg.value() @gen.coroutine def poll_kafka(self): while True: val = self.do_poll() if val: yield self._emit(val) else: yield gen.sleep(self.poll_interval) if self.stopped: break self._close_consumer() def start(self): import confluent_kafka as ck if self.stopped: self.stopped = False self.consumer = ck.Consumer(self.cpars) self.consumer.subscribe(self.topics) weakref.finalize( self, lambda consumer=self.consumer: _close_consumer(consumer) ) tp = ck.TopicPartition(self.topics[0], 0, 0) # blocks for consumer thread to come up and invoke poll to # establish connection with broker to fetch oauth token for kafka self.consumer.poll(timeout=1) self.consumer.get_watermark_offsets(tp) self.loop.add_callback(self.poll_kafka) def _close_consumer(self): if self.consumer is not None: consumer = self.consumer self.consumer = None consumer.unsubscribe() consumer.close() self.stopped = True def _close_consumer(consumer): try: consumer.close() except RuntimeError: pass class FromKafkaBatched(Source): """Base class for both local and cluster-based batched kafka processing""" def __init__(self, topic, consumer_params, poll_interval='1s', npartitions=None, refresh_partitions=False, max_batch_size=10000, keys=False, engine=None, **kwargs): self.consumer_params = consumer_params # Override the auto-commit config to enforce custom streamz # checkpointing self.consumer_params['enable.auto.commit'] = 'false' if 'auto.offset.reset' not in self.consumer_params.keys(): consumer_params['auto.offset.reset'] = 'latest' self.topic = topic self.npartitions = npartitions self.refresh_partitions = refresh_partitions if self.npartitions is not None and self.npartitions <= 0: raise ValueError("Number of Kafka topic partitions must be > 0.") self.poll_interval = convert_interval(poll_interval) self.max_batch_size = max_batch_size self.keys = keys self.engine = engine self.started = False super().__init__(**kwargs) @gen.coroutine def poll_kafka(self): import confluent_kafka as ck def commit(_part): topic, part_no, _, _, offset = _part[1:] _tp = ck.TopicPartition(topic, part_no, offset + 1) self.consumer.commit(offsets=[_tp], asynchronous=True) @gen.coroutine def checkpoint_emit(_part): ref = RefCounter(cb=lambda: commit(_part), loop=self.loop) yield self._emit(_part, metadata=[{'ref': ref}]) if self.npartitions is None: kafka_cluster_metadata = self.consumer.list_topics(self.topic) if self.engine == "cudf": # pragma: no cover self.npartitions = len(kafka_cluster_metadata[self.topic.encode('utf-8')]) else: self.npartitions = len(kafka_cluster_metadata.topics[self.topic].partitions) self.positions = [0] * self.npartitions tps = [] for partition in range(self.npartitions): tps.append(ck.TopicPartition(self.topic, partition)) while True: try: committed = self.consumer.committed(tps, timeout=1) except ck.KafkaException: pass else: for tp in committed: self.positions[tp.partition] = tp.offset break while not self.stopped: out = [] if self.refresh_partitions: kafka_cluster_metadata = self.consumer.list_topics(self.topic) if self.engine == "cudf": # pragma: no cover new_partitions = len(kafka_cluster_metadata[self.topic.encode('utf-8')]) else: new_partitions = len(kafka_cluster_metadata.topics[self.topic].partitions) if new_partitions > self.npartitions: self.positions.extend([-1001] * (new_partitions - self.npartitions)) self.npartitions = new_partitions for partition in range(self.npartitions): tp = ck.TopicPartition(self.topic, partition, 0) try: low, high = self.consumer.get_watermark_offsets( tp, timeout=0.1) except (RuntimeError, ck.KafkaException): continue self.started = True if 'auto.offset.reset' in self.consumer_params.keys(): if self.consumer_params['auto.offset.reset'] == 'latest' and \ self.positions[partition] == -1001: self.positions[partition] = high current_position = self.positions[partition] lowest = max(current_position, low) if high > lowest + self.max_batch_size: high = lowest + self.max_batch_size if high > lowest: out.append((self.consumer_params, self.topic, partition, self.keys, lowest, high - 1)) self.positions[partition] = high self.consumer_params['auto.offset.reset'] = 'earliest' for part in out: yield self.loop.add_callback(checkpoint_emit, part) else: yield gen.sleep(self.poll_interval) def start(self): import confluent_kafka as ck if self.engine == "cudf": # pragma: no cover from custreamz import kafka if self.stopped: if self.engine == "cudf": # pragma: no cover self.consumer = kafka.Consumer(self.consumer_params) else: self.consumer = ck.Consumer(self.consumer_params) weakref.finalize(self, lambda consumer=self.consumer: _close_consumer(consumer)) self.stopped = False tp = ck.TopicPartition(self.topic, 0, 0) # blocks for consumer thread to come up and invoke poll to establish # connection with broker to fetch oauth token for kafka self.consumer.poll(timeout=1) self.consumer.get_watermark_offsets(tp) self.loop.add_callback(self.poll_kafka) @Stream.register_api(staticmethod) def from_kafka_batched(topic, consumer_params, poll_interval='1s', npartitions=None, refresh_partitions=False, start=False, dask=False, max_batch_size=10000, keys=False, engine=None, **kwargs): """ Get messages and keys (optional) from Kafka in batches Uses the confluent-kafka library, https://docs.confluent.io/current/clients/confluent-kafka-python/ This source will emit lists of messages for each partition of a single given topic per time interval, if there is new data. If using dask, one future will be produced per partition per time-step, if there is data. Checkpointing is achieved through the use of reference counting. A reference counter is emitted downstream for each batch of data. A callback is triggered when the reference count reaches zero and the offsets are committed back to Kafka. Upon the start of this function, the previously committed offsets will be fetched from Kafka and begin reading form there. This will guarantee at-least-once semantics. Parameters ---------- topic: str Kafka topic to consume from consumer_params: dict | Settings to set up the stream, see | https://docs.confluent.io/current/clients/confluent-kafka-python/#configuration | https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md | Examples: | bootstrap.servers: Connection string(s) (host:port) by which to reach Kafka | group.id: Identity of the consumer. If multiple sources share the same | group, each message will be passed to only one of them. poll_interval: number Seconds that elapse between polling Kafka for new messages npartitions: int (None) | Number of partitions in the topic. | If None, streamz will poll Kafka to get the number of partitions. refresh_partitions: bool (False) | Useful if the user expects to increase the number of topic partitions on the | fly, maybe to handle spikes in load. Streamz polls Kafka in every batch to | determine the current number of partitions. If partitions have been added, | streamz will automatically start reading data from the new partitions as well. | If set to False, streamz will not accommodate adding partitions on the fly. | It is recommended to restart the stream after decreasing the number of partitions. start: bool (False) Whether to start polling upon instantiation max_batch_size: int The maximum number of messages per partition to be consumed per batch keys: bool (False) | Whether to extract keys along with the messages. | If True, this will yield each message as a dict: | {'key':msg.key(), 'value':msg.value()} engine: str (None) | If engine is set to "cudf", streamz reads data (messages must be JSON) | from Kafka in an accelerated manner directly into cuDF (GPU) dataframes. | This is done using the RAPIDS custreamz library. | Please refer to RAPIDS cudf API here: | https://docs.rapids.ai/api/cudf/stable/ | Folks interested in trying out custreamz would benefit from this | accelerated Kafka reader. If one does not want to use GPUs, they | can use streamz as is, with the default engine=None. | To use this option, one must install custreamz (use the | appropriate CUDA version recipe & Python version) | using a command like the one below, which will install all | GPU dependencies and streamz itself: | conda install -c rapidsai-nightly -c nvidia -c conda-forge \ | -c defaults custreamz=0.15 python=3.7 cudatoolkit=10.2 | More information at: https://rapids.ai/start.html Important Kafka Configurations By default, a stream will start reading from the latest offsets available. Please set 'auto.offset.reset': 'earliest' in the consumer configs, if the stream needs to start processing from the earliest offsets. Examples ---------- >>> source = Stream.from_kafka_batched('mytopic', ... {'bootstrap.servers': 'localhost:9092', ... 'group.id': 'streamz'}) # doctest: +SKIP """ if dask: from distributed.client import default_client kwargs['loop'] = default_client().loop source = FromKafkaBatched(topic, consumer_params, poll_interval=poll_interval, npartitions=npartitions, refresh_partitions=refresh_partitions, max_batch_size=max_batch_size, keys=keys, engine=engine, **kwargs) if dask: source = source.scatter() if start: source.start() if engine == "cudf": # pragma: no cover return source.starmap(get_message_batch_cudf) else: return source.starmap(get_message_batch) def get_message_batch(kafka_params, topic, partition, keys, low, high, timeout=None): """Fetch a batch of kafka messages (keys & values) in given topic/partition This will block until messages are available, or timeout is reached. """ import confluent_kafka as ck t0 = time.time() consumer = ck.Consumer(kafka_params) tp = ck.TopicPartition(topic, partition, low) consumer.assign([tp]) out = [] try: while True: msg = consumer.poll(0) if msg and msg.value() and msg.error() is None: if high >= msg.offset(): if keys: out.append({'key':msg.key(), 'value':msg.value()}) else: out.append(msg.value()) if high <= msg.offset(): break else: time.sleep(0.1) if timeout is not None and time.time() - t0 > timeout: break finally: consumer.close() return out def get_message_batch_cudf(kafka_params, topic, partition, keys, low, high, timeout=None): # pragma: no cover """ Fetch a batch of kafka messages (currently, messages must be in JSON format) in given topic/partition as a cudf dataframe """ from custreamz import kafka consumer = kafka.Consumer(kafka_params) gdf = None try: gdf = consumer.read_gdf(topic=topic, partition=partition, lines=True, start=low, end=high + 1) finally: consumer.close() return gdf @Stream.register_api(staticmethod) class from_iterable(Source): """ Emits items from an iterable. Parameters ---------- iterable: iterable An iterable to emit messages from. Examples -------- >>> source = Stream.from_iterable(range(3)) >>> L = source.sink_to_list() >>> source.start() >>> L [0, 1, 2] """ def __init__(self, iterable, **kwargs): self._iterable = iterable super().__init__(**kwargs) async def run(self): for x in self._iterable: if self.stopped: break await asyncio.gather(*self._emit(x)) self.stopped = True @Stream.register_api() class from_websocket(Source): """Read binary data from a websocket This source will accept connections on a given port and handle messages coming in. The websockets library must be installed. :param host: str Typically "localhost" :param port: int Which port to listen on (must be available) :param serve_kwargs: dict Passed to ``websockets.serve`` :param kwargs: Passed to superclass """ def __init__(self, host, port, serve_kwargs=None, **kwargs): self.host = host self.port = port self.s_kw = serve_kwargs self.server = None super().__init__(**kwargs) @gen.coroutine def _read(self, ws, path): while not self.stopped: data = yield ws.recv() yield self._emit(data) async def run(self): import websockets self.server = await websockets.serve( self._read, self.host, self.port, **(self.s_kw or {}) ) def stop(self): self.server.close() sync(self.loop, self.server.wait_closed) @Stream.register_api() class from_q(Source): """Source events from a threading.Queue, running another event framework The queue is polled, i.e., there is a latency/overhead tradeoff, since we cannot use ``await`` directly with a multithreaded queue. Allows mixing of another event loop, for example pyqt, on another thread. Note that, by default, a streamz.Source such as this one will start an event loop in a new thread, unless otherwise specified. """ def __init__(self, q, sleep_time=0.01, **kwargs): """ :param q: threading.Queue Any items pushed into here will become streamz events :param sleep_time: int Sets how long we wait before checking the input queue when empty (in s) :param kwargs: passed to streamz.Source """ self.q = q self.sleep = sleep_time super().__init__(**kwargs) async def _run(self): """Poll threading queue for events This uses check-and-wait, but overhead is low. Could maybe have a sleep-free version with an threading.Event. """ try: out = self.q.get_nowait() await self.emit(out, asynchronous=True) except queue.Empty: await asyncio.sleep(self.sleep) @Stream.register_api() class from_mqtt(from_q): """Read from MQTT source See https://en.wikipedia.org/wiki/MQTT for a description of the protocol and its uses. See also ``sinks.to_mqtt``. Requires ``paho.mqtt`` The outputs are ``paho.mqtt.client.MQTTMessage`` instances, which each have attributes timestamp, payload, topic, ... NB: paho.mqtt.python runs on its own thread in this implementation. We may wish to instead call client.loop() directly :param host: str :param port: int :param topic: str (May in the future support a list of topics) :param keepalive: int See mqtt docs - to keep the channel alive :param client_kwargs: Passed to the client's ``connect()`` method """ def __init__(self, host, port, topic, keepalive=60 , client_kwargs=None, **kwargs): self.host = host self.port = port self.keepalive = keepalive self.topic = topic self.client_kwargs = client_kwargs super().__init__(q=queue.Queue(), **kwargs) def _on_connect(self, client, userdata, flags, rc): client.subscribe(self.topic) def _on_message(self, client, userdata, msg): self.q.put(msg) async def run(self): import paho.mqtt.client as mqtt client = mqtt.Client() client.on_connect = self._on_connect client.on_message = self._on_message client.connect(self.host, self.port, self.keepalive, **(self.client_kwargs or {})) client.loop_start() await super().run() client.disconnect() streamz-0.6.4/streamz/sinks.py0000644000175000017500000001755114270277270015743 0ustar nileshnileshimport inspect import weakref from tornado import gen from streamz import Stream from streamz.core import sync # sinks add themselves here to avoid being garbage-collected _global_sinks = set() class Sink(Stream): _graphviz_shape = 'trapezium' def __init__(self, upstream, **kwargs): super().__init__(upstream, **kwargs) _global_sinks.add(self) def destroy(self): super().destroy() _global_sinks.remove(self) @Stream.register_api() class sink(Sink): """ Apply a function on every element Parameters ---------- func: callable A function that will be applied on every element. args: Positional arguments that will be passed to ``func`` after the incoming element. kwargs: Stream-specific arguments will be passed to ``Stream.__init__``, the rest of them will be passed to ``func``. Examples -------- >>> source = Stream() >>> L = list() >>> source.sink(L.append) >>> source.sink(print) >>> source.sink(print) >>> source.emit(123) 123 123 >>> L [123] See Also -------- map Stream.sink_to_list """ def __init__(self, upstream, func, *args, **kwargs): self.func = func # take the stream specific kwargs out sig = set(inspect.signature(Stream).parameters) stream_kwargs = {k: v for (k, v) in kwargs.items() if k in sig} self.kwargs = {k: v for (k, v) in kwargs.items() if k not in sig} self.args = args super().__init__(upstream, **stream_kwargs) def update(self, x, who=None, metadata=None): result = self.func(x, *self.args, **self.kwargs) if gen.isawaitable(result): return result else: return [] @Stream.register_api() class sink_to_textfile(Sink): """ Write elements to a plain text file, one element per line. Type of elements must be ``str``. Parameters ---------- file: str or file-like File to write the elements to. ``str`` is treated as a file name to open. If file-like, descriptor must be open in text mode. Note that the file descriptor will be closed when this sink is destroyed. end: str, optional This value will be written to the file after each element. Defaults to newline character. mode: str, optional If file is ``str``, file will be opened in this mode. Defaults to ``"a"`` (append mode). Examples -------- >>> source = Stream() >>> source.map(str).sink_to_textfile("test.txt") >>> source.emit(0) >>> source.emit(1) >>> print(open("test.txt", "r").read()) 0 1 """ def __init__(self, upstream, file, end="\n", mode="a", **kwargs): self._end = end self._fp = open(file, mode=mode) if isinstance(file, str) else file weakref.finalize(self, self._fp.close) super().__init__(upstream, **kwargs) def update(self, x, who=None, metadata=None): self._fp.write(x + self._end) @Stream.register_api() class to_kafka(Stream): """ Writes data in the stream to Kafka This stream accepts a string or bytes object. Call ``flush`` to ensure all messages are pushed. Responses from Kafka are pushed downstream. Parameters ---------- topic : string The topic which to write producer_config : dict Settings to set up the stream, see https://docs.confluent.io/current/clients/confluent-kafka-python/#configuration https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md Examples: bootstrap.servers: Connection string (host:port) to Kafka Examples -------- >>> from streamz import Stream >>> ARGS = {'bootstrap.servers': 'localhost:9092'} >>> source = Stream() >>> kafka = source.map(lambda x: str(x)).to_kafka('test', ARGS) >>> for i in range(10): ... source.emit(i) >>> kafka.flush() """ def __init__(self, upstream, topic, producer_config, **kwargs): import confluent_kafka as ck self.topic = topic self.producer = ck.Producer(producer_config) kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) self.stopped = False self.polltime = 0.2 self.loop.add_callback(self.poll) self.futures = [] @gen.coroutine def poll(self): while not self.stopped: # executes callbacks for any delivered data, in this thread # if no messages were sent, nothing happens self.producer.poll(0) yield gen.sleep(self.polltime) def update(self, x, who=None, metadata=None): future = gen.Future() self.futures.append(future) @gen.coroutine def _(): while True: try: # this runs asynchronously, in C-K's thread self.producer.produce(self.topic, x, callback=self.cb) return except BufferError: yield gen.sleep(self.polltime) except Exception as e: future.set_exception(e) return self.loop.add_callback(_) return future @gen.coroutine def cb(self, err, msg): future = self.futures.pop(0) if msg is not None and msg.value() is not None: future.set_result(None) yield self._emit(msg.value()) else: future.set_exception(err or msg.error()) def flush(self, timeout=-1): self.producer.flush(timeout) @Stream.register_api() class to_websocket(Sink): """Write bytes data to websocket The websocket will be opened on first call, and kept open. Should it close at some point, future writes will fail. Requires the ``websockets`` package. :param uri: str Something like "ws://host:port". Use "wss:" to allow TLS. :param ws_kwargs: dict Further kwargs to pass to ``websockets.connect``, please read its documentation. :param kwargs: Passed to superclass """ def __init__(self, upstream, uri, ws_kwargs=None, **kwargs): self.uri = uri self.ws_kw = ws_kwargs self.ws = None super().__init__(upstream, ensure_io_loop=True, **kwargs) async def update(self, x, who=None, metadata=None): import websockets if self.ws is None: self.ws = await websockets.connect(self.uri, **(self.ws_kw or {})) await self.ws.send(x) def destroy(self): super().destroy() if self.ws is not None: sync(self.loop, self.ws.protocol.close) self.ws = None @Stream.register_api() class to_mqtt(Sink): """ Send data to MQTT broker See also ``sources.from_mqtt``. Requires ``paho.mqtt`` :param host: str :param port: int :param topic: str :param keepalive: int See mqtt docs - to keep the channel alive :param client_kwargs: Passed to the client's ``connect()`` method """ def __init__(self, upstream, host, port, topic, keepalive=60, client_kwargs=None, **kwargs): self.host = host self.port = port self.c_kw = client_kwargs or {} self.client = None self.topic = topic self.keepalive = keepalive super().__init__(upstream, ensure_io_loop=True, **kwargs) def update(self, x, who=None, metadata=None): import paho.mqtt.client as mqtt if self.client is None: self.client = mqtt.Client() self.client.connect(self.host, self.port, self.keepalive, **self.c_kw) # TODO: wait on successful delivery self.client.publish(self.topic, x) def destroy(self): self.client.disconnect() self.client = None super().destroy() streamz-0.6.4/streamz/river.py0000644000175000017500000000345014270277270015734 0ustar nileshnileshfrom . import Stream # TODO: most river classes support batches, e.g., learn_many, more efficiently class RiverTransform(Stream): """Pass data through one or more River transforms""" def __init__(self, model, **kwargs): super().__init__(**kwargs) self.model = model def update(self, x, who=None, metadata=None): out = self.model.transform_one(*x) self.emit(out) class RiverTrain(Stream): def __init__(self, model, metric=None, pass_model=False, **kwargs): """ If metric and pass_model are both defaults, this is effectively a sink. :param model: river model or pipeline :param metric: river metric If given, it is emitted on every sample :param pass_model: bool If True, the (updated) model if emitted for each sample """ super().__init__(**kwargs) self.model = model if pass_model and metric is not None: raise TypeError self.pass_model = pass_model self.metric = metric def update(self, x, who=None, metadata=None): """ :param x: tuple (x, [y[, w]) floats for single sample. Include """ self.model.learn_one(*x) if self.metric: yp = self.model.predict_one(x[0]) weights = x[2] if len(x) > 1 else 1.0 return self._emit(self.metric.update(x[1], yp, weights).get(), metadata=metadata) if self.pass_model: return self._emit(self.model, metadata=metadata) class RiverPredict(Stream): def __init__(self, model, **kwargs): super().__init__(**kwargs) self.model = model def update(self, x, who=None, metadata=None): out = self.model.predict_one(x) return self._emit(out, metadata=metadata) streamz-0.6.4/streamz/plugins.py0000644000175000017500000000132614270277270016266 0ustar nileshnileshimport warnings import pkg_resources def try_register(cls, entry_point, *modifier): try: cls.register_plugin_entry_point(entry_point, *modifier) except ValueError: warnings.warn( f"Can't add {entry_point.name} from {entry_point.module_name}: " "name collision with existing stream node." ) def load_plugins(cls): for entry_point in pkg_resources.iter_entry_points("streamz.sources"): try_register(cls, entry_point, staticmethod) for entry_point in pkg_resources.iter_entry_points("streamz.nodes"): try_register(cls, entry_point) for entry_point in pkg_resources.iter_entry_points("streamz.sinks"): try_register(cls, entry_point) streamz-0.6.4/streamz/orderedweakset.py0000644000175000017500000000172514270277270017620 0ustar nileshnilesh# -*- coding: utf8 -*- # This is a copy from Stack Overflow # https://stackoverflow.com/questions/7828444/indexable-weak-ordered-set-in-python # Asked by Neil G https://stackoverflow.com/users/99989/neil-g # Answered/edited by https://stackoverflow.com/users/1001643/raymond-hettinger import collections import weakref class OrderedSet(collections.abc.MutableSet): def __init__(self, values=()): self._od = collections.OrderedDict().fromkeys(values) def __len__(self): return len(self._od) def __iter__(self): return iter(self._od) def __contains__(self, value): return value in self._od def add(self, value): self._od[value] = None def discard(self, value): self._od.pop(value, None) class OrderedWeakrefSet(weakref.WeakSet): def __init__(self, values=()): super(OrderedWeakrefSet, self).__init__() self.data = OrderedSet() for elem in values: self.add(elem) streamz-0.6.4/streamz/graph.py0000644000175000017500000001615614270277270015715 0ustar nileshnilesh"""Graphing utilities for EventStreams""" from __future__ import absolute_import, division, print_function from functools import partial import os import re def _clean_text(text, match=None): ''' Clean text, remove forbidden characters. ''' # all non alpha numeric characters, except for _ and : # replace them with space # the + condenses a group of consecutive characters all into one space # (rather than assigning a space to each) if match is None: match = '[^a-zA-Z0-9_:]+' text = re.sub(match, ' ', text) # now replace the colon with semicolon text = re.sub(":", ";", text) return text def build_node_set(node, s=None): """Build a set of all the nodes in a streamz graph Parameters ---------- node : Stream The node to use as a starting point for building the set s : set or None The set to put the nodes into. If None return a new set full of nodes Returns ------- s : set The set of nodes in the graph """ if s is None: s = set() if node is None or ( node in s and all(n in s for n in node.upstreams) and all(n in s for n in node.downstreams) ): return new_nodes = {n for n in node.downstreams} new_nodes.update(node.upstreams) new_nodes.add(node) s.update(new_nodes) [build_node_set(n, s) for n in list(new_nodes)] return s def create_graph(node, graph): """Create networkx graph of the pipeline Parameters ---------- node : Stream The node to start from graph : networkx.DiGraph The graph to fill with nodes Returns ------- """ # Step 1 build a set of all the nodes node_set = build_node_set(node) # Step 2 for each node in the set add to the graph for n in node_set: t = hash(n) graph.add_node( t, label=_clean_text(str(n)), shape=n._graphviz_shape, orientation=str(n._graphviz_orientation), style=n._graphviz_style, fillcolor=n._graphviz_fillcolor, ) # Step 3 for each node establish its edges for n in node_set: t = hash(n) for nn in n.upstreams: tt = hash(nn) graph.add_edge(tt, t) downstreams = n.downstreams for i, nn in enumerate(downstreams): tt = hash(nn) if len(downstreams) > 1: graph.add_edge(t, tt, label=str(i)) else: graph.add_edge(t, tt) # Step 4 destroy set del node_set def readable_graph(graph): """Create human readable version of this object's task graph. Parameters ---------- graph: nx.DiGraph instance The networkx graph representing the pipeline """ import networkx as nx mapping = {k: "{}".format(graph.nodes[k]["label"]) for k in graph} idx_mapping = {} for k, v in mapping.items(): if v in idx_mapping.keys(): idx_mapping[v] += 1 mapping[k] += "-{}".format(idx_mapping[v]) else: idx_mapping[v] = 0 gg = {k: v for k, v in mapping.items()} rg = nx.relabel_nodes(graph, gg, copy=True) return rg def to_graphviz(graph, **graph_attr): import graphviz digraph_kwargs = {'name', 'comment', 'filename', 'format', 'engine', 'encoding', 'graph_attr', 'node_attr', 'edge_attr', 'body', 'strict', 'directory'} if not digraph_kwargs.intersection(graph_attr): graph_attr = dict(graph_attr=graph_attr) gvz = graphviz.Digraph(**graph_attr) for node, attrs in graph.nodes.items(): gvz.node(node, **attrs) for edge, attrs in graph.edges().items(): gvz.edge(edge[0], edge[1], **attrs) return gvz def visualize(node, filename="mystream.png", **kwargs): """ Render a task graph using dot. If `filename` is not None, write a file to disk with that name in the format specified by `format`. `filename` should not include an extension. Parameters ---------- node : Stream instance The stream to display. filename : str or None, optional The name (without an extension) of the file to write to disk. If `filename` is None, no file will be written, and we communicate with dot using only pipes. Default is 'mydask'. format : {'png', 'pdf', 'dot', 'svg', 'jpeg', 'jpg'}, optional Format in which to write output file. Default is 'png'. Returns ------- result : None or IPython.display.Image or IPython.display.SVG (See below.) Notes ----- If IPython is installed, we return an IPython.display object in the requested format. If IPython is not installed, we just return None. We always return None if format is 'pdf' or 'dot', because IPython can't display these formats natively. Passing these formats with filename=None will not produce any useful output. See Also -------- streams.graph.readable_graph """ import networkx as nx nx_g = nx.DiGraph() create_graph(node, nx_g) rg = readable_graph(nx_g) g = to_graphviz(rg, **kwargs) fmts = [".png", ".pdf", ".dot", ".svg", ".jpeg", ".jpg"] if filename is None: format = "png" elif any(filename.lower().endswith(fmt) for fmt in fmts): filename, format = os.path.splitext(filename) format = format[1:].lower() else: format = "png" data = g.pipe(format=format) if not data: raise RuntimeError( "Graphviz failed to properly produce an image. " "This probably means your installation of graphviz " "is missing png support. See: " "https://github.com/ContinuumIO/anaconda-issues/" "issues/485 for more information." ) display_cls = _get_display_cls(format) if not filename: return display_cls(data=data) full_filename = ".".join([filename, format]) with open(full_filename, "wb") as f: f.write(data) return display_cls(filename=full_filename) IPYTHON_IMAGE_FORMATS = frozenset(["jpeg", "png"]) IPYTHON_NO_DISPLAY_FORMATS = frozenset(["dot", "pdf"]) def _get_display_cls(format): """ Get the appropriate IPython display class for `format`. Returns `IPython.display.SVG` if format=='svg', otherwise `IPython.display.Image`. If IPython is not importable, return dummy function that swallows its arguments and returns None. """ dummy = lambda *args, **kwargs: None try: import IPython.display as display except ImportError: # Can't return a display object if no IPython. return dummy if format in IPYTHON_NO_DISPLAY_FORMATS: # IPython can't display this format natively, so just return None. return dummy elif format in IPYTHON_IMAGE_FORMATS: # Partially apply `format` so that `Image` and `SVG` supply a uniform # interface to the caller. return partial(display.Image, format=format) elif format == "svg": return display.SVG else: raise ValueError("Unknown format '%s' passed to `dot_graph`" % format) streamz-0.6.4/streamz/dataframe/0000755000175000017500000000000014270277270016155 5ustar nileshnileshstreamz-0.6.4/streamz/dataframe/utils.py0000644000175000017500000000370414270277270017673 0ustar nileshnileshfrom __future__ import division, print_function import inspect import sys def is_dataframe_like(df): """ Looks like a Pandas DataFrame. ** Borrowed from dask.dataframe.utils ** """ typ = type(df) return ( all(hasattr(typ, name) for name in ("groupby", "head", "merge", "mean")) and all(hasattr(df, name) for name in ("dtypes", "columns")) and not any(hasattr(typ, name) for name in ("name", "dtype")) ) def is_series_like(s): """ Looks like a Pandas Series. ** Borrowed from dask.dataframe.utils ** """ typ = type(s) return ( all(hasattr(typ, name) for name in ("groupby", "head", "mean")) and all(hasattr(s, name) for name in ("dtype", "name")) and "index" not in typ.__name__.lower() ) def is_index_like(s): """ Looks like a Pandas Index. ** Borrowed from dask.dataframe.utils ** """ typ = type(s) return ( all(hasattr(s, name) for name in ("name", "dtype")) and "index" in typ.__name__.lower() ) def get_base_frame_type(frame_name, is_frame_like, example=None): """Handles type check for input example for DataFrame/Series/Index initialization. Returns the base type of streaming objects if type checks pass.""" if example is None: raise TypeError("Missing required argument:'example'") if is_frame_like is is_dataframe_like and not is_frame_like(example): import pandas as pd example = pd.DataFrame(example) elif not is_frame_like(example): msg = "Streaming {0} expects an example of {0} like objects. Got: {1}."\ .format(frame_name, example) raise TypeError(msg) return type(example) def get_dataframe_package(df): """ Utility function to return the top level package (pandas/cudf) of DataFrame/Series/Index objects """ module = inspect.getmodule(df) package, _, _ = module.__name__.partition('.') return sys.modules[package] streamz-0.6.4/streamz/dataframe/tests/0000755000175000017500000000000014270277270017317 5ustar nileshnileshstreamz-0.6.4/streamz/dataframe/tests/test_dataframes.py0000644000175000017500000007711314270277270023050 0ustar nileshnileshfrom __future__ import division, print_function import json import operator from time import sleep import pytest from dask.dataframe.utils import assert_eq import numpy as np import pandas as pd from tornado import gen from streamz import Stream from streamz.utils_test import gen_test, wait_for from streamz.dataframe import DataFrame, Series, DataFrames, Aggregation import streamz.dataframe as sd from streamz.dask import DaskStream from distributed import Client @pytest.fixture(scope="module") def client(): client = Client(processes=False, asynchronous=False) try: yield client finally: client.close() @pytest.fixture(params=['core', 'dask']) def stream(request, client): # flake8: noqa if request.param == 'core': return Stream() else: return DaskStream() def test_identity(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.stream.gather().sink_to_list() sdf.emit(df) assert L[0] is df assert list(sdf.example.columns) == ['x', 'y'] x = sdf.x assert isinstance(x, Series) L2 = x.stream.gather().sink_to_list() assert not L2 sdf.emit(df) assert isinstance(L2[0], pd.Series) assert assert_eq(L2[0], df.x) def test_dtype(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) assert str(sdf.dtypes) == str(df.dtypes) assert sdf.x.dtype == df.x.dtype assert sdf.index.dtype == df.index.dtype def test_attributes(): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df) assert getattr(sdf,'x',-1) != -1 assert getattr(sdf,'z',-1) == -1 sdf.x with pytest.raises(AttributeError): sdf.z def test_exceptions(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) with pytest.raises(TypeError): sdf.emit(1) with pytest.raises(IndexError): sdf.emit(pd.DataFrame()) @pytest.mark.parametrize('func', [ lambda x: x.sum(), lambda x: x.mean(), lambda x: x.count(), lambda x: x.size ]) def test_reductions(stream, func): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) for example in [df, df.iloc[:0]]: sdf = DataFrame(example=example, stream=stream) df_out = func(sdf).stream.gather().sink_to_list() x = sdf.x x_out = func(x).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(df_out[-1], func(pd.concat([df, df]))) assert_eq(x_out[-1], func(pd.concat([df, df]).x)) @pytest.mark.parametrize('op', [ operator.add, operator.and_, operator.eq, operator.floordiv, operator.ge, operator.gt, operator.le, operator.lshift, operator.lt, operator.mod, operator.mul, operator.ne, operator.or_, operator.pow, operator.rshift, operator.sub, operator.truediv, operator.xor, ]) @pytest.mark.parametrize('getter', [lambda df: df, lambda df: df.x]) def test_binary_operators(op, getter, stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) try: left = op(getter(df), 2) right = op(2, getter(df)) except Exception: return a = DataFrame(example=df, stream=stream) li = op(getter(a), 2).stream.gather().sink_to_list() r = op(2, getter(a)).stream.gather().sink_to_list() a.emit(df) assert_eq(li[0], left) assert_eq(r[0], right) @pytest.mark.parametrize('op', [ operator.abs, operator.inv, operator.invert, operator.neg, operator.not_, lambda x: x.map(lambda x: x + 1), lambda x: x.reset_index(), lambda x: x.astype(float), ]) @pytest.mark.parametrize('getter', [lambda df: df, lambda df: df.x]) def test_unary_operators(op, getter): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) try: expected = op(getter(df)) except Exception: return a = DataFrame(example=df) b = op(getter(a)).stream.sink_to_list() a.emit(df) assert_eq(b[0], expected) @pytest.mark.parametrize('func', [ lambda df: df.query('x > 1 and x < 4', engine='python'), lambda df: df.x.value_counts().nlargest(2) ]) def test_dataframe_simple(func): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) expected = func(df) a = DataFrame(example=df) L = func(a).stream.sink_to_list() a.emit(df) assert_eq(L[0], expected) def test_set_index(): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) a = DataFrame(example=df) b = a.set_index('x').stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index('x')) b = a.set_index('x', drop=True).stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index('x', drop=True)) b = a.set_index(a.y + 1, drop=True).stream.sink_to_list() a.emit(df) assert_eq(b[0], df.set_index(df.y + 1, drop=True)) def test_binary_stream_operators(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) expected = df.x + df.y a = DataFrame(example=df, stream=stream) b = (a.x + a.y).stream.gather().sink_to_list() a.emit(df) assert_eq(b[0], expected) def test_index(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) a = DataFrame(example=df, stream=stream) b = a.index + 5 L = b.stream.gather().sink_to_list() a.emit(df) a.emit(df) wait_for(lambda: len(L) > 1, timeout=2, period=0.05) assert_eq(L[0], df.index + 5) assert_eq(L[1], df.index + 5) def test_pair_arithmetic(stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = ((a.x + a.y) * 2).stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(pd.concat(L, axis=0), (df.x + df.y) * 2) def test_getitem(stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) a = DataFrame(example=df.iloc[:0], stream=stream) L = a[a.x > 4].stream.gather().sink_to_list() a.emit(df.iloc[:5]) a.emit(df.iloc[5:]) assert len(L) == 2 assert_eq(pd.concat(L, axis=0), df[df.x > 4]) @pytest.mark.parametrize('agg', [ lambda x: x.sum(), lambda x: x.mean(), lambda x: x.count(), lambda x: x.var(ddof=1), lambda x: x.std(), # pytest.mark.xfail(lambda x: x.var(ddof=0), reason="don't know") ]) @pytest.mark.parametrize('grouper', [ lambda a: a.x % 3, lambda a: 'x', lambda a: a.index % 2, lambda a: ['x'] ]) @pytest.mark.parametrize('indexer', [ lambda g: g.y, lambda g: g, lambda g: g[['y']] # lambda g: g[['x', 'y']] ]) def test_groupby_aggregate(agg, grouper, indexer, stream): df = pd.DataFrame({'x': (np.arange(10) // 2).astype(float), 'y': [1.0, 2.0] * 5}) a = DataFrame(example=df.iloc[:0], stream=stream) def f(x): return agg(indexer(x.groupby(grouper(x)))) L = f(a).stream.gather().sink_to_list() a.emit(df.iloc[:3]) a.emit(df.iloc[3:7]) a.emit(df.iloc[7:]) first = df.iloc[:3] assert assert_eq(L[0], f(first)) assert assert_eq(L[-1], f(df)) def test_value_counts(stream): s = pd.Series(['a', 'b', 'a']) a = Series(example=s, stream=stream) b = a.value_counts() assert b._stream_type == 'updating' result = b.stream.gather().sink_to_list() a.emit(s) a.emit(s) assert_eq(result[-1], pd.concat([s, s], axis=0).value_counts()) def test_repr(stream): df = pd.DataFrame({'x': (np.arange(10) // 2).astype(float), 'y': [1.0] * 10}) a = DataFrame(example=df, stream=stream) text = repr(a) assert type(a).__name__ in text assert 'x' in text assert 'y' in text text = repr(a.x) assert type(a.x).__name__ in text assert 'x' in text text = repr(a.x.sum()) assert type(a.x.sum()).__name__ in text def test_repr_html(stream): df = pd.DataFrame({'x': (np.arange(10) // 2).astype(float), 'y': [1.0] * 10}) a = DataFrame(example=df, stream=stream) for x in [a, a.y, a.y.mean()]: html = x._repr_html_() assert type(x).__name__ in html assert '1' in html def test_display(monkeypatch, capsys): pytest.importorskip("ipywidgets") import ipywidgets df = pd.DataFrame({'x': (np.arange(10) // 2).astype(float), 'y': [1.0] * 10}) a = DataFrame(example=df, stream=stream) # works by side-affect of display() a._ipython_display_() assert "Output()" in capsys.readouterr().out def get(*_, **__): raise ImportError monkeypatch.setattr(ipywidgets.Output, "__init__", get) out = source._ipython_display_() assert "DataFrame" in capsys.readouterr().out def test_setitem(stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) sdf = DataFrame(example=df.iloc[:0], stream=stream) stream = sdf.stream sdf['z'] = sdf['x'] * 2 sdf['a'] = 10 sdf[['c', 'd']] = sdf[['x', 'y']] L = sdf.mean().stream.gather().sink_to_list() stream.emit(df.iloc[:3]) stream.emit(df.iloc[3:7]) stream.emit(df.iloc[7:]) df['z'] = df['x'] * 2 df['a'] = 10 df[['c', 'd']] = df[['x', 'y']] assert_eq(L[-1], df.mean()) def test_setitem_overwrites(stream): df = pd.DataFrame({'x': list(range(10))}) sdf = DataFrame(example=df.iloc[:0], stream=stream) stream = sdf.stream sdf['x'] = sdf['x'] * 2 L = sdf.stream.gather().sink_to_list() stream.emit(df.iloc[:3]) stream.emit(df.iloc[3:7]) stream.emit(df.iloc[7:]) assert_eq(L[-1], df.iloc[7:] * 2) @pytest.mark.slow @pytest.mark.parametrize('kwargs,op', [ ({}, 'sum'), ({}, 'mean'), pytest.param({}, 'min', marks=pytest.mark.slow), ({}, 'median'), pytest.param({}, 'max', marks=pytest.mark.slow), pytest.param({}, 'var', marks=pytest.mark.slow), pytest.param({}, 'count', marks=pytest.mark.slow), ({'ddof': 0}, 'std'), pytest.param({'quantile': 0.5}, 'quantile', marks=pytest.mark.slow) # ({'arg': {'A':'sum', 'B':'min'}, 'aggregate') -- deprecated with Pandas1.0 ]) @pytest.mark.parametrize('window', [ pytest.param(2, marks=pytest.mark.slow), 7, pytest.param('3h', marks=pytest.mark.slow), pd.Timedelta('200 minutes') ]) @pytest.mark.parametrize('m', [ 2, pytest.param(5, marks=pytest.mark.slow) ]) @pytest.mark.parametrize('pre_get,post_get', [ (lambda df: df, lambda df: df), (lambda df: df.x, lambda x: x), (lambda df: df, lambda df: df.x) ]) def test_rolling_count_aggregations(op, window, m, pre_get, post_get, kwargs, stream): index = pd.date_range(start='2000-01-01', end='2000-01-03', freq='1h') df = pd.DataFrame({'x': np.arange(len(index))}, index=index) expected = getattr(post_get(pre_get(df).rolling(window)), op)(**kwargs) sdf = DataFrame(example=df.iloc[:0], stream=stream) roll = getattr(post_get(pre_get(sdf).rolling(window)), op)(**kwargs) L = roll.stream.gather().sink_to_list() assert len(L) == 0 for i in range(0, len(df), m): sdf.emit(df.iloc[i: i + m]) assert len(L) > 1 assert_eq(pd.concat(L), expected) def test_stream_to_dataframe(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) source = stream L = source.to_dataframe(example=df).x.sum().stream.gather().sink_to_list() source.emit(df) source.emit(df) source.emit(df) assert L == [6, 12, 18] def test_integration_from_stream(stream): source = stream sdf = source.partition(4).to_batch(example=['{"x": 0, "y": 0}']).map(json.loads).to_dataframe() result = sdf.groupby(sdf.x).y.sum().mean() L = result.stream.gather().sink_to_list() for i in range(12): source.emit(json.dumps({'x': i % 3, 'y': i})) assert L == [2, 28 / 3, 22.0] @gen_test() def test_random_source2(): source = sd.Random(freq='10ms', interval='100ms') L = source.stream.sink_to_list() yield gen.sleep(0.5) assert 2 < len(L) < 8 assert all(2 < len(df) < 25 for df in L) source.x source.rolling('10s') @pytest.mark.xfail(reason="Does not yet exist") @pytest.mark.parametrize('n', [2, 3, 4]) def test_repartition_n(n, stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) source = stream L = source.to_dataframe(example=df).repartition(n=n).stream.gather().sink_to_list() source.emit(df) source.emit(df) source.emit(df) source.emit(df) assert all(len(x) == n for x in L) assert_eq(pd.concat(L), pd.concat([df] * 4)) @pytest.mark.xfail(reason="Does not yet exist") @gen_test() def test_repartition_interval(n): source = sd.Random(freq='10ms', interval='100ms') L = source.stream.sink_to_list() L2 = source.repartition(interval='150ms').stream.sink_to_list() yield gen.sleep(0.400) assert L2 for df in L2: assert df.index.max() - df.index.min() <= pd.Timedelta('150ms') expected = pd.concat(L).iloc[:sum(map(len, L2))] assert_eq(pd.concat(L2), expected) def test_to_frame(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) assert sdf.to_frame() is sdf a = sdf.x.to_frame() assert isinstance(a, DataFrame) assert list(a.columns) == ['x'] def test_instantiate_with_dict(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) sdf2 = DataFrame({'a': sdf.x, 'b': sdf.x * 2, 'c': sdf.y % 2}) L = sdf2.stream.gather().sink_to_list() assert len(sdf2.columns) == 3 sdf.emit(df) sdf.emit(df) assert len(L) == 2 for x in L: assert_eq(x[['a', 'b', 'c']], pd.DataFrame({'a': df.x, 'b': df.x * 2, 'c': df.y % 2}, columns=['a', 'b', 'c'])) @pytest.mark.parametrize('op', ['cumsum', 'cummax', 'cumprod', 'cummin']) @pytest.mark.parametrize('getter', [lambda df: df, lambda df: df.x]) def test_cumulative_aggregations(op, getter, stream): df = pd.DataFrame({'x': list(range(10)), 'y': [1] * 10}) expected = getattr(getter(df), op)() sdf = DataFrame(example=df, stream=stream) L = getattr(getter(sdf), op)().stream.gather().sink_to_list() for i in range(0, 10, 3): sdf.emit(df.iloc[i: i + 3]) sdf.emit(df.iloc[:0]) assert len(L) > 1 assert_eq(pd.concat(L), expected) @gen_test() def test_gc(): sdf = sd.Random(freq='5ms', interval='100ms') a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) n = len(sdf.stream.downstreams) a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) yield gen.sleep(0.1) a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) yield gen.sleep(0.1) a = DataFrame({'volatility': sdf.x.rolling('100ms').var(), 'sub': sdf.x - sdf.x.rolling('100ms').mean()}) yield gen.sleep(0.1) assert len(sdf.stream.downstreams) == n del a import gc; gc.collect() assert len(sdf.stream.downstreams) == 0 @gen_test() def test_gc_random(): from weakref import WeakValueDictionary w = WeakValueDictionary() a = sd.Random(freq='5ms', interval='100ms') w[1] = a yield gen.sleep(0.1) a = sd.Random(freq='5ms', interval='100ms') w[2] = a yield gen.sleep(0.1) a = sd.Random(freq='5ms', interval='100ms') w[3] = a yield gen.sleep(0.1) assert len(w) == 1 def test_display(stream): pytest.importorskip('ipywidgets') pytest.importorskip('IPython') df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) s = sdf.x.sum() s._ipython_display_() def test_tail(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.tail(2).stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert_eq(L[0], df.tail(2)) assert_eq(L[1], df.tail(2)) def test_random_source(client): n = len(client.cluster.scheduler.tasks) source = sd.Random(freq='1ms', interval='10ms', dask=True) source.x.stream.gather().sink_to_list() sleep(0.20) assert len(client.cluster.scheduler.tasks) < n + 10 def test_example_type_error_message(): try: DataFrame(example=[123]) except Exception as e: assert 'DataFrame' in str(e) assert '[123]' in str(e) def test_dataframes(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrames(example=df, stream=stream) L = (sdf + 1).x.sum().stream.gather().sink_to_list() sdf.emit(df) sdf.emit(df) assert L == [9, 9] def test_groupby_aggregate_updating(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) assert sdf.groupby('x').y.mean()._stream_type == 'updating' assert sdf.x.sum()._stream_type == 'updating' assert (sdf.x.sum() + 1)._stream_type == 'updating' def test_window_sum(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).x.sum().stream.gather().sink_to_list() sdf.emit(df) assert L == [6] sdf.emit(df) assert L == [6, 9] sdf.emit(df) assert L == [6, 9, 9] def test_window_sum_dataframe(stream): df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) sdf = DataFrame(example=df, stream=stream) L = sdf.window(n=4).sum().stream.gather().sink_to_list() sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) sdf.emit(df) assert_eq(L[0], pd.Series([6, 15], index=['x', 'y'])) assert_eq(L[1], pd.Series([9, 21], index=['x', 'y'])) assert_eq(L[2], pd.Series([9, 21], index=['x', 'y'])) @pytest.mark.parametrize('func', [ lambda x: x.sum(), lambda x: x.mean(), lambda x: x.count(), lambda x: x.size, lambda x: x.var(ddof=1), lambda x: x.std(ddof=1), lambda x: x.var(ddof=0), ]) @pytest.mark.parametrize('n', [1, 4]) @pytest.mark.parametrize('getter', [ lambda df: df, lambda df: df.x, ]) def test_windowing_n(func, n, getter): df = pd.DataFrame({'x': list(range(10)), 'y': [1, 2] * 5}) sdf = DataFrame(example=df) L = func(getter(sdf).window(n=n) + 10).stream.gather().sink_to_list() for i in range(0, 10, 3): sdf.emit(df.iloc[i: i + 3]) sdf.emit(df.iloc[:0]) assert len(L) == 5 assert_eq(L[0], func(getter(df).iloc[max(0, 3 - n): 3] + 10)) assert_eq(L[-1], func(getter(df).iloc[len(df) - n:] + 10)) @pytest.mark.parametrize('func', [ lambda x: x.sum(), lambda x: x.mean(), lambda x: x.count(), lambda x: x.var(ddof=1), lambda x: x.std(ddof=1), lambda x: x.var(ddof=0), ], ids=["sum", "mean", "count", "var_1", "std", "var_0"]) def test_expanding(func): df = pd.DataFrame({'x': [1.], 'y': [2.]}) sdf = DataFrame(example=df) L = func(sdf.expanding()).stream.gather().sink_to_list() for i in range(5): sdf.emit(df) result = pd.concat(L, axis=1).T.astype(float) expected = func(pd.concat([df] * 5, ignore_index=True).expanding()) assert_eq(result, expected) def test_ewm_mean(): sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y'])) L = sdf.ewm(1).mean().stream.gather().sink_to_list() sdf.emit(pd.DataFrame({'x': [1.], 'y': [2.]})) sdf.emit(pd.DataFrame({'x': [2.], 'y': [3.]})) sdf.emit(pd.DataFrame({'x': [3.], 'y': [4.]})) result = pd.concat(L, ignore_index=True) df = pd.DataFrame({'x': [1., 2., 3.], 'y': [2., 3., 4.]}) expected = df.ewm(1).mean() assert_eq(result, expected) def test_ewm_raise_multiple_arguments(): sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y'])) with pytest.raises(ValueError, match="Can only provide one of"): sdf.ewm(com=1, halflife=1) def test_ewm_raise_no_argument(): sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y'])) with pytest.raises(ValueError, match="Must pass one of"): sdf.ewm() @pytest.mark.parametrize("arg", ["com", "halflife", "alpha", "span"]) def test_raise_invalid_argument(arg): sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y'])) param = {arg: -1} with pytest.raises(ValueError): sdf.ewm(**param) @pytest.mark.parametrize('func', [ lambda x: x.sum(), lambda x: x.count(), lambda x: x.apply(lambda x: x), lambda x: x.full(), lambda x: x.var(), lambda x: x.std() ], ids=["sum", "count", "apply", "full", "var", "std"]) def test_ewm_notimplemented(func): sdf = DataFrame(example=pd.DataFrame(columns=['x', 'y'])) with pytest.raises(NotImplementedError): func(sdf.ewm(1)) @pytest.mark.parametrize('func', [ lambda x: x.sum(), lambda x: x.mean(), lambda x: x.count(), lambda x: x.var(ddof=1), lambda x: x.std(), pytest.param(lambda x: x.var(ddof=0), marks=pytest.mark.xfail), ]) @pytest.mark.parametrize('value', ['10h', '1d']) @pytest.mark.parametrize('getter', [ lambda df: df, lambda df: df.x, ]) @pytest.mark.parametrize('grouper', [ lambda a: a.x % 4, lambda a: 'y', lambda a: a.index, lambda a: ['y'] ]) @pytest.mark.parametrize('indexer', [ lambda g: g.x, lambda g: g, lambda g: g[['x']], #lambda g: g[['x', 'y']] ]) def test_groupby_windowing_value(func, value, getter, grouper, indexer): index = pd.date_range(start='2000-01-01', end='2000-01-03', freq='1h') df = pd.DataFrame({'x': np.arange(len(index), dtype=float), 'y': np.arange(len(index), dtype=float) % 2}, index=index) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(value)).stream.gather().sink_to_list() value = pd.Timedelta(value) diff = 13 for i in range(0, len(index), diff): sdf.emit(df.iloc[i: i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[:diff] first = first[first.index.max() - value + pd.Timedelta('1ns'):] assert_eq(L[0], f(first)) last = df.loc[index.max() - value + pd.Timedelta('1ns'):] assert_eq(L[-1], f(last)) @pytest.mark.parametrize('func', [ lambda x: x.sum(), lambda x: x.mean(), lambda x: x.count(), lambda x: x.size(), lambda x: x.var(ddof=1), lambda x: x.std(ddof=1), pytest.param(lambda x: x.var(ddof=0), marks=pytest.mark.xfail), ]) @pytest.mark.parametrize('n', [1, 4]) @pytest.mark.parametrize('getter', [ lambda df: df, lambda df: df.x, ]) @pytest.mark.parametrize('grouper', [ lambda a: a.x % 3, lambda a: 'y', lambda a: a.index % 2, lambda a: ['y'] ]) @pytest.mark.parametrize('indexer', [ lambda g: g.x, lambda g: g, lambda g: g[['x']], #lambda g: g[['x', 'y']] ]) def test_groupby_windowing_n(func, n, getter, grouper, indexer): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) def f(x): return func(indexer(x.groupby(grouper(x)))) L = f(sdf.window(n=n)).stream.gather().sink_to_list() diff = 3 for i in range(0, 10, diff): sdf.emit(df.iloc[i: i + diff]) sdf.emit(df.iloc[:0]) assert len(L) == 5 first = df.iloc[max(0, diff - n): diff] assert_eq(L[0], f(first)) last = df.iloc[len(df) - n:] assert_eq(L[-1], f(last)) def test_windowing_value_empty_intermediate_index(stream): def preprocess(df): mask = df["amount"] == 5 df = df.loc[mask] return df source = stream.map(preprocess) example = pd.DataFrame({"amount":[]}) sdf = DataFrame(stream=source, example=example) output = sdf.window("2h").amount.sum().stream.gather().sink_to_list() stream.emit(pd.DataFrame({"amount": [1, 2, 3]}, index=[pd.Timestamp("2050-01-01 00:00:00"), pd.Timestamp("2050-01-01 01:00:00"), pd.Timestamp("2050-01-01 02:00:00")])) stream.emit(pd.DataFrame({"amount": [5, 5, 5]}, index=[pd.Timestamp("2050-01-01 03:00:00"), pd.Timestamp("2050-01-01 04:00:00"), pd.Timestamp("2050-01-01 05:00:00")])) stream.emit(pd.DataFrame({"amount": [4, 5, 6]}, index=[pd.Timestamp("2050-01-01 06:00:00"), pd.Timestamp("2050-01-01 07:00:00"), pd.Timestamp("2050-01-01 08:00:00")])) stream.emit(pd.DataFrame({"amount": [1, 2, 3]}, index=[pd.Timestamp("2050-01-01 09:00:00"), pd.Timestamp("2050-01-01 10:00:00"), pd.Timestamp("2050-01-01 11:00:00")])) stream.emit(pd.DataFrame({"amount": [5, 5, 5]}, index=[pd.Timestamp("2050-01-01 12:00:00"), pd.Timestamp("2050-01-01 13:00:00"), pd.Timestamp("2050-01-01 14:00:00")])) assert_eq(output, [0, 10, 5, 5, 10]) def test_window_full(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) sdf = DataFrame(example=df) L = sdf.window(n=4).apply(lambda x: x).stream.sink_to_list() sdf.emit(df.iloc[:3]) sdf.emit(df.iloc[3:8]) sdf.emit(df.iloc[8:]) assert_eq(L[0], df.iloc[:3]) assert_eq(L[1], df.iloc[4:8]) assert_eq(L[2], df.iloc[-4:]) def test_custom_aggregation(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) class Custom(Aggregation): def initial(self, new): return 0 def on_new(self, state, new): return state + 1, state def on_old(self, state, new): return state - 100, state sdf = DataFrame(example=df) L = sdf.aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [0, 1, 2] sdf = DataFrame(example=df) L = sdf.window(n=5).aggregate(Custom()).stream.sink_to_list() sdf.emit(df) sdf.emit(df) sdf.emit(df) assert L == [1, -198, -397] def test_groupby_aggregate_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example).groupby(['name']) output0 = sdf.amount.sum(start=None).stream.gather().sink_to_list() output1 = sdf.amount.mean(with_state=True, start=None).stream.gather().sink_to_list() output2 = sdf.amount.count(start=None).stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [50, 100]}) stream.emit(df) out_df0 = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [50.0, 100.0]}) out_df1 = pd.DataFrame({'name': ['Alice', 'Tom'], 'amount': [1, 1]}) assert assert_eq(output0[0].reset_index(), out_df0) assert assert_eq(output1[0][1].reset_index(), out_df0) assert assert_eq(output2[0].reset_index(), out_df1) example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example).groupby(['name']) output3 = sdf.amount.sum(start=output0[0]).stream.gather().sink_to_list() output4 = sdf.amount.mean(with_state=True, start=output1[0][0]).stream.gather().sink_to_list() output5 = sdf.amount.count(start=output2[0]).stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) out_df2 = pd.DataFrame({'name': ['Alice', 'Linda', 'Tom'], 'amount': [100.0, 200.0, 200.0]}) out_df3 = pd.DataFrame({'name': ['Alice', 'Linda', 'Tom'], 'amount': [50.0, 200.0, 100.0]}) out_df4 = pd.DataFrame({'name': ['Alice', 'Linda', 'Tom'], 'amount': [2, 1, 2]}) assert assert_eq(output3[0].reset_index(), out_df2) assert assert_eq(output4[0][1].reset_index(), out_df3) assert assert_eq(output5[0].reset_index(), out_df4) def test_reductions_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output0 = sdf.amount.mean(start=(10, 2)).stream.gather().sink_to_list() output1 = sdf.amount.count(start=3).stream.gather().sink_to_list() output2 = sdf.amount.sum(start=10).stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) assert output0[0] == 72.0 assert output1[0] == 6 assert output2[0] == 360 def test_rolling_aggs_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output0 = sdf.rolling(2, with_state=True, start=()).amount.sum().stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) df = pd.DataFrame({'name': ['Bob'], 'amount': [250]}) stream.emit(df) assert assert_eq(output0[-1][0].reset_index(drop=True), pd.Series([200, 250], name="amount")) assert assert_eq(output0[-1][1].reset_index(drop=True), pd.Series([450.0], name="amount")) stream = Stream() example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output1 = sdf.rolling(2, with_state=True, start=output0[-1][0]).amount.sum().stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice'], 'amount': [50]}) stream.emit(df) assert assert_eq(output1[-1][0].reset_index(drop=True), pd.Series([250, 50], name="amount")) assert assert_eq(output1[-1][1].reset_index(drop=True), pd.Series([300.0], name="amount")) def test_window_aggs_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output0 = sdf.window(2, with_state=True, start=None).amount.sum().stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) df = pd.DataFrame({'name': ['Bob'], 'amount': [250]}) stream.emit(df) assert output0[-1][1] == 450 stream = Stream() example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output1 = sdf.window(2, with_state=True, start=output0[-1][0]).amount.sum().stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice'], 'amount': [50]}) stream.emit(df) assert output1[-1][1] == 300 def test_windowed_groupby_aggs_with_start_state(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output0 = sdf.window(5, with_state=True, start=None).groupby(['name']).amount.sum().\ stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Tom', 'Linda'], 'amount': [50, 100, 200]}) stream.emit(df) df = pd.DataFrame({'name': ['Alice', 'Linda', 'Bob'], 'amount': [250, 300, 350]}) stream.emit(df) stream = Stream() example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) output1 = sdf.window(5, with_state=True, start=output0[-1][0]).groupby(['name']).amount.sum().\ stream.gather().sink_to_list() df = pd.DataFrame({'name': ['Alice', 'Linda', 'Tom', 'Bob'], 'amount': [50, 100, 150, 200]}) stream.emit(df) out_df1 = pd.DataFrame({'name':['Alice', 'Bob', 'Linda', 'Tom'], 'amount':[50.0, 550.0, 100.0, 150.0]}) assert_eq(output1[-1][1].reset_index(), out_df1) def test_dir(stream): example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) assert 'name' in dir(sdf) assert 'amount' in dir(sdf) streamz-0.6.4/streamz/dataframe/tests/test_dataframe_utils.py0000644000175000017500000000553414270277270024103 0ustar nileshnileshimport pytest from streamz.dataframe.utils import is_dataframe_like, is_series_like, \ is_index_like, get_base_frame_type, get_dataframe_package import pandas as pd import numpy as np def test_utils_get_base_frame_type_pandas(): with pytest.raises(TypeError): get_base_frame_type("DataFrame", is_dataframe_like, None) df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) assert pd.DataFrame == get_base_frame_type("DataFrame", is_dataframe_like, df) with pytest.raises(TypeError): get_base_frame_type("Series", is_series_like, df) with pytest.raises(TypeError): get_base_frame_type("Index", is_index_like, df) # casts Series to DataFrame, if that's what we ask for assert pd.DataFrame == get_base_frame_type("DataFrame", is_dataframe_like, df.x) assert pd.Series == get_base_frame_type("Series", is_series_like, df.x) with pytest.raises(TypeError): get_base_frame_type("Index", is_index_like, df.x) # casts Series to DataFrame, if that's what we ask for assert pd.DataFrame == get_base_frame_type("DataFrame", is_dataframe_like, df.index) with pytest.raises(TypeError): get_base_frame_type("Series", is_series_like, df.index) assert issubclass(get_base_frame_type("Index", is_index_like, df.index), pd.Index) def test_utils_get_base_frame_type_cudf(): cudf = pytest.importorskip("cudf") df = cudf.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) assert cudf.DataFrame == get_base_frame_type("DataFrame", is_dataframe_like, df) with pytest.raises(TypeError): get_base_frame_type("Series", is_series_like, df) with pytest.raises(TypeError): get_base_frame_type("Index", is_index_like, df) with pytest.raises(TypeError): get_base_frame_type("DataFrame", is_dataframe_like, df.x) assert cudf.Series == get_base_frame_type("Series", is_series_like, df.x) with pytest.raises(TypeError): get_base_frame_type("Index", is_index_like, df.x) with pytest.raises(TypeError): get_base_frame_type("DataFrame", is_dataframe_like, df.index) with pytest.raises(TypeError): get_base_frame_type("Series", is_series_like, df.index) assert issubclass(get_base_frame_type("Index", is_index_like, df.index), cudf.Index) def test_get_dataframe_package_pandas(): df = pd.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) assert pd == get_dataframe_package(df) assert pd == get_dataframe_package(df.x) assert pd == get_dataframe_package(df.index) def test_get_dataframe_package_cudf(): cudf = pytest.importorskip("cudf") df = cudf.DataFrame({'x': np.arange(10, dtype=float), 'y': [1.0, 2.0] * 5}) assert cudf == get_dataframe_package(df) assert cudf == get_dataframe_package(df.x) assert cudf == get_dataframe_package(df.index) streamz-0.6.4/streamz/dataframe/core.py0000644000175000017500000010625014270277270017463 0ustar nileshnileshimport asyncio import operator from collections import OrderedDict import numpy as np import pandas as pd import toolz from ..collection import Streaming, _stream_types, OperatorMixin from ..sources import Source from ..utils import M from . import aggregations from .utils import is_dataframe_like, is_series_like, is_index_like, \ get_base_frame_type, get_dataframe_package class BaseFrame(Streaming): def round(self, decimals=0): """ Round elements in frame """ return self.map_partitions(M.round, self, decimals=decimals) def reset_index(self): """ Reset Index """ return self.map_partitions(M.reset_index, self) def set_index(self, index, **kwargs): """ Set Index """ return self.map_partitions(M.set_index, self, index, **kwargs) def tail(self, n=5): """ Round elements in frame """ return self.map_partitions(M.tail, self, n=n) def astype(self, dt): return self.map_partitions(M.astype, self, dt) @property def index(self): return self.map_partitions(lambda x: x.index, self) def map(self, func, na_action=None): return self.map_partitions(self._subtype.map, self, func, na_action=na_action) class Frame(BaseFrame): _stream_type = 'streaming' def groupby(self, other): """ Groupby aggregations """ return GroupBy(self, other) def aggregate(self, aggregation, start=None): return self.accumulate_partitions(aggregations.accumulator, agg=aggregation, start=start, stream_type='updating', returns_state=True) def sum(self, start=None): """ Sum frame. Parameters ---------- start: None or resulting Python object type from the operation Accepts a valid start state. """ return self.aggregate(aggregations.Sum(), start) def count(self, start=None): """ Count of frame Parameters ---------- start: None or resulting Python object type from the operation Accepts a valid start state. """ return self.aggregate(aggregations.Count(), start) @property def size(self): """ size of frame """ return self.aggregate(aggregations.Size()) def mean(self, start=None): """ Average frame Parameters ---------- start: None or resulting Python object type from the operation Accepts a valid start state. """ return self.aggregate(aggregations.Mean(), start) def rolling(self, window, min_periods=1, with_state=False, start=()): """ Compute rolling aggregations When followed by an aggregation method like ``sum``, ``mean``, or ``std`` this produces a new Streaming dataframe whose values are aggregated over that window. The window parameter can be either a number of rows or a timedelta like ``"2 minutes"` in which case the index should be a datetime index. This operates by keeping enough of a backlog of records to maintain an accurate stream. It performs a copy at every added dataframe. Because of this it may be slow if the rolling window is much larger than the average stream element. Parameters ---------- window: int or timedelta Window over which to roll with_state: bool (False) Whether to return the state along with the result as a tuple (state, result). State may be needed downstream for a number of reasons like checkpointing. start: () or resulting Python object type from the operation Accepts a valid start state. Returns ------- Rolling object See Also -------- DataFrame.window: more generic window operations """ return Rolling(self, window, min_periods, with_state, start) def window(self, n=None, value=None, with_state=False, start=None): """ Sliding window operations Windowed operations are defined over a sliding window of data, either with a fixed number of elements:: >>> df.window(n=10).sum() # sum of the last ten elements or over an index value range (index must be monotonic):: >>> df.window(value='2h').mean() # average over the last two hours Windowed dataframes support all normal arithmetic, aggregations, and groupby-aggregations. Parameters ---------- n: int Window of number of elements over which to roll value: str Window of time over which to roll with_state: bool (False) Whether to return the state along with the result as a tuple (state, result). State may be needed downstream for a number of reasons like checkpointing. start: None or resulting Python object type from the operation Accepts a valid start state. Examples -------- >>> df.window(n=10).std() >>> df.window(value='2h').count() >>> w = df.window(n=100) >>> w.groupby(w.name).amount.sum() >>> w.groupby(w.x % 10).y.var() See Also -------- DataFrame.rolling: mimic's Pandas rolling aggregations """ return Window(self, n=n, value=value, with_state=with_state, start=start) def expanding(self, with_state=False, start=None): return Expanding(self, n=1, with_state=with_state, start=start) def ewm(self, com=None, span=None, halflife=None, alpha=None, with_state=False, start=None): return EWM(self, n=1, com=com, span=span, halflife=halflife, alpha=alpha, with_state=with_state, start=start) def _cumulative_aggregation(self, op): return self.accumulate_partitions(_cumulative_accumulator, returns_state=True, start=(), op=op) def cumsum(self): """ Cumulative sum """ return self._cumulative_aggregation(op='cumsum') def cumprod(self): """ Cumulative product """ return self._cumulative_aggregation(op='cumprod') def cummin(self): """ Cumulative minimum """ return self._cumulative_aggregation(op='cummin') def cummax(self): """ Cumulative maximum """ return self._cumulative_aggregation(op='cummax') class Frames(BaseFrame): _stream_type = 'updating' def sum(self, **kwargs): return self.map_partitions(M.sum, self, **kwargs) def mean(self, **kwargs): return self.map_partitions(M.mean, self, **kwargs) def std(self, **kwargs): return self.map_partitions(M.std, self, **kwargs) def var(self, **kwargs): return self.map_partitions(M.var, self, **kwargs) @property def size(self, **kwargs): return self.map_partitions(M.size, self, **kwargs) def count(self, **kwargs): return self.map_partitions(M.count, self, **kwargs) def nlargest(self, n, *args, **kwargs): return self.map_partitions(M.nlargest, self, n, *args, **kwargs) def tail(self, n=5): """ Round elements in frame """ return self.map_partitions(M.tail, self, n=n) class _DataFrameMixin(object): @property def columns(self): return self.example.columns @property def dtypes(self): return self.example.dtypes def __getitem__(self, index): return self.map_partitions(operator.getitem, self, index) def __getattr__(self, key): if key in self.columns or not len(self.columns): return self.map_partitions(getattr, self, key) else: raise AttributeError("DataFrame has no attribute %r" % key) def __dir__(self): o = set(dir(type(self))) o.update(self.__dict__) o.update(c for c in self.columns if (isinstance(c, str) and c.isidentifier())) return list(o) def assign(self, **kwargs): """ Assign new columns to this dataframe Alternatively use setitem syntax Examples -------- >>> sdf = sdf.assign(z=sdf.x + sdf.y) # doctest: +SKIP >>> sdf['z'] = sdf.x + sdf.y # doctest: +SKIP """ kvs = list(toolz.concat(kwargs.items())) def _assign(df, *kvs): keys = kvs[::2] values = kvs[1::2] kwargs = OrderedDict(zip(keys, values)) return df.assign(**kwargs) return self.map_partitions(_assign, self, *kvs) def to_frame(self): """ Convert to a streaming dataframe """ return self def __setitem__(self, key, value): if isinstance(value, Series): result = self.assign(**{key: value}) elif isinstance(value, DataFrame): result = self.assign(**{k: value[c] for k, c in zip(key, value.columns)}) else: example = self.example.copy() example[key] = value df_type = type(self.example) result = self.map_partitions(df_type.assign, self, **{key: value}) self.stream = result.stream self.example = result.example return self def query(self, expr, **kwargs): df_type = type(self.example) return self.map_partitions(df_type.query, self, expr, **kwargs) class DataFrame(Frame, _DataFrameMixin): """ A Streaming Dataframe This is a logical collection over a stream of Pandas dataframes. Operations on this object will translate to the appropriate operations on the underlying Pandas dataframes. See Also -------- Series """ def __init__(self, *args, **kwargs): # {'x': sdf.x + 1, 'y': sdf.y - 1} if len(args) == 1 and not kwargs and isinstance(args[0], dict): def concat(tup, module=None, columns=None): result = module.concat(tup, axis=1) result.columns = columns return result columns, values = zip(*args[0].items()) base_frame_type = values[0]._subtype df_package = get_dataframe_package(base_frame_type) stream = type(values[0].stream).zip(*[v.stream for v in values]) stream = stream.map(concat, module=df_package, columns=list(columns)) example = df_package.DataFrame({k: getattr(v, 'example', v) for k, v in args[0].items()}) DataFrame.__init__(self, stream, example) else: example = None if "example" in kwargs: example = kwargs.get('example') elif len(args) > 1: example = args[1] if callable(example): example = example() kwargs["example"] = example self._subtype = get_base_frame_type(self.__class__.__name__, is_dataframe_like, example) super(DataFrame, self).__init__(*args, **kwargs) def verify(self, x): """ Verify consistency of elements that pass through this stream """ super(DataFrame, self).verify(x) if list(x.columns) != list(self.example.columns): raise IndexError("Input expected to have columns %s, got %s" % (self.example.columns, x.columns)) @property def plot(self): try: # import has side-effect of attaching .hvplot attribute import hvplot.streamz # # noqa: F401 except ImportError as err: # pragma: no cover raise ImportError("Streamz dataframe plotting requires hvplot") from err return self.hvplot class _SeriesMixin(object): @property def dtype(self): return self.example.dtype def to_frame(self): """ Convert to a streaming dataframe """ return self.map_partitions(M.to_frame, self) class Series(Frame, _SeriesMixin): """ A Streaming Series This is a logical collection over a stream of Pandas series objects. Operations on this object will translate to the appropriate operations on the underlying Pandas series. See Also -------- DataFrame """ def __init__(self, *args, **kwargs): example = None if "example" in kwargs: example = kwargs.get('example') elif len(args) > 1: example = args[1] if isinstance(self, Index): self._subtype = get_base_frame_type(self.__class__.__name__, is_index_like, example) else: self._subtype = get_base_frame_type(self.__class__.__name__, is_series_like, example) super(Series, self).__init__(*args, **kwargs) def value_counts(self): return self.accumulate_partitions(aggregations.accumulator, agg=aggregations.ValueCounts(), start=None, stream_type='updating', returns_state=True) class Index(Series): pass class DataFrames(Frames, _DataFrameMixin): pass class Seriess(Frames, _SeriesMixin): pass def _cumulative_accumulator(state, new, op=None): if not len(new): return state, new if not len(state): df = new else: df_package = get_dataframe_package(new) df = df_package.concat([state, new]) # ouch, full copy result = getattr(df, op)() new_state = result.iloc[-1:] if len(state): result = result[1:] return new_state, result class Rolling(object): """ Rolling aggregations This intermediate class enables rolling aggregations across either a fixed number of rows or a time window. Examples -------- >>> sdf.rolling(10).x.mean() # doctest: +SKIP >>> sdf.rolling('100ms').x.mean() # doctest: +SKIP """ def __init__(self, sdf, window, min_periods, with_state, start): self.root = sdf if not isinstance(window, int): window = pd.Timedelta(window) min_periods = 1 self.window = window self.min_periods = min_periods self.with_state = with_state self.start = start def __getitem__(self, key): sdf = self.root[key] return Rolling(sdf, self.window, self.min_periods, self.with_state, self.start) def __getattr__(self, key): if key in self.root.columns or not len(self.root.columns): return self[key] else: raise AttributeError("Rolling has no attribute %r" % key) def _known_aggregation(self, op, *args, **kwargs): return self.root.accumulate_partitions(rolling_accumulator, window=self.window, op=op, args=args, kwargs=kwargs, start=self.start, returns_state=True, with_state=self.with_state) def sum(self): """ Rolling sum """ return self._known_aggregation('sum') def mean(self): """ Rolling mean """ return self._known_aggregation('mean') def min(self): """ Rolling minimum """ return self._known_aggregation('min') def max(self): """ Rolling maximum """ return self._known_aggregation('max') def median(self): """ Rolling median """ return self._known_aggregation('median') def std(self, *args, **kwargs): """ Rolling standard deviation """ return self._known_aggregation('std', *args, **kwargs) def var(self, *args, **kwargs): """ Rolling variance """ return self._known_aggregation('var', *args, **kwargs) def count(self, *args, **kwargs): """ Rolling count """ return self._known_aggregation('count', *args, **kwargs) def aggregate(self, *args, **kwargs): """ Rolling aggregation """ return self._known_aggregation('aggregate', *args, **kwargs) def quantile(self, *args, **kwargs): """ Rolling quantile """ return self._known_aggregation('quantile', *args, **kwargs) class Window(OperatorMixin): """ Windowed aggregations This provides a set of aggregations that can be applied over a sliding window of data. See Also -------- DataFrame.window: contains full docstring """ def __init__(self, sdf, n=None, value=None, with_state=False, start=None): if value is None and isinstance(n, (str, pd.Timedelta)): value = n n = None self.n = n self.root = sdf if isinstance(value, str) and isinstance(self.root.example.index, pd.DatetimeIndex): value = pd.Timedelta(value) self.value = value self.with_state = with_state self.start = start def __getitem__(self, key): sdf = self.root[key] return type(self)( sdf, n=self.n, value=self.value, with_state=self.with_state, start=self.start ) def __getattr__(self, key): if key in self.root.columns or not len(self.root.columns): return self[key] else: raise AttributeError(f"{type(self)} has no attribute {key}") def map_partitions(self, func, *args, **kwargs): args2 = [a.root if isinstance(a, type(self)) else a for a in args] root = self.root.map_partitions(func, *args2, **kwargs) return type(self)( root, n=self.n, value=self.value, with_state=self.with_state, start=self.start ) @property def index(self): return self.map_partitions(lambda x: x.index, self) @property def columns(self): return self.root.columns @property def dtypes(self): return self.root.dtypes @property def example(self): return self.root.example def reset_index(self): return type(self)(self.root.reset_index(), n=self.n, value=self.value) def aggregate(self, agg): if self.n is not None: diff = aggregations.diff_iloc window = self.n elif self.value is not None: diff = aggregations.diff_loc window = self.value return self.root.accumulate_partitions(aggregations.window_accumulator, diff=diff, window=window, agg=agg, start=self.start, returns_state=True, stream_type='updating', with_state=self.with_state) def full(self): return self.aggregate(aggregations.Full()) def apply(self, func): """ Apply an arbitrary function over each window of data """ result = self.aggregate(aggregations.Full()) return result.map_partitions(func, result) def sum(self): """ Sum elements within window """ return self.aggregate(aggregations.Sum()) def count(self): """ Count elements within window """ return self.aggregate(aggregations.Count()) def mean(self): """ Average elements within window """ return self.aggregate(aggregations.Mean()) def var(self, ddof=1): """ Compute variance of elements within window """ return self.aggregate(aggregations.Var(ddof=ddof)) def std(self, ddof=1): """ Compute standard deviation of elements within window """ return self.var(ddof=ddof) ** 0.5 @property def size(self): """ Number of elements within window """ return self.aggregate(aggregations.Size()) def value_counts(self): """ Count groups of elements within window """ return self.aggregate(aggregations.ValueCounts()) def groupby(self, other): """ Groupby-aggregations within window """ return WindowedGroupBy(self.root, other, None, self.n, self.value, self.with_state, self.start) class Expanding(Window): def aggregate(self, agg): window = self.n diff = aggregations.diff_expanding return self.root.accumulate_partitions(aggregations.window_accumulator, diff=diff, window=window, agg=agg, start=self.start, returns_state=True, stream_type='updating', with_state=self.with_state) def groupby(self, other): raise NotImplementedError class EWM(Expanding): def __init__( self, sdf, n=1, value=None, with_state=False, start=None, com=None, span=None, halflife=None, alpha=None ): super().__init__(sdf, n=n, value=value, with_state=with_state, start=start) self._com = self._get_com(com, span, halflife, alpha) self.com = com self.span = span self.alpha = alpha self.halflife = halflife def __getitem__(self, key): sdf = self.root[key] return type(self)( sdf, n=self.n, value=self.value, with_state=self.with_state, start=self.start, com=self.com, span=self.span, halflife=self.halflife, alpha=self.alpha ) @staticmethod def _get_com(com, span, halflife, alpha): if sum(var is not None for var in (com, span, halflife, alpha)) > 1: raise ValueError("Can only provide one of `com`, `span`, `halflife`, `alpha`.") # Convert to center of mass; domain checks ensure 0 < alpha <= 1 if com is not None: if com < 0: raise ValueError("com must satisfy: comass >= 0") elif span is not None: if span < 1: raise ValueError("span must satisfy: span >= 1") com = (span - 1) / 2 elif halflife is not None: if halflife <= 0: raise ValueError("halflife must satisfy: halflife > 0") decay = 1 - np.exp(np.log(0.5) / halflife) com = 1 / decay - 1 elif alpha is not None: if alpha <= 0 or alpha > 1: raise ValueError("alpha must satisfy: 0 < alpha <= 1") com = (1 - alpha) / alpha else: raise ValueError("Must pass one of com, span, halflife, or alpha") return float(com) def full(self): raise NotImplementedError def apply(self, func): """ Apply an arbitrary function over each window of data """ raise NotImplementedError def sum(self): """ Sum elements within window """ raise NotImplementedError def count(self): """ Count elements within window """ raise NotImplementedError def mean(self): """ Average elements within window """ return self.aggregate(aggregations.EWMean(self._com)) def var(self, ddof=1): """ Compute variance of elements within window """ raise NotImplementedError def std(self, ddof=1): """ Compute standard deviation of elements within window """ raise NotImplementedError @property def size(self): """ Number of elements within window """ raise NotImplementedError def value_counts(self): """ Count groups of elements within window """ raise NotImplementedError def rolling_accumulator(acc, new, window=None, op=None, with_state=False, args=(), kwargs={}): if len(acc): df_package = get_dataframe_package(new) df = df_package.concat([acc, new]) else: df = new result = getattr(df.rolling(window), op)(*args, **kwargs) if isinstance(window, int): new_acc = df.iloc[-window:] else: new_acc = df.loc[result.index.max() - window:] result = result.iloc[len(acc):] return new_acc, result def _accumulate_mean(accumulator, new): accumulator = accumulator.copy() accumulator['sums'] += new.sum() accumulator['counts'] += new.count() result = accumulator['sums'] / accumulator['counts'] return accumulator, result def _accumulate_sum(accumulator, new): return accumulator + new.sum() def _accumulate_size(accumulator, new): return accumulator + new.size() class GroupBy(object): """ Groupby aggregations on streaming dataframes """ def __init__(self, root, grouper, index=None): self.root = root self.grouper = grouper self.index = index def __getitem__(self, index): return GroupBy(self.root, self.grouper, index) def __getattr__(self, key): if key in self.root.columns or not len(self.root.columns): return self[key] else: raise AttributeError("GroupBy has no attribute %r" % key) def _accumulate(self, Agg, with_state=False, start=None, **kwargs): stream_type = 'updating' if isinstance(self.grouper, Streaming): stream = self.root.stream.zip(self.grouper.stream) grouper_example = self.grouper.example agg = Agg(self.index, grouper=None, **kwargs) else: stream = self.root.stream grouper_example = self.grouper agg = Agg(self.index, grouper=self.grouper, **kwargs) # Compute example state = agg.initial(self.root.example, grouper=grouper_example) if hasattr(grouper_example, 'iloc'): grouper_example = grouper_example.iloc[:0] elif isinstance(grouper_example, np.ndarray) or is_index_like(grouper_example): grouper_example = grouper_example[:0] _, example = agg.on_new(state, self.root.example.iloc[:0], grouper=grouper_example) outstream = stream.accumulate(aggregations.groupby_accumulator, agg=agg, start=start, returns_state=True, with_state=with_state) for fn, s_type in _stream_types[stream_type]: """Function checks if example is of a specific frame type""" if fn(example): return s_type(outstream, example) return Streaming(outstream, example, stream_type=stream_type) def count(self, start=None): """ Groupby-count Parameters ---------- start: None or resulting Python object type from the operation Accepts a valid start state. """ return self._accumulate(aggregations.GroupbyCount, start=start) def mean(self, with_state=False, start=None): """ Groupby-mean Parameters ---------- start: None or resulting Python object type from the operation Accepts a valid start state. """ return self._accumulate(aggregations.GroupbyMean, with_state=with_state, start=start) def size(self): """ Groupby-size """ return self._accumulate(aggregations.GroupbySize) def std(self, ddof=1): """ Groupby-std """ return self.var(ddof=ddof) ** 0.5 def sum(self, start=None): """ Groupby-sum Parameters ---------- start: None or resulting Python object type from the operation Accepts a valid start state. """ return self._accumulate(aggregations.GroupbySum, start=start) def var(self, ddof=1): """ Groupby-variance """ return self._accumulate(aggregations.GroupbyVar, ddof=ddof) class WindowedGroupBy(GroupBy): """ Groupby aggregations over a window of data """ def __init__(self, root, grouper, index=None, n=None, value=None, with_state=False, start=None): self.root = root self.grouper = grouper self.index = index self.n = n if isinstance(value, str) and isinstance(self.root.example.index, pd.DatetimeIndex): value = pd.Timedelta(value) self.value = value self.with_state = with_state self.start = start def __getitem__(self, index): return WindowedGroupBy(self.root, self.grouper, index, self.n, self.value, self.with_state, self.start) def _accumulate(self, Agg, **kwargs): stream_type = 'updating' if isinstance(self.grouper, Streaming): stream = self.root.stream.zip(self.grouper.stream) grouper_example = self.grouper.example agg = Agg(self.index, grouper=None, **kwargs) elif isinstance(self.grouper, Window): stream = self.root.stream.zip(self.grouper.root.stream) grouper_example = self.grouper.root.example agg = Agg(self.index, grouper=None, **kwargs) else: stream = self.root.stream grouper_example = self.grouper agg = Agg(self.index, grouper=self.grouper, **kwargs) # Compute example state = agg.initial(self.root.example, grouper=grouper_example) if hasattr(grouper_example, 'iloc'): grouper_example = grouper_example.iloc[:0] elif isinstance(grouper_example, np.ndarray) or is_index_like(grouper_example): grouper_example = grouper_example[:0] _, example = agg.on_new(state, self.root.example.iloc[:0], grouper=grouper_example) if self.n is not None: diff = aggregations.diff_iloc window = self.n elif self.value is not None: diff = aggregations.diff_loc window = self.value outstream = stream.accumulate(aggregations.windowed_groupby_accumulator, agg=agg, start=self.start, returns_state=True, diff=diff, window=window, with_state=self.with_state) for fn, s_type in _stream_types[stream_type]: """Function checks if example is of a specific frame type""" if fn(example): return s_type(outstream, example) return Streaming(outstream, example, stream_type=stream_type) def random_datapoint(now=None, **kwargs): """Example of querying a single current value""" if now is None: now = pd.Timestamp.now() return pd.DataFrame( {'a': np.random.random(1)}, index=[now]) def random_datablock(last, now, **kwargs): """ Example of querying over a time range since last update Parameters ---------- last: pd.Timestamp Time of previous call to this function. now: pd.Timestamp Current time. freq: pd.Timedelta, optional The time interval between individual records to be returned. For good throughput, should be much smaller than the interval at which this function is called. Returns a pd.DataFrame with random values where: The x column is uniformly distributed. The y column is Poisson distributed. The z column is normally distributed. """ freq = kwargs.get("freq", pd.Timedelta("100ms")) index = pd.date_range(start=last + freq, end=now, freq=freq) df = pd.DataFrame({'x': np.random.random(len(index)), 'y': np.random.poisson(size=len(index)), 'z': np.random.normal(0, 1, size=len(index))}, index=index) return df @DataFrame.register_api(staticmethod, "from_periodic") class PeriodicDataFrame(DataFrame): """A streaming dataframe using the asyncio ioloop to poll a callback fn Parameters ---------- datafn: callable Callback function accepting **kwargs and returning a pd.DataFrame. kwargs will include at least 'last' (pd.Timestamp.now() when datafn was last invoked), and 'now' (current pd.Timestamp.now()). interval: timedelta The time interval between new dataframes. dask: boolean If true, uses a DaskStream instead of a regular Source. **kwargs: Optional keyword arguments to be passed into the callback function. By default, returns a three-column random pd.DataFrame generated by the 'random_datablock' function. Example ------- >>> df = PeriodicDataFrame(interval='1s', datafn=random_datapoint) # doctest: +SKIP """ def __init__(self, datafn=random_datablock, interval='500ms', dask=False, start=True, **kwargs): if dask: from streamz.dask import DaskStream source = DaskStream() else: source = Source() self.loop = source.loop self.interval = pd.Timedelta(interval).total_seconds() self.source = source self.continue_ = [False] # like the oppose of self.stopped self.kwargs = kwargs stream = self.source.map(lambda x: datafn(**x, **kwargs)) example = datafn(last=pd.Timestamp.now(), now=pd.Timestamp.now(), **kwargs) super(PeriodicDataFrame, self).__init__(stream, example) if start: self.start() def start(self): if not self.continue_[0]: self.continue_[0] = True self.loop.add_callback(self._cb, self.interval, self.source, self.continue_) def __del__(self): self.stop() def stop(self): self.continue_[0] = False @staticmethod async def _cb(interval, source, continue_): last = pd.Timestamp.now() while continue_[0]: await asyncio.sleep(interval) now = pd.Timestamp.now() await asyncio.gather(*source._emit(dict(last=last, now=now))) last = now @DataFrame.register_api(staticmethod, "random") class Random(PeriodicDataFrame): """PeriodicDataFrame providing random values by default Accepts same parameters as PeriodicDataFrame, plus `freq`, a string that will be converted to a pd.Timedelta and passed to the 'datafn'. Useful mainly for examples and docs. Example ------- >>> source = Random(freq='100ms', interval='1s') # doctest: +SKIP """ def __init__(self, freq='100ms', interval='500ms', dask=False, start=True, datafn=random_datablock): super(Random, self).__init__(datafn, interval, dask, start, freq=pd.Timedelta(freq)) _stream_types['streaming'].append((is_dataframe_like, DataFrame)) _stream_types['streaming'].append((is_index_like, Index)) _stream_types['streaming'].append((is_series_like, Series)) _stream_types['updating'].append((is_dataframe_like, DataFrames)) _stream_types['updating'].append((is_series_like, Seriess)) streamz-0.6.4/streamz/dataframe/aggregations.py0000644000175000017500000004130414270277270021203 0ustar nileshnileshfrom __future__ import division, print_function from collections import deque from numbers import Number import numpy as np import pandas as pd from .utils import is_series_like, is_index_like, get_dataframe_package class Aggregation(object): pass class Sum(Aggregation): def on_new(self, acc, new): if len(new): result = acc + new.sum() else: result = acc return result, result def on_old(self, acc, old): result = acc - old.sum() return result, result def initial(self, new): result = new.sum() if isinstance(result, Number): result = 0 else: result[:] = 0 return result class Mean(Aggregation): def on_new(self, acc, new): totals, counts = acc if len(new): totals = totals + new.sum() counts = counts + new.count() if isinstance(counts, Number) and counts == 0: counts = 1 return (totals, counts), totals / counts def on_old(self, acc, old): totals, counts = acc if len(old): totals = totals - old.sum() counts = counts - old.count() if isinstance(counts, Number) and counts == 0: counts = 1 return (totals, counts), totals / counts def initial(self, new): s, c = new.sum(), new.count() if isinstance(s, Number): s = 0 c = 0 else: s[:] = 0 c[:] = 0 return (s, c) class Count(Aggregation): def on_new(self, acc, new): result = acc + new.count() return result, result def on_old(self, acc, old): result = acc - old.count() return result, result def initial(self, new): return new.iloc[:0].count() class Size(Aggregation): def on_new(self, acc, new): result = acc + new.size return result, result def on_old(self, acc, old): result = acc - old.size return result, result def initial(self, new): return 0 class Var(Aggregation): def __init__(self, ddof=1): self.ddof = ddof def _compute_result(self, x, x2, n): result = (x2 / n) - (x / n) ** 2 if self.ddof != 0: result = result * n / (n - self.ddof) return result def on_new(self, acc, new): x, x2, n = acc if len(new): x = x + new.sum() x2 = x2 + (new ** 2).sum() n = n + new.count() return (x, x2, n), self._compute_result(x, x2, n) def on_old(self, acc, new): x, x2, n = acc if len(new): x = x - new.sum() x2 = x2 - (new ** 2).sum() n = n - new.count() return (x, x2, n), self._compute_result(x, x2, n) def initial(self, new): s = new.sum() c = new.count() if isinstance(s, Number): s = 0 c = 0 else: s[:] = 0 c[:] = 0 return (s, s, c) class Full(Aggregation): """ Return the full window of data every time This is somewhat expensive, builtin aggregations should be preferred when possible """ def on_new(self, acc, new): df_package = get_dataframe_package(new) result = df_package.concat([acc, new]) return result, result def on_old(self, acc, old): result = acc.iloc[len(old):] return result, result def initial(self, new): return new.iloc[:0] class EWMean(Aggregation): def __init__(self, com): self.com = com alpha = 1. / (1. + self.com) self.old_wt_factor = 1. - alpha self.new_wt = 1. def on_new(self, acc, new): result, old_wt, is_first = acc for i in range(int(is_first), len(new)): old_wt *= self.old_wt_factor result = ((old_wt * result) + (self.new_wt * new.iloc[i])) / (old_wt + self.new_wt) old_wt += self.new_wt return (result, old_wt, False), result def on_old(self, acc, old): pass def initial(self, new): return new.iloc[:1], 1, True def diff_iloc(dfs, new, window=None): """ Emit new list of dfs and decayed data Parameters ---------- dfs: list List of historical dataframes new: DataFrame, Series New data window: int Returns ------- dfs: list New list of historical data old: list List of dataframes to decay """ dfs = deque(dfs) if len(new) > 0: dfs.append(new) old = [] if len(dfs) > 0: n = sum(map(len, dfs)) - window while n > 0: if len(dfs[0]) <= n: df = dfs.popleft() old.append(df) n -= len(df) else: old.append(dfs[0].iloc[:n]) dfs[0] = dfs[0].iloc[n:] n = 0 return dfs, old def diff_loc(dfs, new, window=None): """ Emit new list of dfs and decayed data Parameters ---------- dfs: list List of historical dataframes new: DataFrame, Series New data window: value Returns ------- dfs: list New list of historical data old: list List of dataframes to decay """ dfs = deque(dfs) if len(new) > 0: dfs.append(new) old = [] if len(dfs) > 0: mx = max(df.index.max() for df in dfs) mn = mx - pd.Timedelta(window) + pd.Timedelta('1ns') while pd.Timestamp(dfs[0].index.min()) < mn: o = dfs[0].loc[:mn] if len(old) > 0: old.append(o) else: old = [o] dfs[0] = dfs[0].iloc[len(o):] if not len(dfs[0]): dfs.popleft() return dfs, old def diff_expanding(dfs, new, window=None): dfs = deque(dfs) if len(new) > 0: dfs.append(new) return dfs, [] def diff_align(dfs, groupers): """ Align groupers to newly-diffed dataframes For groupby aggregations we keep historical values of the grouper along with historical values of the dataframes. The dataframes are kept in historical sync with the ``diff_loc`` and ``diff_iloc`` functions above. This function copies that functionality over to the secondary list of groupers. """ old = [] while len(dfs) < len(groupers): old.append(groupers.popleft()) if dfs: n = len(groupers[0]) - len(dfs[0]) if n: old.append(groupers[0][:n]) groupers[0] = groupers[0][n:] assert len(dfs) == len(groupers) for df, g in zip(dfs, groupers): assert len(df) == len(g) return old, groupers def window_accumulator(acc, new, diff=None, window=None, agg=None, with_state=False): """ An accumulation binary operator for windowed aggregations This is the function that is actually given to the ``Stream.accumulate`` function. It performs all of the work given old state, new data, a diff function, window value, and aggregation object. Parameters ---------- acc: state new: DataFrame, Series The new data to add to the window. diff: callable One of ``diff_iloc`` or ``diff_loc`` window: int, value Either an integer for ``n=...`` for a value like ``value='2h'`` agg: Aggregation The aggregation object to apply, like ``Sum()`` Returns ------- acc: state result: newly emitted result See Also -------- accumulator windowed_groupby_accumulator """ if acc is None: acc = {'dfs': [], 'state': agg.initial(new)} dfs = acc['dfs'] state = acc['state'] dfs, old = diff(dfs, new, window=window) if new is not None: state, result = agg.on_new(state, new) for o in old: if len(o): state, result = agg.on_old(state, o) acc2 = {'dfs': dfs, 'state': state} return acc2, result def windowed_groupby_accumulator(acc, new, diff=None, window=None, agg=None, grouper=None, with_state=False): """ An accumulation binary operator for windowed groupb-aggregations This is the function that is actually given to the ``Stream.accumulate`` function. Parameters ---------- acc: state new: DataFrame, Series The new data to add to the window. diff: callable One of ``diff_iloc`` or ``diff_loc`` window: int, value Either an integer for ``n=...`` for a value like ``value='2h'`` agg: Aggregation The aggregation object to apply, like ``Sum()`` grouper: key or Frame Either a column like ``'x'`` or a Pandas Series if the groupby was given a streaming frame. Returns ------- acc: state result: newly emitted result See Also -------- accumulator windowed_accumulator """ if agg.grouper is None and isinstance(new, tuple): new, grouper = new else: grouper = None size = GroupbySize(agg.columns, agg.grouper) if acc is None: acc = {'dfs': [], 'state': agg.initial(new, grouper=grouper), 'size-state': size.initial(new, grouper=grouper)} if isinstance(grouper, np.ndarray) or is_series_like(grouper) or is_index_like(grouper): acc['groupers'] = deque([]) dfs = acc['dfs'] state = acc['state'] size_state = acc['size-state'] dfs, old = diff(dfs, new, window=window) if 'groupers' in acc: groupers = deque(acc['groupers']) if len(grouper) > 0: groupers.append(grouper) old_groupers, groupers = diff_align(dfs, groupers) else: old_groupers = [grouper] * len(old) if new is not None: state, result = agg.on_new(state, new, grouper=grouper) size_state, _ = size.on_new(size_state, new, grouper=grouper) for o, og in zip(old, old_groupers): if 'groupers' in acc: assert len(o) == len(og) if len(o): state, result = agg.on_old(state, o, grouper=og) size_state, _ = size.on_old(size_state, o, grouper=og) nonzero = size_state != 0 if not nonzero.all(): size_state = size_state[nonzero] result = result[nonzero] if isinstance(state, tuple): state = tuple(s[nonzero] for s in state) else: state = state[nonzero] acc2 = {'dfs': dfs, 'state': state, 'size-state': size_state} if 'groupers' in acc: acc2['groupers'] = groupers return acc2, result def accumulator(acc, new, agg=None): """ An accumulation binary operator This is the function that is actually given to the ``Stream.accumulate`` function. See Also -------- windowed_accumulator windowed_groupby_accumulator """ if acc is None: acc = agg.initial(new) return agg.on_new(acc, new) class GroupbyAggregation(Aggregation): def __init__(self, columns, grouper=None, **kwargs): self.grouper = grouper self.columns = columns for k, v in kwargs.items(): setattr(self, k, v) def grouped(self, df, grouper=None): if grouper is None: grouper = self.grouper g = df.groupby(grouper) if self.columns is not None: g = g[self.columns] return g class GroupbySum(GroupbyAggregation): def on_new(self, acc, new, grouper=None): g = self.grouped(new, grouper=grouper) result = acc.add(g.sum(), fill_value=0) result.index.name = acc.index.name return result, result def on_old(self, acc, old, grouper=None): g = self.grouped(old, grouper=grouper) result = acc.sub(g.sum(), fill_value=0) result.index.name = acc.index.name return result, result def initial(self, new, grouper=None): if hasattr(grouper, 'iloc'): grouper = grouper.iloc[:0] if isinstance(grouper, np.ndarray) or is_index_like(grouper): grouper = grouper[:0] return self.grouped(new.iloc[:0], grouper=grouper).sum() class GroupbyCount(GroupbyAggregation): def on_new(self, acc, new, grouper=None): g = self.grouped(new, grouper=grouper) result = acc.add(g.count(), fill_value=0) result = result.astype(int) result.index.name = acc.index.name return result, result def on_old(self, acc, old, grouper=None): g = self.grouped(old, grouper=grouper) result = acc.sub(g.count(), fill_value=0) result = result.astype(int) result.index.name = acc.index.name return result, result def initial(self, new, grouper=None): if hasattr(grouper, 'iloc'): grouper = grouper.iloc[:0] if isinstance(grouper, np.ndarray) or is_index_like(grouper): grouper = grouper[:0] return self.grouped(new.iloc[:0], grouper=grouper).count() class GroupbySize(GroupbyAggregation): def on_new(self, acc, new, grouper=None): g = self.grouped(new, grouper=grouper) result = acc.add(g.size(), fill_value=0) result = result.astype(int) result.index.name = acc.index.name return result, result def on_old(self, acc, old, grouper=None): g = self.grouped(old, grouper=grouper) result = acc.sub(g.size(), fill_value=0) result = result.astype(int) result.index.name = acc.index.name return result, result def initial(self, new, grouper=None): if hasattr(grouper, 'iloc'): grouper = grouper.iloc[:0] if isinstance(grouper, np.ndarray) or is_index_like(grouper): grouper = grouper[:0] return self.grouped(new.iloc[:0], grouper=grouper).size() class ValueCounts(Aggregation): def on_new(self, acc, new, grouper=None): result = acc.add(new.value_counts(), fill_value=0).astype(int) result.index.name = acc.index.name return result, result def on_old(self, acc, new, grouper=None): result = acc.sub(new.value_counts(), fill_value=0).astype(int) result.index.name = acc.index.name return result, result def initial(self, new, grouper=None): return new.iloc[:0].value_counts() class GroupbyMean(GroupbyAggregation): def on_new(self, acc, new, grouper=None): totals, counts = acc g = self.grouped(new, grouper=grouper) totals = totals.add(g.sum(), fill_value=0) counts = counts.add(g.count(), fill_value=0) totals.index.name = acc[0].index.name counts.index.name = acc[1].index.name return (totals, counts), totals / counts def on_old(self, acc, old, grouper=None): totals, counts = acc g = self.grouped(old, grouper=grouper) totals = totals.sub(g.sum(), fill_value=0) counts = counts.sub(g.count(), fill_value=0) totals.index.name = acc[0].index.name counts.index.name = acc[1].index.name return (totals, counts), totals / counts def initial(self, new, grouper=None): if hasattr(grouper, 'iloc'): grouper = grouper.iloc[:0] if isinstance(grouper, np.ndarray) or is_index_like(grouper): grouper = grouper[:0] g = self.grouped(new.iloc[:0], grouper=grouper) return (g.sum(), g.count()) class GroupbyVar(GroupbyAggregation): def _compute_result(self, x, x2, n): result = (x2 / n) - (x / n) ** 2 if self.ddof != 0: result = result * n / (n - self.ddof) return result def on_new(self, acc, new, grouper=None): x, x2, n = acc g = self.grouped(new, grouper=grouper) if len(new): x = x.add(g.sum(), fill_value=0) x2 = x2.add(g.agg(lambda x: (x**2).sum()), fill_value=0) n = n.add(g.count(), fill_value=0) return (x, x2, n), self._compute_result(x, x2, n) def on_old(self, acc, old, grouper=None): x, x2, n = acc g = self.grouped(old, grouper=grouper) if len(old): x = x.sub(g.sum(), fill_value=0) x2 = x2.sub(g.agg(lambda x: (x**2).sum()), fill_value=0) n = n.sub(g.count(), fill_value=0) return (x, x2, n), self._compute_result(x, x2, n) def initial(self, new, grouper=None): if hasattr(grouper, 'iloc'): grouper = grouper.iloc[:0] if isinstance(grouper, np.ndarray) or is_index_like(grouper): grouper = grouper[:0] new = new.iloc[:0] g = self.grouped(new, grouper=grouper) x = g.sum() x2 = g.agg(lambda x: (x**2).sum()) n = g.count() return (x, x2, n) def groupby_accumulator(acc, new, agg=None): if agg.grouper is None and isinstance(new, tuple): new, grouper = new else: grouper = None if acc is None: acc = agg.initial(new, grouper=grouper) result = agg.on_new(acc, new, grouper=grouper) return result streamz-0.6.4/streamz/dataframe/__init__.py0000644000175000017500000000027714270277270020274 0ustar nileshnileshfrom .core import (DataFrame, DataFrames, Frame, Frames, Series, Seriess, Index, Rolling, Window, PeriodicDataFrame, Random, GroupBy) from .aggregations import Aggregation streamz-0.6.4/streamz/dask.py0000644000175000017500000001413414270277270015530 0ustar nileshnileshfrom __future__ import absolute_import, division, print_function from operator import getitem from tornado import gen from dask.utils import apply from distributed.client import default_client from .core import Stream from . import core, sources class DaskStream(Stream): """ A Parallel stream using Dask This object is fully compliant with the ``streamz.core.Stream`` object but uses a Dask client for execution. Operations like ``map`` and ``accumulate`` submit functions to run on the Dask instance using ``dask.distributed.Client.submit`` and pass around Dask futures. Time-based operations like ``timed_window``, buffer, and so on operate as normal. Typically one transfers between normal Stream and DaskStream objects using the ``Stream.scatter()`` and ``DaskStream.gather()`` methods. Examples -------- >>> from dask.distributed import Client >>> client = Client() >>> from streamz import Stream >>> source = Stream() >>> source.scatter().map(func).accumulate(binop).gather().sink(...) See Also -------- dask.distributed.Client """ def __init__(self, *args, **kwargs): kwargs["ensure_io_loop"] = True super().__init__(*args, **kwargs) @DaskStream.register_api() class map(DaskStream): def __init__(self, upstream, func, *args, **kwargs): self.func = func self.kwargs = kwargs self.args = args DaskStream.__init__(self, upstream) def update(self, x, who=None, metadata=None): client = default_client() result = client.submit(self.func, x, *self.args, **self.kwargs) return self._emit(result, metadata=metadata) @DaskStream.register_api() class accumulate(DaskStream): def __init__(self, upstream, func, start=core.no_default, returns_state=False, **kwargs): self.func = func self.state = start self.returns_state = returns_state self.kwargs = kwargs self.with_state = kwargs.pop('with_state', False) DaskStream.__init__(self, upstream) def update(self, x, who=None, metadata=None): if self.state is core.no_default: self.state = x if self.with_state: return self._emit((self.state, x), metadata=metadata) else: return self._emit(x, metadata=metadata) else: client = default_client() result = client.submit(self.func, self.state, x, **self.kwargs) if self.returns_state: state = client.submit(getitem, result, 0) result = client.submit(getitem, result, 1) else: state = result self.state = state if self.with_state: return self._emit((self.state, result), metadata=metadata) else: return self._emit(result, metadata=metadata) @core.Stream.register_api() @DaskStream.register_api() class scatter(DaskStream): """ Convert local stream to Dask Stream All elements flowing through the input will be scattered out to the cluster """ @gen.coroutine def update(self, x, who=None, metadata=None): client = default_client() self._retain_refs(metadata) # We need to make sure that x is treated as it is by dask # However, client.scatter works internally different for # lists and dicts. So we always use a list here to be sure # we know the format exactly. We do not use a key to avoid # issues like https://github.com/python-streamz/streams/issues/397. future_as_list = yield client.scatter([x], asynchronous=True, hash=False) future = future_as_list[0] f = yield self._emit(future, metadata=metadata) self._release_refs(metadata) raise gen.Return(f) @DaskStream.register_api() class gather(core.Stream): """ Wait on and gather results from DaskStream to local Stream This waits on every result in the stream and then gathers that result back to the local stream. Warning, this can restrict parallelism. It is common to combine a ``gather()`` node with a ``buffer()`` to allow unfinished futures to pile up. Examples -------- >>> local_stream = dask_stream.buffer(20).gather() See Also -------- buffer scatter """ @gen.coroutine def update(self, x, who=None, metadata=None): client = default_client() self._retain_refs(metadata) result = yield client.gather(x, asynchronous=True) result2 = yield self._emit(result, metadata=metadata) self._release_refs(metadata) raise gen.Return(result2) @DaskStream.register_api() class starmap(DaskStream): def __init__(self, upstream, func, **kwargs): self.func = func stream_name = kwargs.pop('stream_name', None) self.kwargs = kwargs DaskStream.__init__(self, upstream, stream_name=stream_name) def update(self, x, who=None, metadata=None): client = default_client() result = client.submit(apply, self.func, x, self.kwargs) return self._emit(result, metadata=metadata) @DaskStream.register_api() class buffer(DaskStream, core.buffer): pass @DaskStream.register_api() class combine_latest(DaskStream, core.combine_latest): pass @DaskStream.register_api() class delay(DaskStream, core.delay): pass @DaskStream.register_api() class latest(DaskStream, core.latest): pass @DaskStream.register_api() class partition(DaskStream, core.partition): pass @DaskStream.register_api() class rate_limit(DaskStream, core.rate_limit): pass @DaskStream.register_api() class sliding_window(DaskStream, core.sliding_window): pass @DaskStream.register_api() class timed_window(DaskStream, core.timed_window): pass @DaskStream.register_api() class union(DaskStream, core.union): pass @DaskStream.register_api() class zip(DaskStream, core.zip): pass @DaskStream.register_api(staticmethod) class filenames(DaskStream, sources.filenames): pass @DaskStream.register_api(staticmethod) class from_textfile(DaskStream, sources.from_textfile): pass streamz-0.6.4/streamz/core.py0000644000175000017500000016770414270277270015552 0ustar nileshnileshimport asyncio from collections import deque, defaultdict from datetime import timedelta import functools import logging import six import sys import threading from time import time from typing import Any, Callable, Hashable, Union import weakref import toolz from tornado import gen from tornado.locks import Condition from tornado.ioloop import IOLoop from tornado.queues import Queue try: from distributed.client import default_client as _dask_default_client except ImportError: # pragma: no cover _dask_default_client = None from collections.abc import Iterable from threading import get_ident as get_thread_identity from .orderedweakset import OrderedWeakrefSet no_default = '--no-default--' _html_update_streams = set() thread_state = threading.local() logger = logging.getLogger(__name__) _io_loops = [] def get_io_loop(asynchronous=None): if asynchronous: return IOLoop.current() if _dask_default_client is not None: try: client = _dask_default_client() except ValueError: # No dask client found; continue pass else: return client.loop if not _io_loops: loop = IOLoop() thread = threading.Thread(target=loop.start) thread.daemon = True thread.start() _io_loops.append(loop) return _io_loops[-1] def identity(x): return x class RefCounter: """ A counter to track references to data This class is used to track how many nodes in the DAG are referencing a particular element in the pipeline. When the count reaches zero, then parties interested in knowing if data is done being processed are notified Parameters ---------- initial: int, optional The initial value of the reference counter cb: callable The function to use a callback when the reference count reaches zero loop: tornado.ioloop.IOLoop The loop on which to create a callback when the reference count reaches zero """ def __init__(self, initial=0, cb=None, loop=None): self.loop = loop if loop else get_io_loop() self.count = initial self.cb = cb def retain(self, n=1): """Retain the reference Parameters ---------- n: The number of times to retain the reference """ self.count += n def release(self, n=1): """Release the reference If the reference count is equal to or less than zero, the callback, if provided will added to the provided loop or default loop Parameters ---------- n: The number of references to release """ self.count -= n if self.count <= 0 and self.cb: self.loop.add_callback(self.cb) def __str__(self): return ''.format(self.count) __repr__ = __str__ class APIRegisterMixin(object): @classmethod def register_api(cls, modifier=identity, attribute_name=None): """ Add callable to Stream API This allows you to register a new method onto this class. You can use it as a decorator.:: >>> @Stream.register_api() ... class foo(Stream): ... ... >>> Stream().foo(...) # this works now It attaches the callable as a normal attribute to the class object. In doing so it respects inheritance (all subclasses of Stream will also get the foo attribute). By default callables are assumed to be instance methods. If you like you can include modifiers to apply before attaching to the class as in the following case where we construct a ``staticmethod``. >>> @Stream.register_api(staticmethod) ... class foo(Stream): ... ... >>> Stream.foo(...) # Foo operates as a static method You can also provide an optional ``attribute_name`` argument to control the name of the attribute your callable will be attached as. >>> @Stream.register_api(attribute_name="bar") ... class foo(Stream): ... ... >> Stream().bar(...) # foo was actually attached as bar """ def _(func): @functools.wraps(func) def wrapped(*args, **kwargs): return func(*args, **kwargs) name = attribute_name if attribute_name else func.__name__ setattr(cls, name, modifier(wrapped)) return func return _ @classmethod def register_plugin_entry_point(cls, entry_point, modifier=identity): if hasattr(cls, entry_point.name): raise ValueError( f"Can't add {entry_point.name} from {entry_point.module_name} " f"to {cls.__name__}: duplicate method name." ) def stub(*args, **kwargs): """ Entrypoints-based streamz plugin. Will be loaded on first call. """ node = entry_point.load() if not issubclass(node, Stream): raise TypeError( f"Error loading {entry_point.name} " f"from module {entry_point.module_name}: " f"{node.__class__.__name__} must be a subclass of Stream" ) if getattr(cls, entry_point.name).__name__ == "stub": cls.register_api( modifier=modifier, attribute_name=entry_point.name )(node) return node(*args, **kwargs) cls.register_api(modifier=modifier, attribute_name=entry_point.name)(stub) class Stream(APIRegisterMixin): """ A Stream is an infinite sequence of data. Streams subscribe to each other passing and transforming data between them. A Stream object listens for updates from upstream, reacts to these updates, and then emits more data to flow downstream to all Stream objects that subscribe to it. Downstream Stream objects may connect at any point of a Stream graph to get a full view of the data coming off of that point to do with as they will. Parameters ---------- stream_name: str or None This is the name of the stream. asynchronous: boolean or None Whether or not this stream will be used in asynchronous functions or normal Python functions. Leave as None if you don't know. True will cause operations like emit to return awaitable Futures False will use an Event loop in another thread (starts it if necessary) ensure_io_loop: boolean Ensure that some IOLoop will be created. If asynchronous is None or False then this will be in a separate thread, otherwise it will be IOLoop.current Examples -------- >>> def inc(x): ... return x + 1 >>> source = Stream() # Create a stream object >>> s = source.map(inc).map(str) # Subscribe to make new streams >>> s.sink(print) # take an action whenever an element reaches the end >>> L = list() >>> s.sink(L.append) # or take multiple actions (streams can branch) >>> for i in range(5): ... source.emit(i) # push data in at the source '1' '2' '3' '4' '5' >>> L # and the actions happen at the sinks ['1', '2', '3', '4', '5'] """ _graphviz_shape = 'ellipse' _graphviz_style = 'rounded,filled' _graphviz_fillcolor = 'white' _graphviz_orientation = 0 str_list = ['func', 'predicate', 'n', 'interval'] def __init__(self, upstream=None, upstreams=None, stream_name=None, loop=None, asynchronous=None, ensure_io_loop=False): self.name = stream_name self.downstreams = OrderedWeakrefSet() self.current_value = None self.current_metadata = None if upstreams is not None: self.upstreams = list(upstreams) elif upstream is not None: self.upstreams = [upstream] else: self.upstreams = [] self._set_asynchronous(asynchronous) self._set_loop(loop) if ensure_io_loop and not self.loop: self._set_asynchronous(False) if self.loop is None and self.asynchronous is not None: self._set_loop(get_io_loop(self.asynchronous)) for upstream in self.upstreams: if upstream: upstream.downstreams.add(self) def _set_loop(self, loop): self.loop = None if loop is not None: self._inform_loop(loop) else: for upstream in self.upstreams: if upstream and upstream.loop: self.loop = upstream.loop break def _inform_loop(self, loop): """ Percolate information about an event loop to the rest of the stream """ if self.loop is not None: if self.loop is not loop: raise ValueError("Two different event loops active") else: self.loop = loop for upstream in self.upstreams: if upstream: upstream._inform_loop(loop) for downstream in self.downstreams: if downstream: downstream._inform_loop(loop) def _set_asynchronous(self, asynchronous): self.asynchronous = None if asynchronous is not None: self._inform_asynchronous(asynchronous) else: for upstream in self.upstreams: if upstream and upstream.asynchronous: self.asynchronous = upstream.asynchronous break def _inform_asynchronous(self, asynchronous): """ Percolate information about an event loop to the rest of the stream """ if self.asynchronous is not None: if self.asynchronous is not asynchronous: raise ValueError("Stream has both asynchronous and synchronous elements") else: self.asynchronous = asynchronous for upstream in self.upstreams: if upstream: upstream._inform_asynchronous(asynchronous) for downstream in self.downstreams: if downstream: downstream._inform_asynchronous(asynchronous) def _add_upstream(self, upstream): """Add upstream to current upstreams, this method is overridden for classes which handle stream specific buffers/caches""" self.upstreams.append(upstream) def _add_downstream(self, downstream): """Add downstream to current downstreams""" self.downstreams.add(downstream) def _remove_downstream(self, downstream): """Remove downstream from current downstreams""" self.downstreams.remove(downstream) def _remove_upstream(self, upstream): """Remove upstream from current upstreams, this method is overridden for classes which handle stream specific buffers/caches""" self.upstreams.remove(upstream) def start(self): """ Start any upstream sources """ for upstream in self.upstreams: upstream.start() def stop(self): """ Stop upstream sources """ for upstream in self.upstreams: upstream.stop() def __str__(self): s_list = [] if self.name: s_list.append('{}; {}'.format(self.name, self.__class__.__name__)) else: s_list.append(self.__class__.__name__) for m in self.str_list: s = '' at = getattr(self, m, None) if at: if not callable(at): s = str(at) elif hasattr(at, '__name__'): s = getattr(self, m).__name__ else: s = None if s: s_list.append('{}={}'.format(m, s)) if len(s_list) <= 2: s_list = [term.split('=')[-1] for term in s_list] text = "<" text += s_list[0] if len(s_list) > 1: text += ': ' text += ', '.join(s_list[1:]) text += '>' return text __repr__ = __str__ def _ipython_display_(self, **kwargs): # pragma: no cover try: import ipywidgets from IPython.core.interactiveshell import InteractiveShell output = ipywidgets.Output(_view_count=0) except ImportError: # since this function is only called by jupyter, this import must succeed from IPython.display import display, HTML if hasattr(self, '_repr_html_'): return display(HTML(self._repr_html_())) else: return display(self.__repr__()) output_ref = weakref.ref(output) def update_cell(val): output = output_ref() if output is None: return with output: content, *_ = InteractiveShell.instance().display_formatter.format(val) output.outputs = ({'output_type': 'display_data', 'data': content, 'metadata': {}},) s = self.map(update_cell) _html_update_streams.add(s) self.output_ref = output_ref s_ref = weakref.ref(s) def remove_stream(change): output = output_ref() if output is None: return if output._view_count == 0: ss = s_ref() ss.destroy() _html_update_streams.remove(ss) # trigger gc output.observe(remove_stream, '_view_count') return output._ipython_display_(**kwargs) def _emit(self, x, metadata=None): """ Push data into the stream at this point Parameters ---------- x: any an element of data metadata: list[dict], optional Various types of metadata associated with the data element in `x`. ref: RefCounter A reference counter used to check when data is done """ self.current_value = x self.current_metadata = metadata if metadata: self._retain_refs(metadata, len(self.downstreams)) else: metadata = [] result = [] for downstream in list(self.downstreams): r = downstream.update(x, who=self, metadata=metadata) if type(r) is list: result.extend(r) else: result.append(r) self._release_refs(metadata) return [element for element in result if element is not None] def emit(self, x, asynchronous=False, metadata=None): """ Push data into the stream at this point This is typically done only at source Streams but can theoretically be done at any point Parameters ---------- x: any an element of data asynchronous: emit asynchronously metadata: list[dict], optional Various types of metadata associated with the data element in `x`. ref: RefCounter A reference counter used to check when data is done """ ts_async = getattr(thread_state, 'asynchronous', False) if self.loop is None or asynchronous or self.asynchronous or ts_async: if not ts_async: thread_state.asynchronous = True try: result = self._emit(x, metadata=metadata) if self.loop: return gen.convert_yielded(result) finally: thread_state.asynchronous = ts_async else: async def _(): thread_state.asynchronous = True try: result = await asyncio.gather(*self._emit(x, metadata=metadata)) finally: del thread_state.asynchronous return result sync(self.loop, _) def update(self, x, who=None, metadata=None): return self._emit(x, metadata=metadata) def gather(self): """ This is a no-op for core streamz This allows gather to be used in both dask and core streams """ return self def connect(self, downstream): """ Connect this stream to a downstream element. Parameters ---------- downstream: Stream The downstream stream to connect to """ self._add_downstream(downstream) downstream._add_upstream(self) def disconnect(self, downstream): """ Disconnect this stream to a downstream element. Parameters ---------- downstream: Stream The downstream stream to disconnect from """ self._remove_downstream(downstream) downstream._remove_upstream(self) @property def upstream(self): if len(self.upstreams) > 1: raise ValueError("Stream has multiple upstreams") elif len(self.upstreams) == 0: return None else: return self.upstreams[0] def destroy(self, streams=None): """ Disconnect this stream from any upstream sources """ if streams is None: streams = self.upstreams for upstream in list(streams): upstream._remove_downstream(self) self._remove_upstream(upstream) def scatter(self, **kwargs): from .dask import scatter return scatter(self, **kwargs) def remove(self, predicate): """ Only pass through elements for which the predicate returns False """ return self.filter(lambda x: not predicate(x)) @property def scan(self): return self.accumulate @property def concat(self): return self.flatten def sink_to_list(self): """ Append all elements of a stream to a list as they come in Examples -------- >>> source = Stream() >>> L = source.map(lambda x: 10 * x).sink_to_list() >>> for i in range(5): ... source.emit(i) >>> L [0, 10, 20, 30, 40] """ L = [] self.sink(L.append) return L def frequencies(self, **kwargs): """ Count occurrences of elements """ def update_frequencies(last, x): return toolz.assoc(last, x, last.get(x, 0) + 1) return self.scan(update_frequencies, start={}, **kwargs) def visualize(self, filename='mystream.png', **kwargs): """Render the computation of this object's task graph using graphviz. Requires ``graphviz`` and ``networkx`` to be installed. Parameters ---------- filename : str, optional The name of the file to write to disk. kwargs: Graph attributes to pass to graphviz like ``rankdir="LR"`` """ from .graph import visualize return visualize(self, filename, **kwargs) def to_dataframe(self, example): """ Convert a stream of Pandas dataframes to a DataFrame Examples -------- >>> source = Stream() >>> sdf = source.to_dataframe() >>> L = sdf.groupby(sdf.x).y.mean().stream.sink_to_list() >>> source.emit(pd.DataFrame(...)) # doctest: +SKIP >>> source.emit(pd.DataFrame(...)) # doctest: +SKIP >>> source.emit(pd.DataFrame(...)) # doctest: +SKIP """ from .dataframe import DataFrame return DataFrame(stream=self, example=example) def to_batch(self, **kwargs): """ Convert a stream of lists to a Batch All elements of the stream are assumed to be lists or tuples Examples -------- >>> source = Stream() >>> batches = source.to_batch() >>> L = batches.pluck('value').map(inc).sum().stream.sink_to_list() >>> source.emit([{'name': 'Alice', 'value': 1}, ... {'name': 'Bob', 'value': 2}, ... {'name': 'Charlie', 'value': 3}]) >>> source.emit([{'name': 'Alice', 'value': 4}, ... {'name': 'Bob', 'value': 5}, ... {'name': 'Charlie', 'value': 6}]) """ from .batch import Batch return Batch(stream=self, **kwargs) def _retain_refs(self, metadata, n=1): """ Retain all references in the provided metadata `n` number of times Parameters ---------- metadata: list[dict], optional Various types of metadata associated with the data element in `x`. ref: RefCounter A reference counter used to check when data is done n: The number of times to retain the provided references """ for m in metadata: if 'ref' in m: m['ref'].retain(n) def _release_refs(self, metadata, n=1): """ Release all references in the provided metadata `n` number of times Parameters ---------- metadata: list[dict], optional Various types of metadata associated with the data element in `x`. ref: RefCounter A reference counter used to check when data is done n: The number of times to retain the provided references """ for m in metadata: if 'ref' in m: m['ref'].release(n) @Stream.register_api() class map(Stream): """ Apply a function to every element in the stream Parameters ---------- func: callable *args : The arguments to pass to the function. **kwargs: Keyword arguments to pass to func Examples -------- >>> source = Stream() >>> source.map(lambda x: 2*x).sink(print) >>> for i in range(5): ... source.emit(i) 0 2 4 6 8 """ def __init__(self, upstream, func, *args, **kwargs): self.func = func # this is one of a few stream specific kwargs stream_name = kwargs.pop('stream_name', None) self.kwargs = kwargs self.args = args Stream.__init__(self, upstream, stream_name=stream_name) def update(self, x, who=None, metadata=None): try: result = self.func(x, *self.args, **self.kwargs) except Exception as e: logger.exception(e) raise else: return self._emit(result, metadata=metadata) @Stream.register_api() class starmap(Stream): """ Apply a function to every element in the stream, splayed out See ``itertools.starmap`` Parameters ---------- func: callable *args : The arguments to pass to the function. **kwargs: Keyword arguments to pass to func Examples -------- >>> source = Stream() >>> source.starmap(lambda a, b: a + b).sink(print) >>> for i in range(5): ... source.emit((i, i)) 0 2 4 6 8 """ def __init__(self, upstream, func, *args, **kwargs): self.func = func # this is one of a few stream specific kwargs stream_name = kwargs.pop('stream_name', None) self.kwargs = kwargs self.args = args Stream.__init__(self, upstream, stream_name=stream_name) def update(self, x, who=None, metadata=None): y = x + self.args try: result = self.func(*y, **self.kwargs) except Exception as e: logger.exception(e) raise else: return self._emit(result, metadata=metadata) def _truthy(x): return not not x @Stream.register_api() class filter(Stream): """ Only pass through elements that satisfy the predicate Parameters ---------- predicate : function The predicate. Should return True or False, where True means that the predicate is satisfied. *args : The arguments to pass to the predicate. **kwargs: Keyword arguments to pass to predicate Examples -------- >>> source = Stream() >>> source.filter(lambda x: x % 2 == 0).sink(print) >>> for i in range(5): ... source.emit(i) 0 2 4 """ def __init__(self, upstream, predicate, *args, **kwargs): if predicate is None: predicate = _truthy self.predicate = predicate stream_name = kwargs.pop("stream_name", None) self.kwargs = kwargs self.args = args Stream.__init__(self, upstream, stream_name=stream_name) def update(self, x, who=None, metadata=None): if self.predicate(x, *self.args, **self.kwargs): return self._emit(x, metadata=metadata) @Stream.register_api() class accumulate(Stream): """ Accumulate results with previous state This performs running or cumulative reductions, applying the function to the previous total and the new element. The function should take two arguments, the previous accumulated state and the next element and it should return a new accumulated state, - ``state = func(previous_state, new_value)`` (returns_state=False) - ``state, result = func(previous_state, new_value)`` (returns_state=True) where the new_state is passed to the next invocation. The state or result is emitted downstream for the two cases. Parameters ---------- func: callable start: object Initial value, passed as the value of ``previous_state`` on the first invocation. Defaults to the first submitted element returns_state: boolean If true then func should return both the state and the value to emit If false then both values are the same, and func returns one value **kwargs: Keyword arguments to pass to func Examples -------- A running total, producing triangular numbers >>> source = Stream() >>> source.accumulate(lambda acc, x: acc + x).sink(print) >>> for i in range(5): ... source.emit(i) 0 1 3 6 10 A count of number of events (including the current one) >>> source = Stream() >>> source.accumulate(lambda acc, x: acc + 1, start=0).sink(print) >>> for _ in range(5): ... source.emit(0) 1 2 3 4 5 Like the builtin "enumerate". >>> source = Stream() >>> source.accumulate(lambda acc, x: ((acc[0] + 1, x), (acc[0], x)), ... start=(0, 0), returns_state=True ... ).sink(print) >>> for i in range(3): ... source.emit(0) (0, 0) (1, 0) (2, 0) """ _graphviz_shape = 'box' def __init__(self, upstream, func, start=no_default, returns_state=False, **kwargs): self.func = func self.kwargs = kwargs self.state = start self.returns_state = returns_state # this is one of a few stream specific kwargs stream_name = kwargs.pop('stream_name', None) self.with_state = kwargs.pop('with_state', False) Stream.__init__(self, upstream, stream_name=stream_name) def update(self, x, who=None, metadata=None): if self.state is no_default: self.state = x if self.with_state: return self._emit((self.state, x), metadata=metadata) else: return self._emit(x, metadata=metadata) else: try: result = self.func(self.state, x, **self.kwargs) except Exception as e: logger.exception(e) raise if self.returns_state: state, result = result else: state = result self.state = state if self.with_state: return self._emit((self.state, result), metadata=metadata) else: return self._emit(result, metadata=metadata) @Stream.register_api() class slice(Stream): """ Get only some events in a stream by position. Works like list[] syntax. Parameters ---------- start : int First event to use. If None, start from the beginnning end : int Last event to use (non-inclusive). If None, continue without stopping. Does not support negative indexing. step : int Pass on every Nth event. If None, pass every one. Examples -------- >>> source = Stream() >>> source.slice(2, 6, 2).sink(print) >>> for i in range(5): ... source.emit(0) 2 4 """ def __init__(self, upstream, start=None, end=None, step=None, **kwargs): self.state = 0 self.star = start or 0 self.end = end self.step = step or 1 if any((_ or 0) < 0 for _ in [start, end, step]): raise ValueError("Negative indices not supported by slice") stream_name = kwargs.pop('stream_name', None) Stream.__init__(self, upstream, stream_name=stream_name) self._check_end() def update(self, x, who=None, metadata=None): if self.state >= self.star and self.state % self.step == 0: self.emit(x, metadata=metadata) self.state += 1 self._check_end() def _check_end(self): if self.end and self.state >= self.end: # we're done for upstream in self.upstreams: upstream._remove_downstream(self) @Stream.register_api() class partition(Stream): """ Partition stream into tuples of equal size Parameters ---------- n: int Maximum partition size timeout: int or float, optional Number of seconds after which a partition will be emitted, even if its size is less than ``n``. If ``None`` (default), a partition will be emitted only when its size reaches ``n``. key: hashable or callable, optional Emit items with the same key together as a separate partition. If ``key`` is callable, partition will be identified by ``key(x)``, otherwise by ``x[key]``. Defaults to ``None``. Examples -------- >>> source = Stream() >>> source.partition(3).sink(print) >>> for i in range(10): ... source.emit(i) (0, 1, 2) (3, 4, 5) (6, 7, 8) >>> source = Stream() >>> source.partition(2, key=lambda x: x % 2).sink(print) >>> for i in range(4): ... source.emit(i) (0, 2) (1, 3) >>> from time import sleep >>> source = Stream() >>> source.partition(5, timeout=1).sink(print) >>> for i in range(3): ... source.emit(i) >>> sleep(1) (0, 1, 2) """ _graphviz_shape = 'diamond' def __init__(self, upstream, n, timeout=None, key=None, **kwargs): self.n = n self._timeout = timeout self._key = key self._buffer = defaultdict(lambda: []) self._metadata_buffer = defaultdict(lambda: []) self._callbacks = {} kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) def _get_key(self, x): if self._key is None: return None if callable(self._key): return self._key(x) return x[self._key] @gen.coroutine def _flush(self, key): result, self._buffer[key] = self._buffer[key], [] metadata_result, self._metadata_buffer[key] = self._metadata_buffer[key], [] yield self._emit(tuple(result), list(metadata_result)) self._release_refs(metadata_result) @gen.coroutine def update(self, x, who=None, metadata=None): self._retain_refs(metadata) key = self._get_key(x) buffer = self._buffer[key] metadata_buffer = self._metadata_buffer[key] buffer.append(x) if isinstance(metadata, list): metadata_buffer.extend(metadata) else: metadata_buffer.append(metadata) if len(buffer) == self.n: if self._timeout is not None and self.n > 1: self._callbacks[key].cancel() yield self._flush(key) return if len(buffer) == 1 and self._timeout is not None: self._callbacks[key] = self.loop.call_later( self._timeout, self._flush, key ) @Stream.register_api() class partition_unique(Stream): """ Partition stream elements into groups of equal size with unique keys only. Parameters ---------- n: int Number of (unique) elements to pass through as a group. key: Union[Hashable, Callable[[Any], Hashable]] Callable that accepts a stream element and returns a unique, hashable representation of the incoming data (``key(x)``), or a hashable that gets the corresponding value of a stream element (``x[key]``). For example, ``key=lambda x: x["a"]`` would allow only elements with unique ``"a"`` values to pass through. .. note:: By default, we simply use the element object itself as the key, so that object must be hashable. If that's not the case, a non-default key must be provided. keep: str Which element to keep in the case that a unique key is already found in the group. If "first", keep element from the first occurrence of a given key; if "last", keep element from the most recent occurrence. Note that relative ordering of *elements* is preserved in the data passed through, and not ordering of *keys*. **kwargs Examples -------- >>> source = Stream() >>> stream = source.partition_unique(n=3, keep="first").sink(print) >>> eles = [1, 2, 1, 3, 1, 3, 3, 2] >>> for ele in eles: ... source.emit(ele) (1, 2, 3) (1, 3, 2) >>> source = Stream() >>> stream = source.partition_unique(n=3, keep="last").sink(print) >>> eles = [1, 2, 1, 3, 1, 3, 3, 2] >>> for ele in eles: ... source.emit(ele) (2, 1, 3) (1, 3, 2) >>> source = Stream() >>> stream = source.partition_unique(n=3, key=lambda x: len(x), keep="last").sink(print) >>> eles = ["f", "fo", "f", "foo", "f", "foo", "foo", "fo"] >>> for ele in eles: ... source.emit(ele) ('fo', 'f', 'foo') ('f', 'foo', 'fo') """ _graphviz_shape = "diamond" def __init__( self, upstream, n: int, key: Union[Hashable, Callable[[Any], Hashable]] = identity, keep: str = "first", # Literal["first", "last"] **kwargs ): self.n = n self.key = key self.keep = keep self._buffer = {} self._metadata_buffer = {} Stream.__init__(self, upstream, **kwargs) def _get_key(self, x): if callable(self.key): return self.key(x) else: return x[self.key] def update(self, x, who=None, metadata=None): self._retain_refs(metadata) y = self._get_key(x) if self.keep == "last": # remove key if already present so that emitted value # will reflect elements' actual relative ordering self._buffer.pop(y, None) self._metadata_buffer.pop(y, None) self._buffer[y] = x self._metadata_buffer[y] = metadata else: # self.keep == "first" if y not in self._buffer: self._buffer[y] = x self._metadata_buffer[y] = metadata if len(self._buffer) == self.n: result, self._buffer = tuple(self._buffer.values()), {} metadata_result, self._metadata_buffer = list(self._metadata_buffer.values()), {} ret = self._emit(result, metadata_result) self._release_refs(metadata_result) return ret else: return [] @Stream.register_api() class sliding_window(Stream): """ Produce overlapping tuples of size n Parameters ---------- return_partial : bool If True, yield tuples as soon as any events come in, each tuple being smaller or equal to the window size. If False, only start yielding tuples once a full window has accrued. Examples -------- >>> source = Stream() >>> source.sliding_window(3, return_partial=False).sink(print) >>> for i in range(8): ... source.emit(i) (0, 1, 2) (1, 2, 3) (2, 3, 4) (3, 4, 5) (4, 5, 6) (5, 6, 7) """ _graphviz_shape = 'diamond' def __init__(self, upstream, n, return_partial=True, **kwargs): self.n = n self._buffer = deque(maxlen=n) self.metadata_buffer = deque(maxlen=n) self.partial = return_partial Stream.__init__(self, upstream, **kwargs) def update(self, x, who=None, metadata=None): self._retain_refs(metadata) self._buffer.append(x) if not isinstance(metadata, list): metadata = [metadata] self.metadata_buffer.append(metadata) if self.partial or len(self._buffer) == self.n: flat_metadata = [m for ml in self.metadata_buffer for m in ml] ret = self._emit(tuple(self._buffer), flat_metadata) if len(self.metadata_buffer) == self.n: completed = self.metadata_buffer.popleft() self._release_refs(completed) return ret else: return [] def convert_interval(interval): if isinstance(interval, str): import pandas as pd interval = pd.Timedelta(interval).total_seconds() return interval @Stream.register_api() class timed_window(Stream): """ Emit a tuple of collected results every interval Every ``interval`` seconds this emits a tuple of all of the results seen so far. This can help to batch data coming off of a high-volume stream. """ _graphviz_shape = 'octagon' def __init__(self, upstream, interval, **kwargs): self.interval = convert_interval(interval) self._buffer = [] self.metadata_buffer = [] self.last = gen.moment kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) self.loop.add_callback(self.cb) def update(self, x, who=None, metadata=None): self._buffer.append(x) self._retain_refs(metadata) self.metadata_buffer.append(metadata) return self.last @gen.coroutine def cb(self): while True: L, self._buffer = self._buffer, [] metadata, self.metadata_buffer = self.metadata_buffer, [] m = [m for ml in metadata for m in ml] self.last = self._emit(L, m) self._release_refs(m) yield self.last yield gen.sleep(self.interval) @Stream.register_api() class timed_window_unique(Stream): """ Emit a group of elements with unique keys every ``interval`` seconds. Parameters ---------- interval: Union[int, str] Number of seconds over which to group elements, or a ``pandas``-style duration string that can be converted into seconds. key: Union[Hashable, Callable[[Any], Hashable]] Callable that accepts a stream element and returns a unique, hashable representation of the incoming data (``key(x)``), or a hashable that gets the corresponding value of a stream element (``x[key]``). For example, both ``key=lambda x: x["a"]`` and ``key="a"`` would allow only elements with unique ``"a"`` values to pass through. .. note:: By default, we simply use the element object itself as the key, so that object must be hashable. If that's not the case, a non-default key must be provided. keep: str Which element to keep in the case that a unique key is already found in the group. If "first", keep element from the first occurrence of a given key; if "last", keep element from the most recent occurrence. Note that relative ordering of *elements* is preserved in the data passed through, and not ordering of *keys*. Examples -------- >>> source = Stream() Get unique hashable elements in a window, keeping just the first occurrence: >>> stream = source.timed_window_unique(interval=1.0, keep="first").sink(print) >>> for ele in [1, 2, 3, 3, 2, 1]: ... source.emit(ele) () (1, 2, 3) () Get unique hashable elements in a window, keeping just the last occurrence: >>> stream = source.timed_window_unique(interval=1.0, keep="last").sink(print) >>> for ele in [1, 2, 3, 3, 2, 1]: ... source.emit(ele) () (3, 2, 1) () Get unique elements in a window by (string) length, keeping just the first occurrence: >>> stream = source.timed_window_unique(interval=1.0, key=len, keep="first") >>> for ele in ["f", "b", "fo", "ba", "foo", "bar"]: ... source.emit(ele) () ('f', 'fo', 'foo') () Get unique elements in a window by (string) length, keeping just the last occurrence: >>> stream = source.timed_window_unique(interval=1.0, key=len, keep="last") >>> for ele in ["f", "b", "fo", "ba", "foo", "bar"]: ... source.emit(ele) () ('b', 'ba', 'bar') () """ _graphviz_shape = "octagon" def __init__( self, upstream, interval: Union[int, str], key: Union[Hashable, Callable[[Any], Hashable]] = identity, keep: str = "first", # Literal["first", "last"] **kwargs ): self.interval = convert_interval(interval) self.key = key self.keep = keep self._buffer = {} self._metadata_buffer = {} self.last = gen.moment kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) self.loop.add_callback(self.cb) def _get_key(self, x): if callable(self.key): return self.key(x) else: return x[self.key] def update(self, x, who=None, metadata=None): self._retain_refs(metadata) y = self._get_key(x) if self.keep == "last": # remove key if already present so that emitted value # will reflect elements' actual relative ordering self._buffer.pop(y, None) self._metadata_buffer.pop(y, None) self._buffer[y] = x self._metadata_buffer[y] = metadata else: # self.keep == "first" if y not in self._buffer: self._buffer[y] = x self._metadata_buffer[y] = metadata return self.last @gen.coroutine def cb(self): while True: result, self._buffer = tuple(self._buffer.values()), {} metadata_result, self._metadata_buffer = list(self._metadata_buffer.values()), {} # TODO: figure out why metadata_result is handled differently here... m = [m for ml in metadata_result for m in ml] self.last = self._emit(result, m) self._release_refs(m) yield self.last yield gen.sleep(self.interval) @Stream.register_api() class delay(Stream): """ Add a time delay to results """ _graphviz_shape = 'octagon' def __init__(self, upstream, interval, **kwargs): self.interval = convert_interval(interval) self.queue = Queue() kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream,**kwargs) self.loop.add_callback(self.cb) @gen.coroutine def cb(self): while True: last = time() x, metadata = yield self.queue.get() yield self._emit(x, metadata=metadata) self._release_refs(metadata) duration = self.interval - (time() - last) if duration > 0: yield gen.sleep(duration) def update(self, x, who=None, metadata=None): self._retain_refs(metadata) return self.queue.put((x, metadata)) @Stream.register_api() class rate_limit(Stream): """ Limit the flow of data This stops two elements of streaming through in an interval shorter than the provided value. Parameters ---------- interval: float Time in seconds """ _graphviz_shape = 'octagon' def __init__(self, upstream, interval, **kwargs): self.interval = convert_interval(interval) self.next = 0 kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) @gen.coroutine def update(self, x, who=None, metadata=None): now = time() old_next = self.next self.next = max(now, self.next) + self.interval if now < old_next: yield gen.sleep(old_next - now) yield self._emit(x, metadata=metadata) @Stream.register_api() class buffer(Stream): """ Allow results to pile up at this point in the stream This allows results to buffer in place at various points in the stream. This can help to smooth flow through the system when backpressure is applied. """ _graphviz_shape = 'diamond' def __init__(self, upstream, n, **kwargs): self.queue = Queue(maxsize=n) kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) self.loop.add_callback(self.cb) def update(self, x, who=None, metadata=None): self._retain_refs(metadata) return self.queue.put((x, metadata)) @gen.coroutine def cb(self): while True: x, metadata = yield self.queue.get() yield self._emit(x, metadata=metadata) self._release_refs(metadata) @Stream.register_api() class zip(Stream): """ Combine streams together into a stream of tuples We emit a new tuple once all streams have produce a new tuple. See also -------- combine_latest zip_latest """ _graphviz_orientation = 270 _graphviz_shape = 'triangle' def __init__(self, *upstreams, **kwargs): self.maxsize = kwargs.pop('maxsize', 10) self.condition = Condition() self.literals = [(i, val) for i, val in enumerate(upstreams) if not isinstance(val, Stream)] self.buffers = {upstream: deque() for upstream in upstreams if isinstance(upstream, Stream)} upstreams2 = [upstream for upstream in upstreams if isinstance(upstream, Stream)] Stream.__init__(self, upstreams=upstreams2, **kwargs) def _add_upstream(self, upstream): # Override method to handle setup of buffer for new stream self.buffers[upstream] = deque() super(zip, self)._add_upstream(upstream) def _remove_upstream(self, upstream): # Override method to handle removal of buffer for stream self.buffers.pop(upstream) super(zip, self)._remove_upstream(upstream) def pack_literals(self, tup): """ Fill buffers for literals whenever we empty them """ inp = list(tup)[::-1] out = [] for i, val in self.literals: while len(out) < i: out.append(inp.pop()) out.append(val) while inp: out.append(inp.pop()) return tuple(out) def update(self, x, who=None, metadata=None): self._retain_refs(metadata) L = self.buffers[who] # get buffer for stream L.append((x, metadata)) if len(L) == 1 and all(self.buffers.values()): vals = [self.buffers[up][0] for up in self.upstreams] tup, md = __builtins__['zip'](*vals) for buf in self.buffers.values(): buf.popleft() self.condition.notify_all() if self.literals: tup = self.pack_literals(tup) md = [m for ml in md for m in ml] ret = self._emit(tup, md) self._release_refs(md) return ret elif len(L) > self.maxsize: return self.condition.wait() @Stream.register_api() class combine_latest(Stream): """ Combine multiple streams together to a stream of tuples This will emit a new tuple of all of the most recent elements seen from any stream. Parameters ---------- emit_on : stream or list of streams or None only emit upon update of the streams listed. If None, emit on update from any stream See Also -------- zip """ _graphviz_orientation = 270 _graphviz_shape = 'triangle' def __init__(self, *upstreams, **kwargs): emit_on = kwargs.pop('emit_on', None) self._initial_emit_on = emit_on self.last = [None for _ in upstreams] self.metadata = [None for _ in upstreams] self.missing = set(upstreams) if emit_on is not None: if not isinstance(emit_on, Iterable): emit_on = (emit_on, ) emit_on = tuple( upstreams[x] if isinstance(x, int) else x for x in emit_on) self.emit_on = emit_on else: self.emit_on = upstreams Stream.__init__(self, upstreams=upstreams, **kwargs) def _add_upstream(self, upstream): # Override method to handle setup of last and missing for new stream self.last.append(None) self.metadata.append(None) self.missing.update([upstream]) super(combine_latest, self)._add_upstream(upstream) if self._initial_emit_on is None: self.emit_on = self.upstreams def _remove_upstream(self, upstream): # Override method to handle removal of last and missing for stream if self.emit_on == upstream: raise RuntimeError("Can't remove the ``emit_on`` stream since that" "would cause no data to be emitted. " "Consider adding an ``emit_on`` first by " "running ``node.emit_on=(upstream,)`` to add " "a new ``emit_on`` or running " "``node.emit_on=tuple(node.upstreams)`` to " "emit on all incoming data") self.last.pop(self.upstreams.index(upstream)) self.metadata.pop(self.upstreams.index(upstream)) self.missing.remove(upstream) super(combine_latest, self)._remove_upstream(upstream) if self._initial_emit_on is None: self.emit_on = self.upstreams def update(self, x, who=None, metadata=None): self._retain_refs(metadata) idx = self.upstreams.index(who) if self.metadata[idx]: self._release_refs(self.metadata[idx]) self.metadata[idx] = metadata if self.missing and who in self.missing: self.missing.remove(who) self.last[idx] = x if not self.missing and who in self.emit_on: tup = tuple(self.last) md = [m for ml in self.metadata for m in ml] return self._emit(tup, md) @Stream.register_api() class flatten(Stream): """ Flatten streams of lists or iterables into a stream of elements Examples -------- >>> source = Stream() >>> source.flatten().sink(print) >>> for x in [[1, 2, 3], [4, 5], [6, 7, 7]]: ... source.emit(x) 1 2 3 4 5 6 7 See Also -------- partition """ def update(self, x, who=None, metadata=None): L = [] for i, item in enumerate(x): if i == len(x) - 1: y = self._emit(item, metadata=metadata) else: y = self._emit(item) if type(y) is list: L.extend(y) else: L.append(y) return L @Stream.register_api() class unique(Stream): """ Avoid sending through repeated elements This deduplicates a stream so that only new elements pass through. You can control how much of a history is stored with the ``maxsize=`` parameter. For example setting ``maxsize=1`` avoids sending through elements when one is repeated right after the other. Parameters ---------- maxsize: int or None, optional number of stored unique values to check against key : function, optional Function which returns a representation of the incoming data. For example ``key=lambda x: x['a']`` could be used to allow only pieces of data with unique ``'a'`` values to pass through. hashable : bool, optional If True then data is assumed to be hashable, else it is not. This is used for determining how to cache the history, if hashable then either dicts or LRU caches are used, otherwise a deque is used. Defaults to True. Examples -------- >>> source = Stream() >>> source.unique(maxsize=1).sink(print) >>> for x in [1, 1, 2, 2, 2, 1, 3]: ... source.emit(x) 1 2 1 3 """ def __init__(self, upstream, maxsize=None, key=identity, hashable=True, **kwargs): self.key = key self.maxsize = maxsize if hashable: self.seen = dict() if self.maxsize: from zict import LRU self.seen = LRU(self.maxsize, self.seen) else: self.seen = [] Stream.__init__(self, upstream, **kwargs) def update(self, x, who=None, metadata=None): y = self.key(x) emit = True if isinstance(self.seen, list): if y in self.seen: self.seen.remove(y) emit = False self.seen.insert(0, y) if self.maxsize: del self.seen[self.maxsize:] if emit: return self._emit(x, metadata=metadata) else: if self.seen.get(y, '~~not_seen~~') == '~~not_seen~~': self.seen[y] = 1 return self._emit(x, metadata=metadata) @Stream.register_api() class union(Stream): """ Combine multiple streams into one Every element from any of the upstreams streams will immediately flow into the output stream. They will not be combined with elements from other streams. See also -------- Stream.zip Stream.combine_latest """ def __init__(self, *upstreams, **kwargs): super(union, self).__init__(upstreams=upstreams, **kwargs) def update(self, x, who=None, metadata=None): return self._emit(x, metadata=metadata) @Stream.register_api() class pluck(Stream): """ Select elements from elements in the stream. Parameters ---------- pluck : object, list The element(s) to pick from the incoming element in the stream If an instance of list, will pick multiple elements. Examples -------- >>> source = Stream() >>> source.pluck([0, 3]).sink(print) >>> for x in [[1, 2, 3, 4], [4, 5, 6, 7], [8, 9, 10, 11]]: ... source.emit(x) (1, 4) (4, 7) (8, 11) >>> source = Stream() >>> source.pluck('name').sink(print) >>> for x in [{'name': 'Alice', 'x': 123}, {'name': 'Bob', 'x': 456}]: ... source.emit(x) 'Alice' 'Bob' """ def __init__(self, upstream, pick, **kwargs): self.pick = pick super(pluck, self).__init__(upstream, **kwargs) def update(self, x, who=None, metadata=None): if isinstance(self.pick, list): return self._emit(tuple([x[ind] for ind in self.pick]), metadata=metadata) else: return self._emit(x[self.pick], metadata=metadata) @Stream.register_api() class collect(Stream): """ Hold elements in a cache and emit them as a collection when flushed. Examples -------- >>> source1 = Stream() >>> source2 = Stream() >>> collector = collect(source1) >>> collector.sink(print) >>> source2.sink(collector.flush) >>> source1.emit(1) >>> source1.emit(2) >>> source2.emit('anything') # flushes collector ... [1, 2] """ def __init__(self, upstream, cache=None, metadata_cache=None, **kwargs): if cache is None: cache = deque() self.cache = cache if metadata_cache is None: metadata_cache = deque() self.metadata_cache = metadata_cache Stream.__init__(self, upstream, **kwargs) def update(self, x, who=None, metadata=None): self._retain_refs(metadata) self.cache.append(x) if metadata: if isinstance(metadata, list): self.metadata_cache.extend(metadata) else: self.metadata_cache.append(metadata) def flush(self, _=None): out = tuple(self.cache) metadata = list(self.metadata_cache) self._emit(out, metadata) self._release_refs(metadata) self.cache.clear() self.metadata_cache.clear() @Stream.register_api() class zip_latest(Stream): """Combine multiple streams together to a stream of tuples The stream which this is called from is lossless. All elements from the lossless stream are emitted reguardless of when they came in. This will emit a new tuple consisting of an element from the lossless stream paired with the latest elements from the other streams. Elements are only emitted when an element on the lossless stream are received, similar to ``combine_latest`` with the ``emit_on`` flag. See Also -------- Stream.combine_latest Stream.zip """ def __init__(self, lossless, *upstreams, **kwargs): upstreams = (lossless,) + upstreams self.last = [None for _ in upstreams] self.metadata = [None for _ in upstreams] self.missing = set(upstreams) self.lossless = lossless self.lossless_buffer = deque() Stream.__init__(self, upstreams=upstreams, **kwargs) def update(self, x, who=None, metadata=None): self._retain_refs(metadata) idx = self.upstreams.index(who) if who is self.lossless: self.lossless_buffer.append((x, metadata)) elif self.metadata[idx]: self._release_refs(self.metadata[idx]) self.metadata[idx] = metadata self.last[idx] = x if self.missing and who in self.missing: self.missing.remove(who) if not self.missing: L = [] while self.lossless_buffer: self.last[0], self.metadata[0] = self.lossless_buffer.popleft() md = [m for ml in self.metadata for m in ml] L.append(self._emit(tuple(self.last), md)) self._release_refs(self.metadata[0]) return L @Stream.register_api() class latest(Stream): """ Drop held-up data and emit the latest result This allows you to skip intermediate elements in the stream if there is some back pressure causing a slowdown. Use this when you only care about the latest elements, and are willing to lose older data. This passes through values without modification otherwise. Examples -------- >>> source.map(f).latest().map(g) # doctest: +SKIP """ _graphviz_shape = 'octagon' def __init__(self, upstream, **kwargs): self.condition = Condition() self.next = [] self.next_metadata = None kwargs["ensure_io_loop"] = True Stream.__init__(self, upstream, **kwargs) self.loop.add_callback(self.cb) def update(self, x, who=None, metadata=None): if self.next_metadata: self._release_refs(self.next_metadata) self._retain_refs(metadata) self.next = [x] self.next_metadata = metadata self.loop.add_callback(self.condition.notify) @gen.coroutine def cb(self): while True: yield self.condition.wait() [x] = self.next yield self._emit(x, self.next_metadata) def sync(loop, func, *args, **kwargs): """ Run coroutine in loop running in separate thread. """ # This was taken from distrbuted/utils.py timeout = kwargs.pop('callback_timeout', None) e = threading.Event() main_tid = get_thread_identity() result = [None] error = [False] @gen.coroutine def f(): try: if main_tid == get_thread_identity(): raise RuntimeError("sync() called from thread of running loop") yield gen.moment thread_state.asynchronous = True future = func(*args, **kwargs) if timeout is not None: future = gen.with_timeout(timedelta(seconds=timeout), future) result[0] = yield future except Exception: error[0] = sys.exc_info() finally: thread_state.asynchronous = False e.set() loop.add_callback(f) if timeout is not None: if not e.wait(timeout): raise gen.TimeoutError("timed out after %s s." % (timeout,)) else: while not e.is_set(): e.wait(10) if error[0]: six.reraise(*error[0]) else: return result[0] streamz-0.6.4/streamz/collection.py0000644000175000017500000002177114270277270016746 0ustar nileshnileshimport operator import types from streamz import Stream, core _stream_types = {'streaming': [], 'updating': []} def map_partitions(func, *args, **kwargs): """ Map a function across all batch elements of this stream The output stream type will be determined by the action of that function on the example See Also -------- Streaming.accumulate_partitions """ example = kwargs.pop('example', None) if example is None: example = func(*[getattr(arg, 'example', arg) for arg in args], **kwargs) streams = [arg for arg in args if isinstance(arg, Streaming)] if 'stream_type' in kwargs: stream_type = kwargs['stream_type'] else: stream_type = ('streaming' if any(s._stream_type == 'streaming' for s in streams) else 'updating') if len(streams) > 1: stream = type(streams[0].stream).zip(*[getattr(arg, 'stream', arg) for arg in args]) stream = stream.map(apply_args, func, kwargs) else: s = streams[0] if isinstance(args[0], Streaming): stream = s.stream.map(func, *args[1:], **kwargs) else: other = [(i, arg) for i, arg in enumerate(args) if not isinstance(arg, Streaming)] stream = s.stream.map(partial_by_order, function=func, other=other, **kwargs) s_type = get_stream_type(example, stream_type) if s_type: return s_type(stream, example) return Streaming(stream, example, stream_type=stream_type) class OperatorMixin(object): def __abs__(self): return self.map_partitions(operator.abs, self) def __add__(self, other): return self.map_partitions(operator.add, self, other) def __radd__(self, other): return self.map_partitions(operator.add, other, self) def __and__(self, other): return self.map_partitions(operator.and_, self, other) def __rand__(self, other): return self.map_partitions(operator.and_, other, self) def __eq__(self, other): return self.map_partitions(operator.eq, self, other) def __floordiv__(self, other): return self.map_partitions(operator.floordiv, self, other) def __rfloordiv__(self, other): return self.map_partitions(operator.floordiv, other, self) def __ge__(self, other): return self.map_partitions(operator.ge, self, other) def __gt__(self, other): return self.map_partitions(operator.gt, self, other) def __inv__(self): return self.map_partitions(operator.inv, self) def __invert__(self): return self.map_partitions(operator.invert, self) def __le__(self, other): return self.map_partitions(operator.le, self, other) def __lshift__(self, other): return self.map_partitions(operator.lshift, self, other) def __rlshift__(self, other): return self.map_partitions(operator.lshift, other, self) def __lt__(self, other): return self.map_partitions(operator.lt, self, other) def __mod__(self, other): return self.map_partitions(operator.mod, self, other) def __rmod__(self, other): return self.map_partitions(operator.mod, other, self) def __mul__(self, other): return self.map_partitions(operator.mul, self, other) def __rmul__(self, other): return self.map_partitions(operator.mul, other, self) def __ne__(self, other): return self.map_partitions(operator.ne, self, other) def __neg__(self): return self.map_partitions(operator.neg, self) def __or__(self, other): return self.map_partitions(operator.or_, self, other) def __ror__(self, other): return self.map_partitions(operator.or_, other, self) def __pow__(self, other): return self.map_partitions(operator.pow, self, other) def __rpow__(self, other): return self.map_partitions(operator.pow, other, self) def __rshift__(self, other): return self.map_partitions(operator.rshift, self, other) def __rrshift__(self, other): return self.map_partitions(operator.rshift, other, self) def __sub__(self, other): return self.map_partitions(operator.sub, self, other) def __rsub__(self, other): return self.map_partitions(operator.sub, other, self) def __truediv__(self, other): return self.map_partitions(operator.truediv, self, other) def __rtruediv__(self, other): return self.map_partitions(operator.truediv, other, self) def __xor__(self, other): return self.map_partitions(operator.xor, self, other) def __rxor__(self, other): return self.map_partitions(operator.xor, other, self) class Streaming(OperatorMixin, core.APIRegisterMixin): """ Superclass for streaming collections Do not create this class directly, use one of the subclasses instead. Parameters ---------- stream: streamz.Stream example: object An object to represent an example element of this stream See also -------- streamz.dataframe.StreamingDataFrame streamz.dataframe.StreamingBatch """ _subtype = object _stream_type = 'streaming' map_partitions = staticmethod(map_partitions) def __init__(self, stream=None, example=None, stream_type=None): assert example is not None self.example = example if not isinstance(self.example, self._subtype): self.example = self._subtype(example) assert isinstance(self.example, self._subtype) self.stream = stream or Stream() if stream_type: if stream_type not in ['streaming', 'updating']: raise Exception() self._stream_type = stream_type def accumulate_partitions(self, func, *args, **kwargs): """ Accumulate a function with state across batch elements See Also -------- Streaming.map_partitions """ start = kwargs.pop('start', core.no_default) returns_state = kwargs.pop('returns_state', False) example = kwargs.pop('example', None) stream_type = kwargs.pop('stream_type', self._stream_type) if example is None: example = func(start, self.example, *args, **kwargs) if returns_state: _, example = example stream = self.stream.accumulate(func, *args, start=start, returns_state=returns_state, **kwargs) s_type = get_stream_type(example, stream_type) if s_type: return s_type(stream, example) return Streaming(stream, example, stream_type=stream_type) def __repr__(self): example = self.example if hasattr(example, 'head'): example = example.head(2) return "%s - elements like:\n%r" % (type(self).__name__, example) def _repr_html_(self): example = self.example if hasattr(example, 'head'): example = example.head(2) try: body = example._repr_html_() except AttributeError: body = repr(example) return "
%s - elements like
\n%s" % (type(self).__name__, body) @property def current_value(self): return self.stream.current_value def start(self): self.stream.start() def stop(self): self.stream.stop() def _ipython_display_(self, **kwargs): try: from ipywidgets import Output # noqa: F401 return self.stream.latest().rate_limit( 0.5).gather()._ipython_display_(**kwargs) except ImportError: # since this function is only called by jupyter, this import must succeed from IPython.display import display, HTML if hasattr(self, '_repr_html_'): return display(HTML(self._repr_html_())) else: return display(self.__repr__()) def emit(self, x): self.verify(x) self.stream.emit(x) def verify(self, x): """ Verify elements that pass through this stream """ if not isinstance(x, self._subtype): raise TypeError("Expected type %s, got type %s" % (self._subtype, type(x))) def get_stream_type(example, stream_type='streaming'): for typ, s_type in _stream_types[stream_type]: if isinstance(typ, types.FunctionType): """For Frame like objects we use utility functions to check type. i.e, DataFrame like objects are checked using is_dataframe_like.""" if typ(example): return s_type elif isinstance(example, typ): return s_type return None def partial_by_order(*args, **kwargs): """ >>> from operator import add >>> partial_by_order(5, function=add, other=[(1, 10)]) 15 """ function = kwargs.pop('function') other = kwargs.pop('other') args2 = list(args) for i, arg in other: args2.insert(i, arg) return function(*args2, **kwargs) def apply_args(args, func, kwargs): return func(*args, **kwargs) streamz-0.6.4/streamz/batch.py0000644000175000017500000000452014270277270015665 0ustar nileshnileshfrom .collection import Streaming, _stream_types import toolz import toolz.curried class Batch(Streaming): """ A Stream of tuples or lists This streaming collection manages batches of Python objects such as lists of text or dictionaries. By batching many elements together we reduce overhead from Python. This library is typically used at the early stages of data ingestion before handing off to streaming dataframes Examples -------- >>> text = Streaming.from_file(myfile) # doctest: +SKIP >>> b = text.partition(100).map(json.loads) # doctest: +SKIP """ def __init__(self, stream=None, example=None): if example is None: example = [] super(Batch, self).__init__(stream=stream, example=example) def sum(self): """ Sum elements """ return self.accumulate_partitions(_accumulate_sum, start=0) def filter(self, predicate): """ Filter elements by a predicate """ return self.map_partitions(_filter, self, predicate) def pluck(self, ind): """ Pick a field out of all elements Example ------- >>> s.pluck('name').sink(print) # doctest: +SKIP >>> s.emit({'name': 'Alice', 'x': 123}) # doctest: +SKIP 'Alice' """ return self.map_partitions(_pluck, self, ind) def map(self, func, **kwargs): """ Map a function across all elements """ return self.map_partitions(_map_map, self, func, **kwargs) def to_dataframe(self): """ Convert to a streaming dataframe This calls ``pd.DataFrame`` on all list-elements of this stream """ import pandas as pd import streamz.dataframe # noqa: F401 return self.map_partitions(pd.DataFrame, self) def to_stream(self): """ Concatenate batches and return base Stream Returned stream will be composed of single elements """ return self.stream.flatten() def _filter(seq, predicate): return list(filter(predicate, seq)) def _pluck(seq, ind): return list(toolz.pluck(ind, seq)) def _map_map(seq, func, **kwargs): return list(map(func, seq, **kwargs)) def _accumulate_sum(accumulator, new): return accumulator + sum(new) map_type = type(map(lambda x: x, [])) _stream_types['streaming'].append(((list, tuple, set), Batch)) streamz-0.6.4/streamz/__init__.py0000644000175000017500000000046314270277270016345 0ustar nileshnileshfrom __future__ import absolute_import, division, print_function from .core import * from .graph import * from .sources import * from .sinks import * from .plugins import load_plugins load_plugins(Stream) try: from .dask import DaskStream, scatter except ImportError: pass __version__ = '0.6.4' streamz-0.6.4/setup.py0000755000175000017500000000127514270277270014266 0ustar nileshnilesh#!/usr/bin/env python from os.path import exists from setuptools import setup packages = ['streamz', 'streamz.dataframe'] tests = [p + '.tests' for p in packages] setup(name='streamz', version='0.6.4', description='Streams', url='http://github.com/python-streamz/streamz/', maintainer='Matthew Rocklin', maintainer_email='mrocklin@gmail.com', license='BSD', keywords='streams', packages=packages + tests, python_requires='>=3.7', long_description=(open('README.rst').read() if exists('README.rst') else ''), install_requires=list(open('requirements.txt').read().strip().split('\n')), zip_safe=False) streamz-0.6.4/setup.cfg0000644000175000017500000000216714270277270014373 0ustar nileshnilesh[flake8] # References: # https://flake8.readthedocs.io/en/latest/user/configuration.html # https://flake8.readthedocs.io/en/latest/user/error-codes.html # Note: there cannot be spaces after comma's here exclude = __init__.py,tests ignore = # Extra space in brackets E20, # Multiple spaces around "," E231,E241, # Comments E26, # Import formatting E4, # Comparing types instead of isinstance E721, # Assigning lambda expression E731, # continuation line under-indented for hanging indent E121, # continuation line over-indented for hanging indent E126, # continuation line over-indented for visual indent E127, # E128 continuation line under-indented for visual indent E128, # multiple statements on one line (semicolon) E702, # line break before binary operator W503, # visually indented line with same indent as next logical line E129, # unexpected indentation E116 max-line-length = 120 [bdist_wheel] universal=1 [tool:pytest] markers: network: Test requires an internet connection slow: Skipped unless --runslow passed streamz-0.6.4/requirements.txt0000644000175000017500000000004114270277270016023 0ustar nileshnileshtornado toolz zict six setuptoolsstreamz-0.6.4/examples/0000755000175000017500000000000014270277270014362 5ustar nileshnileshstreamz-0.6.4/examples/scrape.py0000644000175000017500000000277414270277270016223 0ustar nileshnileshfrom __future__ import print_function from time import sleep import sys from BeautifulSoup import BeautifulSoup # Python 2 only, sorry. import requests from streamz import Stream import toolz import urlparse def links_of_page((content, page)): uri = urlparse.urlparse(page) domain = '%s://%s' % (uri.scheme, uri.netloc) try: soup = BeautifulSoup(content) except: return [] else: links = [link.get('href') for link in soup.findAll('a')] return [domain + link for link in links if link and link.startswith('/') and '?' not in link and link != '/'] def topk_dict(d, k=10): return dict(toolz.topk(k, d.items(), key=lambda x: x[1])) source = Stream() pages = source.unique() pages.sink(print) content = (pages.map(requests.get) .map(lambda x: x.content)) links = (content.zip(pages) .map(links_of_page) .concat()) links.sink(source.emit) """ from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) word_counts = (content.map(str.split) .concat() .filter(str.isalpha) .remove(stopwords.__contains__) .frequencies()) top_words = (word_counts.map(topk_dict, k=10) .map(frozenset) .unique(history=10)) top_words.sink(print) """ if len(sys.argv) > 1: source.emit(sys.argv[1]) # streamz-0.6.4/examples/river_kmeans.py0000644000175000017500000000737614270277270017436 0ustar nileshnileshimport functools import random import time import pandas as pd from streamz import Stream import hvplot.streamz from streamz.river import RiverTrain from river import cluster import holoviews as hv from panel.pane.holoviews import HoloViews import panel as pn hv.extension('bokeh') model = cluster.KMeans(n_clusters=3, sigma=0.1, mu=0.5) centres = [[random.random(), random.random()] for _ in range(3)] count = [0] def gen(move_chance=0.05): centre = int(random.random() * 3) # 3x faster than random.randint(0, 2) if random.random() < move_chance: centres[centre][0] += random.random() / 5 - 0.1 centres[centre][1] += random.random() / 5 - 0.1 value = {'x': random.random() / 20 + centres[centre][0], 'y': random.random() / 20 + centres[centre][1]} count[0] += 1 return value def get_clusters(model): # return [{"x": xcen, "y": ycen}, ...] for each centre data = [{'x': v['x'], 'y': v['y']} for k, v in model.centers.items()] return pd.DataFrame(data, index=range(3)) def main(viz=True): cadance = 0.01 ex = pd.DataFrame({'x': [0.5], 'y': [0.5]}) pipe_in = hv.streams.Pipe(data=ex) pipe_out = hv.streams.Pipe(data=ex) # setup pipes s = Stream.from_periodic(gen, cadance) # Branch 0: Input/Observations obs = s.map(lambda x: pd.DataFrame([x])) # Branch 1: Output/River ML clusters km = RiverTrain(model, pass_model=True) s.map(lambda x: (x,)).connect(km) # learn takes a tuple of (x,[ y[, w]]) clusters = km.map(get_clusters) concat = functools.partial(pd.concat, ignore_index=True) def accumulate(previous, new, last_lines=50): return concat([previous, new]).iloc[-last_lines:, :] partition_obs = 10 particion_clusters = 10 backlog_obs = 100 # .partition is used to gather x number of points # before sending them to the plots # .accumulate allows to generate a backlog ( obs .partition(partition_obs) .map(concat) .accumulate(functools.partial(accumulate, last_lines=backlog_obs)) .sink(pipe_in.send) ) ( clusters .partition(particion_clusters) .map(pd.concat) .sink(pipe_out.send) ) # start things s.emit(gen()) # set initial model for i, (x, y) in enumerate(centres): model.centers[i]['x'] = x model.centers[i]['y'] = y print("starting") if viz: # plot button_start = pn.widgets.Button(name='Start') button_stop = pn.widgets.Button(name='Stop') t0 = 0 def start(event): s.start() global t0 t0 = time.time() def stop(event): print(count, "events") global t0 t_spent = time.time() - t0 print("frequency", count[0] / t_spent, "Hz") print("Current centres", centres) print("Output centres", [list(c.values()) for c in model.centers.values()]) s.stop() button_start.on_click(start) button_stop.on_click(stop) scatter_dmap_input = hv.DynamicMap(hv.Scatter, streams=[pipe_in]).opts(color="blue") scatter_dmap_output = hv.DynamicMap(hv.Scatter, streams=[pipe_out]).opts(color="red") pl = scatter_dmap_input * scatter_dmap_output pl.opts(xlim=(-0.2, 1.2), ylim=(-0.2, 1.2), height=600, width=600) pan = HoloViews(pl) app = pn.Row(pn.Column(button_start, button_stop), pan) app.show() else: s.start() time.sleep(5) print(count, "events") print("frequency", count[0] / 5, "Hz") print("Current centres", centres) print("Output centres", [list(c.values()) for c in model.centers.values()]) s.stop() if __name__ == "__main__": main(viz=True) streamz-0.6.4/examples/river_kmeans.ipynb0000644000175000017500000000661214270277270020117 0ustar nileshnilesh{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "accbccab", "metadata": {}, "outputs": [], "source": [ "import random\n", "\n", "import pandas as pd\n", "\n", "from streamz import Stream\n", "import hvplot.streamz\n", "from streamz.river import RiverTrain\n", "from river import cluster\n", "import holoviews as hv\n", "from panel.pane.holoviews import HoloViews\n", "hv.extension('bokeh')" ] }, { "cell_type": "code", "execution_count": null, "id": "8a2ef27a", "metadata": {}, "outputs": [], "source": [ "model = cluster.KMeans(n_clusters=3, sigma=0.1, mu=0.5)\n", "centres = [[random.random(), random.random()] for _ in range(3)]\n", "\n", "def gen(move_chance=0.05):\n", " centre = int(random.random() * 3) # 3x faster than random.randint(0, 2)\n", " if random.random() < move_chance:\n", " centres[centre][0] += random.random() / 5 - 0.1\n", " centres[centre][1] += random.random() / 5 - 0.1\n", " value = {'x': random.random() / 20 + centres[centre][0],\n", " 'y': random.random() / 20 + centres[centre][1]}\n", " return value\n", "\n", "\n", "def get_clusters(model):\n", " # return [{\"x\": xcen, \"y\": ycen}, ...] for each centre\n", " data = [{'x': v['x'], 'y': v['y']} for k, v in model.centers.items()]\n", " return pd.DataFrame(data, index=range(3))" ] }, { "cell_type": "code", "execution_count": null, "id": "e6451048", "metadata": {}, "outputs": [], "source": [ "s = Stream.from_periodic(gen, 0.03)\n", "km = RiverTrain(model, pass_model=True)\n", "s.map(lambda x: (x,)).connect(km) # learn takes a tuple of (x,[ y[, w]])\n", "ex = pd.DataFrame({'x': [0.5], 'y': [0.5]})\n", "ooo = s.map(lambda x: pd.DataFrame([x])).to_dataframe(example=ex)\n", "out = km.map(get_clusters)\n", "\n", "# start things\n", "s.emit(gen()) # set initial model\n", "for i, (x, y) in enumerate(centres):\n", " model.centers[i]['x'] = x\n", " model.centers[i]['y'] = y\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1b4de451", "metadata": {}, "outputs": [], "source": [ "pout = out.to_dataframe(example=ex)\n", "pl = (ooo.hvplot.scatter('x', 'y', color=\"blue\", backlog=50) *\n", " pout.hvplot.scatter('x', 'y', color=\"red\", backlog=3))\n", "pl.opts(xlim=(-0.2, 1.2), ylim=(-0.2, 1.2), height=600, width=600)\n", "pl" ] }, { "cell_type": "code", "execution_count": null, "id": "c24d2363", "metadata": {}, "outputs": [], "source": [ "s.start()" ] }, { "cell_type": "code", "execution_count": null, "id": "18cfd94e", "metadata": {}, "outputs": [], "source": [ "s.stop()" ] }, { "cell_type": "code", "execution_count": null, "id": "4537495c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 } streamz-0.6.4/examples/plot_graph.py0000644000175000017500000000043314270277270017073 0ustar nileshnileshfrom streamz import Stream from operator import add source1 = Stream(stream_name='source1') source2 = Stream(stream_name='source2') source3 = Stream(stream_name='awesome source') n1 = source1.zip(source2) n2 = n1.map(add) n3 = n2.zip(source3) L = n3.sink_to_list() n2.visualize() streamz-0.6.4/examples/network_wordcount.py0000644000175000017500000000070114270277270020527 0ustar nileshnilesh#! /usr/env python """ a recreation of spark-streaming's network_wordcount https://spark.apache.org/docs/2.2.0/streaming-programming-guide.html#a-quick-example """ import time from streamz import Stream # absolute port on localhost for now s = Stream.from_tcp(9999) s.map(bytes.split).flatten().frequencies().sink(print) print( """In another terminal execute > nc 127.0.0.1 9999 and then start typing content """ ) s.start() time.sleep(600) streamz-0.6.4/examples/mystream.png0000644000175000017500000010700314270277270016732 0ustar nileshnileshPNG  IHDRD"̊ bKGD IDATxwX7.KH)*ET,  $DŒ59ؒ&1&9)rQ1EM-{JSAD:Y=e ({k/pvvg;<#`1B!hB+ B"B!h= DBz" ]2 UUU\.\.GeeeyKKK%" 577P( ajj ===kl[i d(--Emm-4>Q[[`u>###C(ܜk;fffi Dt`UUUAAA=zDH$F_ըBUUd2/5bї%:u{{{I!PXX\5N귡>pmN:H@w>!33B^^󑛛4A4!abb]]]bӃ@ ht'Z!QT(++@*2 JAyy9jjj pujjjԖmaa{{{p?t邮]ԩSKI;T^^εL檵B}FOOP^7.򐗗dgg#//;]ajj]\\ OOOxxx "/"77)))HMMEFFZ)))!;;;X[[ؘǭiy* \{Y ÇypttTk7nnn;yުAr$''#99IIIHKKCrr2ݻL@Q%^M$R[[233(((: xxxgϞ@޽ѽ{t~eee͛\k;KKKtM899qɞ1<ܿk+<@ff&7MP@GG...,лwoA$q$$$pD@$k׮ٳ'ꊞ={o߾\;ь2ܻwHJJR Jӧ|}}兞={zzz|rssL||<bb1fWWWr9>|ȵsuTUUAGGڍ,--.i(IFdd$qMro߾^V{WSS۷o#11HHH[ `dd___! C i:>>affwɤJtԽJJJ  ၀ ((NNN|\!-)??OFTT L;%˹hAGG Dpp0N4T*Edd$.^Hǣ]tAPP̅:~eff"11qqqF||>>tܿOƙ3gp TTTՕk3C e y&Μ9hDGG AXXFٚCiP "jkkqqݻ'ODII \\\cbذat$JJJp);v 'ODaa!1vXL2AAAZC]vȑ#HMM)FP  qq?~n݂1'b>=JPTpv܉رcwRTz*;ppp)SKԘ{aΝصkRSSѭ[7?Seee8rN:!""՟!ϒ;rrrトL2hsIIIBBzz:<<<0g⚣Z޽6l+W`ccɓ'cԩ8p vH"?Ν; 333L:kɃQ D4&66k֬aiiٳgcڴi4ҁb۶mضmcx1o<]ZaÆ !H0i$1b6rLZINNvލ~ 9r$,X1cƼi`Ι3g?|}}ٖ-[T*,UVnݺ1@͛7.IجYf˖-c999|E:8RɎ9Fsssc;v`*y!7oP;EGG]BJ|wY*--e|344d...lt@xx & /;{,fv@1gEEE8{,=!C]BB/2}v\pnnnBwyM6[n_b b֬YVٳ'~7\~1bݬ5DDkbڴiªU0uT')2 ?>;v%%%={6/,\1,,,xƜ;w׿PXX_&LhS:1,_pwwǍ7Aa9zzzxpUHRv⥖(x{{#..gϞʕ+) 6iHHHI0qD̙39 DDT*̙3};=z666|.0ưqFaؼy31M6 ppp@ @ 5P}X6mVZժ?x Fƍ:th=͛kaРAϽj3cdd޽{gŮj"X`300`滔vgӦMc֭cg [vmi&ڵvfffƶnݪ6oVV ޑZ /Ν;o͔Jeyݟ{f_bb"fcƌari]fD{l۶ w^KiwXDDڴ'N0[n^^pp0JKKզ޽XwFF x;O?D"N:ɓ'̄Df^L\\366f~f̈vH$?>͛W_}rڝ`ʕjF+++6{y* ߫r8q"7evv6PTTwL~)[j‚ i&Ǝgf^̀rJ|uV3^>#?˗/g:☟?ɖ-[tttXEEc./f>[h9r$[h+))a1?3ٖP(ق fAAAё 石iӦy池 sG믿fo&e#Fx*cr}6bĈ;ݻ;<<5/`9;wn՟&NȖ,Y¦O؍7cUVVm۶)SA˗/}](^y֩S'v5-)) B_idiiiL(hdMQ CDmT*ݠc.уbnkV^^z>Snނ֣G$ c1WWWqL4T.] b vi[o̦O]yf:t1?-wȑښ5-%$$pӦOغu+377gX,f6l` Bm_"ھ{Օ1ƘL&cˋ1x@Ļw2̌9r%%%1ɉ}%&&2,88IۨiÆ cӧOȲ+fkkuC<-Qi]6l`ɷ(OT2Hvw)ʊ`WfJݺuK2,77Wm-[0lŌ1ܟGqV\Tn\.g7of%%%,66;|r5T;g)**b {饗ilRʕ+zJ%suue"H'gooUTsܼۨI˖-czȲLƍe7GGi3O DfZ׍7[t R) .6l .\???TVV .]|rQw%7… .]pD"f͚Xxxyy1`}FS6Xwsiee pttѣGdɒfo_Swc娭Ukd!`ii&o&B#ˮ6k3Z7cnnS DtxFFF055Ç.&M1j($$$ 00;LHkϫpF222PUU=RÇ|srE$&&M۷/{f/)šwpuu deeN#˶m4#Emdee@ AAA8veO>AnpIܹ ˖-j=6] F]UTܑ{˗ݕcǎ555o>uq>uלqSSS 4\]]acckkkMyWS9s&r9BCCvwkT*N8҂pejdMLK6r;gggtڵᛚ#c_~%T4aϞ=L(6v-D(++cb?=:UUU ޽{6zJJJX,fycoED;(J ׯꫯXDD{饗XFFcNolȑwe/f}JҘ322b#Gdiii, M>mܸ={6KLL>{M6j(faaق y> gbذٳgBSS׽{c[ ֖`,$$A5>`?`>{=~[n333c `111l‚,))-Z`zzzĉLGG`clڵVTTZDQQcoF2}}gi9w={6D"oصk׸ʹfoo`pYGO'Z#55~~~?~<~whNC0c ]JTSSڵk}RİaC\|Yc+i;j3_cٲe8rƌ,{=<<>>رc%u޴coa1YC A׮]qV C[/_3gbʔ)9sfk6ׯΞ=cǎ]@-鉘̜93fq㐒wYdXO=/b͚5ػw/9~?h׷`'//sE@@z۷o?4Y}200?SN!33}ܹswiD%''W^APPLLL{Mꫯƍɓ1x`DEE]rѣG?~/XYY5y Abb"6n܈'NG;w]2:u*ĉ8}47H`[۷#>>&&& ѣq1A Ѵ;w0cƌf@P "P(믿4,_ǏƎ'Oi12 ;w? wb֭HLLѣ.|||pi:u{'֭[g<~쌟o6ݻ%K๖IwJ#G`͚58wz聈DDD͍H]va(..` 2Ҟ[rr2֮]m۶A$aĉaÚSB={`Νz*cԩYOq-l޼{A^^Sbppp<ҎbΝصkݻ777L:o.H$lٲ;wD|||GAdd$FW_}'N&VK!Jqi=zǏGVV:w1c`̘1#-u\pǏǙ3gPYY ///;/!j囉8|0;WB$!((>|8zMִУG'Oرcx0f;cǎmLpm;v ǏǥK @" |IZRč7(DGG6l3wrr6'Ncp)C A@@1`]*ia>Ddd$d׏;p0`@k8P "UUU!66r `mm p.]].i"ݻD$&&"!!qqq% …CWW J۷o}Iڌ'nG6+W ++ ߿?~ KKK>K@Dr$$$ҥKCbb"5ݻ7<<<wLP^^4$''͛HLLĵkPVVHOOOb BϞ=4#::111HLLĭ[ Jahhoooo߾􄧧':uwZM.#==)))HJJ<?xZ "BPVVk׮9ݽ{ B...'<<<) == ?HMMEvv6@__^^^jz}􁡡!k\d6sMTVVгgOxzzNNNHKKCJJ RSS{A.C Yښ"B Lw"%%tuu5Ǐqrr t // \pwwoLqYYYܗstrr2 <tޞk+ێtmP((,,D^^MϺ777@DH[ÇվrrrT*`cc;;;%b1b1{{h%%%H$Eaa!rrrPPP\sC׮]IgggƆ-# H 522=lmmakk˵E,+E"ѣG\;)((@NN QXX;;;<ɩ]EN.#''PPPP<塤Rjc IDAT͡###BWW4qxr,ƦUTT@PM+//\EEd2P[[jTTT@.555܎Aԩ`mm {{{0hcc5>DM~~>\"`]GWW G&&&‚X,V[@ P{x]`rss2}:lmm1aH$:A1?~-[%m'../u놣G҃?IJsŖ-[/`֬Y|DEB:3&&&8q.xp۷|DGB:&OOO@$!00iii|DDQF!&&N0D8!=.\.]`|2%bС˗is!CQ @OGѨڍF&MABjhiT!ZF.44i DCZw|44i. DDZwL44y^40#!DѨ>M^TM!4uGOD#UBjݾӤ%P "Ш>MZ "Bպ}ѧIK@D!Ш>MZvO!OQ6}h"Bij6DS(B3Шm>M4! ՚_44i 40#!4jhiJhjBiպӤHՄ4uѧIk@D!DZk>M@Bj44 "ByN4uˢѧ {By4uˠѧ (B Q_ >M DBhTѧI[ABZj444ikh`FBhi ~~~D^^󔕕O>AUUjc _}222W]]W^ywơC( 6!hȳFJ _|+WXe?tR 6 t}etʌB4L" <<)))8|01qD=z ̄5ߥ0BwwwdffBGGt*++2sl7nDff&T*r90|pCRҥKHD=DJr9 W]RRRУG{qUUUpvvƣGԶO$ܹ3TCD!eӦMo@(bҥv/rT*|gرcG+UEHHՄa޽;RRRL| 8~ ߒ^\\|H$֢R011H$X,5lll >qFF~G"B0`kk X"DS!|ؽ{7Ba=*:::ѣ6T*ܺu HJJ۷q梶uD"ؠGB^ ___'O? x0~xDt,Nڜ!c˖-Xj  T*B!~7̜9gݻC… Bii) ѳgOxyyppp,--B& F^^pm$%% FFF8p \~>>> ND"L<K,A޽[IHˡ@D!|d8x ;AWWr;wFff& [u:uꄡC"88ի jݻw/x!1~x̘1 >ѐDP(pttĿ/̞=-Z!BBڂ8سg c5kp)b„ ?~<Ӫ%$$ؿ?!C`ժU#Gb…=zt4B4!%Xx1v B̙3ƍ.x~lڴ * C ЫW/K#yQ "7oRSS1yd̟?|TիyfXZZoADDcDH;BB 0}t >HJJŽ;tݻcݺu@XXfΜÇ#55i6 D£'O8x :~jkk~ 111~"Y(BT*>CbĈy&^~ez!~~~ŢE;`|!m ]CD!3ga̚5Zٳg1et%,tQ5!!11'wIs3zzz8<. !( L27nŋ;t777\|2+"(BH+7oΞ=#GO>|*lmmqIcj(!-@D!`߾}矱}v 8rZ;X||CH"BѰSN.7?#ϟH }0j(lܸro="B0B@TUU!..|ċ;w7n܀ R]TM!&}vdffb…|ËkbϞ=رc!&Q "VЭ[7lݺ7nw9j޽XhVX#F]!gBH+7n6l؀sK,$;pf̘B!ٳgC*b…=yY~=,Xsbʕ|C3Q "V6orik8DBz!(B!DQ "B֣@D!GB!Z!B"B!h= DBz!(B!DQ "B֣@D!GB!Z!B"B!h= DBz!(B!DQ "B֣@D!GB!Z!B"B!h= DBz!(B!DQ "B֣@D!GB!Z!B"B!h=c]!c߾}ܿJ%ѥKqӽ}v>J$-Y/B!-IIIx7##] ʪK#MSfM:AzP(̙3["B6:eF! 233. aaazE6"Bf̘]]FD 0DP "*""rJ%Ov)3BzՆx y6NBHG6sLMŤI( R"BLR6M.#""i(BH舁B({ 1Ǫi{(BH7c OEDD<3B"B&O"\S\!m"B,--Sdvvv2dг: L*gCMM R)7O( TTTJ2ɓڪ.LLL`jj }}} GH$ܫըDYYjjjP]] DjԠ |wP(ͅL&淶5lllԾ40Z']CQQZ_[TT}6!-cWX ;;;յ]vR\k=򐑑|4X,nXrvvyWZZW":: GAAǽ^^^ٳ'_~[B@VV}6jkkajj>}ח{ iR)RRR$gff&cbbnݺյOGGG%-Gzz:w[𸇩G􄗗{4XO222p?ʂ@ @ѿ 0G~`bbwZnB||<^x$''CT:t( /// H9O R(HII7dܿJpss;ݺuCn`mmwD1ƐwrG߇JRo߾޽;[X~~>N<ʂ`c µk . 22ܹ31l05 ݻwTҶ@P(u5TWWCWWj^^^򂁁ߥl2L- %''###㙧<==[3eddطobbb  퍐 !JBJJ .]3g̙3H$puuEXX1tPߥ5͜9 6|pv K"Z ++EDD0@BBBÇ.hƺfݸq}<WWWĴihCBڱ Q^^_}. puk(//Ǐ? &]2/_ٳQXXm۶a̘1|DZ&ӧ>ľ}EQ"suu`8u(//絶͛7cРApppk( iV~y^PKXlٟ@ںCRTdfff{l޽LRiPTlƍgϞO>ޞ`عs.ݩek׮eVVVK.üw1@.] /5u*Y-Y :Tz!555/ddBє_9saDZM4D"tRV]]Z,6mbخ]i`fffl֭jfe?;,Cw} %H+kM4@,&7\QrS51h&$vcWD(6TfAҶ_' <;gvgv;̃.Ƞ H$p %I$ZfMM)Zf \.zbʊ̌^&ԗѡ6DC֐E1Vk}![aA˗/H{n;v3ryOd``@ ,i5unnnԦM˨nnnՆS_~WH$tqQԺ*!RTxב 01RT*`=m۶8aÆ!##C@=zǎüy}zś+zVS D"vjiԩ3f {=( aߓ.]vZ%$$ѣ? ݻp:SH[[ )<}!"(Dd3_Mu҅ @7ny{ISS ɖ ҢEh@ϧl""*,,۷GtEر#kt9#GQdddJ6l!zrʕ+a N>Ms%'''JII!???rttl*((?iGWW7?-T޽[iJ|R)͜9ګEޫύ՞={H"ɓ'mI$?m/ JE/^ =|x 233#OOOڻw5YCCCi̙3gҪU^ΪT*:x M>);;&NHYa~^fffR\\ o߯u)'h4}tZt)-^*l8 -11h׮]D]f ^{5(ZnM...DDTVVFR<==}]v5 rMFqqqdmmMyyyDDJfffcǎǏS~~>'|"L111o\Ӫ*R/Ka%%%t'|r:y$M:O>DJ6mD($$FD>>>ϝm۶R) 333駟*܈^X{AAAh{1J%ʰsNm'mذa~ FFF7oQXX\p`ll\F???O{rH$077G^^0&UU뭉yzXgΜ888ô0eh~$I.'aiiQQQpttđ#GֺVݲscgmmTzګw-O>vh#o/nܸHX| x /A˖-,$&&B&UzR 3f _"** زeb޿ʯ,.J_i^kR[UĪCY/;? Gttta;v>?Wmjɲݸq|< RSS,vvvh:5[_xKJJR~ JMKrr2lmm.J}Ś5kDΉ'B.cȐ!lk1b駟B&fիW{wm۶-bŊ ӉźuѪU+?~; -+Gd+kMjʫ[K׸W񋖑:-[&/vїgcB lll*m 5&^sS\\7n\MW^@aaaM>rQ%^v}Zu͛[``.g>Y#'OW^b^UU;Ο?O~:ebbBc"tҥ 0*?LR 63)SЎ;hҥ L/C/++#&LFdoo_Ḝ9sPϞ=  Cx,?I8UU5j`u}wB3cǎBj}xw~Ѻu(88f̘AJF_{'4iҤ _HH7 ZjETu~Vݲsc6c 233YYYdddD+WiTU޻ꖋcyeܹd'zZ gg k>}zqReeeծ%$$455ܜ;F2N:EڵkՎ#Ge<1_MaÆ,֭[G&&&ԭ[75k֐)1233222hi&ݣÇ)==]uԉ+z뭷hСʘO .Z`-\>3*))!"G ɓ'?Δ={6effڵkxWXA5*R/ѓe(!!zMAAA믿 gӧW`q 8LMMΎΝ[ybu3uT@vvv/G[[[@OԣGڿ/^Lw^*,,>칵WWۋWzRI'---:x`OoҥdbbR!6W]GL*D~-eddǏ髯k߸qC8^"Ч~Jׯ_';wpj_|AhѢETTTT[ս.ԳgO222"gggꫯחfΜIaaaP(j4NCd*IN9I$] Oݺu IC%jXԕb.]L&ki6vVqS*4faN>c޽۷oG^pՕظqإqΞ=7tp!$%%aСMx}T*1uTb߾}$V=_q̘1'틀ܸq!jc ꫯЮ];KQ+ӧ0h \~>>> ZCV_!..={DBBBN}㕞!C`׮]8xpr kjÇN:7Dddd}\DI.GE߾}ѳgOʕ+ضm[M ]v1v*k dxr*qr ӧOcbRmw)Jڽ{7uڕ/չ1&Z|9D"ӧ.R;w.I$;vlkd1V߲iܹEF.kĚuorlܸJȑ#1i$ 8޻2^^qq1<[",, x뭷0m4o^ѣ?>`:>3V 6l؀O>XlNZe's@T.??{-[pb6l (1 "44ǎL&1yd >:::bX#eee_nj3s6S?رcV\D̝;K.ey@۷o?@hh(cذa:thjMXsÇøp444ÇcܸqMrXb~1w\a/-''vZdgg#00K,A֭.տDO{9#Gĉd򂟟|}}'\Ç1n߾sٳaii!C`ذa4hP 6`ڵHOOǰa0yd 2b9"ٳguVٳ5k̙O~JJJp:u ΝCTT Zn ___pqqirkT*bcc.B__ݻwн{w8 w͛qXXX`„ 4i:w,vysoߎm۶ܹ3&Oɓ'~ѳd2] . ,, .\@qq1R)ХK1(vEdggݻwG^лwozzzb+Tر6m­[вeK 8PJTbu+&&Fhh(.^[[[曘|8F5@TB\vME(Jhkkm۶_I(,,D||i܁*%%%G\\bbb޽ B---...hժZjְ p]$&&ݻ߷oFRRh۶-)۪U+ Z p _JA׮]ѭ[7tЁ߇FHT">>ĕ+Wpu=,~~~rYM4@tuuѡCt :u<<<TRR8ڵk¡Gǎѭ[7}􁵵%),,711<@ii0 vvvhѢ`ee ]5Qr@JJ Ґd 100@˖-μQTu떰"227n@QQ h׮VZRʛ<$&&">>7oD\\n޼D(JݺunڵإO=Qu+|ɥV՟ba| !VVV9`ff[DVVV ;;x1?~ ѣGߙobb8::V ̯`jj*ܱRp= 667oDll,PRRɲR[@FQQZYYY # OOOxxxӓOaQMeee _>Dzzz/e2YǛ BHzfhhJ022nCQQ+d2!<}=J -Z5`gg=RJ_\a\,--ann.,,,`nnLeee!;;-33iiix!RSSTؒ􍏟c"@TQ_Od29yLLL```B"ֆ'_Ӄ>$I-=#OWWנ+((Bb2u^^'.r9r9 < ;* %%%(..B@AArssQXX044.`jj ]]]011  *,Ӟ^we2JKK+yyy(--Eaa!W,,,ТE P;; 7ưfQcSi+Hqq1 +m-P*(--UNN'᬴J¯3ͭaV|x"n߾]}u1V81VOj:55^^^3f6.IIIGS>:oG/n-1zhĉؾ}Պ0uT( <~ "u10$TWgΜ  Jw^XnJ%$ q-|777T.HUVH1zeXrppƍADH$8p'Ru5kJJJ* Ѐ>F%BU1uÁT\\7| Um̙3-Bu o׮] \.tRL&èQ(zǁ4sL$$$@PTyJB~~>͛5,{=M.XxqVSG| c d߾}:رcͺ[o{Vui "Ƣm۶ TcLͬENB&A[[a@CC`ll\'Cvv6rrrPTT$SZZ"aR077Y9|0v555RͭNcU-D5R;w!!!ؾ};rssUi76N~LNNF||<G~~>ڴi z{:c9rci aaaؽ{7ߏ [$ N>>}xrPvtt4"##qM 66R~zzz044$xQ*s&oYFvv6>|d!|=z* pvv'<<<бcGtNNNsN>6mR6 cذao181Xr:u {}VVVx7rh߾=<<<R޽+ >> 񁍍 ~gcbСB@c1p b1QTx"=} :СjJJkF&!** /_ƥK,c2dewnX1={`޽HKK ]fILLDXXp1Gx71n8ڊ]"cLp bL,矱i&cǎرcb=zwÇQVVAwŐ!Cc c X~=:sssL:4Qd2bƍ8u0sL̘1bk81P??/^/}]=z4'n† er̞=qXwe;w=z!Jgb„ M6Xj_~Xt)d21ƚ!DՓtL<}􁱱1._G{bd?Ľ{;/vifc`ǎh۶-/ٳ'Ne5Y~a̘16l.1Lp baԩ8q"&N8=Z첚 lݺgΜAll,:vӧO]c@XINN:Ca͚5}ڵkիj*Kb5q|{۷Tk׮Q쒚=T={`l,[LcM"^QBB ''';v fffbVϟ L:EEEXz%1ƚ D\9-[ɓ'*"8q"1~xa֬Ybkb81(,,ӧ9 7D\\̙///[cMwf%m߾SLŋ)>>>÷~NMM"996-"q]ܸq6-X{ƌ3}ggg2m{{{cϞ=ɩiI$>~zcy-Dk"88w܁uN;""3v܉:D"^dѢEزe RSSG0ƪ[{x C* zz!ϟZ :6k,dddpFXq bRSSq%;ާUVV[]v3f <<<~A ƍüy0o<@CCW\Rݻ1i$>-[ģG0zh 1<|5kK,G}낳3u놽{] c X?~PnnnM#??VZE$JiJrO;vЇ~( KJJ"F BCCIOOOoWFii)m޼<<9xZf͚Lm۶ =wp|U?),, ;w̙3]bCLL BCC[.ꔇڪ,JcǎϯAk81VK6lv`455G}cx"`߾}ؿ?~z̛7SN7!ɰ|rÇwUFk֬Aff&CC[[W\A~p=lٲ/]ӧ1o<7zUm;wiii 1'=== A_[[[8::NNND"mcq b?#>ddd] ca}=1c81cLq b1Ƙ@c1ǁ1cjc1"c=D1S{c181cLq b1Ƙ@c1ǁ1cjc1"c=D1S{c181cLq b1Ƙ@c1ǁ1cjc1"c=D1S{c181cLq b1Ƙ@c1ǁ1cjc1"c=D1S{ZbkXr BFF0,772 ڵI$,\oe2X@ĘB\\=zT龛7oVqqqCc]fD~:::/OSSGncL\SC&L@YYsĠA`aaрU1Ƙx81:u֭[?~"BPPPVc@Ęz]}>|xWc@Ę B4\KK :D115ժU+tpR@b1qp bLM8a"Uc@Ę?~JE*''G*UZZZ!%&&V { +++XYYFppp=^{*b1-D\Ν;s %%iii.jmm-|R·`jj*5Rdgg###C[>|(Ǐ#-- %%%c-,,pTѺuknݚ/N{RDݻBs3g СC1h XÁ5=J.\@HH>> „ x7-c"ָ=|۷o͛#((#GD6m.*..Ɖ'o>۷* F”)Sƅk.]T*ń 0etE|޽7oŋٳgc֬Y066<"_Xl/sŨQ3k6/6̙ٳgsĵٲFٳ' 8u.^qqb͊V\cΜ9Xv-Zl(..<"&|̞=={>"""p)_WO`|h߾=.1ā]e{bUgpy`/!77edd?h߾=0qDr258"§~#F`o DulܸС!H Hp `…5zl|G033oֻvZ,Z/]II -[=z¢ooo}СC nݺu08ED7o/_~ [n6oތiӦ?T߿&&&HII0s/inn˗U׮]%K`ٲe8px+B J%Z#FիWajj ???ʼnV c2c jŊΝ;1fQk۷/Ñ T* ߵkGm Um۶ "*T}{Y2d+++QaXùz*.]oV0@ z _~GF۶m*빒|":---L>]rk8tRٳ}FGxrݴݻwcҤIСC1cI&^^^zsrJ>^x1_XÇ1k,ѣG5kf͚N8 xrުUa_|0gk֬!++ /%%% u-3-o ___ܸqD|hٲ%=zѣG^^^طo_)W\\+V`ԩڵ+?j IDAT`̘1裏 j4ssslذoXHII!  mFROP(*D͍T*%''!/߿O۷o'-<͍W, sOj-i...j\>Q>}(((J%mڴPHH}^z oݺ5-Jӓ ~####@Վsy""6m  kkkͭ|riӦJ"";wfTl]v3f]cٺƵֳfСC$HH&]J%CRAApJFڴiSKSR5ݻwwޡJVTT]V]V"/#iӦM]ڞj*߉HT iii *,,^иqj4ΥKBCCk4? O1Yx1uQ2k.3 򠫫 K?##G?긝gI$ҸCL&eϣYs|z0LRWg_`cٲe(--B/g1p OOOQ۰aj4'Nlٲxfff 1u|,ۣbGttta;vQgZr%vڅ+Vs֥Ǐ0Ctk...addTm vvvGG*Q*N,feeh|1ݽ{B`e=DAؽ{إ`J_...uMkСXd ,YG3{Q OС`ٲeܿG͛[``'B.cȐ!BMSJ^tmV8iXn]L#Gh| $$ ƚ7153|yRZ~~>I&Q~~0<$$ƍ+ۢE apr9;;;@eeeDDԲeK@Jr9׏R)EGG /ݝvz9;;W>j(@t-̌бcHPT[ݻwP~hݺuL3f @Ǐ;vK.уcr-[PΝ׮qٙД)ShǎtR <"=vijj9;vd2:u %&&zòeАRSS.f '++hذa5>>277'=zq iV˗ /(77W8M>>#˖-ںu+ cccZ|9ԩS =7n̙3 I$ODD@d``@@{ ڹspj]hэ7hdjjJvvv4w\}uݺudbbBݺuZf ҈#(33S;~-eddǏ髯p {Mƹw >ƆO.nݺwxx8ٓٙ+3gRXXsH[[Q`L NլA]t  믿-[@KKKD~W\QNЍ[.]cػw/7d~qjְw8pn֧ڻ86f2/J7'FZ7.,XuEtEE{R3DS* WEB'kKxA}y߈<~p3olp9)u)Bܹf۷we"Z Dߏ~|>TWWS$pL&KY1E"_ ŸH2 E"޽{qD1Qx~Xf z=ĎhDUU;@De'>Uu֡ zz:N@`2X,`0O>ÇKsss, %u^TJ(1/_`bbBA ZΊr4 6l 5 j5T*ղ-׮qaaLMMϟ  dw.IBӡ `0a[ @DBB@(4=L&JBqq1T*}aa!  <(%?/{e7M$~ XL<cff߿G$~h4*MR~~>j5Z.Z8Z-JKK!߁(E"%) O333bIh4*vAG,R ɖ< ^Ij2 jrEEEBY~~?vwxuq"J DDDD"""Ս2e, """/ {i0IENDB`streamz-0.6.4/examples/iterators-and-streams.ipynb0000644000175000017500000002320214270277270021654 0ustar nileshnilesh{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Finite Sequences\n", "======" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = ['{\"name\": \"Alice\", \"value\": 1}',\n", " '{\"name\": \"Bob\", \"value\": 2}',\n", " '{\"name\": \"Alice\", \"value\": 3}',\n", " '{\"name\": \"Alice\", \"value\": 4}',\n", " '{\"name\": \"Charlie\", \"value\": 5}',\n", " '{\"name\": \"Bob\", \"value\": 6}',\n", " '{\"name\": \"Alice\", \"value\": 7}']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'name': 'Alice', 'value': 1},\n", " {'name': 'Bob', 'value': 2},\n", " {'name': 'Alice', 'value': 3},\n", " {'name': 'Alice', 'value': 4},\n", " {'name': 'Charlie', 'value': 5},\n", " {'name': 'Bob', 'value': 6},\n", " {'name': 'Alice', 'value': 7}]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "seq = list(map(json.loads, data))\n", "seq" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3, 4, 5, 6, 7]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import toolz\n", "seq = list(toolz.pluck('value', seq))\n", "seq" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "28" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(seq)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Infinite Sequences\n", "==========" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def infinite_data():\n", " for x in data:\n", " yield x\n", " \n", " # Here we stop, but we could keep going forever...\n", " raise StopIteration" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from operator import add\n", "seq = infinite_data()\n", "seq = map(json.loads, seq)\n", "seq = toolz.pluck('value', seq)\n", "seq = toolz.accumulate(add, seq)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "3\n", "6\n", "10\n", "15\n", "21\n", "28\n" ] } ], "source": [ "for item in seq:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Branching Sequences\n", "\n", "Sometimes we want to do multiple things with an infinite sequence. This is where the Python iterator abstraction starts to feel awkward." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import itertools\n", "import logging\n", "from collections import deque\n", "\n", "seq = infinite_data()\n", "seq = map(json.loads, data)\n", "\n", "seq1, seq2 = itertools.tee(seq, 2)\n", "\n", "seq1 = toolz.pluck('value', seq1) # what we did before\n", "seq1 = toolz.accumulate(add, seq1)\n", "\n", "last_three = deque(maxlen=3)\n", "seq2 = map(last_three.append, seq2)\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n", "deque([{'name': 'Alice', 'value': 1}], maxlen=3)\n", "3\n", "deque([{'name': 'Alice', 'value': 1}, {'name': 'Bob', 'value': 2}], maxlen=3)\n", "6\n", "deque([{'name': 'Alice', 'value': 1}, {'name': 'Bob', 'value': 2}, {'name': 'Alice', 'value': 3}], maxlen=3)\n", "10\n", "deque([{'name': 'Bob', 'value': 2}, {'name': 'Alice', 'value': 3}, {'name': 'Alice', 'value': 4}], maxlen=3)\n", "15\n", "deque([{'name': 'Alice', 'value': 3}, {'name': 'Alice', 'value': 4}, {'name': 'Charlie', 'value': 5}], maxlen=3)\n", "21\n", "deque([{'name': 'Alice', 'value': 4}, {'name': 'Charlie', 'value': 5}, {'name': 'Bob', 'value': 6}], maxlen=3)\n", "28\n", "deque([{'name': 'Charlie', 'value': 5}, {'name': 'Bob', 'value': 6}, {'name': 'Alice', 'value': 7}], maxlen=3)\n" ] } ], "source": [ "while True:\n", " try:\n", " item = next(seq1)\n", " print(item)\n", " \n", " next(seq2)\n", " print(last_three)\n", " \n", " except StopIteration:\n", " break" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Also want\n", "\n", "- Handle multiple incoming streams with joins\n", "- Perform time-window operations like \"group by 50 ms\" or \"slow down input stream, I'm swamped\"\n", "- ..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Streamz\n", "=====\n", "\n", "Same applications, just a different way of thinking about controlling data." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from streamz import Stream" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "L = []" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Simple linear stream\n", "source = Stream()\n", "stream = (source.map(json.loads)\n", " .map(lambda x: x['value'])\n", " .scan(add))\n", "\n", "# Two actions whenever a value comes through\n", "stream.sink(print)\n", "stream.sink(L.append)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3\n", "6\n", "10\n", "15\n", "21\n", "28\n" ] } ], "source": [ "for line in data:\n", " source.emit(line)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[3, 6, 10, 15, 21, 28]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "L" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "128\n" ] } ], "source": [ "source.emit('{\"name\": \"Charlie\", \"value\": 100}');" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[3, 6, 10, 15, 21, 28, 128]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "L" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Easy to add on new components" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stream.sliding_window(2).sink(print)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "129\n", "131\n", "(129, 131)\n", "134\n", "(131, 134)\n", "138\n", "(134, 138)\n", "143\n", "(138, 143)\n", "149\n", "(143, 149)\n", "156\n", "(149, 156)\n" ] } ], "source": [ "for line in data:\n", " source.emit(line)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 } streamz-0.6.4/examples/fibonacci.ipynb0000644000175000017500000000236414270277270017347 0ustar nileshnilesh{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# A Stream of Fibonacci numbers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from streamz import Stream\n", "source = Stream()\n", "\n", "s = source.sliding_window(2).map(sum)\n", "s.rate_limit(0.5).sink(source.emit)\n", "\n", "L = s.sink_to_list()\n", "s.sink(print)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "source.emit(0)\n", "source.emit(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "L" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 2 } streamz-0.6.4/examples/fib_tornado.py0000644000175000017500000000072014270277270017221 0ustar nileshnileshfrom streamz import Stream from tornado.ioloop import IOLoop source = Stream(asynchronous=True) s = source.sliding_window(2).map(sum) L = s.sink_to_list() # store result in a list s.rate_limit('500ms').sink(source.emit) # pipe output back to input s.rate_limit('1s').sink(lambda x: print(L)) # print state of L every second source.emit(0) # seed with initial values source.emit(1) IOLoop.current().start() streamz-0.6.4/examples/fib_thread.py0000644000175000017500000000064114270277270017024 0ustar nileshnileshfrom streamz import Stream from tornado.ioloop import IOLoop source = Stream() s = source.sliding_window(2).map(sum) L = s.sink_to_list() # store result in a list s.rate_limit('500ms').sink(source.emit) # pipe output back to input s.rate_limit('1s').sink(lambda x: print(L)) # print state of L every second source.emit(0) # seed with initial values source.emit(1) streamz-0.6.4/examples/fib_asyncio.py0000644000175000017500000000124214270277270017220 0ustar nileshnileshfrom streamz import Stream import asyncio from tornado.platform.asyncio import AsyncIOMainLoop AsyncIOMainLoop().install() source = Stream() s = source.sliding_window(2).map(sum) L = s.sink_to_list() # store result in a list s.rate_limit(0.5).sink(source.emit) # pipe output back to input s.rate_limit(1.0).sink(lambda x: print(L)) # print state of L every second source.emit(0) # seed with initial values source.emit(1) def run_asyncio_loop(): loop = asyncio.get_event_loop() try: loop.run_forever() except KeyboardInterrupt: pass finally: loop.close() run_asyncio_loop() streamz-0.6.4/examples/dataframes.ipynb0000644000175000017500000000277514270277270017547 0ustar nileshnilesh{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from streamz.dataframe import Random, DataFrame\n", "# this example requires hvplot\n", "import hvplot.streamz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "source = Random(freq='5ms', interval='100ms')\n", "source.x.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sdf = (source - 0.5).cumsum()\n", "sdf.tail()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "p = (DataFrame({'raw': sdf.x,\n", " 'smooth': sdf.x.rolling('100ms').mean(),\n", " 'very-smooth': sdf.x.rolling('500ms').mean()})\n", " .hvplot(width=700)\n", " )\n", "p" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 } streamz-0.6.4/docs/0000755000175000017500000000000014270277270013474 5ustar nileshnileshstreamz-0.6.4/docs/source/0000755000175000017500000000000014270277270014774 5ustar nileshnileshstreamz-0.6.4/docs/source/plugins.rst0000644000175000017500000000615314270277270017214 0ustar nileshnileshPlugins ======= In addition to using ``@Stream.register_api()`` decorator, custom stream nodes can be added to Streamz by installing 3rd-party Python packages. Known plugins ------------- Extras ++++++ These plugins are supported by the Streamz community and can be installed as extras, e.g. ``pip install streamz[kafka]``. There are no plugins here yet, but hopefully soon there will be. .. only:: comment ================= ====================================================== Extra name Description ================= ====================================================== ``files`` Advanced filesystem operations: listening for new files in a directory, writing to multiple files etc. ``kafka`` Reading from and writing to Kafka topics. ================= ====================================================== Entry points ------------ Plugins register themselves with Streamz by using ``entry_points`` argument in ``setup.py``: .. code-block:: Python # setup.py from setuptools import setup setup( name="streamz_example_plugin", version="0.0.1", entry_points={ "streamz.nodes": [ "repeat = streamz_example_plugin:RepeatNode" ] } ) In this example, ``RepeatNode`` class will be imported from ``streamz_example_plugin`` package and will be available as ``Stream.repeat``. In practice, class name and entry point name (the part before ``=`` in entry point definition) are usually the same, but they `can` be different. Different kinds of add-ons go into different entry point groups: =========== ======================= ===================== Node type Required parent class Entry point group =========== ======================= ===================== Source ``streamz.Source`` ``streamz.sources`` Node ``streamz.Stream`` ``streamz.nodes`` Sink ``streamz.Sink`` ``streamz.sinks`` =========== ======================= ===================== Lazy loading ++++++++++++ Streamz will attach methods from existing plugins to the ``Stream`` class when it's imported, but actual classes will be loaded only when the corresponding ``Stream`` method is first called. Streamz will also validate the loaded class before attaching it and will raise an appropriate exception if validation fails. Reference implementation ------------------------ Let's look at how stream nodes can be implemented. .. code-block:: Python # __init__.py from tornado import gen from streamz import Stream class RepeatNode(Stream): def __init__(self, upstream, n, **kwargs): super().__init__(upstream, ensure_io_loop=True, **kwargs) self._n = n @gen.coroutine def update(self, x, who=None, metadata=None): for _ in range(self._n): yield self._emit(x, metadata=metadata) As you can see, implementation is the same as usual, but there's no ``@Stream.register_api()`` — Streamz will take care of that when loading the plugin. It will still work if you add the decorator, but you don't have to. streamz-0.6.4/docs/source/plotting.rst0000644000175000017500000000376014270277270017374 0ustar nileshnileshVisualizing streamz =================== A variety of tools are available to help you understand, debug, visualize your streaming objects: - Most Streamz objects automatically display themselves in Jupyter notebooks, periodically updating their visual representation as text or tables by registering events with the Tornado IOLoop used by Jupyter - The network graph underlying a stream can be visualized using `dot` to render a PNG using `Stream.visualize(filename)` - Streaming data can be visualized using the optional separate packages hvPlot, HoloViews, and Panel (see below) hvplot.streamz -------------- hvPlot is a separate plotting library providing Bokeh-based plots for Pandas dataframes and a variety of other object types, including streamz DataFrame and Series objects. See `hvplot.holoviz.org `_ for instructions on how to install hvplot. Once it is installed, you can use the Pandas .plot() API to get a dynamically updating plot in Jupyter or in Bokeh/Panel Server: .. code-block:: python import hvplot.streamz from streamz.dataframe import Random df = Random() df.hvplot(backlog=100) See the `streaming section `_ of the hvPlot user guide for more details, and the `dataframes.ipynb` example that comes with streamz for a simple runnable example. HoloViews --------- hvPlot is built on HoloViews, and you can also use HoloViews directly if you want more control over events and how they are processed. See the `HoloViews user guide `_ for more details. Panel ----- Panel is a general purpose dashboard and app framework, supporting a wide variety of displayable objects as "Panes". Panel provides a `streamz Pane `_ for rendering arbitrary streamz objects, and streamz DataFrames are handled by the Panel `DataFrame Pane `_. streamz-0.6.4/docs/source/index.rst0000644000175000017500000000703414270277270016641 0ustar nileshnileshStreamz ======= Streamz helps you build pipelines to manage continuous streams of data. It is simple to use in simple cases, but also supports complex pipelines that involve branching, joining, flow control, feedback, back pressure, and so on. Optionally, Streamz can also work with both `Pandas `_ and `cuDF `_ dataframes, to provide sensible streaming operations on continuous tabular data. To learn more about how to use streams, visit :doc:`Core documentation `. Motivation ---------- Continuous data streams arise in many applications like the following: 1. Log processing from web servers 2. Scientific instrument data like telemetry or image processing pipelines 3. Financial time series 4. Machine learning pipelines for real-time and on-line learning 5. ... Sometimes these pipelines are very simple, with a linear sequence of processing steps: .. image:: images/simple.svg :alt: a simple streamz pipeline And sometimes these pipelines are more complex, involving branching, look-back periods, feedback into earlier stages, and more. .. image:: images/complex.svg :alt: a more complex streamz pipeline Streamz endeavors to be simple in simple cases, while also being powerful enough to let you define custom and powerful pipelines for your application. Why not Python generator expressions? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Python users often manage continuous sequences of data with iterators or generator expressions. .. code-block:: python def fib(): a, b = 0, 1 while True: yield a a, b = b, a + b sequence = (f(n) for n in fib()) However iterators become challenging when you want to fork them or control the flow of data. Typically people rely on tools like ``itertools.tee``, and ``zip``. .. code-block:: python x1, x2 = itertools.tee(x, 2) y1 = map(f, x1) y2 = map(g, x2) However this quickly become cumbersome, especially when building complex pipelines. Installation ------------ To install either use: - conda-forge: ``conda install streamz -c conda-forge`` - pip: ``pip install streamz`` - dev: ``git clone https://github.com/python-streamz/streamz`` followed by ``pip install -e streamz/`` Quickstart ---------- The streamz project offers a Docker image for the convenience of quickly trying out streamz and its features. The purpose of the Dockerfile at this time is not to be used in a production environment but rather for experimentation, learning, or new feature development. Its most common use would be to interact with the streamz example jupyter notebooks. Lets walk through the steps needed for this. - Build the Docker container .. code-block:: bash $ docker/build.sh - Run the Docker container .. code-block:: bash $ docker/run.sh - Interact with Jupyter Lab on the container in your browser at `http://localhost:8888/ `_. Related Work ------------ Streamz is similar to reactive programming systems like `RxPY `_ or big data streaming systems like `Apache Flink `_, `Apache Beam `_ or `Apache Spark Streaming `_. .. toctree:: :maxdepth: 2 :hidden: :caption: Contents core.rst dataframes.rst gpu-dataframes.rst dask.rst collections.rst api.rst collections-api.rst async.rst plotting.rst plugins.rst streamz-0.6.4/docs/source/images/0000755000175000017500000000000014270277270016241 5ustar nileshnileshstreamz-0.6.4/docs/source/images/simple.svg0000644000175000017500000001177514270277270020266 0ustar nileshnilesh %3 map; process map; process Sink; save Sink; save map; process->Sink; save filter; select filter; select filter; select->map; process map; parse map; parse map; parse->filter; select TextFile TextFile TextFile->map; parse streamz-0.6.4/docs/source/images/inc-dec-print.svg0000644000175000017500000000766014270277270021427 0ustar nileshnilesh %3 map; decrement map; decrement Sink; print Sink; print map; decrement->Sink; print Stream Stream Stream->map; decrement map; increment map; increment Stream->map; increment <Sink; print>-1 Sink; print map; increment-><Sink; print>-1 streamz-0.6.4/docs/source/images/inc-dec-add-print.svg0000644000175000017500000001114514270277270022146 0ustar nileshnilesh %3 map; sum map; sum Sink; print Sink; print map; sum->Sink; print zip zip zip->map; sum map; increment map; increment map; increment->zip map; decrement map; decrement map; decrement->zip Stream Stream Stream->map; increment Stream->map; decrement streamz-0.6.4/docs/source/images/cyclic.svg0000644000175000017500000001250414270277270020232 0ustar nileshnilesh %3 sink; print sink; print flatten flatten Stream Stream flatten -> Stream unique unique unique -> sink; print 0 map; get map; get unique -> map; get 1 map; lambda map; lambda map; get -> map; lambda map; get_list_of_links map; get_list_of_links map; get_list_of_links -> flatten Stream -> unique map; lambda -> map; get_list_of_links streamz-0.6.4/docs/source/images/complex.svg0000644000175000017500000003217514270277270020441 0ustar nileshnilesh %3 map; parse map; parse sliding_window; 100 sliding_window; 100 map; parse->sliding_window; 100 sliding_window; 10 sliding_window; 10 map; parse->sliding_window; 10 accumulate; count accumulate; count map; parse->accumulate; count map; process map; process sliding_window; 100->map; process delay; 5 delay; 5 sliding_window; 10->delay; 5 Sink; report Sink; report accumulate; count->Sink; report map; train map; train map; process->map; train zip zip map; train->zip <Sink; save>-1 Sink; save map; train-><Sink; save>-1 map; predict map; predict zip->map; predict Sink; save Sink; save map; predict->Sink; save <map; process>-1 map; process <map; process>-1->zip delay; 5-><map; process>-1 TextFile TextFile TextFile->map; parse streamz-0.6.4/docs/source/gpu-dataframes.rst0000644000175000017500000000307314270277270020431 0ustar nileshnileshStreaming GPU DataFrames (cudf) ------------------------------- The ``streamz.dataframe`` module provides a DataFrame-like interface on streaming data as described in the ``dataframes`` documentation. It provides support for dataframe-like libraries such as pandas and cudf. This documentation is specific to streaming GPU dataframes using cudf. The example in the ``dataframes`` documentation is rewritten below using cudf dataframes just by replacing the ``pandas`` module with ``cudf``: .. code-block:: python import cudf from streamz.dataframe import DataFrame example = cudf.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) sdf[sdf.name == 'Alice'].amount.sum() Supported Operations -------------------- Streaming cudf dataframes support the following classes of operations: - Elementwise operations like ``df.x + 1`` - Filtering like ``df[df.name == 'Alice']`` - Column addition like ``df['z'] = df.x + df.y`` - Reductions like ``df.amount.mean()`` - Windowed aggregations (fixed length) like ``df.window(n=100).amount.sum()`` The following operations are not yet supported with cudf (as of version 0.8): - Groupby-aggregations like ``df.groupby(df.name).amount.mean()`` - Windowed aggregations (index valued) like ``df.window(value='2h').amount.sum()`` - Windowed groupby aggregations like ``df.window(value='2h').groupby('name').amount.sum()`` Window-based Aggregations with cudf are supported just as explained in the ``dataframes`` documentation. Support for groupby operations is expected to be added in the future. streamz-0.6.4/docs/source/dataframes.rst0000644000175000017500000002063214270277270017640 0ustar nileshnileshDataFrames ========== When handling large volumes of streaming tabular data it is often more efficient to pass around larger Pandas dataframes with many rows each rather than pass around individual Python tuples or dicts. Handling and computing on data with Pandas can be much faster than operating on individual Python objects. So one could imagine building streaming dataframe pipelines using the ``.map`` and ``.accumulate`` streaming operators with functions that consume and produce Pandas dataframes as in the following example: .. code-block:: python from streamz import Stream def query(df): return df[df.name == 'Alice'] def aggregate(acc, df): return acc + df.amount.sum() stream = Stream() stream.map(query).accumulate(aggregate, start=0) This is fine, and straightforward to do if you understand ``streamz.core`` , Pandas, and have some skill with developing algorithms. Streaming Dataframes -------------------- The ``streamz.dataframe`` module provides a streaming dataframe object that implements many of these algorithms for you. It provides a Pandas-like interface on streaming data. Our example above is rewritten below using streaming dataframes: .. code-block:: python import pandas as pd from streamz.dataframe import DataFrame example = pd.DataFrame({'name': [], 'amount': []}) sdf = DataFrame(stream, example=example) sdf[sdf.name == 'Alice'].amount.sum() The two examples are identical in terms of performance and execution. The resulting streaming dataframe contains a ``.stream`` attribute which is equivalent to the ``stream`` produced in the first example. Streaming dataframes are only syntactic sugar on core streams. Supported Operations -------------------- Streaming dataframes support the following classes of operations - Elementwise operations like ``df.x + 1`` - Filtering like ``df[df.name == 'Alice']`` - Column addition like ``df['z'] = df.x + df.y`` - Reductions like ``df.amount.mean()`` - Groupby-aggregations like ``df.groupby(df.name).amount.mean()`` - Windowed aggregations (fixed length) like ``df.window(n=100).amount.sum()`` - Windowed aggregations (index valued) like ``df.window(value='2h').amount.sum()`` - Windowed groupby aggregations like ``df.window(value='2h').groupby('name').amount.sum()`` DataFrame Aggregations ---------------------- Dataframe aggregations are composed of an aggregation (like sum, mean, ...) and a windowing scheme (fixed sized windows, index-valued, all time, ...) Aggregations ++++++++++++ Streaming Dataframe aggregations are built from three methods - ``initial``: Creates initial state given an empty example dataframe - ``on_new``: Updates state and produces new result to emit given new data - ``on_old``: Updates state and produces new result to emit given decayed data So a simple implementation of ``sum`` as an aggregation might look like the following: .. code-block:: python from streamz.dataframe import Aggregation class Mean(Aggregation): def initial(self, new): state = new.iloc[:0].sum(), new.iloc[:0].count() return state def on_new(self, state, new): total, count = state total = total + new.sum() count = count + new.count() new_state = (total, count) new_value = total / count return new_state, new_value def on_old(self, state, old): total, count = state total = total - old.sum() # switch + for - here count = count - old.count() # switch + for - here new_state = (total, count) new_value = total / count return new_state, new_value These aggregations can then used in a variety of different windowing schemes with the ``aggregate`` method as follows: .. code-block:: python df.aggregate(Mean()) df.window(n=100).aggregate(Mean()) df.window(value='60s').aggregate(Mean()) whose job it is to deliver new and old data to your aggregation for processing. Windowing Schemes +++++++++++++++++ Different windowing schemes like fixed sized windows (last 100 elements) or value-indexed windows (last two hours of data) will track newly arrived and decaying data and call these methods accordingly. The mechanism to track data arriving and leaving is kept orthogonal from the aggregations themselves. These windowing schemes include the following: 1. All previous data. Only ``initial`` and ``on_new`` are called, ``on_old`` is never called. .. code-block:: python >>> df.sum() 2. The previous ``n`` elements .. code-block:: python >>> df.window(n=100).sum() 3. An index range, like a time range for a datetime index .. code-block:: python >>> df.window(value='2h').sum() Although this can be done for any range on any type of index, time is just a common case. Windowing schemes generally maintain a deque of historical values within accumulated state. As new data comes in they inspect that state and eject data that no longer falls within the window. Grouping ++++++++ Groupby aggregations also maintain historical data on the grouper and perform a parallel aggregation on the number of times any key has been seen, removing that key once it is no longer present. Dask ---- In all cases, dataframe operations are only implemented with the ``.map`` and ``.accumulate`` operators, and so are equally compatible with core ``Stream`` and ``DaskStream`` objects. Not Yet Supported ----------------- Streaming dataframe algorithms do not currently pay special attention to data arriving out-of-order. PeriodicDataFrame ----------------- As you have seen above, Streamz can handle arbitrarily complex pipelines, events, and topologies, but what if you simply want to run some Python function periodically and collect or plot the results? streamz provides a high-level convenience class for this purpose, called a PeriodicDataFrame. A PeriodicDataFrame uses Python's asyncio event loop (used as part of Tornado in Jupyter and other interactive frameworks) to call a user-provided function at a regular interval, collecting the results and making them available for later processing. In the simplest case, you can use a PeriodicDataFrame by first writing a callback function like: .. code-block:: python import numpy as np def random_datapoint(**kwargs): return pd.DataFrame({'a': np.random.random(1)}, index=[pd.Timestamp.now()]) You can then make a streaming dataframe to poll this function e.g. every 300 milliseconds: .. code-block:: python df = PeriodicDataFrame(random_datapoint, interval='300ms') ``df`` will now be a steady stream of whatever values are returned by the `datafn`, which can of course be any Python code as long as it returns a DataFrame. Here we returned only a single point, appropriate for streaming the results of system calls or other isolated actions, but any number of entries can be returned by the dataframe in a single batch. To facilitate collecting such batches, the callback is invoked with keyword arguments ``last`` (the time of the previous invocation) and ``now`` (the time of the current invocation) as Pandas Timestamp objects. The callback can then generate or query for just the values in that time range. Arbitrary keyword arguments can be provided to the PeriodicDataFrame constructor, which will be passed into the callback so that its behavior can be parameterized. For instance, you can write a callback to return a suitable number of datapoints to keep a regularly updating stream, generated randomly as a batch since the last call: .. code-block:: python def datablock(last, now, **kwargs): freq = kwargs.get("freq", pd.Timedelta("50ms")) index = pd.date_range(start=last + freq, end=now, freq=freq) return pd.DataFrame({'x': np.random.random(len(index))}, index=index) df = PeriodicDataFrame(datablock, interval='300ms') The callback will now be invoked every 300ms, each time generating datapoints at a rate of 1 every 50ms, returned as a batch. If you wished, you could override the 50ms value by passing `freq=pd.Timedelta("100ms")` to the PeriodicDataFrame constructor. Similar code could e.g. query an external database for the time range since the last update, returning all datapoints since then. Once you have a PeriodicDataFrame defined using such callbacks, you can then use all the rest of the functionality supported by streamz, including aggregations, rolling windows, etc., and streaming `visualization. `_ streamz-0.6.4/docs/source/dask.rst0000644000175000017500000000732014270277270016452 0ustar nileshnileshDask Integration ================ The ``streamz.dask`` module contains a Dask_-powered implementation of the core Stream object. This is a drop-in implementation, but uses Dask for execution and so can scale to a multicore machine or a distributed cluster. Quickstart ---------- .. currentmodule:: streamz Installation ++++++++++++ First install dask and dask.distributed:: conda install dask or pip install dask[complete] --upgrade You may also want to install Bokeh for web diagnostics:: conda install -c bokeh bokeh or pip install bokeh --upgrade Start Local Dask Client +++++++++++++++++++++++ Then start a local Dask cluster .. code-block:: python from dask.distributed import Client client = Client() This operates on local processes or threads. If you have Bokeh installed then this will also start a diagnostics web server at http://localhost:8787/status which you may want to open to get a real-time view of execution. Sequential Execution ++++++++++++++++++++ .. autosummary:: Stream.emit map sink Before we build a parallel stream, let's build a sequential stream that maps a simple function across data, and then prints those results. We use the core ``Stream`` object. .. code-block:: python from time import sleep def inc(x): sleep(1) # simulate actual work return x + 1 from streamz import Stream source = Stream() source.map(inc).sink(print) for i in range(10): source.emit(i) This should take ten seconds because we call the ``inc`` function ten times sequentially. Parallel Execution ++++++++++++++++++ .. currentmodule:: streamz .. autosummary:: scatter buffer .. currentmodule:: streamz.dask .. autosummary:: gather That example ran sequentially under normal execution, now we use ``.scatter()`` to convert our stream into a DaskStream and ``.gather()`` to convert back. .. code-block:: python source = Stream() source.scatter().map(inc).buffer(8).gather().sink(print) for i in range(10): source.emit(i) You may want to look at http://localhost:8787/status during execution to get a sense of the parallel execution. This should have run much more quickly depending on how many cores you have on your machine. We added a few extra nodes to our stream; let's look at what they did. - ``scatter``: Converted our Stream into a DaskStream. The elements that we emitted into our source were sent to the Dask client, and the subsequent ``map`` call used that client's cores to perform the computations. - ``gather``: Converted our DaskStream back into a Stream, pulling data on our Dask client back to our local stream - ``buffer(5)``: Normally gather would exert back pressure so that the source would not accept new data until results finished and were pulled back to the local stream. This back-pressure would limit parallelism. To counter-act this we add a buffer of size eight to allow eight unfinished futures to build up in the pipeline before we start to apply back-pressure to ``source.emit``. .. _Dask: https://dask.pydata.org/en/latest/ Gotchas +++++++ An important gotcha with ``DaskStream`` is that it is a subclass of ``Stream``, and so can be used as an input to any function expecting a ``Stream``. If there is no intervening ``.gather()``, then the downstream node will receive Dask futures instead of the data they represent:: source = Stream() source2 = Stream() a = source.scatter().map(inc) b = source2.combine_latest(a) In this case, the combine operation will get real values from ``source2``, and Dask futures. Downstream nodes would be free to operate on the futures, but more likely, the line should be:: b = source2.combine_latest(a.gather()) streamz-0.6.4/docs/source/core.rst0000644000175000017500000003046614270277270016467 0ustar nileshnileshCore Streams ============ This document takes you through how to build basic streams and push data through them. We start with map and accumulate, talk about emitting data, then discuss flow control and finally back pressure. Examples are used throughout. Map, emit, and sink ------------------- .. currentmodule:: streamz .. autosummary:: Stream.emit map sink You can create a basic pipeline by instantiating the ``Streamz`` object and then using methods like ``map``, ``accumulate``, and ``sink``. .. code-block:: python from streamz import Stream def increment(x): return x + 1 source = Stream() source.map(increment).sink(print) The ``map`` and ``sink`` methods both take a function and apply that function to every element in the stream. The ``map`` method returns a new stream with the modified elements while ``sink`` is typically used at the end of a stream for final actions. To push data through our pipeline we call ``emit`` .. code-block:: python >>> source.emit(1) 2 >>> source.emit(2) 3 >>> source.emit(10) 11 As we can see, whenever we push data in at the source, our pipeline calls ``increment`` on that data, and then calls ``print`` on that data, resulting in incremented results being printed to the screen. Often we call ``emit`` from some other continuous process, like reading lines from a file .. code-block:: python import json data = [] source = Stream() source.map(json.loads).sink(data.append) for line in open('myfile.json'): source.emit(line) Accumulating State ------------------ .. autosummary:: accumulate Map and sink both pass data directly through a stream. One piece of data comes in, either one or zero pieces go out. Accumulate allows you to track some state within the pipeline. It takes an accumulation function that takes the previous state, the new element, and then returns a new state and a new element to emit. In the following example we make an accumulator that keeps a running total of the elements seen so far. .. code-block:: python def add(x, y): return x + y source = Stream() source.accumulate(add).sink(print) .. code-block:: python >>> source.emit(1) 1 >>> source.emit(2) 3 >>> source.emit(3) 6 >>> source.emit(4) 10 The accumulation function above is particularly simple, the state that we store and the value that we emit are the same. In more complex situations we might want to keep around different state than we emit. For example lets count the number of distinct elements that we have seen so far. .. code-block:: python def num_distinct(state, new): state.add(new) return state, len(state) source = Stream() source.accumulate(num_distinct, returns_state=True, start=set()).sink(print) >>> source.emit('cat') 1 >>> source.emit('dog') 2 >>> source.emit('cat') 2 >>> source.emit('mouse') 3 Accumulators allow us to build many interesting operations. Flow Control ------------ .. autosummary:: buffer flatten partition sliding_window union unique You can batch and slice streams into streams of batches in various ways with operations like ``partition``, ``buffer``, and ``sliding_window`` .. code-block:: python source = Stream() source.sliding_window(3, return_partial=False).sink(print) >>> source.emit(1) >>> source.emit(2) >>> source.emit(3) (1, 2, 3) >>> source.emit(4) (2, 3, 4) >>> source.emit(5) (3, 4, 5) Branching and Joining --------------------- .. autosummary:: combine_latest zip zip_latest You can branch multiple streams off of a single stream. Elements that go into the input will pass through to both output streams. Note: ``graphviz`` and ``networkx`` need to be installed to visualize the stream graph. .. code-block:: python def increment(x): return x + 1 def decrement(x): return x - 1 source = Stream() a = source.map(increment).sink(print) b = source.map(decrement).sink(print) b.visualize(rankdir='LR') .. image:: images/inc-dec-print.svg :alt: a branching stream .. code-block:: python >>> source.emit(1) 0 2 >>> source.emit(10) 9 11 Similarly you can also combine multiple streams together with operations like ``zip``, which emits once both streams have provided a new element, or ``combine_latest`` which emits when either stream has provided a new element. .. code-block:: python source = Stream() a = source.map(increment) b = source.map(decrement) c = a.zip(b).map(sum).sink(print) >>> source.emit(10) 20 # 9 + 11 .. image:: images/inc-dec-add-print.svg :alt: a branching and zipped stream This branching and combining is where Python iterators break down, and projects like ``streamz`` start becoming valuable. Processing Time and Back Pressure --------------------------------- .. autosummary:: delay rate_limit timed_window Time-based flow control depends on having an active `Tornado `_ event loop. Tornado is active by default within a Jupyter notebook, but otherwise you will need to learn at least a little about asynchronous programming in Python to use these features. Learning async programming is not mandatory, the rest of the project will work fine without Tornado. You can control the flow of data through your stream over time. For example you may want to batch all elements that have arrived in the last minute, or slow down the flow of data through sensitive parts of the pipeline, particularly when they may be writing to slow resources like databases. Streamz helps you do these operations both with operations like ``delay``, ``rate_limit``, and ``timed_window``, and also by passing `Tornado `_ futures back through the pipeline. As data moves forward through the pipeline, futures that signal work completed move backwards. In this way you can reliably avoid buildup of data in slower parts of your pipeline. Lets consider the following example that reads JSON data from a file and inserts it into a database using an async-aware insertion function. .. code-block:: python async def write_to_database(...): ... # build pipeline source = Source() source.map(json.loads).sink(write_to_database) async def process_file(fn): with open(fn) as f: for line in f: await source.emit(line) # wait for pipeline to clear As we call the ``write_to_database`` function on our parsed JSON data it produces a future for us to signal that the writing process has finished. Streamz will ensure that this future is passed all the way back to the ``source.emit`` call, so that user code at the start of our pipeline can await on it. This allows us to avoid buildup even in very large and complex streams. We always pass futures back to ensure responsiveness. But wait, maybe we don't mind having a few messages in memory at once, this will help steady the flow of data so that we can continue to work even if our sources or sinks become less productive for brief periods. We might add a ``buffer`` just before writing to the database. .. code-block:: python source.map(json.loads).buffer(100).sink(write_to_database) And if we are pulling from an API with known limits then we might want to introduce artificial rate limits at 10ms. .. code-block:: python source.rate_limit(0.010).map(json.loads).buffer(100).sink(write_to_database) Operations like these (and more) allow us to shape the flow of data through our pipelines. Modifying and Cleaning up Streams --------------------------------- When you call ``Stream`` you create a stream. When you call any method on a ``Stream``, like ``Stream.map``, you also create a stream. All operations can be chained together. Additionally, as discussed in the section on Branching, you can split multiple streams off of any point. Streams will pass their outputs on to all downstream streams so that anyone can hook in at any point, and get a full view of what that stream is producing. If you delete a part of a stream then it will stop getting data. Streamz follows normal Python garbage collection semantics so once all references to a stream have been lost those operations will no longer occur. The one counter example to this is ``sink``, which is intended to be used with side effects and will stick around even without a reference. .. note:: Sink streams store themselves in ``streamz.sinks._global_sinks``. You can remove them permanently by clearing that collection. .. code-block:: python >>> source.map(print) # this doesn't do anything >>> source.sink(print) # this stays active even without a reference >>> s = source.map(print) # this works too because we have a handle to s Recursion and Feedback ---------------------- By connecting sources to sinks you can create feedback loops. As an example, here is a tiny web crawler: .. code-block:: python from streamz import Stream source = Stream() pages = source.unique() pages.sink(print) content = pages.map(requests.get).map(lambda x: x.content) links = content.map(get_list_of_links).flatten() links.connect(source) # pipe new links back into pages >>> source.emit('http://github.com') http://github.com http://github.com/features http://github.com/business http://github.com/explore http://github.com/pricing ... .. image:: images/cyclic.svg :alt: the graph of the cyclic web crawler .. note:: Execution order is important here, as if the print was ordered after the ``map; get`` node then the print would never run. Performance ----------- Streamz adds microsecond overhead to normal Python operations. .. code-block:: python from streamz import Stream source = Stream() def inc(x): return x + 1 source.sink(inc) In [5]: %timeit source.emit(1) 100000 loops, best of 3: 3.19 µs per loop In [6]: %timeit inc(1) 10000000 loops, best of 3: 91.5 ns per loop You may want to avoid pushing millions of individual elements per second through a stream. However, you can avoid performance issues by collecting lots of data into single elements, for example by pushing through Pandas dataframes instead of individual integers and strings. This will be faster regardless, just because projects like NumPy and Pandas can be much faster than Python generally. In the following example we pass filenames through a stream, convert them to Pandas dataframes, and then map pandas-level functions on those dataframes. For operations like this Streamz adds virtually no overhead. .. code-block:: python source = Stream() s = source.map(pd.read_csv).map(lambda df: df.value.sum()).accumulate(add) for fn in glob('data/2017-*-*.csv'): source.emit(fn) Streams provides higher level APIs for situations just like this one. You may want to read further about :doc:`collections ` Metadata -------- Metadata can be emitted into the pipeline to accompany the data as a list of dictionaries. Most functions will pass the metadata to the downstream function without making any changes. However, functions that make the pipeline asynchronous require logic that dictates how and when the metadata will be passed downstream. Synchronous functions and asynchronous functions that have a 1:1 ratio of the number of values on the input to the number of values on the output will emit the metadata collection without any modification. However, functions that have multiple input streams or emit collections of data will emit the metadata associated with the emitted data as a collection. Reference Counting and Checkpointing ------------------------------------ Checkpointing is achieved in Streamz through the use of reference counting. With this method, a checkpoint can be saved when and only when data has progressed through all of the the pipeline without any issues. This prevents data loss and guarantees at-least-once semantics. Any node that caches or holds data after it returns increments the reference counter associated with the given data by one. When a node is no longer holding the data, it will release it by decrementing the counter by one. When the counter changes to zero, a callback associated with the data is triggered. References are passed in the metadata as a value of the `ref` keyword. Each metadata object contains only one reference counter object. streamz-0.6.4/docs/source/conf.py0000644000175000017500000001215014270277270016272 0ustar nileshnilesh#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Streamz documentation build configuration file, created by # sphinx-quickstart on Wed Apr 12 18:41:31 2017. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', 'sphinx.ext.autosummary', 'sphinx.ext.extlinks', 'numpydoc'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # General information about the project. project = 'Streamz' copyright = '2017-2020, Matthew Rocklin' author = 'Matthew Rocklin' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. import streamz version = streamz.__version__ # The full version, including alpha/beta/rc tags. release = streamz.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- # Taken from docs.readthedocs.io: # on_rtd is whether we are on readthedocs.io on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if not on_rtd: # only import and set the theme if we're building docs locally import sphinx_rtd_theme html_theme = 'sphinx_rtd_theme' html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # # html_theme = 'alabaster' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] # -- Options for HTMLHelp output ------------------------------------------ # Output file base name for HTML help builder. htmlhelp_basename = 'Streamzdoc' # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'Streamz.tex', 'Streamz Documentation', 'Matthew Rocklin', 'manual'), ] # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'streamz', 'Streamz Documentation', [author], 1) ] # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'Streamz', 'Streamz Documentation', author, 'Streamz', 'Support for pipelines managing continuous streams of data.', 'Miscellaneous'), ] streamz-0.6.4/docs/source/collections.rst0000644000175000017500000000610414270277270020045 0ustar nileshnileshCollections =========== Streamz high-level collection APIs are built on top of ``streamz.core``, and bring special consideration to certain types of data: 1. ``streamz.batch``: supports streams of lists of Python objects like tuples or dictionaries 2. ``streamz.dataframe``: supports streams of Pandas/cudf dataframes or Pandas/cudf series. cudf support is in beta phase and has limited functionality as of cudf version ``0.8`` These high-level APIs help us handle common situations in data processing. They help us implement complex algorithms and also improve efficiency. These APIs are built on the streamz core operations (map, accumulate, buffer, timed_window, ...) which provide the building blocks to build complex pipelines but offer no help with what those functions should be. The higher-level APIs help to fill in this gap for common situations. Conversion ---------- .. currentmodule:: streamz.core .. autosummary:: Stream.to_batch Stream.to_dataframe You can convert from core Stream objects to Batch, and DataFrame objects using the ``.to_batch`` and ``.to_dataframe`` methods. In each case we assume that the stream is a stream of batches (lists or tuples) or a list of Pandas dataframes. .. code-block:: python >>> batch = stream.to_batch() >>> sdf = stream.to_dataframe() To convert back from a Batch or a DataFrame to a ``core.Stream`` you can access the ``.stream`` property. .. code-block:: python >>> stream = sdf.stream >>> stream = batch.stream Example ------- We create a stream and connect it to a file object .. code-block:: python file = ... # filename or file-like object from streamz import Stream source = Stream.from_textfile(file) Our file produces line-delimited JSON serialized data on which we want to call ``json.loads`` to parse into dictionaries. To reduce overhead we first batch our records up into 100-line batches and turn this into a Batch object. We provide our Batch object an example element that it will use to help it determine metadata. .. code-block:: python example = [{'name': 'Alice', 'x': 1, 'y': 2}] lines = source.partition(100).to_batch(example=example) # batches of 100 elements records = lines.map(json.loads) # convert lines to text. We could have done the ``.map(json.loads)`` command on the original stream, but this way reduce overhead by applying this function to lists of items, rather than one item at a time. Now we convert these batches of records into pandas dataframes and do some basic filtering and groupby-aggregations. .. code-block:: python sdf = records.to_dataframe() sdf = sdf[sdf.name == "Alice"] sdf = sdf.groupby(sdf.x).y.mean() The DataFrames satisfy a subset of the Pandas API, but now rather than operate on the data directly, they set up a pipeline to compute the data in an online fashion. Finally we convert this back to a stream and push the results into a fixed-size deque. .. code-block:: python from collections import deque d = deque(maxlen=10) sdf.stream.sink(d.append) See :doc:`Collections API ` for more information. streamz-0.6.4/docs/source/collections-api.rst0000644000175000017500000000335714270277270020623 0ustar nileshnileshCollections API =============== Collections ----------- .. currentmodule:: streamz.collection .. autosummary:: Streaming Streaming.map_partitions Streaming.accumulate_partitions Streaming.verify Batch ----- .. currentmodule:: streamz.batch .. autosummary:: Batch Batch.filter Batch.map Batch.pluck Batch.to_dataframe Batch.to_stream Dataframes ---------- .. currentmodule:: streamz.dataframe .. autosummary:: DataFrame DataFrame.groupby DataFrame.rolling DataFrame.assign DataFrame.sum DataFrame.mean DataFrame.cumsum DataFrame.cumprod DataFrame.cummin DataFrame.cummax .. autosummary:: GroupBy GroupBy.count GroupBy.mean GroupBy.size GroupBy.std GroupBy.sum GroupBy.var .. autosummary:: Rolling Rolling.aggregate Rolling.count Rolling.max Rolling.mean Rolling.median Rolling.min Rolling.quantile Rolling.std Rolling.sum Rolling.var .. autosummary:: DataFrame.window Window.apply Window.count Window.groupby Window.sum Window.size Window.std Window.var .. autosummary:: Rolling.aggregate Rolling.count Rolling.max Rolling.mean Rolling.median Rolling.min Rolling.quantile Rolling.std Rolling.sum Rolling.var .. autosummary:: PeriodicDataFrame .. autosummary:: Random Details ------- .. currentmodule:: streamz.collection .. autoclass:: Streaming :members: .. currentmodule:: streamz.batch .. autoclass:: Batch :members: :inherited-members: .. currentmodule:: streamz.dataframe .. autoclass:: DataFrame :members: :inherited-members: .. autoclass:: Rolling :members: .. autoclass:: Window :members: .. autoclass:: GroupBy :members: .. autoclass:: Random streamz-0.6.4/docs/source/async.rst0000644000175000017500000001263514270277270016652 0ustar nileshnileshAsynchronous Computation ======================== *This section is only relevant if you want to use time-based functionality. If you are only using operations like map and accumulate then you can safely skip this section.* When using time-based flow control like ``rate_limit``, ``delay``, or ``timed_window`` Streamz relies on the Tornado_ framework for concurrency. This allows us to handle many concurrent operations cheaply and consistently within a single thread. However, this also adds complexity and requires some understanding of asynchronous programming. There are a few different ways to use Streamz with a Tornado event loop. We give a few examples below that all do the same thing, but with different styles. In each case we use the following toy functions: .. code-block:: python from tornado import gen import time def increment(x): """ A blocking increment function Simulates a computational function that was not designed to work asynchronously """ time.sleep(0.1) return x + 1 @gen.coroutine def write(x): """ A non-blocking write function Simulates writing to a database asynchronously """ yield gen.sleep(0.2) print(x) Within the Event Loop --------------------- You may have an application that runs strictly within an event loop. .. code-block:: python from streamz import Stream from tornado.ioloop import IOLoop @gen.coroutine def f(): source = Stream(asynchronous=True) # tell the stream we're working asynchronously source.map(increment).rate_limit(0.500).sink(write) for x in range(10): yield source.emit(x) IOLoop().run_sync(f) We call Stream with the ``asynchronous=True`` keyword, informing it that it should expect to operate within an event loop. This ensures that calls to ``emit`` return Tornado futures rather than block. We wait on results using ``yield``. .. code-block:: python yield source.emit(x) # waits until the pipeline is ready This would also work with async-await syntax in Python 3 .. code-block:: python from streamz import Stream from tornado.ioloop import IOLoop async def f(): source = Stream(asynchronous=True) # tell the stream we're working asynchronously source.map(increment).rate_limit(0.500).sink(write) for x in range(10): await source.emit(x) IOLoop().run_sync(f) Event Loop on a Separate Thread ------------------------------- Sometimes the event loop runs on a separate thread. This is common when you want to support interactive workloads (the user needs their own thread for interaction) or when using Dask (next section). .. code-block:: python from streamz import Stream source = Stream(asynchronous=False) # starts IOLoop in separate thread source.map(increment).rate_limit('500ms').sink(write) for x in range(10): source.emit(x) In this case we pass ``asynchronous=False`` to inform the stream that it is expected to perform time-based computation (our write function is a coroutine) but that it should not expect to run in an event loop, and so needs to start its own in a separate thread. Now when we call ``source.emit`` normally without using ``yield`` or ``await`` the emit call blocks, waiting on a coroutine to finish within the IOLoop. All functions here happen on the IOLoop. This is good for consistency, but can cause other concurrent applications to become unresponsive if your functions (like ``increment``) block for long periods of time. You might address this by using Dask (see below) which will offload these computations onto separate threads or processes. Using Dask ---------- Dask_ is a parallel computing library that uses Tornado for concurrency and threads for computation. The ``DaskStream`` object is a drop-in replacement for ``Stream`` (mostly). Typically we create a Dask client, and then ``scatter`` a local Stream to become a DaskStream. .. code-block:: python from dask.distributed import Client client = Client(processes=False) # starts thread pool, IOLoop in separate thread from streamz import Stream source = Stream() (source.scatter() # scatter local elements to cluster, creating a DaskStream .map(increment) # map a function remotely .buffer(5) # allow five futures to stay on the cluster at any time .gather() # bring results back to local process .sink(write)) # call write locally for x in range(10): source.emit(x) This operates very much like the synchronous case in terms of coding style (no ``@gen.coroutine`` or ``yield``) but does computations on separate threads. This also provides parallelism and access to a dashboard at http://localhost:8787/status . Asynchronous Dask ----------------- Dask can also operate within an event loop if preferred. Here you can get the non-blocking operation within an event loop while also offloading computations to separate threads. .. code-block:: python from dask.distributed import Client from tornado.ioloop import IOLoop async def f(): client = await Client(processes=False, asynchronous=True) source = Stream(asynchronous=True) source.scatter().map(increment).rate_limit('500ms').gather().sink(write) for x in range(10): await source.emit(x) IOLoop().run_sync(f) .. _Tornado: http://www.tornadoweb.org/en/stable/ .. _Dask: https://dask.pydata.org/en/latest/ streamz-0.6.4/docs/source/api.rst0000644000175000017500000000371614270277270016306 0ustar nileshnileshAPI === Stream ------ .. currentmodule:: streamz .. autosummary:: Stream .. autosummary:: Stream.connect Stream.destroy Stream.disconnect Stream.visualize accumulate buffer collect combine_latest delay filter flatten map partition rate_limit scatter sink sink_to_textfile slice sliding_window starmap timed_window union unique pluck zip zip_latest .. automethod:: Stream.connect .. automethod:: Stream.disconnect .. automethod:: Stream.destroy .. automethod:: Stream.emit .. automethod:: Stream.frequencies .. automethod:: Stream.register_api .. automethod:: Stream.sink .. automethod:: Stream.sink_to_list .. automethod:: Stream.sink_to_textfile .. automethod:: Stream.to_websocket .. automethod:: Stream.to_mqtt .. automethod:: Stream.update .. automethod:: Stream.visualize Sources ------- .. autosummary:: from_iterable filenames from_kafka from_kafka_batched from_mqtt from_process from_websocket from_textfile from_tcp from_http_server DaskStream ---------- .. currentmodule:: streamz.dask .. autosummary:: DaskStream gather Definitions ----------- .. currentmodule:: streamz .. autofunction:: accumulate .. autofunction:: buffer .. autofunction:: collect .. autofunction:: combine_latest .. autofunction:: delay .. autofunction:: filter .. autofunction:: flatten .. autofunction:: map .. autofunction:: partition .. autofunction:: rate_limit .. autofunction:: sink .. autofunction:: sink_to_textfile .. autofunction:: sliding_window .. autofunction:: Stream .. autofunction:: timed_window .. autofunction:: union .. autofunction:: unique .. autofunction:: pluck .. autofunction:: zip .. autofunction:: zip_latest .. autofunction:: from_iterable .. autofunction:: filenames .. autofunction:: from_kafka .. autofunction:: from_kafka_batched .. autofunction:: from_textfile .. currentmodule:: streamz.dask .. autofunction:: DaskStream .. autofunction:: gather streamz-0.6.4/docs/requirements-docs.txt0000644000175000017500000000011414270277270017702 0ustar nileshnileshnumpydoc sphinx sphinx_rtd_theme tornado toolz zict pandas dask distributed streamz-0.6.4/docs/make.bat0000644000175000017500000000145714270277270015110 0ustar nileshnilesh@ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=source set BUILDDIR=build set SPHINXPROJ=Streams if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% :end popd streamz-0.6.4/docs/Makefile0000644000175000017500000000114014270277270015130 0ustar nileshnilesh# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = Streams SOURCEDIR = source BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)streamz-0.6.4/docker/0000755000175000017500000000000014270277270014013 5ustar nileshnileshstreamz-0.6.4/docker/scripts/0000755000175000017500000000000014270277270015502 5ustar nileshnileshstreamz-0.6.4/docker/scripts/entry.sh0000755000175000017500000000133114270277270017200 0ustar nileshnilesh#!/bin/bash # Activate the streamz-dev anaconda environment by default source activate streamz-dev # Start Zookeeper $KAFKA_HOME/bin/zookeeper-server-start.sh -daemon $KAFKA_HOME/config/zookeeper.properties # Configure Kafka sed -i '/#listeners=PLAINTEXT:\/\/:9092/c\listeners=PLAINTEXT:\/\/localhost:9092' $KAFKA_HOME/config/server.properties sed -i '/#advertised.listeners=PLAINTEXT:\/\/your.host.name:9092/c\advertised.listeners=PLAINTEXT:\/\/localhost:9092' $KAFKA_HOME/config/server.properties # Start Kafka $KAFKA_HOME/bin/kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties # Start up a jupyter notebook cd /streamz/examples && jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' streamz-0.6.4/docker/run.sh0000755000175000017500000000006314270277270015155 0ustar nileshnilesh#!/bin/bash docker run -p 8888:8888 streamz:0.5.2 streamz-0.6.4/docker/build.sh0000755000175000017500000000005514270277270015451 0ustar nileshnilesh#!/bin/bash docker build -t streamz:0.5.2 . streamz-0.6.4/conftest.py0000644000175000017500000000057714270277270014754 0ustar nileshnileshimport pytest def pytest_addoption(parser): parser.addoption("--runslow", action="store_true", help="run slow tests") def pytest_runtest_setup(item): if 'slow' in item.keywords and not item.config.getoption("--runslow"): pytest.skip("need --runslow option to run") def pytest_ignore_collect(path, config): if 'run_test.py' in str(path): return True streamz-0.6.4/ci/0000755000175000017500000000000014270277270013137 5ustar nileshnileshstreamz-0.6.4/ci/environment-py38.yml0000644000175000017500000000077414270277270017037 0ustar nileshnileshname: test_env channels: - conda-forge - defaults dependencies: - python=3.8 - pytest - flake8 - black - isort - tornado - toolz - zict - six - librdkafka=1.5.3 - dask - distributed - pandas - python-confluent-kafka=1.5.0 - numpydoc - sphinx - sphinx_rtd_theme - codecov - coverage - networkx - graphviz - python-graphviz - pytest-asyncio - bokeh - ipython - ipykernel - ipywidgets - flaky - pytest-cov - coveralls - paho-mqtt - websockets streamz-0.6.4/ci/environment-py37.yml0000644000175000017500000000072114270277270017026 0ustar nileshnileshname: test_env channels: - conda-forge - defaults dependencies: - python=3.7 - pytest - flake8 - black - isort - tornado - toolz - zict - six - librdkafka=1.5.3 - dask - distributed - pandas - python-confluent-kafka=1.5.0 - numpydoc - sphinx - sphinx_rtd_theme - codecov - coverage - networkx - graphviz - pytest-asyncio - python-graphviz - bokeh - ipython - ipykernel - ipywidgets - flaky - pytest-cov streamz-0.6.4/README.rst0000644000175000017500000000252514270277270014237 0ustar nileshnileshStreamz ======= |Build Status| |Doc Status| |Version Status| |RAPIDS custreamz gpuCI| Streamz helps you build pipelines to manage continuous streams of data. It is simple to use in simple cases, but also supports complex pipelines that involve branching, joining, flow control, feedback, back pressure, and so on. Optionally, Streamz can also work with both `Pandas `_ and `cuDF `_ dataframes, to provide sensible streaming operations on continuous tabular data. To learn more about how to use Streamz see documentation at `streamz.readthedocs.org `_. LICENSE ------- BSD-3 Clause .. |Build Status| image:: https://github.com/python-streamz/streamz/workflows/CI/badge.svg :target: https://github.com/python-streamz/streamz/actions .. |Doc Status| image:: http://readthedocs.org/projects/streamz/badge/?version=latest :target: http://streamz.readthedocs.org/en/latest/ :alt: Documentation Status .. |Version Status| image:: https://img.shields.io/pypi/v/streamz.svg :target: https://pypi.python.org/pypi/streamz/ .. |RAPIDS custreamz gpuCI| image:: https://img.shields.io/badge/gpuCI-custreamz-green :target: https://github.com/jdye64/cudf/blob/kratos/python/custreamz/custreamz/kafka.py streamz-0.6.4/MANIFEST.in0000644000175000017500000000031114270277270014275 0ustar nileshnileshrecursive-include streamz *.py recursive-include docs/source * include docs/Makefile docs/make.bat include setup.py include README.rst include LICENSE.txt include MANIFEST.in include requirements.txt streamz-0.6.4/LICENSE.txt0000644000175000017500000000274114270277270014373 0ustar nileshnileshCopyright (c) 2017, Continuum Analytics, Inc. and contributors All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of Continuum Analytics nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. streamz-0.6.4/Dockerfile0000644000175000017500000000275214270277270014544 0ustar nileshnileshFROM python:3.7.5-slim USER root SHELL ["/bin/bash", "--login", "-c"] ENV DEBIAN_FRONTEND noninteractive ENV SCALA_VERSION 2.11 ENV KAFKA_VERSION 2.3.0 ENV KAFKA_HOME /opt/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION" # Install conda ADD https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh /miniconda.sh RUN sh /miniconda.sh -b -p /conda && /conda/bin/conda update -n base conda RUN echo "PATH=${PATH}:/conda/bin" >> ~/.bashrc # Add Streamz source code to the build context ADD . /streamz/. # Create the conda environment RUN conda env create --name streamz-dev -f /streamz/conda/environments/streamz_dev.yml RUN conda init bash # Ensures subsequent RUN commands do not need the "conda activate streamz_dev" command RUN echo "conda activate streamz_dev" >> ~/.bashrc # Build streamz from source RUN cd /streamz && \ python setup.py install # Install optional dependencies in the conda environment RUN conda install -c conda-forge jupyterlab \ numpy \ pandas \ wget \ vim # Install Kafka RUN wget -q http://www.gtlib.gatech.edu/pub/apache/kafka/2.3.0/kafka_2.11-2.3.0.tgz -O /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz && \ tar xfz /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -C /opt && \ rm /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz # Zookeeper & Kafa ports EXPOSE 2181 EXPOSE 9092 CMD ["/streamz/docker/scripts/entry.sh"] streamz-0.6.4/CONTRIBUTING.md0000644000175000017500000000107614270277270015001 0ustar nileshnilesh# Contributing to Streamz ## Streamz Conda Environment For the convenience of the community streamz offers an environment file for contributors to create conda environments. A few basic quick start commands for creating a using a streamz conda environment are found below. ### Creating Conda Development Environment Creating the streamz conda environment can be achieved by simply running ```conda env create -f ./conda/environments/streamz_dev.yml``` ### Using Conda Environment The streamz conda environment can be activated by running ```conda activate streamz_dev``` streamz-0.6.4/.gitignore0000644000175000017500000000021614270277270014533 0ustar nileshnilesh*.pyc *.egg-info docs/build build/ dist/ .idea/ log.* log .coverage .DS_Store *.swp *.swo .cache/ .ipynb_checkpoints/ .vscode .python-version streamz-0.6.4/.dockerignore0000644000175000017500000000047614270277270015227 0ustar nileshnilesh# Common IDE configuration directories and files that should be ignored *.vscode *.idea # Previous build output that should not be included build directories # Ignore the Dockerfile itself Dockerfile # Ignore CI related files are we don't need those in the container .codecov.yml .coveragerc .gitignore .travis.yml streamz-0.6.4/.coveragerc0000644000175000017500000000044314270277270014666 0ustar nileshnilesh[run] source = streamz relative_files = True [report] omit = */python?.?/* */site-packages/nose/* # ignore _version.py and versioneer.py .*version.* *_version.py */test_*.py exclude_lines = if __name__ == '__main__': pragma: no cover NotImplementedError streamz-0.6.4/.codecov.yml0000644000175000017500000000114414270277270014767 0ustar nileshnilesh# codecov can find this file anywhere in the repo, so we don't need to clutter # the root folder. #comment: false codecov: notify: require_ci_to_pass: no coverage: status: patch: default: target: '80' if_no_uploads: error if_not_found: success if_ci_failed: failure project: default: false library: target: auto if_no_uploads: error if_not_found: success if_ci_failed: failure paths: '!*/tests/.*' tests: target: 97.9% paths: '*/tests/.*' flags: tests: paths: - tests/