package/LICENSE000600 0000002066 3560116604 010261 0ustar00000000 000000 The MIT License (MIT) Copyright (c) 2015 Will Welch Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. package/example.html000600 0000000716 3560116604 011575 0ustar00000000 000000 TDigest example
TDigest of 1M samples uniform in [0..1]:
    

  
package/specs/digest.spec.js000600 0000005467 3560116604 013147 0ustar00000000 000000 var Digest = require('../tdigest').Digest; var assert = require('better-assert'); assert.deepEqual = require('chai').assert.deepEqual; describe('Digests in discrete mode', function(){ it('consumes same-valued points into a single point', function(){ var digest = new Digest({mode:'disc'}); var i, N = 1000; for (i = 0 ; i < N ; i = i + 1) { digest.push(42); } var points = digest.toArray(); assert.deepEqual(points, [{mean: 42, n:N}]); }); it('handles multiple duplicates', function(){ var digest = new Digest({mode:'disc'}); var i, N = 10; for (i = 0 ; i < N ; i++) { digest.push(0.0); digest.push(1.0); digest.push(0.5); } assert.deepEqual( digest.toArray(), [{mean:0.0, n:N}, {mean:0.5, n:N}, {mean:1.0, n:N}] ); }); }); describe('Digests in continuous mode', function(){ // these results should be the same as a plain old TDigest // it('compresses points and preserves bounds', function(){ var digest = new Digest({mode:'cont'}); var i, N = 100; for (i = 0 ; i < N ; i += 1) { digest.push(i*10); } assert(digest.size() === 100); digest.delta = 0.1; // encourage merging (don't do this!) digest.compress(); var points = digest.toArray(); assert(points.length < 100); assert(points[0].mean === 0); assert(points[points.length-1].mean === (N - 1) * 10); }); it('automatically compresses during ingest', function(){ var digest = new Digest({mode:'cont'}); var i, N = 10000; for (i = 0 ; i < N ; i += 1) { digest.push(i*10); } var points = digest.toArray(); assert(digest.nreset > 1); assert(points.length < 10000); assert(points[0].mean === 0); assert(points[points.length-1].mean === 99990); }); }); describe('Digests in auto mode', function(){ it('preserves a discrete distribution', function(){ var digest = new Digest(); var i, ntimes = 1000, nvals=100; for (i = 0 ; i < ntimes ; i++) { for (j = 0 ; j < nvals ; j++) { digest.push(j); } } var result = digest.toArray(); for (i = 0 ; i < nvals ; i++) { assert.deepEqual(result[i], {mean:i, n:ntimes}); } }); it('compresses a continuous distribution', function(){ var digest = new Digest(); var i, N = 10, M=1000; for (i = 0 ; i < N ; i++) { for (i = 0 ; i < M ; i++) { digest.push(Math.random()); } } var result = digest.toArray(); assert(result.length < digest.n); }); }); package/specs/discrete.spec.js000600 0000014457 3560116604 013471 0ustar00000000 000000 // test the discrete distribution behavior, in which TDigest merging is // disabled and mass is accumulated at points and reported exactly. // var TDigest = require('../tdigest').TDigest; var assert = require('better-assert'); assert.deepEqual = require('chai').assert.deepEqual; describe('discrete digests', function(){ it('consumes increasing-valued points', function(){ var tdigest = new TDigest(false); var i, N = 100; for (i = 0 ; i < N ; i += 1) { tdigest.push(i*10); } var points = tdigest.toArray(true); for (i = 0 ; i < N ; i += 1) { assert(points[i].mean === i*10); } }); it('consumes decreasing-valued points', function(){ var tdigest = new TDigest(false); var i, N = 100; for (i = N - 1 ; i >= 0 ; i = i - 1) { tdigest.push(i*10); } var points = tdigest.toArray(true); for (i = 0 ; i < N ; i += 1) { assert(points[i].mean === i*10); } }); it('consumes nonnumeric points', function(){ var tdigest = new TDigest(false); tdigest.push("foo"); tdigest.push("bar"); tdigest.push("baz"); tdigest.push("foo"); tdigest.push("freen"); tdigest.push("bork"); tdigest.push("bork"); tdigest.push("bork"); tdigest.push("books"); var points = tdigest.toArray(); assert.deepEqual(points, [ {mean:"bar", n:1}, {mean:"baz", n:1}, {mean:"books", n:1}, {mean:"bork", n:3}, {mean:"foo", n:2}, {mean:"freen", n:1}, ]); }); it('consumes same-valued points into a single point', function(){ var tdigest = new TDigest(false); var i, N = 100; for (i = 0 ; i < N ; i = i + 1) { tdigest.push(1000); } var points = tdigest.toArray(); assert.deepEqual(points, [{mean: 1000, n:N}]); }); it('selects a run of duplicates containing the percentile', function(){ var tdigest = new TDigest(false); tdigest.push([ 5, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 6, 1, 0, 6, 5, 3, 6, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0 ]); assert(tdigest.percentile(0.5) === 0); }); it('handles multiple duplicates', function(){ var tdigest = new TDigest(false); var i, N = 10; for (i = 0 ; i < N ; i++) { tdigest.push(0.0); tdigest.push(1.0); tdigest.push(0.5); } assert.deepEqual( tdigest.toArray(), [{mean:0.0, n:N}, {mean:0.5, n:N}, {mean:1.0, n:N}] ); }); }); describe('discrete percentile ranks', function(){ it('from a single point', function(){ var tdigest = new TDigest(false); tdigest.push(0); var x = [-0.5, 0, 0.5, 1.0, 1.5]; var q = [0, 1, 1, 1, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from two points', function(){ var tdigest = new TDigest(false); tdigest.push([0, 1]); var x = [-0.5, 0, 0.5, 1.0, 1.5]; var q = [0, 0.5, 0.5, 1.0, 1.0]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from three points', function(){ var tdigest = new TDigest(false); tdigest.push([-1, 0, 1] ); var x = [-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]; var q = [0, 1/3, 1/3, 2/3, 2/3, 1, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from three points is same as from multiples of those points', function(){ var tdigest = new TDigest(false); tdigest.push([0,1,-1]); var x = [-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]; var result1 = tdigest.p_rank(x); tdigest.push([0,1,-1]); tdigest.push([0,1,-1]); var result2 = tdigest.p_rank(x); assert.deepEqual(result1, result2); }); it('from four points away from the origin', function(){ var tdigest = new TDigest(false); tdigest.push([10,11,12,13]); var x = [9, 10, 11, 12, 13, 14]; var q = [0, 1/4, 2/4, 3/4, 1, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from four points is same as from multiples of those points', function(){ var tdigest = new TDigest(false); tdigest.push([10,11,12,13]); var x = [9, 10, 11, 12, 13, 14]; var result1 = tdigest.p_rank(x); tdigest.push([10,11,12,13]); tdigest.push([10,11,12,13]); var result2 = tdigest.p_rank(x); assert.deepEqual(result1, result2); }); it('from nonnumeric points', function(){ var tdigest = new TDigest(false); tdigest.push("foo"); tdigest.push("bar"); tdigest.push("baz"); tdigest.push("freen"); var x = ["bar", "baz", "foo", "freen"]; var q = [1/4, 2/4, 3/4, 4/4]; assert.deepEqual(tdigest.p_rank(x), q); }); }); describe('discrete percentiles', function(){ it('from a single point', function(){ var tdigest = new TDigest(false); tdigest.push(0); var p = [0, 0.5, 1.0]; var x = [0, 0, 0]; assert.deepEqual(tdigest.percentile(p), x); }); it('from two points', function(){ var tdigest = new TDigest(false); tdigest.push([0, 10]); var p = [0, 1/4, 1/2, 3/4, 1]; var x = [0, 0, 0, 10, 10]; assert.deepEqual(tdigest.percentile(p), x); }); it('from three points', function(){ var tdigest = new TDigest(false); tdigest.push([0, 5, 10]); var p = [0, 1/4, 1/2.9, 1/2, 2/3, 3/4, 1]; var x = [0, 0, 5, 5, 5, 10, 10]; assert.deepEqual(tdigest.percentile(p), x); }); it('from four points away from the origin', function(){ var tdigest = new TDigest(false); tdigest.push([10,11,12,13]); var p = [0, 1/4, 1/2, 3/4, 1]; var x = [10, 10, 11, 12, 13]; assert.deepEqual(tdigest.percentile(p), x); }); it('from nonnumeric points', function(){ var tdigest = new TDigest(false); tdigest.push("foo"); tdigest.push("bar"); tdigest.push("baz"); tdigest.push("freen"); var p = [0, 1/4, 1/2, 3/4, 1]; var x = ["bar", "bar", "baz", "foo", "freen"]; assert.deepEqual(tdigest.percentile(p), x); }); }); package/distributions.js000600 0000006005 3560116604 012511 0ustar00000000 000000 // // distributions: // // measure and report approximation error on classic distribution functions // var TDigest = require('./tdigest').TDigest; function distributions(nruns, nsamples, npercentiles) { [uniform, gaussian, chisq, exponential, make_brownian() ].map( function(dist) { var fname = dist.toString().match(/function ([^\(]+)/)[1]; console.log(fname + ": " + nruns + " runs digesting " + nsamples + " points:"); var errs = [0,0]; for (var i = 0 ; i < nruns ; i++) { var err = compare_edf(dist, nsamples, npercentiles); errs[0] += err[0]; errs[1] += err[1]; } errs[0] /= nruns; errs[1] /= nruns; console.log(fname + ": avg rmse (relative) == " + errs[0] + ", avg max err (relative) == " + errs[1]); }); } function compare(d1, d2, N) { // compare digests d1 and d2 over N evenly spaced percentiles. // return RMSE and maximum error, both relative to the // distributions' maximum observed magnitude. // var maxerr = 0; var rmse = 0; var scale = Math.max(Math.abs(d1.percentile(0)), Math.abs(d1.percentile(1)), Math.abs(d2.percentile(0)), Math.abs(d2.percentile(1))); for (var i = 0 ; i <= N ; i += 1) { var q1 = d1.percentile(i/N); var q2 = d2.percentile(i/N); maxerr = Math.max(maxerr, Math.abs(q1 - q2)); rmse += (q1 - q2) * (q1 - q2); } rmse = Math.sqrt(rmse/i); return [rmse/scale, maxerr/scale]; } function compare_edf(f, nsamples, npercentiles) { // draw samples from f, digest them, and compare digest percentile // results to EDF of original samples. // var edf = new TDigest(false); var digest = new TDigest(); for (var i = 0 ; i < nsamples ; i++) { var x = f(); edf.push(x); digest.push(x); } digest.compress(); return compare(edf, digest, npercentiles); } function uniform() { return Math.random(); } var _extra = null; function gaussian(mean, sigma) { mean = mean || 0; sigma = sigma || 1; if (_extra != null) { var result = mean + sigma * _extra; _extra = null; return result; } else { var u = 2 * Math.random() - 1; var v = 2 * Math.random() - 1; var r = u*u + v*v; if (r == 0 || r > 1) { // out of bounds, try again return gaussian(mean, sigma); } var c = Math.sqrt(-2*Math.log(r)/r); _extra = u * c; return mean + sigma * v * c; } } function chisq() { var k = 3; var total = 0; for (var i = 0 ; i < k ; i++) { var x = gaussian(); total += x * x; } return total; } function exponential() { return Math.exp(- Math.random()); } function make_brownian() { var brownian_state = 0; return function brownian() { brownian_state += gaussian(); return brownian_state; }; } distributions(10, 100000, 100); package/example.js000600 0000003746 3560116604 011253 0ustar00000000 000000 // Examples of Digest, which automatically chooses between // a discrete and TDigest representation of a streaming sample. // var Digest = require('./tdigest').Digest; // // create a frequency digest for a small sample. automatically store // these as discrete samples and report exact percentiles // var x=[], N = 10; digest = new Digest(); for (var i = 0 ; i < N ; i += 1) { digest.push(i/N * 10 - 5); } console.log(digest.summary()); for (var p = 0 ; p <= 1.0 ; p += 0.1) { console.log("p = "+p.toFixed(2)+", x == "+(digest.percentile(p))); } for (var x = -5 ; x <= 5 ; x += 1.0) { console.log("x = "+x+", p == "+(digest.p_rank(x))); } // // the digest remains exact for a large number of samples having // a small number of distinct values // x = []; N = 10000; digest = new Digest(); for (i = 0 ; i < N ; i += 1) { digest.push(Math.floor(i/N * 10 - 5)); } console.log(digest.summary()); for (p = 0 ; p <= 1.0 ; p += 0.1) { console.log("p = "+p.toFixed(2)+", x == "+(digest.percentile(p))); } for (x = -5 ; x <= 5 ; x += 1.0) { console.log("x = "+x+", p == "+(digest.p_rank(x))); } // // the digest automatically shifts to a TDigest approximation for a // large number of distinct sample values // x = []; N = 10000; digest = new Digest(); for (i = 0 ; i < N ; i += 1) { digest.push(i/N * 10 - 5); } digest.compress(); console.log(digest.summary()); for (p = 0 ; p <= 1.0 ; p += 0.1) { console.log("p = "+p.toFixed(2)+", x ~ "+(digest.percentile(p))); } for (x = -5 ; x <= 5 ; x += 1.0) { console.log("x = "+x+", p ~ "+(digest.p_rank(x))); } // // force the digest to store all unique samples, regardless of number // x = []; N = 10000; digest = new Digest({mode:'disc'}); for (var i = 0 ; i < N ; i += 1) { digest.push(i/N * 10 - 5); } console.log(digest.summary()); for (var p = 0 ; p <= 1.0 ; p += 0.1) { console.log("p = "+p.toFixed(2)+", x == "+(digest.percentile(p))); } for (var x = -5 ; x <= 5 ; x += 1.0) { console.log("x = "+x+", p == "+(digest.p_rank(x))); } package/gruntfile.js000600 0000000612 3560116604 011604 0ustar00000000 000000 module.exports = function(grunt) { var pkg = grunt.file.readJSON('package.json'); grunt.initConfig({ pure_cjs: { options: { exports: 'tdigest', comments: true }, 'dist/tdigest.js': 'tdigest.js' } }); grunt.task.loadNpmTasks('grunt-pure-cjs'); grunt.registerTask('dist', ['pure_cjs']); } package/dist/tdigest.js000600 0000116650 3560116604 012225 0ustar00000000 000000 (function (factory) { if (typeof define === 'function' && define.amd) { // AMD. Register as an anonymous module. define([], factory); } else if (typeof exports === 'object') { // Node. Does not work with strict CommonJS, but // only CommonJS-like enviroments that support module.exports, // like Node. module.exports = factory(); } else { // Browser globals this.tdigest = factory(); } }(function () { var global = this, define; function _require(id) { var module = _require.cache[id]; if (!module) { var exports = {}; module = _require.cache[id] = { id: id, exports: exports }; _require.modules[id].call(exports, module, exports); } return module.exports; } _require.cache = []; _require.modules = [ function (module, exports) { module.exports = { RBTree: _require(2), BinTree: _require(1) }; }, function (module, exports) { var TreeBase = _require(3); function Node(data) { this.data = data; this.left = null; this.right = null; } Node.prototype.get_child = function (dir) { return dir ? this.right : this.left; }; Node.prototype.set_child = function (dir, val) { if (dir) { this.right = val; } else { this.left = val; } }; function BinTree(comparator) { this._root = null; this._comparator = comparator; this.size = 0; } BinTree.prototype = new TreeBase(); // returns true if inserted, false if duplicate // returns true if inserted, false if duplicate BinTree.prototype.insert = function (data) { if (this._root === null) { // empty tree this._root = new Node(data); this.size++; return true; } var dir = 0; // setup // setup var p = null; // parent // parent var node = this._root; // search down // search down while (true) { if (node === null) { // insert new node at the bottom node = new Node(data); p.set_child(dir, node); ret = true; this.size++; return true; } // stop if found // stop if found if (this._comparator(node.data, data) === 0) { return false; } dir = this._comparator(node.data, data) < 0; // update helpers // update helpers p = node; node = node.get_child(dir); } }; // returns true if removed, false if not found // returns true if removed, false if not found BinTree.prototype.remove = function (data) { if (this._root === null) { return false; } var head = new Node(undefined); // fake tree root // fake tree root var node = head; node.right = this._root; var p = null; // parent // parent var found = null; // found item // found item var dir = 1; while (node.get_child(dir) !== null) { p = node; node = node.get_child(dir); var cmp = this._comparator(data, node.data); dir = cmp > 0; if (cmp === 0) { found = node; } } if (found !== null) { found.data = node.data; p.set_child(p.right === node, node.get_child(node.left === null)); this._root = head.right; this.size--; return true; } else { return false; } }; module.exports = BinTree; }, function (module, exports) { var TreeBase = _require(3); function Node(data) { this.data = data; this.left = null; this.right = null; this.red = true; } Node.prototype.get_child = function (dir) { return dir ? this.right : this.left; }; Node.prototype.set_child = function (dir, val) { if (dir) { this.right = val; } else { this.left = val; } }; function RBTree(comparator) { this._root = null; this._comparator = comparator; this.size = 0; } RBTree.prototype = new TreeBase(); // returns true if inserted, false if duplicate // returns true if inserted, false if duplicate RBTree.prototype.insert = function (data) { var ret = false; if (this._root === null) { // empty tree this._root = new Node(data); ret = true; this.size++; } else { var head = new Node(undefined); // fake tree root // fake tree root var dir = 0; var last = 0; // setup // setup var gp = null; // grandparent // grandparent var ggp = head; // grand-grand-parent // grand-grand-parent var p = null; // parent // parent var node = this._root; ggp.right = this._root; // search down // search down while (true) { if (node === null) { // insert new node at the bottom node = new Node(data); p.set_child(dir, node); ret = true; this.size++; } else if (is_red(node.left) && is_red(node.right)) { // color flip node.red = true; node.left.red = false; node.right.red = false; } // fix red violation // fix red violation if (is_red(node) && is_red(p)) { var dir2 = ggp.right === gp; if (node === p.get_child(last)) { ggp.set_child(dir2, single_rotate(gp, !last)); } else { ggp.set_child(dir2, double_rotate(gp, !last)); } } var cmp = this._comparator(node.data, data); // stop if found // stop if found if (cmp === 0) { break; } last = dir; dir = cmp < 0; // update helpers // update helpers if (gp !== null) { ggp = gp; } gp = p; p = node; node = node.get_child(dir); } // update root // update root this._root = head.right; } // make root black // make root black this._root.red = false; return ret; }; // returns true if removed, false if not found // returns true if removed, false if not found RBTree.prototype.remove = function (data) { if (this._root === null) { return false; } var head = new Node(undefined); // fake tree root // fake tree root var node = head; node.right = this._root; var p = null; // parent // parent var gp = null; // grand parent // grand parent var found = null; // found item // found item var dir = 1; while (node.get_child(dir) !== null) { var last = dir; // update helpers // update helpers gp = p; p = node; node = node.get_child(dir); var cmp = this._comparator(data, node.data); dir = cmp > 0; // save found node // save found node if (cmp === 0) { found = node; } // push the red node down // push the red node down if (!is_red(node) && !is_red(node.get_child(dir))) { if (is_red(node.get_child(!dir))) { var sr = single_rotate(node, dir); p.set_child(last, sr); p = sr; } else if (!is_red(node.get_child(!dir))) { var sibling = p.get_child(!last); if (sibling !== null) { if (!is_red(sibling.get_child(!last)) && !is_red(sibling.get_child(last))) { // color flip p.red = false; sibling.red = true; node.red = true; } else { var dir2 = gp.right === p; if (is_red(sibling.get_child(last))) { gp.set_child(dir2, double_rotate(p, last)); } else if (is_red(sibling.get_child(!last))) { gp.set_child(dir2, single_rotate(p, last)); } // ensure correct coloring // ensure correct coloring var gpc = gp.get_child(dir2); gpc.red = true; node.red = true; gpc.left.red = false; gpc.right.red = false; } } } } } // replace and remove if found // replace and remove if found if (found !== null) { found.data = node.data; p.set_child(p.right === node, node.get_child(node.left === null)); this.size--; } // update root and make it black // update root and make it black this._root = head.right; if (this._root !== null) { this._root.red = false; } return found !== null; }; function is_red(node) { return node !== null && node.red; } function single_rotate(root, dir) { var save = root.get_child(!dir); root.set_child(!dir, save.get_child(dir)); save.set_child(dir, root); root.red = true; save.red = false; return save; } function double_rotate(root, dir) { root.set_child(!dir, single_rotate(root.get_child(!dir), !dir)); return single_rotate(root, dir); } module.exports = RBTree; }, function (module, exports) { function TreeBase() { } // removes all nodes from the tree // removes all nodes from the tree TreeBase.prototype.clear = function () { this._root = null; this.size = 0; }; // returns node data if found, null otherwise // returns node data if found, null otherwise TreeBase.prototype.find = function (data) { var res = this._root; while (res !== null) { var c = this._comparator(data, res.data); if (c === 0) { return res.data; } else { res = res.get_child(c > 0); } } return null; }; // returns iterator to node if found, null otherwise // returns iterator to node if found, null otherwise TreeBase.prototype.findIter = function (data) { var res = this._root; var iter = this.iterator(); while (res !== null) { var c = this._comparator(data, res.data); if (c === 0) { iter._cursor = res; return iter; } else { iter._ancestors.push(res); res = res.get_child(c > 0); } } return null; }; // Returns an iterator to the tree node at or immediately after the item // Returns an iterator to the tree node at or immediately after the item TreeBase.prototype.lowerBound = function (item) { var cur = this._root; var iter = this.iterator(); var cmp = this._comparator; while (cur !== null) { var c = cmp(item, cur.data); if (c === 0) { iter._cursor = cur; return iter; } iter._ancestors.push(cur); cur = cur.get_child(c > 0); } for (var i = iter._ancestors.length - 1; i >= 0; --i) { cur = iter._ancestors[i]; if (cmp(item, cur.data) < 0) { iter._cursor = cur; iter._ancestors.length = i; return iter; } } iter._ancestors.length = 0; return iter; }; // Returns an iterator to the tree node immediately after the item // Returns an iterator to the tree node immediately after the item TreeBase.prototype.upperBound = function (item) { var iter = this.lowerBound(item); var cmp = this._comparator; while (iter.data() !== null && cmp(iter.data(), item) === 0) { iter.next(); } return iter; }; // returns null if tree is empty // returns null if tree is empty TreeBase.prototype.min = function () { var res = this._root; if (res === null) { return null; } while (res.left !== null) { res = res.left; } return res.data; }; // returns null if tree is empty // returns null if tree is empty TreeBase.prototype.max = function () { var res = this._root; if (res === null) { return null; } while (res.right !== null) { res = res.right; } return res.data; }; // returns a null iterator // call next() or prev() to point to an element // returns a null iterator // call next() or prev() to point to an element TreeBase.prototype.iterator = function () { return new Iterator(this); }; // calls cb on each node's data, in order // calls cb on each node's data, in order TreeBase.prototype.each = function (cb) { var it = this.iterator(), data; while ((data = it.next()) !== null) { cb(data); } }; // calls cb on each node's data, in reverse order // calls cb on each node's data, in reverse order TreeBase.prototype.reach = function (cb) { var it = this.iterator(), data; while ((data = it.prev()) !== null) { cb(data); } }; function Iterator(tree) { this._tree = tree; this._ancestors = []; this._cursor = null; } Iterator.prototype.data = function () { return this._cursor !== null ? this._cursor.data : null; }; // if null-iterator, returns first node // otherwise, returns next node // if null-iterator, returns first node // otherwise, returns next node Iterator.prototype.next = function () { if (this._cursor === null) { var root = this._tree._root; if (root !== null) { this._minNode(root); } } else { if (this._cursor.right === null) { // no greater node in subtree, go up to parent // if coming from a right child, continue up the stack var save; do { save = this._cursor; if (this._ancestors.length) { this._cursor = this._ancestors.pop(); } else { this._cursor = null; break; } } while (this._cursor.right === save); } else { // get the next node from the subtree this._ancestors.push(this._cursor); this._minNode(this._cursor.right); } } return this._cursor !== null ? this._cursor.data : null; }; // if null-iterator, returns last node // otherwise, returns previous node // if null-iterator, returns last node // otherwise, returns previous node Iterator.prototype.prev = function () { if (this._cursor === null) { var root = this._tree._root; if (root !== null) { this._maxNode(root); } } else { if (this._cursor.left === null) { var save; do { save = this._cursor; if (this._ancestors.length) { this._cursor = this._ancestors.pop(); } else { this._cursor = null; break; } } while (this._cursor.left === save); } else { this._ancestors.push(this._cursor); this._maxNode(this._cursor.left); } } return this._cursor !== null ? this._cursor.data : null; }; Iterator.prototype._minNode = function (start) { while (start.left !== null) { this._ancestors.push(start); start = start.left; } this._cursor = start; }; Iterator.prototype._maxNode = function (start) { while (start.right !== null) { this._ancestors.push(start); start = start.right; } this._cursor = start; }; module.exports = TreeBase; }, function (module, exports) { // // TDigest: // // approximate distribution percentiles from a stream of reals // var RBTree = _require(0).RBTree; function TDigest(delta, K, CX) { // allocate a TDigest structure. // // delta is the compression factor, the max fraction of mass that // can be owned by one centroid (bigger, up to 1.0, means more // compression). delta=false switches off TDigest behavior and treats // the distribution as discrete, with no merging and exact values // reported. // // K is a size threshold that triggers recompression as the TDigest // grows during input. (Set it to 0 to disable automatic recompression) // // CX specifies how often to update cached cumulative totals used // for quantile estimation during ingest (see cumulate()). Set to // 0 to use exact quantiles for each new point. // this.discrete = delta === false; this.delta = delta || 0.01; this.K = K === undefined ? 25 : K; this.CX = CX === undefined ? 1.1 : CX; this.centroids = new RBTree(compare_centroid_means); this.nreset = 0; this.reset(); } TDigest.prototype.reset = function () { // prepare to digest new points. // this.centroids.clear(); this.n = 0; this.nreset += 1; this.last_cumulate = 0; }; TDigest.prototype.size = function () { return this.centroids.size; }; TDigest.prototype.toArray = function (everything) { // return {mean,n} of centroids as an array ordered by mean. // var result = []; if (everything) { this._cumulate(true); // be sure cumns are exact // be sure cumns are exact this.centroids.each(function (c) { result.push(c); }); } else { this.centroids.each(function (c) { result.push({ mean: c.mean, n: c.n }); }); } return result; }; TDigest.prototype.summary = function () { var approx = this.discrete ? 'exact ' : 'approximating '; var s = [ approx + this.n + ' samples using ' + this.size() + ' centroids', 'min = ' + this.percentile(0), 'Q1 = ' + this.percentile(0.25), 'Q2 = ' + this.percentile(0.5), 'Q3 = ' + this.percentile(0.75), 'max = ' + this.percentile(1) ]; return s.join('\n'); }; function compare_centroid_means(a, b) { // order two centroids by mean. // return a.mean > b.mean ? 1 : a.mean < b.mean ? -1 : 0; } function compare_centroid_mean_cumns(a, b) { // order two centroids by mean_cumn. // return a.mean_cumn - b.mean_cumn; } TDigest.prototype.push = function (x, n) { // incorporate value or array of values x, having count n into the // TDigest. n defaults to 1. // n = n || 1; x = Array.isArray(x) ? x : [x]; for (var i = 0; i < x.length; i++) { this._digest(x[i], n); } }; TDigest.prototype.push_centroid = function (c) { // incorporate centroid or array of centroids c // c = Array.isArray(c) ? c : [c]; for (var i = 0; i < c.length; i++) { this._digest(c[i].mean, c[i].n); } }; TDigest.prototype._cumulate = function (exact) { // update cumulative counts for each centroid // // exact: falsey means only cumulate after sufficient // growth. During ingest, these counts are used as quantile // estimates, and they work well even when somewhat out of // date. (this is a departure from the publication, you may set CX // to 0 to disable). // if (this.n === this.last_cumulate || !exact && this.CX && this.CX > this.n / this.last_cumulate) { return; } var cumn = 0; var self = this; this.centroids.each(function (c) { c.mean_cumn = cumn + c.n / 2; // half of n at the mean // half of n at the mean cumn = c.cumn = cumn + c.n; }); this.n = this.last_cumulate = cumn; }; TDigest.prototype.find_nearest = function (x) { // find the centroid closest to x. The assumption of // unique means and a unique nearest centroid departs from the // paper, see _digest() below // if (this.size() === 0) { return null; } var iter = this.centroids.lowerBound({ mean: x }); // x <= iter || iter==null // x <= iter || iter==null var c = iter.data() === null ? iter.prev() : iter.data(); if (c.mean === x || this.discrete) { return c; // c is either x or a neighbor (discrete: no distance func) } // c is either x or a neighbor (discrete: no distance func) var prev = iter.prev(); if (prev && Math.abs(prev.mean - x) < Math.abs(c.mean - x)) { return prev; } else { return c; } }; TDigest.prototype._new_centroid = function (x, n, cumn) { // create and insert a new centroid into the digest (don't update // cumulatives). // var c = { mean: x, n: n, cumn: cumn }; this.centroids.insert(c); this.n += n; return c; }; TDigest.prototype._addweight = function (nearest, x, n) { // add weight at location x to nearest centroid. adding x to // nearest will not shift its relative position in the tree and // require reinsertion. // if (x !== nearest.mean) { nearest.mean += n * (x - nearest.mean) / (nearest.n + n); } nearest.cumn += n; nearest.mean_cumn += n / 2; nearest.n += n; this.n += n; }; TDigest.prototype._digest = function (x, n) { // incorporate value x, having count n into the TDigest. // var min = this.centroids.min(); var max = this.centroids.max(); var nearest = this.find_nearest(x); if (nearest && nearest.mean === x) { // accumulate exact matches into the centroid without // limit. this is a departure from the paper, made so // centroids remain unique and code can be simple. this._addweight(nearest, x, n); } else if (nearest === min) { this._new_centroid(x, n, 0); // new point around min boundary } else // new point around min boundary if (nearest === max) { this._new_centroid(x, n, this.n); // new point around max boundary } else // new point around max boundary if (this.discrete) { this._new_centroid(x, n, nearest.cumn); // never merge } else { // never merge // conider a merge based on nearest centroid's capacity. if // there's not room for all of n, don't bother merging any of // it into nearest, as we'll have to make a new centroid // anyway for the remainder (departure from the paper). var p = nearest.mean_cumn / this.n; var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p)); if (max_n - nearest.n >= n) { this._addweight(nearest, x, n); } else { this._new_centroid(x, n, nearest.cumn); } } this._cumulate(false); if (!this.discrete && this.K && this.size() > this.K / this.delta) { // re-process the centroids and hope for some compression. this.compress(); } }; TDigest.prototype.bound_mean = function (x) { // find centroids lower and upper such that lower.mean < x < // upper.mean or lower.mean === x === upper.mean. Don't call // this for x out of bounds. // var iter = this.centroids.upperBound({ mean: x }); // x < iter // x < iter var lower = iter.prev(); // lower <= x // lower <= x var upper = lower.mean === x ? lower : iter.next(); return [ lower, upper ]; }; TDigest.prototype.p_rank = function (x_or_xlist) { // return approximate percentile-ranks (0..1) for data value x. // or list of x. calculated according to // https://en.wikipedia.org/wiki/Percentile_rank // // (Note that in continuous mode, boundary sample values will // report half their centroid weight inward from 0/1 as the // percentile-rank. X values outside the observed range return // 0/1) // // this triggers cumulate() if cumn's are out of date. // var xs = Array.isArray(x_or_xlist) ? x_or_xlist : [x_or_xlist]; var ps = xs.map(this._p_rank, this); return Array.isArray(x_or_xlist) ? ps : ps[0]; }; TDigest.prototype._p_rank = function (x) { if (this.size() === 0) { return undefined; } else if (x < this.centroids.min().mean) { return 0; } else if (x > this.centroids.max().mean) { return 1; } // find centroids that bracket x and interpolate x's cumn from // their cumn's. // find centroids that bracket x and interpolate x's cumn from // their cumn's. this._cumulate(true); // be sure cumns are exact // be sure cumns are exact var bound = this.bound_mean(x); var lower = bound[0], upper = bound[1]; if (this.discrete) { return lower.cumn / this.n; } else { var cumn = lower.mean_cumn; if (lower !== upper) { cumn += (x - lower.mean) * (upper.mean_cumn - lower.mean_cumn) / (upper.mean - lower.mean); } return cumn / this.n; } }; TDigest.prototype.bound_mean_cumn = function (cumn) { // find centroids lower and upper such that lower.mean_cumn < x < // upper.mean_cumn or lower.mean_cumn === x === upper.mean_cumn. Don't call // this for cumn out of bounds. // // XXX because mean and mean_cumn give rise to the same sort order // (up to identical means), use the mean rbtree for our search. this.centroids._comparator = compare_centroid_mean_cumns; var iter = this.centroids.upperBound({ mean_cumn: cumn }); // cumn < iter // cumn < iter this.centroids._comparator = compare_centroid_means; var lower = iter.prev(); // lower <= cumn // lower <= cumn var upper = lower && lower.mean_cumn === cumn ? lower : iter.next(); return [ lower, upper ]; }; TDigest.prototype.percentile = function (p_or_plist) { // for percentage p (0..1), or for each p in a list of ps, return // the smallest data value q at which at least p percent of the // observations <= q. // // for discrete distributions, this selects q using the Nearest // Rank Method // (https://en.wikipedia.org/wiki/Percentile#The_Nearest_Rank_method) // (in scipy, same as percentile(...., interpolation='higher') // // for continuous distributions, interpolates data values between // count-weighted bracketing means. // // this triggers cumulate() if cumn's are out of date. // var ps = Array.isArray(p_or_plist) ? p_or_plist : [p_or_plist]; var qs = ps.map(this._percentile, this); return Array.isArray(p_or_plist) ? qs : qs[0]; }; TDigest.prototype._percentile = function (p) { if (this.size() === 0) { return undefined; } this._cumulate(true); // be sure cumns are exact // be sure cumns are exact var min = this.centroids.min(); var max = this.centroids.max(); var h = this.n * p; var bound = this.bound_mean_cumn(h); var lower = bound[0], upper = bound[1]; if (upper === lower || lower === null || upper === null) { return (lower || upper).mean; } else if (!this.discrete) { return lower.mean + (h - lower.mean_cumn) * (upper.mean - lower.mean) / (upper.mean_cumn - lower.mean_cumn); } else if (h <= lower.cumn) { return lower.mean; } else { return upper.mean; } }; function pop_random(choices) { // remove and return an item randomly chosen from the array of choices // (mutates choices) // var idx = Math.floor(Math.random() * choices.length); return choices.splice(idx, 1)[0]; } TDigest.prototype.compress = function () { // TDigests experience worst case compression (none) when input // increases monotonically. Improve on any bad luck by // reconsuming digest centroids as if they were weighted points // while shuffling their order (and hope for the best). // if (this.compressing) { return; } var points = this.toArray(); this.reset(); this.compressing = true; while (points.length > 0) { this.push_centroid(pop_random(points)); } this._cumulate(true); this.compressing = false; }; function Digest(config) { // allocate a distribution digest structure. This is an extension // of a TDigest structure that starts in exact histogram (discrete) // mode, and automatically switches to TDigest mode for large // samples that appear to be from a continuous distribution. // this.config = config || {}; this.mode = this.config.mode || 'auto'; // disc, cont, auto // disc, cont, auto TDigest.call(this, this.mode === 'cont' ? config.delta : false); this.digest_ratio = this.config.ratio || 0.9; this.digest_thresh = this.config.thresh || 1000; this.n_unique = 0; } Digest.prototype = Object.create(TDigest.prototype); Digest.prototype.constructor = Digest; Digest.prototype.push = function (x_or_xlist) { TDigest.prototype.push.call(this, x_or_xlist); this.check_continuous(); }; Digest.prototype._new_centroid = function (x, n, cumn) { this.n_unique += 1; TDigest.prototype._new_centroid.call(this, x, n, cumn); }; Digest.prototype._addweight = function (nearest, x, n) { if (nearest.n === 1) { this.n_unique -= 1; } TDigest.prototype._addweight.call(this, nearest, x, n); }; Digest.prototype.check_continuous = function () { // while in 'auto' mode, if there are many unique elements, assume // they are from a continuous distribution and switch to 'cont' // mode (tdigest behavior). Return true on transition from // disctete to continuous. if (this.mode !== 'auto' || this.size() < this.digest_thresh) { return false; } if (this.n_unique / this.size() > this.digest_ratio) { this.mode = 'cont'; this.discrete = false; this.delta = this.config.delta || 0.01; this.compress(); return true; } return false; }; module.exports = { 'TDigest': TDigest, 'Digest': Digest }; } ]; return _require(4); }));package/tdigest.js000644 0000032072 3560116604 011265 0ustar00000000 000000 // // TDigest: // // approximate distribution percentiles from a stream of reals // var RBTree = require('bintrees').RBTree; function TDigest(delta, K, CX) { // allocate a TDigest structure. // // delta is the compression factor, the max fraction of mass that // can be owned by one centroid (bigger, up to 1.0, means more // compression). delta=false switches off TDigest behavior and treats // the distribution as discrete, with no merging and exact values // reported. // // K is a size threshold that triggers recompression as the TDigest // grows during input. (Set it to 0 to disable automatic recompression) // // CX specifies how often to update cached cumulative totals used // for quantile estimation during ingest (see cumulate()). Set to // 0 to use exact quantiles for each new point. // this.discrete = (delta === false); this.delta = delta || 0.01; this.K = (K === undefined) ? 25 : K; this.CX = (CX === undefined) ? 1.1 : CX; this.centroids = new RBTree(compare_centroid_means); this.nreset = 0; this.reset(); } TDigest.prototype.reset = function() { // prepare to digest new points. // this.centroids.clear(); this.n = 0; this.nreset += 1; this.last_cumulate = 0; }; TDigest.prototype.size = function() { return this.centroids.size; }; TDigest.prototype.toArray = function(everything) { // return {mean,n} of centroids as an array ordered by mean. // var result = []; if (everything) { this._cumulate(true); // be sure cumns are exact this.centroids.each(function(c) { result.push(c); }); } else { this.centroids.each(function(c) { result.push({mean:c.mean, n:c.n}); }); } return result; }; TDigest.prototype.summary = function() { var approx = (this.discrete) ? "exact " : "approximating "; var s = [approx + this.n + " samples using " + this.size() + " centroids", "min = "+this.percentile(0), "Q1 = "+this.percentile(0.25), "Q2 = "+this.percentile(0.5), "Q3 = "+this.percentile(0.75), "max = "+this.percentile(1.0)]; return s.join('\n'); }; function compare_centroid_means(a, b) { // order two centroids by mean. // return (a.mean > b.mean) ? 1 : (a.mean < b.mean) ? -1 : 0; } function compare_centroid_mean_cumns(a, b) { // order two centroids by mean_cumn. // return (a.mean_cumn - b.mean_cumn); } TDigest.prototype.push = function(x, n) { // incorporate value or array of values x, having count n into the // TDigest. n defaults to 1. // n = n || 1; x = Array.isArray(x) ? x : [x]; for (var i = 0 ; i < x.length ; i++) { this._digest(x[i], n); } }; TDigest.prototype.push_centroid = function(c) { // incorporate centroid or array of centroids c // c = Array.isArray(c) ? c : [c]; for (var i = 0 ; i < c.length ; i++) { this._digest(c[i].mean, c[i].n); } }; TDigest.prototype._cumulate = function(exact) { // update cumulative counts for each centroid // // exact: falsey means only cumulate after sufficient // growth. During ingest, these counts are used as quantile // estimates, and they work well even when somewhat out of // date. (this is a departure from the publication, you may set CX // to 0 to disable). // if (this.n === this.last_cumulate || !exact && this.CX && this.CX > (this.n / this.last_cumulate)) { return; } var cumn = 0; this.centroids.each(function(c) { c.mean_cumn = cumn + c.n / 2; // half of n at the mean cumn = c.cumn = cumn + c.n; }); this.n = this.last_cumulate = cumn; }; TDigest.prototype.find_nearest = function(x) { // find the centroid closest to x. The assumption of // unique means and a unique nearest centroid departs from the // paper, see _digest() below // if (this.size() === 0) { return null; } var iter = this.centroids.lowerBound({mean:x}); // x <= iter || iter==null var c = (iter.data() === null) ? iter.prev() : iter.data(); if (c.mean === x || this.discrete) { return c; // c is either x or a neighbor (discrete: no distance func) } var prev = iter.prev(); if (prev && Math.abs(prev.mean - x) < Math.abs(c.mean - x)) { return prev; } else { return c; } }; TDigest.prototype._new_centroid = function(x, n, cumn) { // create and insert a new centroid into the digest (don't update // cumulatives). // var c = {mean:x, n:n, cumn:cumn}; this.centroids.insert(c); this.n += n; return c; }; TDigest.prototype._addweight = function(nearest, x, n) { // add weight at location x to nearest centroid. adding x to // nearest will not shift its relative position in the tree and // require reinsertion. // if (x !== nearest.mean) { nearest.mean += n * (x - nearest.mean) / (nearest.n + n); } nearest.cumn += n; nearest.mean_cumn += n / 2; nearest.n += n; this.n += n; }; TDigest.prototype._digest = function(x, n) { // incorporate value x, having count n into the TDigest. // var min = this.centroids.min(); var max = this.centroids.max(); var nearest = this.find_nearest(x); if (nearest && nearest.mean === x) { // accumulate exact matches into the centroid without // limit. this is a departure from the paper, made so // centroids remain unique and code can be simple. this._addweight(nearest, x, n); } else if (nearest === min) { this._new_centroid(x, n, 0); // new point around min boundary } else if (nearest === max ) { this._new_centroid(x, n, this.n); // new point around max boundary } else if (this.discrete) { this._new_centroid(x, n, nearest.cumn); // never merge } else { // conider a merge based on nearest centroid's capacity. if // there's not room for all of n, don't bother merging any of // it into nearest, as we'll have to make a new centroid // anyway for the remainder (departure from the paper). var p = nearest.mean_cumn / this.n; var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p)); if (max_n - nearest.n >= n) { this._addweight(nearest, x, n); } else { this._new_centroid(x, n, nearest.cumn); } } this._cumulate(false); if (!this.discrete && this.K && this.size() > this.K / this.delta) { // re-process the centroids and hope for some compression. this.compress(); } }; TDigest.prototype.bound_mean = function(x) { // find centroids lower and upper such that lower.mean < x < // upper.mean or lower.mean === x === upper.mean. Don't call // this for x out of bounds. // var iter = this.centroids.upperBound({mean:x}); // x < iter var lower = iter.prev(); // lower <= x var upper = (lower.mean === x) ? lower : iter.next(); return [lower, upper]; }; TDigest.prototype.p_rank = function(x_or_xlist) { // return approximate percentile-ranks (0..1) for data value x. // or list of x. calculated according to // https://en.wikipedia.org/wiki/Percentile_rank // // (Note that in continuous mode, boundary sample values will // report half their centroid weight inward from 0/1 as the // percentile-rank. X values outside the observed range return // 0/1) // // this triggers cumulate() if cumn's are out of date. // var xs = Array.isArray(x_or_xlist) ? x_or_xlist : [x_or_xlist]; var ps = xs.map(this._p_rank, this); return Array.isArray(x_or_xlist) ? ps : ps[0]; }; TDigest.prototype._p_rank = function(x) { if (this.size() === 0) { return undefined; } else if (x < this.centroids.min().mean) { return 0.0; } else if (x > this.centroids.max().mean) { return 1.0; } // find centroids that bracket x and interpolate x's cumn from // their cumn's. this._cumulate(true); // be sure cumns are exact var bound = this.bound_mean(x); var lower = bound[0], upper = bound[1]; if (this.discrete) { return lower.cumn / this.n; } else { var cumn = lower.mean_cumn; if (lower !== upper) { cumn += (x - lower.mean) * (upper.mean_cumn - lower.mean_cumn) / (upper.mean - lower.mean); } return cumn / this.n; } }; TDigest.prototype.bound_mean_cumn = function(cumn) { // find centroids lower and upper such that lower.mean_cumn < x < // upper.mean_cumn or lower.mean_cumn === x === upper.mean_cumn. Don't call // this for cumn out of bounds. // // XXX because mean and mean_cumn give rise to the same sort order // (up to identical means), use the mean rbtree for our search. this.centroids._comparator = compare_centroid_mean_cumns; var iter = this.centroids.upperBound({mean_cumn:cumn}); // cumn < iter this.centroids._comparator = compare_centroid_means; var lower = iter.prev(); // lower <= cumn var upper = (lower && lower.mean_cumn === cumn) ? lower : iter.next(); return [lower, upper]; }; TDigest.prototype.percentile = function(p_or_plist) { // for percentage p (0..1), or for each p in a list of ps, return // the smallest data value q at which at least p percent of the // observations <= q. // // for discrete distributions, this selects q using the Nearest // Rank Method // (https://en.wikipedia.org/wiki/Percentile#The_Nearest_Rank_method) // (in scipy, same as percentile(...., interpolation='higher') // // for continuous distributions, interpolates data values between // count-weighted bracketing means. // // this triggers cumulate() if cumn's are out of date. // var ps = Array.isArray(p_or_plist) ? p_or_plist : [p_or_plist]; var qs = ps.map(this._percentile, this); return Array.isArray(p_or_plist) ? qs : qs[0]; }; TDigest.prototype._percentile = function(p) { if (this.size() === 0) { return undefined; } this._cumulate(true); // be sure cumns are exact var h = this.n * p; var bound = this.bound_mean_cumn(h); var lower = bound[0], upper = bound[1]; if (upper === lower || lower === null || upper === null) { return (lower || upper).mean; } else if (!this.discrete) { return lower.mean + (h - lower.mean_cumn) * (upper.mean - lower.mean) / (upper.mean_cumn - lower.mean_cumn); } else if (h <= lower.cumn) { return lower.mean; } else { return upper.mean; } }; function pop_random(choices) { // remove and return an item randomly chosen from the array of choices // (mutates choices) // var idx = Math.floor(Math.random() * choices.length); return choices.splice(idx, 1)[0]; } TDigest.prototype.compress = function() { // TDigests experience worst case compression (none) when input // increases monotonically. Improve on any bad luck by // reconsuming digest centroids as if they were weighted points // while shuffling their order (and hope for the best). // if (this.compressing) { return; } var points = this.toArray(); this.reset(); this.compressing = true; while (points.length > 0) { this.push_centroid(pop_random(points)); } this._cumulate(true); this.compressing = false; }; function Digest(config) { // allocate a distribution digest structure. This is an extension // of a TDigest structure that starts in exact histogram (discrete) // mode, and automatically switches to TDigest mode for large // samples that appear to be from a continuous distribution. // this.config = config || {}; this.mode = this.config.mode || 'auto'; // disc, cont, auto TDigest.call(this, this.mode === 'cont' ? config.delta : false); this.digest_ratio = this.config.ratio || 0.9; this.digest_thresh = this.config.thresh || 1000; this.n_unique = 0; } Digest.prototype = Object.create(TDigest.prototype); Digest.prototype.constructor = Digest; Digest.prototype.push = function(x_or_xlist) { TDigest.prototype.push.call(this, x_or_xlist); this.check_continuous(); }; Digest.prototype._new_centroid = function(x, n, cumn) { this.n_unique += 1; TDigest.prototype._new_centroid.call(this, x, n, cumn); }; Digest.prototype._addweight = function(nearest, x, n) { if (nearest.n === 1) { this.n_unique -= 1; } TDigest.prototype._addweight.call(this, nearest, x, n); }; Digest.prototype.check_continuous = function() { // while in 'auto' mode, if there are many unique elements, assume // they are from a continuous distribution and switch to 'cont' // mode (tdigest behavior). Return true on transition from // disctete to continuous. if (this.mode !== 'auto' || this.size() < this.digest_thresh) { return false; } if (this.n_unique / this.size() > this.digest_ratio) { this.mode = 'cont'; this.discrete = false; this.delta = this.config.delta || 0.01; this.compress(); return true; } return false; }; module.exports = { 'TDigest': TDigest, 'Digest': Digest }; package/specs/tdigest.spec.js000600 0000020523 3560116604 013321 0ustar00000000 000000 var TDigest = require('../tdigest').TDigest; var assert = require('better-assert'); assert.deepEqual = require('chai').assert.deepEqual; describe('T-Digests in which each point becomes a centroid', function(){ it('consumes a point', function(){ var tdigest = new TDigest(); tdigest.push(0); var points = tdigest.toArray(); assert.deepEqual(points, [{mean:0, n:1}]); }); it('consumes two points', function(){ var tdigest = new TDigest(); tdigest.push([0,1]); var points = tdigest.toArray(); assert.deepEqual(points, [{mean:0, n:1}, {mean:1, n:1}]); }); it('consumes three points', function(){ var tdigest = new TDigest(); tdigest.push([0, 1, -1]); var points = tdigest.toArray(); assert.deepEqual(points, [{mean:-1, n:1}, {mean:0, n:1}, {mean:1, n:1}]); }); it('consumes increasing-valued points', function(){ var tdigest = new TDigest(0.001, 0); // force a new centroid for each pt var i, N = 100; for (i = 0 ; i < N ; i += 1) { tdigest.push(i*10); } var points = tdigest.toArray(); for (i = 0 ; i < N ; i += 1) { assert(points[i].mean === i*10); } }); it('consumes decreasing-valued points', function(){ var tdigest = new TDigest(0.001, 0); // force a new centroid for each pt var i, N = 100; for (i = N - 1 ; i >= 0 ; i = i - 1) { tdigest.push(i*10); } var points = tdigest.toArray(); for (i = 0 ; i < N ; i += 1) { assert(points[i].mean === i*10); } }); }); describe('T-Digests in which points are merged into centroids', function(){ it('consumes same-valued points into a single point', function(){ var tdigest = new TDigest(); var i, N = 100; for (i = 0 ; i < N ; i = i + 1) { tdigest.push(1000); } var points = tdigest.toArray(); assert.deepEqual(points, [{mean: 1000, n:N}]); }); it('handles multiple duplicates', function(){ var tdigest = new TDigest(1,0,0); var i, N = 10; for (i = 0 ; i < N ; i++) { tdigest.push(0.0); tdigest.push(1.0); tdigest.push(0.5); } assert.deepEqual( tdigest.toArray(), [{mean:0.0, n:N}, {mean:0.5, n:N}, {mean:1.0, n:N}] ); }); }); describe('compress', function(){ it('compresses points and preserves bounds', function(){ var tdigest = new TDigest(0.001, 0); var i, N = 100; for (i = 0 ; i < N ; i += 1) { tdigest.push(i*10); } assert(tdigest.size() === 100); tdigest.delta = 0.1; // encourage merging (don't do this!) tdigest.compress(); var points = tdigest.toArray(); assert(points.length < 100); assert(points[0].mean === 0); assert(points[points.length-1].mean === (N - 1) * 10); }); it('K automatically compresses during ingest', function(){ var tdigest = new TDigest(); var i, N = 10000; for (i = 0 ; i < N ; i += 1) { tdigest.push(i*10); } var points = tdigest.toArray(); assert(tdigest.nreset > 1); assert(points.length < 10000); assert(points[0].mean === 0); assert(points[points.length-1].mean === 99990); }); }); describe('percentile ranks', function(){ // // TDigests are really meant for large datasets and continuous // distributions. On small or categorical sets, results can seem // strange because mass exists at boundary points. The small tests // here verify some precise behaviors that may not be relevant at // scale. // it('reports undefined when given no points', function(){ var tdigest = new TDigest(); var x = [1, 2, 3]; assert.deepEqual(tdigest.p_rank(1), undefined); assert.deepEqual(tdigest.p_rank(x), [undefined,undefined,undefined]); }); it('from a single point', function(){ var tdigest = new TDigest(); tdigest.push(0); var x = [-0.5, 0, 0.5, 1.0, 1.5]; var q = [0, 0.5, 1, 1, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from two points', function(){ var tdigest = new TDigest(); tdigest.push([0, 1]); var x = [-0.5, 0, 0.5, 1.0, 1.5]; var q = [0, 0.25, 0.5, 0.75, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from three points', function(){ var tdigest = new TDigest(); tdigest.push([-1, 0, 1] ); var x = [-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]; var q = [0, 1/6, 2/6, 3/6, 4/6, 5/6, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from three points is same as from multiples of those points', function(){ var tdigest = new TDigest(); tdigest.push([0,1,-1]); var x = [-1.5, -1.0, -0.5, 0, 0.5, 1.0, 1.5]; var result1 = tdigest.p_rank(x); tdigest.push([0,1,-1]); tdigest.push([0,1,-1]); var result2 = tdigest.p_rank(x); assert.deepEqual(result1, result2); }); it('from four points away from the origin', function(){ var tdigest = new TDigest(); tdigest.push([10,11,12,13]); var x = [9, 10, 11, 12, 13, 14]; var q = [0, 1/8, 3/8, 5/8, 7/8, 1]; assert.deepEqual(tdigest.p_rank(x), q); }); it('from four points is same as from multiples of those points', function(){ var tdigest = new TDigest(0, 0); tdigest.push([10,11,12,13]); var x = [9, 10, 11, 12, 13, 14]; var result1 = tdigest.p_rank(x); tdigest.push([10,11,12,13]); tdigest.push([10,11,12,13]); var result2 = tdigest.p_rank(x); assert.deepEqual(result1, result2); }); it('from lots of uniformly distributed points', function(){ var tdigest = new TDigest(); var i, x=[], N = 100000; var maxerr = 0; for (i = 0 ; i < N ; i += 1) { x.push(Math.random()); } tdigest.push(x); tdigest.compress(); for (i = 0.01 ; i <= 1 ; i += 0.01) { var q = tdigest.p_rank(i); maxerr = Math.max(maxerr, Math.abs(i-q)); } assert(maxerr < 0.01); }); it('from an exact match', function(){ var tdigest = new TDigest(0.001,0); // no compression var i, N = 10; var maxerr = 0; for (i = 0 ; i < N ; i += 1) { tdigest.push([10, 20, 30]); } assert(tdigest.p_rank(20) === 0.5); }); }); describe('percentiles', function(){ it('reports undefined when given no points', function(){ var tdigest = new TDigest(); var p = [0, 0.5, 1.0]; assert.deepEqual(tdigest.percentile(0.5), undefined); assert.deepEqual(tdigest.percentile(p), [undefined,undefined,undefined]); }); it('from a single point', function(){ var tdigest = new TDigest(); tdigest.push(0); var p = [0, 0.5, 1.0]; var x = [0, 0, 0]; assert.deepEqual(tdigest.percentile(p), x); }); it('from two points', function(){ var tdigest = new TDigest(); tdigest.push([0, 1]); var p = [-1/4, 0, 1/4, 1/2, 5/8, 3/4, 1, 1.25]; var x = [ 0, 0, 0, 0.5, 0.75, 1, 1, 1]; assert.deepEqual(tdigest.percentile(p), x); }); it('from three points', function(){ var tdigest = new TDigest(); tdigest.push([0, 0.5, 1]); var p = [0, 1/4, 1/2, 3/4, 1]; var x = [0, 0.125, 0.5, 0.875, 1.0]; assert.deepEqual(tdigest.percentile(p), x); }); it('from four points', function(){ var tdigest = new TDigest(); tdigest.push([10, 11, 12, 13]); var p = [0, 1/4, 1/2, 3/4, 1]; var x = [10.0, 10.5, 11.5, 12.5, 13.0]; assert.deepEqual(tdigest.percentile(p), x); }); it('from lots of uniformly distributed points', function(){ var tdigest = new TDigest(); var i, x=[], N = 100000; var maxerr = 0; for (i = 0 ; i < N ; i += 1) { x.push(Math.random()); } tdigest.push(x); tdigest.compress(); for (i = 0.01 ; i <= 1 ; i += 0.01) { var q = tdigest.percentile(i); maxerr = Math.max(maxerr, Math.abs(i-q)); } assert(maxerr < 0.01); }); }); package/package.json000644 0000001513 3560116604 011546 0ustar00000000 000000 { "name": "tdigest", "version": "0.1.2", "description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation", "main": "tdigest.js", "scripts": { "test": "mocha specs" }, "repository": { "type": "git", "url": "https://github.com/welch/tdigest.git" }, "keywords": [ "tdigest", "percentile", "quantile", "histogram", "approximation" ], "author": "Will Welch (http://quietplease.com/)", "license": "MIT", "bugs": { "url": "https://github.com/welch/tdigest/issues" }, "homepage": "https://github.com/welch/tdigest", "dependencies": { "bintrees": "1.0.2" }, "devDependencies": { "better-assert": "^1.0.2", "chai": "^3.0.0", "grunt": "^0.4.5", "grunt-pure-cjs": "^1.4.0", "mocha": "^2.1.0" } } package/README.md000600 0000007657 3560116604 010546 0ustar00000000 000000 tdigest ============ [![Build Status][travis-image]][travis-url] [![NPM version][npm-image]][npm-url] [![NPM download][download-image]][npm-url] Javascript implementation of Dunning's T-Digest for streaming quantile approximation The T-Digest is a data structure and algorithm for constructing an approximate distribution for a collection of real numbers presented as a stream. The algorithm makes no guarantees, but behaves well enough in practice that implementations have been included in Apache Mahout and ElasticSearch for computing summaries and approximate order statistics over a stream. For an overview of T-Digest's behavior, see Davidson-Pilon's [blog post](http://dataorigami.net/blogs/napkin-folding/19055451-percentile-and-quantile-estimation-of-big-data-the-t-digest) regarding a python implementation. For more details, there are the [tdigest paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) and [reference implementation](https://github.com/tdunning/t-digest) (Java). This javascript implementation is based on a reading of the paper, with some boundary and performance tweaks. **changes in 0.1.2:** Updated the bintree dependency to 1.0.2 to pick up its licencing declaration **changes in 0.1.1:** 1. percentile on an empty digest returns *undefined* or array of *undefined* instead of NaN 2. upgraded bintrees to get bugfix. 3. bugfix for discrete percentile and p_rank, make boundary conditions conform to standard definition. **changes in 0.1.0:** Discrete mode: when a TDigest is created with delta=false, the sample distribution is treated as discrete. TDigest behavior is disabled, differing samples are never merged (they needn't even be numeric), and percentiles are reported as nearest exact data values rather than interpolated. Digest: distribution digest structure. Starts in exact histogram (discrete) mode, remains in exact mode for reasonable numbers of distinct values as sample size inreases, and automatically switches to TDigest mode for large samples that appear to be from a continuous distribution. Renamed quantile() -> p_rank(), Percentile Rank. percentile() and p_rank() now accept arrays or singleton arguments. **changes in 0.0.7:** A `grunt dist` task has been added to create a UMD-wrapped version of tdigest and dependencies for importing as a standalone module in client-side javascript. bugfixes and speed improvements. **changes in 0.0.5:** API Overhaul: * asArray() -> toArray() * redigest() -> compress() * digest() -> push() * pushing an array no longer triggers compression bugfixes and speed improvements. quickstart ------------ #### node.js: ``` npm install tdigest ``` ```javascript var TDigest = require('tdigest').TDigest; var x=[], N = 100000; for (var i = 0 ; i < N ; i += 1) { x.push(Math.random() * 10 - 5); }; td = new TDigest(); td.push(x); td.compress(); console.log(td.summary()); console.log("median ~ "+td.percentile(0.5)); ``` See also [example.js](https://github.com/welch/tdigest/blob/master/example.js) in this package. #### In the browser: The `grunt dist` task has been configured to generate a self-contained [UMD-wrapped](https://github.com/umdjs/umd) version of tdigest in dist/tdigest.js. Embed it in HTML like this: ``` ``` See also [example.html](https://github.com/welch/tdigest/blob/master/example.html) in this package. dependencies ------------- `bintrees`: [https://www.npmjs.com/package/bintrees](https://www.npmjs.com/package/bintrees) [travis-image]: https://travis-ci.org/welch/tdigest.svg?branch=master [travis-url]: https://travis-ci.org/welch/tdigest [npm-image]: http://img.shields.io/npm/v/tdigest.svg [download-image]: http://img.shields.io/npm/dm/tdigest.svg [npm-url]: https://www.npmjs.org/package/tdigest package/.travis.yml000600 0000000177 3560116604 011366 0ustar00000000 000000 language: node_js node_js: "stable" before_install: npm install -g grunt-cli install: npm install before_script: grunt dist