astroML-0.3/0000755000076500000240000000000012462244012013501 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/0000755000076500000240000000000012462244012015062 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/__init__.py0000644000076500000240000000057712462243654017217 0ustar jakevdpstaff00000000000000__version__ = '0.3' __citation__ = """@INPROCEEDINGS{astroML, author={{Vanderplas}, J.T. and {Connolly}, A.J. and {Ivezi{\'c}}, {\v Z}. and {Gray}, A.}, booktitle={Conference on Intelligent Data Understanding (CIDU)}, title={Introduction to astroML: Machine learning for astrophysics}, month={Oct.}, pages={47 -54}, doi={10.1109/CIDU.2012.6382200}, year={2012} }""" astroML-0.3/astroML/classification/0000755000076500000240000000000012462244012020055 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/classification/__init__.py0000644000076500000240000000004012420767763022201 0ustar jakevdpstaff00000000000000from .gmm_bayes import GMMBayes astroML-0.3/astroML/classification/gmm_bayes.py0000644000076500000240000000405112115147567022406 0ustar jakevdpstaff00000000000000""" GMM Bayes --------- This implements generative classification based on mixtures of gaussians to model the probability density of each class. """ import numpy as np from sklearn.mixture import GMM from sklearn.naive_bayes import BaseNB class GMMBayes(BaseNB): """GMM Bayes Classifier This is a generalization to the Naive Bayes classifier: rather than modeling the distribution of each class with axis-aligned gaussians, GMMBayes models the distribution of each class with mixtures of gaussians. This can lead to better classification in some cases. Parameters ---------- n_components : int or list number of components to use in the gmm. If specified as a list, it must match the number of class labels other keywords are passed directly to GMM """ def __init__(self, n_components=1, **kwargs): self.n_components = np.atleast_1d(n_components) self.kwargs = kwargs def fit(self, X, y): X = np.asarray(X) y = np.asarray(y) n_samples, n_features = X.shape if n_samples != y.shape[0]: raise ValueError("X and y have incompatible shapes") self.classes_ = np.unique(y) self.classes_.sort() unique_y = self.classes_ n_classes = unique_y.shape[0] if self.n_components.size not in (1, len(unique_y)): raise ValueError("n_components must be compatible with " "the number of classes") self.gmms_ = [None for i in range(n_classes)] self.class_prior_ = np.zeros(n_classes) n_comp = np.zeros(len(self.classes_), dtype=int) + self.n_components for i, y_i in enumerate(unique_y): self.gmms_[i] = GMM(n_comp[i], **self.kwargs).fit(X[y == y_i]) self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples return self def _joint_log_likelihood(self, X): X = np.asarray(np.atleast_2d(X)) logprobs = np.array([g.score(X) for g in self.gmms_]).T return logprobs + np.log(self.class_prior_) astroML-0.3/astroML/clustering/0000755000076500000240000000000012462244012017241 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/clustering/__init__.py0000644000076500000240000000010712420767763021371 0ustar jakevdpstaff00000000000000from .mst_clustering import HierarchicalClustering, get_graph_segments astroML-0.3/astroML/clustering/mst_clustering.py0000644000076500000240000001431412252721253022664 0ustar jakevdpstaff00000000000000""" Minimum Spanning Tree Clustering """ import numpy as np from scipy import sparse from sklearn.neighbors import kneighbors_graph from sklearn.mixture import GMM try: from scipy.sparse.csgraph import \ minimum_spanning_tree, connected_components except: raise ValueError("scipy v0.11 or greater required " "for minimum spanning tree") class HierarchicalClustering(object): """Hierarchical Clustering via Approximate Euclidean Minimum Spanning Tree Parameters ---------- n_neighbors : int number of neighbors of each point used for approximate Euclidean minimum spanning tree (MST) algorithm. See Notes below. edge_cutoff : float specify a fraction of edges to keep when selecting clusters. edge_cutoff should be between 0 and 1. min_cluster_size : int, optional specify a minimum number of points per cluster. If not specified, all clusters will be kept. Attributes ---------- X_train_ : ndarray the training data full_tree_ : sparse graph the full approximate Euclidean MST spanning the data cluster_graph_ : sparse graph the final (truncated) graph showing clusters n_components_ : int the number of clusters found. labels_ : int the cluster labels for each training point. Labels range from -1 to n_components_ - 1: points labeled -1 are in the background (i.e. their clusters were smaller than min_cluster_size) Notes ----- This routine uses an approximate Euclidean minimum spanning tree (MST) to perform hierarchical clustering. A true Euclidean minimum spanning tree naively costs O[N^3]. Graph traversal algorithms only help so much, because all N^2 edges must be used as candidates. In this approximate algorithm, we use k < N edges from each point, so that the cost is only O[Nk log(Nk)]. For k = N, the approximation is exact; in practice for well-behaved data sets, the result is exact for k << N. """ def __init__(self, n_neighbors=20, edge_cutoff=0.9, min_cluster_size=1): self.n_neighbors = n_neighbors self.edge_cutoff = edge_cutoff self.min_cluster_size = min_cluster_size def fit(self, X): """Fit the clustering model Parameters ---------- X : array_like the data to be clustered: shape = [n_samples, n_features] """ X = np.asarray(X, dtype=float) self.X_train_ = X # generate a sparse graph using the k nearest neighbors of each point G = kneighbors_graph(X, n_neighbors=self.n_neighbors, mode='distance') # Compute the minimum spanning tree of this graph self.full_tree_ = minimum_spanning_tree(G, overwrite=True) # Find the cluster labels self.n_components_, self.labels_, self.cluster_graph_ =\ self.compute_clusters() return self def compute_clusters(self, edge_cutoff=None, min_cluster_size=None): """Compute the clusters given a trained tree After fit() is called, this method may be called to obtain a clustering result with a new edge_cutoff and min_cluster_size. Parameters ---------- edge_cutoff : float, optional specify a fraction of edges to keep when selecting clusters. edge_cutoff should be between 0 and 1. If not specified, self.edge_cutoff will be used. min_cluster_size : int, optional specify a minimum number of points per cluster. If not specified, self.min_cluster_size will be used. Returns ------- n_components : int the number of clusters found labels : ndarray the labels of each point. Labels range from -1 to n_components_ - 1: points labeled -1 are in the background (i.e. their clusters were smaller than min_cluster_size) T_trunc : sparse matrix the truncated minimum spanning tree """ if edge_cutoff is None: edge_cutoff = self.edge_cutoff if min_cluster_size is None: min_cluster_size = self.min_cluster_size if not hasattr(self, 'full_tree_'): raise ValueError("must call fit() before calling " "compute_clusters()") T_trunc = self.full_tree_.copy() # cut-off edges at the percentile given by edge_cutoff cutoff = np.percentile(T_trunc.data, 100 * edge_cutoff) T_trunc.data[T_trunc.data > cutoff] = 0 T_trunc.eliminate_zeros() # find connected components n_components, labels = connected_components(T_trunc, directed=False) counts = np.bincount(labels) # for all components with less than min_cluster_size points, set # to background, and re-label the clusters i_bg = np.where(counts < min_cluster_size)[0] for i in i_bg: labels[labels == i] = -1 if len(i_bg) > 0: _, labels = np.unique(labels, return_inverse=True) labels -= 1 n_components = labels.max() + 1 # eliminate links in T_trunc which are not clusters I = sparse.eye(len(labels), len(labels)) I.data[0, labels < 0] = 0 T_trunc = I * T_trunc * I return n_components, labels, T_trunc def get_graph_segments(X, G): """Get graph segments for plotting a 2D graph Parameters ---------- X : array_like the data, of shape [n_samples, 2] G : array_like or sparse graph the [n_samples, n_samples] matrix encoding the graph of connectinons on X Returns ------- x_coords, y_coords : ndarrays the x and y coordinates for plotting the graph. They are of size [2, n_links], and can be visualized using ``plt.plot(x_coords, y_coords, '-k')`` """ X = np.asarray(X) if (X.ndim != 2) or (X.shape[1] != 2): raise ValueError('shape of X should be (n_samples, 2)') n_samples = X.shape[0] G = sparse.coo_matrix(G) A = X[G.row].T B = X[G.col].T x_coords = np.vstack([A[0], B[0]]) y_coords = np.vstack([A[1], B[1]]) return x_coords, y_coords astroML-0.3/astroML/clustering/tests/0000755000076500000240000000000012462244012020403 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/clustering/tests/__init__.py0000644000076500000240000000000012252721253022506 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/clustering/tests/test_MST_clustering.py0000644000076500000240000000233412115147567024734 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_, assert_allclose from astroML.clustering import HierarchicalClustering, get_graph_segments def test_simple_clustering(): np.random.seed(0) N = 10 X = np.random.random((N, 2)) model = HierarchicalClustering(8, edge_cutoff=0.5) model.fit(X) assert_(model.n_components_ == N / 2) assert_(np.sum(model.full_tree_.toarray() > 0) == N - 1) assert_(np.sum(model.cluster_graph_.toarray() > 0) == N / 2) assert_allclose(np.unique(model.labels_), np.arange(N / 2)) def test_cluster_cutoff(): np.random.seed(0) N = 100 X = np.random.random((N, 2)) model = HierarchicalClustering(8, edge_cutoff=0.9, min_cluster_size=10) model.fit(X) assert_allclose(np.unique(model.labels_), np.arange(-1, model.n_components_)) def test_graph_segments(): np.random.seed(0) N = 4 X = np.random.random((N, 2)) G = np.zeros([N, N]) G[0, 1] = 1 G[2, 1] = 1 G[2, 3] = 1 ind = np.array([[0, 2, 2], [1, 1, 3]]) xseg_check = X[ind, 0] yseg_check = X[ind, 1] xseg, yseg = get_graph_segments(X, G) assert_allclose(xseg, xseg_check) assert_allclose(yseg, yseg_check) astroML-0.3/astroML/correlation.py0000644000076500000240000002744512252721253017775 0ustar jakevdpstaff00000000000000""" Tools for computing two-point correlation functions. """ import warnings import numpy as np from sklearn.neighbors import BallTree from .utils import check_random_state # Check if scikit-learn's two-point functionality is available. # This was added in scikit-learn version 0.14 try: from sklearn.neighbors import KDTree sklearn_has_two_point = True except ImportError: import warnings sklearn_has_two_point = False def uniform_sphere(RAlim, DEClim, size=1): """Draw a uniform sample on a sphere Parameters ---------- RAlim : tuple select Right Ascension between RAlim[0] and RAlim[1] units are degrees DEClim : tuple select Declination between DEClim[0] and DEClim[1] size : int (optional) the size of the random arrays to return (default = 1) Returns ------- RA, DEC : ndarray the random sample on the sphere within the given limits. arrays have shape equal to size. """ zlim = np.sin(np.pi * np.asarray(DEClim) / 180.) z = zlim[0] + (zlim[1] - zlim[0]) * np.random.random(size) DEC = (180. / np.pi) * np.arcsin(z) RA = RAlim[0] + (RAlim[1] - RAlim[0]) * np.random.random(size) return RA, DEC def ra_dec_to_xyz(ra, dec): """Convert ra & dec to Euclidean points Parameters ---------- ra, dec : ndarrays Returns x, y, z : ndarrays """ sin_ra = np.sin(ra * np.pi / 180.) cos_ra = np.cos(ra * np.pi / 180.) sin_dec = np.sin(np.pi / 2 - dec * np.pi / 180.) cos_dec = np.cos(np.pi / 2 - dec * np.pi / 180.) return (cos_ra * sin_dec, sin_ra * sin_dec, cos_dec) def angular_dist_to_euclidean_dist(D, r=1): """convert angular distances to euclidean distances""" return 2 * r * np.sin(0.5 * D * np.pi / 180.) def two_point(data, bins, method='standard', data_R=None, random_state=None): """Two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". data_R : array_like (optional) if specified, use this as the random comparison sample random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) if sklearn_has_two_point: # Fast two-point correlation functions added in scikit-learn v. 0.14 KDT_D = KDTree(data) KDT_R = KDTree(data_R) counts_DD = KDT_D.two_point_correlation(data, bins) counts_RR = KDT_R.two_point_correlation(data_R, bins) else: warnings.warn("Version 0.3 of astroML will require scikit-learn " "version 0.14 or higher for correlation function " "calculations. Upgrade to sklearn 0.14+ now for much " "faster correlation function calculations.") BT_D = BallTree(data) BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1) counts_RR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i], count_only=True)) counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i], count_only=True)) DD = np.diff(counts_DD) RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor ** 2 * DD / RR - 1 elif method == 'landy-szalay': if sklearn_has_two_point: counts_DR = KDT_R.two_point_correlation(data, bins) else: counts_DR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i], count_only=True)) DR = np.diff(counts_DR) corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan return corr def bootstrap_two_point(data, bins, Nbootstrap=10, method='standard', return_bootstraps=False, random_state=None): """Bootstrapped two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 Nbootstrap : integer number of bootstrap resamples to perform (default = 10) method : string "standard" or "landy-szalay". return_bootstraps: bool if True, return full bootstrapped samples random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr, corr_err : ndarrays the estimate of the correlation function and the bootstrap error within each bin. shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") if Nbootstrap < 2: raise ValueError("Nbootstrap must be greater than 1") n_samples, n_features = data.shape # get the baseline estimate corr = two_point(data, bins, method=method, random_state=rng) bootstraps = np.zeros((Nbootstrap, len(corr))) for i in range(Nbootstrap): indices = rng.randint(0, n_samples, n_samples) bootstraps[i] = two_point(data[indices, :], bins, method=method, random_state=rng) # use masked std dev in case of NaNs corr_err = np.asarray(np.ma.masked_invalid(bootstraps).std(0, ddof=1)) if return_bootstraps: return corr, corr_err, bootstraps else: return corr, corr_err def two_point_angular(ra, dec, bins, method='standard', random_state=None): """Angular two-point correlation function A separate function is needed because angular distances are not euclidean, and random sampling needs to take into account the spherical volume element. Parameters ---------- ra : array_like input right ascention, shape = (n_samples,) dec : array_like input declination bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ ra = np.asarray(ra) dec = np.asarray(dec) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if (ra.ndim != 1) or (dec.ndim != 1) or (ra.shape != dec.shape): raise ValueError('ra and dec must be 1-dimensional ' 'arrays of the same length') n_features = len(ra) Nbins = len(bins) - 1 # draw a random sample with N points ra_R, dec_R = uniform_sphere((min(ra), max(ra)), (min(dec), max(dec)), 2 * len(ra)) data = np.asarray(ra_dec_to_xyz(ra, dec), order='F').T data_R = np.asarray(ra_dec_to_xyz(ra_R, dec_R), order='F').T # convert spherical bins to cartesian bins bins_transform = angular_dist_to_euclidean_dist(bins) return two_point(data, bins_transform, method=method, data_R=data_R, random_state=rng) def bootstrap_two_point_angular(ra, dec, bins, method='standard', Nbootstraps=10, random_state=None): """Angular two-point correlation function A separate function is needed because angular distances are not euclidean, and random sampling needs to take into account the spherical volume element. Parameters ---------- ra : array_like input right ascention, shape = (n_samples,) dec : array_like input declination bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". Nbootstraps : int number of bootstrap resamples random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins dcorr : ndarray error estimate on dcorr (sample standard deviation of bootstrap resamples) bootstraps : ndarray The full sample of bootstraps used to compute corr and dcorr """ ra = np.asarray(ra) dec = np.asarray(dec) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if (ra.ndim != 1) or (dec.ndim != 1) or (ra.shape != dec.shape): raise ValueError('ra and dec must be 1-dimensional ' 'arrays of the same length') n_features = len(ra) Nbins = len(bins) - 1 data = np.asarray(ra_dec_to_xyz(ra, dec), order='F').T # convert spherical bins to cartesian bins bins_transform = angular_dist_to_euclidean_dist(bins) bootstraps = [] for i in range(Nbootstraps): # draw a random sample with N points ra_R, dec_R = uniform_sphere((min(ra), max(ra)), (min(dec), max(dec)), 2 * len(ra)) data_R = np.asarray(ra_dec_to_xyz(ra_R, dec_R), order='F').T if i > 0: # random sample of the data ind = np.random.randint(0, data.shape[0], data.shape[0]) data_b = data[ind] else: data_b = data bootstraps.append(two_point(data_b, bins_transform, method=method, data_R=data_R, random_state=rng)) bootstraps = np.asarray(bootstraps) corr = np.mean(bootstraps, 0) corr_err = np.std(bootstraps, 0, ddof=1) return corr, corr_err, bootstraps astroML-0.3/astroML/cosmology.py0000644000076500000240000000440312462227370017460 0ustar jakevdpstaff00000000000000import numpy as np from scipy import integrate class Cosmology(object): """Class to enable simple cosmological calculations. For a more full-featured cosmology package, see CosmoloPy [1]_ Parameters ---------- omegaM : float Matter Density. 0 <= omegaM <= 1 omegaL : float Dark energy density. 0 <= omegaL <= 1 h : float Hubble parameter, in units of 100 km/s/Mpc References ---------- [1] http://roban.github.com/CosmoloPy/ """ def __init__(self, omegaM=0.27, omegaL=0.73, h=0.71): self.omegaM = omegaM self.omegaL = omegaL self.omegaK = 1. - omegaM - omegaL self.h = h # compute hubble distance in Mpc self.Dh = 2.9979E5 / (100 * h) def _hinv(self, z): """ dimensionless Hubble constant at redshift z This is used in integration routines Defined as in equation 14 from Hogg 1999, and modified for non-constant w parameterized linearly with z ( w = w0 + w1*z ) """ if np.isinf(z): return np.inf return np.sqrt(self.omegaM * (1. + z) ** 3 + self.omegaK * (1. + z) ** 2 + self.omegaL) def Dc(self, z): """ Line of sight comoving distance at redshift z Remains constant with epoch if objects are in the Hubble flow """ if z == 0: return 0 else: f = lambda z: 1.0 / self._hinv(z) I = integrate.quad(f, 0, z) return self.Dh * I[0] def Dm(self, z): """ Transverse comoving distance at redshift z At same redshift but separated by angle dtheta; Dm * dtheta is transverse comoving distance """ sOk = np.sqrt(abs(self.omegaK)) if self.omegaK < 0.0: return self.Dh * np.sin(sOk * self.Dc(z) / self.Dh) / sOk elif self.omegaK == 0.0: return self.Dc(z) else: return self.Dh * np.sinh(sOk * self.Dc(z) / self.Dh) / sOk def Dl(self, z): """Luminosity distance (Mpc) at redshift z""" return (1. + z) * self.Dm(z) def mu(self, z): """Distance Modulus at redshift z""" return 5. * np.log10(self.Dl(z) * 1E6) - 5. astroML-0.3/astroML/crossmatch.py0000644000076500000240000000567112252721253017617 0ustar jakevdpstaff00000000000000import numpy as np from scipy.spatial import cKDTree def crossmatch(X1, X2, max_distance=np.inf): """Cross-match the values between X1 and X2 By default, this uses a KD Tree for speed. Parameters ---------- X1 : array_like first dataset, shape(N1, D) X2 : array_like second dataset, shape(N2, D) max_distance : float (optional) maximum radius of search. If no point is within the given radius, then inf will be returned. Returns ------- dist, ind: ndarrays The distance and index of the closest point in X2 to each point in X1 Both arrays are length N1. Locations with no match are indicated by dist[i] = inf, ind[i] = N2 """ X1 = np.asarray(X1, dtype=float) X2 = np.asarray(X2, dtype=float) N1, D = X1.shape N2, D2 = X2.shape if D != D2: raise ValueError('Arrays must have the same second dimension') kdt = cKDTree(X2) dist, ind = kdt.query(X1, k=1, distance_upper_bound=max_distance) return dist, ind def crossmatch_angular(X1, X2, max_distance=np.inf): """Cross-match angular values between X1 and X2 by default, this uses a KD Tree for speed. Because the KD Tree only handles cartesian distances, the angles are projected onto a 3D sphere. Parameters ---------- X1 : array_like first dataset, shape(N1, 2). X1[:, 0] is the RA, X1[:, 1] is the DEC, both measured in degrees X2 : array_like second dataset, shape(N2, 2). X2[:, 0] is the RA, X2[:, 1] is the DEC, both measured in degrees max_distance : float (optional) maximum radius of search, measured in degrees. If no point is within the given radius, then inf will be returned. Returns ------- dist, ind: ndarrays The angular distance and index of the closest point in X2 to each point in X1. Both arrays are length N1. Locations with no match are indicated by dist[i] = inf, ind[i] = N2 """ X1 = X1 * (np.pi / 180.) X2 = X2 * (np.pi / 180.) max_distance = max_distance * (np.pi / 180.) # Convert 2D RA/DEC to 3D cartesian coordinates Y1 = np.transpose(np.vstack([np.cos(X1[:, 0]) * np.cos(X1[:, 1]), np.sin(X1[:, 0]) * np.cos(X1[:, 1]), np.sin(X1[:, 1])])) Y2 = np.transpose(np.vstack([np.cos(X2[:, 0]) * np.cos(X2[:, 1]), np.sin(X2[:, 0]) * np.cos(X2[:, 1]), np.sin(X2[:, 1])])) # law of cosines to compute 3D distance max_y = np.sqrt(2 - 2 * np.cos(max_distance)) dist, ind = crossmatch(Y1, Y2, max_y) # convert distances back to angles using the law of tangents not_inf = ~np.isinf(dist) x = 0.5 * dist[not_inf] dist[not_inf] = (180. / np.pi * 2 * np.arctan2(x, np.sqrt(np.maximum(0, 1 - x ** 2)))) return dist, ind astroML-0.3/astroML/datasets/0000755000076500000240000000000012462244012016672 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/datasets/__init__.py0000644000076500000240000000202312115147567021014 0ustar jakevdpstaff00000000000000""" Astronomy Datasets ------------------ """ from .tools import get_data_home from .sdss_S82standards import fetch_sdss_S82standards from .dr7_quasar import fetch_dr7_quasar from .moving_objects import fetch_moving_objects from .sdss_galaxy_colors import fetch_sdss_galaxy_colors from .sdss_spectrum import fetch_sdss_spectrum from .sdss_corrected_spectra import fetch_sdss_corrected_spectra from .nasa_atlas import fetch_nasa_atlas from .sdss_sspp import fetch_sdss_sspp from .sdss_specgals import fetch_sdss_specgals, fetch_great_wall from .imaging_sample import fetch_imaging_sample from .wmap_temperatures import fetch_wmap_temperatures from .rrlyrae_mags import fetch_rrlyrae_mags, fetch_rrlyrae_combined from .LINEAR_sample import fetch_LINEAR_sample, fetch_LINEAR_geneva from .LIGO_bigdog import fetch_LIGO_bigdog, fetch_LIGO_large from .generated import generate_mu_z from .hogg2010test import fetch_hogg2010test from .rrlyrae_templates import fetch_rrlyrae_templates from .sdss_filters import fetch_sdss_filter, fetch_vega_spectrum astroML-0.3/astroML/datasets/dr7_quasar.py0000644000076500000240000000677612421516756021350 0ustar jakevdpstaff00000000000000""" SDSS DR7 Quasar Dataset Loader. This implements a loader for the DR7 quasar dataset, located at http://www.sdss.org/dr7/products/value added/qsocat_dr7.html """ from __future__ import print_function, division import os from gzip import GzipFile import numpy as np from .tools import download_with_progress_bar from ..py3k_compat import BytesIO from . import get_data_home DATA_URL = 'http://das.sdss.org/va/qsocat/dr7qso.dat.gz' ARCHIVE_FILE = 'dr7_quasar.npy' #column numbers for extraction DR7_DTYPE = [('sdssID', 'a14'), ('RA', 'f8'), ('dec', 'f8'), ('redshift', 'f4'), ('mag_u', 'f4'), ('err_u', 'f4'), ('mag_g', 'f4'), ('err_g', 'f4'), ('mag_r', 'f4'), ('err_r', 'f4'), ('mag_i', 'f4'), ('err_i', 'f4'), ('mag_z', 'f4'), ('err_z', 'f4'), ('mag_J', 'f4'), ('err_J', 'f4'), ('mag_H', 'f4'), ('err_H', 'f4'), ('mag_K', 'f4'), ('err_K', 'f4'), ('specobjid', 'i8')] COLUMN_NUMBERS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 22, 23, 24, 25, 26, 27, 72] # length of header information SKIP_ROWS = 80 def fetch_dr7_quasar(data_home=None, download_if_missing=True): """Loader for SDSS DR7 quasar catalog Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : ndarray, shape = (105783,) numpy record array containing the quasar catalog Examples -------- >>> from astroML.datasets import fetch_dr7_quasar >>> data = fetch_dr7_quasar() >>> u_g = data['mag_u'] - data['mag_g'] >>> u_g[:3] # first three u-g colors array([-0.07699966, 0.03600121, 0.10900116], dtype=float32) Notes ----- Not all available data is extracted and saved. The extracted columns are: sdssID, RA, DEC, redshift, mag_u, err_u, mag_g, err_g, mag_r, err_r, mag_i, err_i, mag_z, err_z, mag_J, err_J, mag_H, err_H, mag_K, err_K, specobjid many of the objects are missing 2mass photometry. More information at http://www.sdss.org/dr7/products/value_added/qsocat_dr7.html """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, ARCHIVE_FILE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading DR7 quasar dataset from %s to %s" % (DATA_URL, data_home)) zipped_buf = download_with_progress_bar(DATA_URL, return_buffer=True) gzf = GzipFile(fileobj=zipped_buf, mode='rb') extracted_buf = BytesIO(gzf.read()) data = np.loadtxt(extracted_buf, skiprows=SKIP_ROWS, usecols=COLUMN_NUMBERS, dtype=DR7_DTYPE) np.save(archive_file, data) else: data = np.load(archive_file) return data astroML-0.3/astroML/datasets/generated.py0000644000076500000240000000271212420767763021225 0ustar jakevdpstaff00000000000000import numpy as np from ..cosmology import Cosmology from ..density_estimation import FunctionDistribution from ..utils import check_random_state def redshift_distribution(z, z0): return (z / z0) ** 2 * np.exp(-1.5 * (z / z0)) def generate_mu_z(size=1000, z0=0.3, dmu_0=0.1, dmu_1=0.02, random_state=None, **kwargs): """Generate a dataset of distance modulus vs redshift. Parameters ---------- size : int or tuple size of generated data z0 : float parameter in redshift distribution: p(z) ~ (z / z0)^2 exp[-1.5 (z / z0)] dmu_0, dmu_1 : float specify the error in mu, dmu = dmu_0 + dmu_1 * mu random_state : None, int, or np.random.RandomState instance random seed or random number generator **kwargs : additional keyword arguments are passed to the Cosmology function Returns ------- z, mu, dmu : ndarrays arrays of shape `size` """ random_state = check_random_state(random_state) cosmo = Cosmology(**kwargs) zdist = FunctionDistribution(redshift_distribution, func_args=dict(z0=z0), xmin=0.1 * z0, xmax=10 * z0, random_state=random_state) z_sample = zdist.rvs(size) mu_sample = np.reshape([cosmo.mu(z) for z in z_sample.ravel()], size) dmu = dmu_0 + dmu_1 * mu_sample mu_sample = random_state.normal(mu_sample, dmu) return z_sample, mu_sample, dmu astroML-0.3/astroML/datasets/hogg2010test.py0000644000076500000240000000322412115147567021410 0ustar jakevdpstaff00000000000000""" Data from Hogg et al 2010; useful for testing robust regression methods """ import numpy as np def fetch_hogg2010test(structured=False): """Fetch the Hogg et al 2010 test data """ data = np.array([[1, 201, 592, 61, 9, -0.84], [2, 244, 401, 25, 4, 0.31], [3, 47, 583, 38, 11, 0.64], [4, 287, 402, 15, 7, -0.27], [5, 203, 495, 21, 5, -0.33], [6, 58, 173, 15, 9, 0.67], [7, 210, 479, 27, 4, -0.02], [8, 202, 504, 14, 4, -0.05], [9, 198, 510, 30, 11, -0.84], [10, 158, 416, 16, 7, -0.69], [11, 165, 393, 14, 5, 0.30], [12, 201, 442, 25, 5, -0.46], [13, 157, 317, 52, 5, -0.03], [14, 131, 311, 16, 6, 0.50], [15, 166, 400, 34, 6, 0.73], [16, 160, 337, 31, 5, -0.52], [17, 186, 423, 42, 9, 0.90], [18, 125, 334, 26, 8, 0.40], [19, 218, 533, 16, 6, -0.78], [20, 146, 344, 22, 5, -0.56]]) dtype = [("ID", np.int32), ("x", np.float64), ("y", np.float64), ("sigma_x", np.float64), ("sigma_y", np.float64), ("rho_xy", np.float64)] recarray = np.empty(data.shape[0], dtype=dtype) recarray['ID'] = data[:, 0] recarray['x'] = data[:, 1] recarray['y'] = data[:, 2] recarray['sigma_x'] = data[:, 4] recarray['sigma_y'] = data[:, 3] recarray['rho_xy'] = data[:, 5] return recarray astroML-0.3/astroML/datasets/imaging_sample.py0000644000076500000240000001021212420767763022235 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL = ("http://www.astro.washington.edu/users/" "ivezic/DMbook/data/imagingSample_20sqdeg.fit") DATA_URL = ("http://www.astro.washington.edu/users/" "ivezic/DMbook/data/sgSDSSimagingSample.fit") def fetch_imaging_sample(data_home=None, download_if_missing=True): """Loader for SDSS Imaging sample data Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : recarray, shape = (330753,) record array containing imaging data Examples -------- >>> from astroML.datasets import fetch_imaging_sample >>> data = fetch_imaging_sample() >>> data.shape # number of objects in dataset (330753,) >>> print(data.names[:5]) # names of the first five columns ['ra', 'dec', 'run', 'rExtSFD', 'uRaw'] >>> print(data['ra'][:2]) [ 0.265165 0.265413] >>> print(data['dec'][:2]) [-0.444861 -0.62201 ] Notes ----- This data was selected from the SDSS database using the following SQL query:: SELECT round(p.ra,6) as ra, round(p.dec,6) as dec, p.run, --- comments are preceded by --- round(p.extinction_r,3) as rExtSFD, --- r band extinction from SFD round(p.modelMag_u,3) as uRaw, --- ISM-uncorrected model mags round(p.modelMag_g,3) as gRaw, --- rounding up model magnitudes round(p.modelMag_r,3) as rRaw, round(p.modelMag_i,3) as iRaw, round(p.modelMag_z,3) as zRaw, round(p.modelMagErr_u,3) as uErr, --- errors are important! round(p.modelMagErr_g,3) as gErr, round(p.modelMagErr_r,3) as rErr, round(p.modelMagErr_i,3) as iErr, round(p.modelMagErr_z,3) as zErr, round(p.psfMag_u,3) as psfRaw, --- psf magnitudes round(p.psfMag_g,3) as psfRaw, round(p.psfMag_r,3) as psfRaw, round(p.psfMag_i,3) as psfRaw, round(p.psfMag_z,3) as psfRaw, round(p.psfMagErr_u,3) as psfuErr, round(p.psfMagErr_g,3) as psfgErr, round(p.psfMagErr_r,3) as psfrErr, round(p.psfMagErr_i,3) as psfiErr, round(p.psfMagErr_z,3) as psfzErr, p.type, --- tells if a source is resolved or not (case when (p.flags & '16') = 0 then 1 else 0 end) as ISOLATED INTO mydb.SDSSimagingSample FROM PhotoTag p WHERE --- 10x2 sq.deg. p.ra > 0.0 and p.ra < 10.0 and p.dec > -1 and p.dec < 1 --- resolved and unresolved sources and (p.type = 3 OR p.type = 6) and --- '4295229440' is magic code for no --- DEBLENDED_AS_MOVING or SATURATED objects (p.flags & '4295229440') = 0 and --- PRIMARY objects only, which implies --- !BRIGHT && (!BLENDED || NODEBLEND || nchild == 0)] p.mode = 1 and --- adopted faint limit (same as about SDSS limit) p.modelMag_r < 22.5 --- the end of query """ # fits is an optional dependency: don't import globally from astropy.io import fits data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, os.path.basename(DATA_URL)) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') fitsdata = download_with_progress_bar(DATA_URL) open(archive_file, 'wb').write(fitsdata) hdulist = fits.open(archive_file) return np.asarray(hdulist[1].data) astroML-0.3/astroML/datasets/LIGO_bigdog.py0000644000076500000240000000775312421517100021321 0ustar jakevdpstaff00000000000000""" Fetch the LIGO BigDog time-domain dataset """ from __future__ import print_function, division import os from ..py3k_compat import BytesIO from gzip import GzipFile import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL_LARGE = ('http://www.astro.washington.edu/users/ivezic/' 'DMbook/LIGO/hoft.968653908-968655956.H1.dat.gz') LOCAL_FILE_LARGE = 'LIGO_large.npy' DATA_URL = 'http://www.ligo.org/science/GW100916/HLV-strain.txt' LOCAL_FILE = 'LIGO_bigdog.npy' def fetch_LIGO_large(data_home=None, download_if_missing=True): """Loader for LIGO large dataset Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : ndarray dt : float data represents ~2000s of amplitude data from LIGO hanford; dt is the time spacing between measurements in seconds. """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) local_file = os.path.join(data_home, LOCAL_FILE_LARGE) if os.path.exists(local_file): data = np.load(local_file) else: if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading LIGO bigdog data from %s to %s" % (DATA_URL_LARGE, local_file)) zipped_buf = download_with_progress_bar(DATA_URL_LARGE, return_buffer=True) gzf = GzipFile(fileobj=zipped_buf, mode='rb') print("uncompressing file...") extracted_buf = BytesIO(gzf.read()) data = np.loadtxt(extracted_buf) np.save(local_file, data) return data, 1. / 4096 def fetch_LIGO_bigdog(data_home=None, download_if_missing=True): """Loader for LIGO bigdog event Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : record array The data is 10 seconds of measurements from three sites, along with the time of each measurement. Examples -------- >>> from astroML.datasets import fetch_LIGO_bigdog >>> data = fetch_LIGO_bigdog() >>> print(data.dtype.names) ('t', 'Hanford', 'Livingston', 'Virgo') >>> print(data['t'][:3]) [ 0.00000000e+00 6.10400000e-05 1.22070000e-04] >>> print(data['Hanford'][:3]) [ 1.26329846e-17 1.26846778e-17 1.19187381e-17] """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) local_file = os.path.join(data_home, LOCAL_FILE) if os.path.exists(local_file): data = np.load(local_file) else: if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading LIGO bigdog data from %s to %s" % (DATA_URL, local_file)) buffer = download_with_progress_bar(DATA_URL, return_buffer=True) data = np.loadtxt(buffer, skiprows=2, dtype=[('t', 'f8'), ('Hanford', 'f8'), ('Livingston', 'f8'), ('Virgo', 'f8')]) np.save(local_file, data) return data astroML-0.3/astroML/datasets/LINEAR_sample.py0000644000076500000240000001511212422725616021571 0ustar jakevdpstaff00000000000000import os from ..py3k_compat import BytesIO import tarfile import numpy as np from . import get_data_home from .tools import download_with_progress_bar TARGETLIST_URL = ("http://www.astro.washington.edu/users/ivezic/" "linear/allDataFinal/allLINEARfinal_targets.dat") DATA_URL = ("http://www.astro.washington.edu/users/ivezic/" "linear/allDataFinal/allLINEARfinal_dat.tar.gz") # old version of the data #GENEVA_URL = ("http://www.astro.washington.edu/users/ivezic/" # "DMbook/data/LINEARattributes.dat" #GENEVA_ARCHIVE = 'LINEARattributes.npy' #ARCHIVE_DTYPE = [(s, 'f8') for s in ('RA', 'Dec', 'ug', 'gi', 'iK', # 'JK', 'logP', 'amp', 'skew')] GENEVA_URL = ("http://www.astro.washington.edu/users/ivezic/" "DMbook/data/LINEARattributesFinalApr2013.dat") GENEVA_ARCHIVE = 'LINEARattributesFinalApr2013.npy' ARCHIVE_DTYPE = ([(s, 'f8') for s in ('RA', 'Dec', 'ug', 'gi', 'iK', 'JK', 'logP', 'amp', 'skew', 'kurt', 'magMed', 'nObs')] + [('LCtype', 'i4'), ('LINEARobjectID', '|S20')]) target_names = ['objectID', 'raLIN', 'decLIN', 'raSDSS', 'decSDSS', 'r', 'ug', 'gr', 'ri', 'iz', 'JK', '', 'std', 'rms', 'Lchi2', 'LP1', 'phi1', 'S', 'prior'] class LINEARdata(object): """A container class for the linear dataset. Because the dataset is often not needed all at once, this class offers tools to access just the needed components Example ------- >>> data = fetch_LINEAR_sample() >>> lightcurve = data[data.ids[0]] """ @staticmethod def _name_to_id(name): return int(name.split('.')[0]) @staticmethod def _id_to_name(id): return str(id) + '.dat' def __init__(self, data_file, targetlist_file): self.targets = np.recfromtxt(targetlist_file) self.targets.dtype.names = target_names self.dataF = tarfile.open(data_file) self.ids = np.array(list(map(self._name_to_id, self.dataF.getnames()))) # rearrange targets so lists are in the same order self.targets = self.targets[self.targets['objectID'].argsort()] ind = self.targets['objectID'].searchsorted(self.ids) self.targets = self.targets[ind] def get_light_curve(self, id): """Get a light curve with the given id. Parameters ---------- id: integer LINEAR id of the desired object Returns ------- lightcurve: ndarray a size (n_observations, 3) light-curve. columns are [MJD, flux, flux_err] """ return self[id] def get_target_parameter(self, id, param): """Get a target parameter associated with the given id. Parameters ---------- id: integer LINEAR id of the desired object param: string parameter name of the desired object (see below) Returns ------- val: scalar value of the requested target parameter Notes ----- Target parameters are one of the following: ['objectID', 'raLIN', 'decLIN', 'raSDSS', 'decSDSS', 'r', 'ug', 'gr', 'ri', 'iz', 'JK', '', 'std', 'rms', 'Lchi2', 'LP1', 'phi1', 'S', 'prior'] """ i = np.where(self.targets['objectID'] == id)[0] try: val = self.targets[param][i[0]] except: raise KeyError(id) return val def __getitem__(self, id): try: lc = np.loadtxt(self.dataF.extractfile(self._id_to_name(id))) except: raise KeyError(id) return lc def fetch_LINEAR_sample(data_home=None, download_if_missing=True): """Loader for LINEAR data sample Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : LINEARdata object A custom object which provides access to 7010 selected LINEAR light curves. """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) targetlist_file = os.path.join(data_home, os.path.basename(TARGETLIST_URL)) data_file = os.path.join(data_home, os.path.basename(DATA_URL)) if not os.path.exists(targetlist_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') targets = download_with_progress_bar(TARGETLIST_URL) open(targetlist_file, 'wb').write(targets) if not os.path.exists(data_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') databuffer = download_with_progress_bar(DATA_URL) open(data_file, 'wb').write(databuffer) return LINEARdata(data_file, targetlist_file) def fetch_LINEAR_geneva(data_home=None, download_if_missing=True): """Loader for LINEAR geneva data. This supplements the LINEAR data above with well-determined periods and other light curve characteristics. Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : record array data on 7000+ LINEAR stars from the Geneva catalog """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, GENEVA_ARCHIVE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') databuffer = download_with_progress_bar(GENEVA_URL) data = np.loadtxt(BytesIO(databuffer), dtype=ARCHIVE_DTYPE) np.save(archive_file, data) else: data = np.load(archive_file) return data astroML-0.3/astroML/datasets/moving_objects.py0000644000076500000240000001143712421516675022276 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os from gzip import GzipFile import numpy as np from .tools import download_with_progress_bar from ..py3k_compat import BytesIO from . import get_data_home DATA_URL = 'http://www.astro.washington.edu/users/ivezic/sdssmoc/ADR3.dat.gz' ARCHIVE_FILE = 'moving_objects.npy' ADR4_dtype = [('moID', 'a6'), ('sdss_run', 'i4'), ('sdss_col', 'i4'), ('sdss_field', 'i4'), ('sdss_obj', 'i4'), ('rowc', 'f4'), ('colc', 'f4'), ('mjd', 'f8'), ('ra', 'f8'), ('dec', 'f8'), ('lambda', 'f8'), ('beta', 'f8'), ('phi', 'f8'), ('vmu', 'f4'), ('vmu_err', 'f4'), ('vnu', 'f4'), ('vnu_err', 'f4'), ('vlambda', 'f4'), ('vbeta', 'f4'), ('mag_u', 'f4'), ('err_u', 'f4'), ('mag_g', 'f4'), ('err_g', 'f4'), ('mag_r', 'f4'), ('err_r', 'f4'), ('mag_i', 'f4'), ('err_i', 'f4'), ('mag_z', 'f4'), ('err_z', 'f4'), ('mag_a', 'f4'), ('err_a', 'f4'), ('mag_V', 'f4'), ('mag_B', 'f4'), ('ast_flag', 'i4'), ('ast_num', 'i8'), ('ast_designation', 'a17'), ('ast_det_count', 'i4'), ('ast_det_total', 'i4'), ('ast_flags', 'i8'), ('ra_comp', 'f8'), ('dec_comp', 'f8'), ('mag_comp', 'f4'), ('r_helio', 'f4'), ('r_geo', 'f4'), ('phase', 'f4'), ('cat_id', 'a15'), ('H', 'f4'), ('G', 'f4'), ('Arc', 'f4'), ('Epoch', 'f8'), ('a', 'f8'), ('e', 'f8'), ('i', 'f8'), ('asc_node', 'f8'), ('arg_peri', 'f8'), ('M', 'f8'), ('PEcat_id', 'a17'), ('aprime', 'f8'), ('eprime', 'f8'), ('sin_iprime', 'f8')] def fetch_moving_objects(data_home=None, download_if_missing=True, Parker2008_cuts=False): """Loader for SDSS moving objects datasets Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Parker2008_cuts : bool (optional) If true, apply cuts on magnitudes and orbital parameters used in Parker et al. 2008 Returns ------- data : recarray, shape = (??,) record array containing 60 values for each item Notes ----- See http://www.astro.washington.edu/users/ivezic/sdssmoc/sdssmoc3.html Columns 0, 35, 45, and 56 are left out of the fetch: they are string parameters. Only columns with known orbital parameters are saved. Examples -------- >>> data = fetch_moving_objects() >>> print(len(data)) # number of objects 104686 >>> u_g = data['mag_u'] - data['mag_g'] >>> print(u_g[:5]) # first five u-g colors of the dataset [ 1.48999977 1.80000114 1.78000069 1.65000153 2.01000023] """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, ARCHIVE_FILE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading moving object catalog from %s to %s" % (DATA_URL, data_home)) zipped_buf = download_with_progress_bar(DATA_URL, return_buffer=True) gzf = GzipFile(fileobj=zipped_buf, mode='rb') print("uncompressing file...") extracted_buf = BytesIO(gzf.read()) data = np.loadtxt(extracted_buf, dtype=ADR4_dtype) # Select unique sources with known orbital elements flag = (data['ast_flag'] == 1) & (data['ast_det_count'] == 1) data = data[flag] np.save(archive_file, data) else: data = np.load(archive_file) if Parker2008_cuts: i_z = data['mag_i'] - data['mag_z'] flag = ((data['aprime'] >= 0.01) & (data['aprime'] <= 100) & (data['mag_a'] <= 0.4) & (data['mag_a'] >= -0.3) & (i_z <= 0.6) & (i_z >= -0.8)) data = data[flag] return data astroML-0.3/astroML/datasets/nasa_atlas.py0000644000076500000240000000421612420767763021376 0ustar jakevdpstaff00000000000000""" NASA Sloan Atlas dataset size reduction --------------------------------------- The NASA Sloan Atlas dataset is contained in a ~0.5GB available at http://www.nsatlas.org/data This function fetches a ~50MB subset of that data. This subset is created using the code that can be found at examples/datasets/truncate_nsa_data.py """ from __future__ import print_function, division import os import numpy as np from .tools import download_with_progress_bar from . import get_data_home DATA_URL = ('http://www.astro.washington.edu/users/ivezic/' 'DMbook/nsa_v0_1_2_reduced.npy') ARCHIVE_FILE = os.path.basename(DATA_URL) def fetch_nasa_atlas(data_home=None, download_if_missing=True): """Loader for NASA galaxy atlas data Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : ndarray The data, in the form of a numpy record array. Notes ----- This is the file created by the example script at examples/datasets/truncate_nsa_data.py For an explanation of the meaning of the fields, see the description at http://www.nsatlas.org/data """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, ARCHIVE_FILE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading NASA atlas data from %s to %s" % (DATA_URL, data_home)) buf = download_with_progress_bar(DATA_URL, return_buffer=True) data = np.load(buf) np.save(archive_file, data) else: data = np.load(archive_file) return data astroML-0.3/astroML/datasets/rrlyrae_mags.py0000644000076500000240000001054612420767763021762 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from . import get_data_home from . import fetch_sdss_S82standards from .tools import download_with_progress_bar DATA_URL = ("http://www.astro.washington.edu/users/" "ivezic/DMbook/data/RRLyrae.fit") def fetch_rrlyrae_mags(data_home=None, download_if_missing=True): """Loader for RR-Lyrae data Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : recarray, shape = (483,) record array containing imaging data Examples -------- >>> from astroML.datasets import fetch_rrlyrae_mags >>> data = fetch_rrlyrae_mags() >>> data.shape # number of objects in dataset (483,) >>> print(data.names[:5]) # names of the first five columns ['ra', 'dec', 'run', 'rExtSFD', 'uRaw'] >>> print(data['ra'][:2]) [ 0.265165 0.265413] >>> print(data['dec'][:2]) [-0.444861 -0.62201 ] Notes ----- This data is from table 1 of Sesar et al 2010 ApJ 708:717 """ # fits is an optional dependency: don't import globally from astropy.io import fits data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, os.path.basename(DATA_URL)) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') fitsdata = download_with_progress_bar(DATA_URL) open(archive_file, 'wb').write(fitsdata) hdulist = fits.open(archive_file) return np.asarray(hdulist[1].data) def fetch_rrlyrae_combined(data_home=None, download_if_missing=True): """Loader for RR-Lyrae combined data This returns the combined RR-Lyrae colors and SDSS standards colors. The RR-Lyrae sample is confirmed through time-domain observations; this result in a nice dataset for testing classification routines. Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- X : ndarray a shape (n_samples, 4) array. Columns are u-g, g-r, r-i, i-z y : ndarray a shape (n_samples,) array of labels. 1 indicates an RR Lyrae, 0 indicates a background star. """ #---------------------------------------------------------------------- # Load data kwds = dict(data_home=data_home, download_if_missing=download_if_missing) rrlyrae = fetch_rrlyrae_mags(**kwds) standards = fetch_sdss_S82standards(**kwds) #------------------------------------------------------------ # perform color cuts on standard stars # these come from eqns 1-4 of Sesar et al 2010, ApJ 708:717 u_g = standards['mmu_u'] - standards['mmu_g'] g_r = standards['mmu_g'] - standards['mmu_r'] r_i = standards['mmu_r'] - standards['mmu_i'] i_z = standards['mmu_i'] - standards['mmu_z'] standards = standards[(u_g > 0.7) & (u_g < 1.35) & (g_r > -0.15) & (g_r < 0.4) & (r_i > -0.15) & (r_i < 0.22) & (i_z > -0.21) & (i_z < 0.25)] #---------------------------------------------------------------------- # get magnitudes and colors; split into train and test sets mags_rr = np.vstack([rrlyrae[f + 'mag'] for f in 'ugriz']) colors_rr = mags_rr[:-1] - mags_rr[1:] mags_st = np.vstack([standards['mmu_' + f] for f in 'ugriz']) colors_st = mags_st[:-1] - mags_st[1:] # stack the two sets of colors together X = np.vstack((colors_st.T, colors_rr.T)) y = np.zeros(X.shape[0]) y[-colors_rr.shape[1]:] = 1 return X, y astroML-0.3/astroML/datasets/rrlyrae_templates.py0000644000076500000240000000310712420767763023024 0ustar jakevdpstaff00000000000000import os import tarfile import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL = ("http://www.astro.washington.edu/users/bsesar/" "S82_RRLyr/RRLyr_ugriz_templates.tar.gz") def fetch_rrlyrae_templates(data_home=None, download_if_missing=True): """Loader for RR-Lyrae template data These are the light-curve templates from Sesar et al 2010, ApJ 708:717 Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : numpy record array record array containing the templates """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) data_file = os.path.join(data_home, os.path.basename(DATA_URL)) if not os.path.exists(data_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') databuffer = download_with_progress_bar(DATA_URL) open(data_file, 'wb').write(databuffer) data = tarfile.open(data_file) return dict([(name.strip('.dat'), np.loadtxt(data.extractfile(name))) for name in data.getnames()]) astroML-0.3/astroML/datasets/sdss_corrected_spectra.py0000644000076500000240000000563212420767763024022 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL = 'http://www.astro.washington.edu/users/vanderplas/spec4000.npz' ARCHIVE_FILE = 'spec4000.npz' def reconstruct_spectra(data): """Compute the reconstructed spectra. Parameters ---------- data: NpzFile numpy data object returned by fetch_sdss_corrected_spectra. Returns ------- spec_recons: ndarray Reconstructed spectra, using principal components to interpolate across the masked region. """ spectra = data['spectra'] coeffs = data['coeffs'] evecs = data['evecs'] mask = data['mask'] mu = data['mu'] norms = data['norms'] spec_recons = spectra.copy() nev = coeffs.shape[1] spec_fill = mu + np.dot(coeffs, evecs[:nev]) spec_fill *= norms[:, np.newaxis] spec_recons[mask] = spec_fill[mask] return spec_recons def compute_wavelengths(data): """Compute the wavelength associated with spectra. Paramters --------- Parameters ---------- data: NpzFile numpy data object returned by fetch_sdss_corrected_spectra. Returns ------- wavelength: ndarray One-dimensional wavelength array for spectra. """ return 10 ** (data['coeff0'] + data['coeff1'] * np.arange(data['spectra'].shape[1])) def fetch_sdss_corrected_spectra(data_home=None, download_if_missing=True): """Loader for Iterative PCA pre-processed galaxy spectra Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : NpzFile The data dictionary Notes ----- This is the file created by the example script examples/datasets/compute_sdss_pca.py """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, ARCHIVE_FILE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading PCA-processed SDSS spectra from %s to %s" % (DATA_URL, data_home)) buf = download_with_progress_bar(DATA_URL, return_buffer=True) data = np.load(buf) data_dict = dict([(key, data[key]) for key in data.files]) np.savez(archive_file, **data_dict) else: data = np.load(archive_file) return data astroML-0.3/astroML/datasets/sdss_filters.py0000644000076500000240000000651112421254726021763 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from astroML.datasets import get_data_home from ..py3k_compat import urlopen # Info on vega spectrum: http://www.stsci.edu/hst/observatory/cdbs/calspec.html VEGA_URL = 'http://www.astro.washington.edu/users/ivezic/DMbook/data/1732526_nic_002.ascii' FILTER_URL = 'http://www.sdss.org/dr7/instruments/imager/filters/%s.dat' def fetch_sdss_filter(fname, data_home=None, download_if_missing=True): """Loader for SDSS Filter profiles Parameters ---------- fname : str filter name: must be one of 'ugriz' data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : ndarray data is an array of shape (5, Nlam) first row: wavelength in angstroms second row: sensitivity to point source, airmass 1.3 third row: sensitivity to extended source, airmass 1.3 fourth row: sensitivity to extended source, airmass 0.0 fifth row: assumed atmospheric extinction, airmass 1.0 """ if fname not in 'ugriz': raise ValueError("Unrecognized filter name '%s'" % fname) url = FILTER_URL % fname data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, '%s.dat' % fname) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading from %s" % url) F = urlopen(url) open(archive_file, 'wb').write(F.read()) F = open(archive_file) return np.loadtxt(F, unpack=True) def fetch_vega_spectrum(data_home=None, download_if_missing=True): """Loader for Vega reference spectrum Parameters ---------- fname : str filter name: must be one of 'ugriz' data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : ndarray data[0] is the array of wavelength in angstroms data[1] is the array of fluxes in Jy (F_nu, not F_lambda) """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_name = os.path.join(data_home, VEGA_URL.split('/')[-1]) if not os.path.exists(archive_name): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downnloading from %s" % VEGA_URL) F = urlopen(VEGA_URL) open(archive_name, 'wb').write(F.read()) F = open(archive_name, 'r') return np.loadtxt(F, unpack=True) astroML-0.3/astroML/datasets/sdss_galaxy_colors.py0000644000076500000240000000504312420767763023171 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from . import get_data_home from .tools import sql_query SPECCLASS = ['UNKNOWN', 'STAR', 'GALAXY', 'QSO', 'HIZ_QSO', 'SKY', 'STAR_LATE', 'GAL_EM'] NOBJECTS = 50000 GAL_COLORS_DTYPE = [('u', float), ('g', float), ('r', float), ('i', float), ('z', float), ('specClass', int), ('redshift', float), ('redshift_err', float)] ARCHIVE_FILE = 'sdss_galaxy_colors.npy' def fetch_sdss_galaxy_colors(data_home=None, download_if_missing=True): """Loader for SDSS galaxy colors. This function directly queries the sdss SQL database at http://cas.sdss.org/ Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : recarray, shape = (10000,) record array containing magnitudes and redshift for each galaxy """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, ARCHIVE_FILE) query_text = ('\n'.join( ("SELECT TOP %i" % NOBJECTS, " p.u, p.g, p.r, p.i, p.z, s.specClass, s.z, s.zerr", "FROM PhotoObj AS p", " JOIN SpecObj AS s ON s.bestobjid = p.objid", "WHERE ", " p.u BETWEEN 0 AND 19.6", " AND p.g BETWEEN 0 AND 20", " AND s.specClass > 1 -- not UNKNOWN or STAR", " AND s.specClass <> 5 -- not SKY", " AND s.specClass <> 6 -- not STAR_LATE"))) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("querying for %i objects" % NOBJECTS) print(query_text) output = sql_query(query_text) print("finished.") data = np.loadtxt(output, delimiter=',', skiprows=1, dtype=GAL_COLORS_DTYPE) np.save(archive_file, data) else: data = np.load(archive_file) return data astroML-0.3/astroML/datasets/sdss_S82standards.py0000644000076500000240000001173012421516615022570 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os from gzip import GzipFile import numpy as np from .tools import download_with_progress_bar from ..py3k_compat import BytesIO from . import get_data_home DATA_URL = ('http://www.astro.washington.edu/users/ivezic/' 'sdss/catalogs/stripe82calibStars_v2.6.dat.gz') DATA_URL_2MASS = ('http://www.astro.washington.edu/users/ivezic/' 'sdss/catalogs/stripe82calibStars_2MASS_v2.6.dat.gz') ARCHIVE_FILE = 'sdss_S82standards.npy' ARCHIVE_FILE_2MASS = 'sdss_S82standards_2mass.npy' DTYPE = [('RA', 'f8'), ('DEC', 'f8'), ('RArms', 'f4'), ('DECrms', 'f4'), ('Ntot', 'i4'), ('A_r', 'f4')] for band in 'ugriz': DTYPE += [('Nobs_%s' % band, 'i4')] DTYPE += map(lambda s: (s + '_' + band, 'f4'), ['mmed', 'mmu', 'msig', 'mrms', 'mchi2']) DTYPE_2MASS = DTYPE + [('ra2MASS', 'f4'), ('dec2MASS', 'f4'), ('J', 'f4'), ('Jerr', 'f4'), ('H', 'f4'), ('Herr', 'f4'), ('K', 'f4'), ('Kerr', 'f4'), ('theta', 'f4')] # first column is 'CALIBSTARS'. We'll ignore this. COLUMNS = range(1, len(DTYPE) + 1) def fetch_sdss_S82standards(data_home=None, download_if_missing=True, crossmatch_2mass=False): """Loader for SDSS stripe82 standard star catalog Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : bool, optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. crossmatch_2mass: bool, optional, default=False If True, return the standard star catalog cross-matched with 2mass magnitudes Returns ------- data : ndarray, shape = (313859,) record array containing sdss standard stars (see notes below) Notes ----- Information on the data can be found at http://www.astro.washington.edu/users/ivezic/sdss/catalogs/stripe82.html Data is described in Ivezic et al. 2007 (Astronomical Journal, 134, 973). Columns are as follows: RA Right-ascention of source (degrees) DEC Declination of source (degrees) RArms rms of right-ascention (arcsec) DECrms rms of declination (arcsec) Ntot total number of epochs A_r SFD ISM extinction (mags) for each band in (u g r i z): Nobs_ number of observations in this band mmed_ median magnitude in this band mmu_ mean magnitude in this band msig_ standard error on the mean (1.25 times larger for median) mrms_ root-mean-square scatter mchi2_ chi2 per degree of freedom for mean magnitude For 2-MASS, the following columns are added: ra2MASS 2-mass right-ascention dec2MASS 2-mass declination J J-band magnitude Jerr J-band error H H-band magnitude Herr H-band error K K-band magnitude Kerr K-band error theta difference between SDSS and 2MASS position (arcsec) Examples -------- >>> data = fetch_sdss_S82standards() >>> u_g = data['mmed_u'] - data['mmed_g'] >>> print(u_g[:5]) [-22.23500061 1.34900093 1.43799973 2.08200073 -23.03800011] References ---------- Ivesic et al. ApJ 134:973 (2007) """ data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) if crossmatch_2mass: archive_file = os.path.join(data_home, ARCHIVE_FILE_2MASS) data_url = DATA_URL_2MASS kwargs = dict(dtype=DTYPE_2MASS) else: archive_file = os.path.join(data_home, ARCHIVE_FILE) data_url = DATA_URL kwargs = dict(usecols=COLUMNS, dtype=DTYPE) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') print("downloading cross-matched SDSS/2MASS dataset from %s to %s" % (data_url, data_home)) zipped_buf = download_with_progress_bar(data_url, return_buffer=True) gzf = GzipFile(fileobj=zipped_buf, mode='rb') print("uncompressing file...") extracted_buf = BytesIO(gzf.read()) data = np.loadtxt(extracted_buf, **kwargs) np.save(archive_file, data) else: data = np.load(archive_file) return data astroML-0.3/astroML/datasets/sdss_specgals.py0000644000076500000240000001467212420767763022134 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL = ("http://www.astro.washington.edu/users/ivezic/" "DMbook/data/SDSSspecgalsDR8.fit") def fetch_sdss_specgals(data_home=None, download_if_missing=True): """Loader for SDSS Galaxies with spectral information Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : recarray, shape = (327260,) record array containing pipeline parameters Notes ----- These were compiled from the SDSS database using the following SQL query:: SELECT G.ra, G.dec, S.mjd, S.plate, S.fiberID, --- basic identifiers --- basic spectral data S.z, S.zErr, S.rChi2, S.velDisp, S.velDispErr, --- some useful imaging parameters G.extinction_r, G.petroMag_r, G.psfMag_r, G.psfMagErr_r, G.modelMag_u, modelMagErr_u, G.modelMag_g, modelMagErr_g, G.modelMag_r, modelMagErr_r, G.modelMag_i, modelMagErr_i, G.modelMag_z, modelMagErr_z, G.petroR50_r, G.petroR90_r, --- line fluxes for BPT diagram and other derived spec. parameters GSL.nii_6584_flux, GSL.nii_6584_flux_err, GSL.h_alpha_flux, GSL.h_alpha_flux_err, GSL.oiii_5007_flux, GSL.oiii_5007_flux_err, GSL.h_beta_flux, GSL.h_beta_flux_err, GSL.h_delta_flux, GSL.h_delta_flux_err, GSX.d4000, GSX.d4000_err, GSE.bptclass, GSE.lgm_tot_p50, GSE.sfr_tot_p50, G.objID, GSI.specObjID INTO mydb.SDSSspecgalsDR8 FROM SpecObj S CROSS APPLY dbo.fGetNearestObjEQ(S.ra, S.dec, 0.06) N, Galaxy G, GalSpecInfo GSI, GalSpecLine GSL, GalSpecIndx GSX, GalSpecExtra GSE WHERE N.objID = G.objID AND GSI.specObjID = S.specObjID AND GSL.specObjID = S.specObjID AND GSX.specObjID = S.specObjID AND GSE.specObjID = S.specObjID --- add some quality cuts to get rid of obviously bad measurements AND (G.petroMag_r > 10 AND G.petroMag_r < 18) AND (G.modelMag_u-G.modelMag_r) > 0 AND (G.modelMag_u-G.modelMag_r) < 6 AND (modelMag_u > 10 AND modelMag_u < 25) AND (modelMag_g > 10 AND modelMag_g < 25) AND (modelMag_r > 10 AND modelMag_r < 25) AND (modelMag_i > 10 AND modelMag_i < 25) AND (modelMag_z > 10 AND modelMag_z < 25) AND S.rChi2 < 2 AND (S.zErr > 0 AND S.zErr < 0.01) AND S.z > 0.02 --- end of query --- Examples -------- >>> from astroML.datasets import fetch_sdss_specgals >>> data = fetch_sdss_specgals() >>> data.shape # number of objects in dataset (661598,) >>> data.names[:5] # first five column names ['ra', 'dec', 'mjd', 'plate', 'fiberID'] >>> print(data['ra'][:3]) # first three RA values [ 146.71419105 146.74414186 146.62857334] >>> print(data['dec'][:3]) # first three declination values [-1.04127639 -0.6522198 -0.7651468 ] """ # fits is an optional dependency: don't import globally from astropy.io import fits data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, os.path.basename(DATA_URL)) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') fitsdata = download_with_progress_bar(DATA_URL) open(archive_file, 'wb').write(fitsdata) hdulist = fits.open(archive_file) return np.asarray(hdulist[1].data) def fetch_great_wall(data_home=None, download_if_missing=True, xlim=(-375, -175), ylim=(-300, 200)): """Get the 2D SDSS "Great Wall" distribution, following Cowan et al 2008 Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. xlim, ylim : tuples or None the limits in Mpc of the data: default values are the same as that used for the plots in Cowan 2008. If set to None, no cuts will be performed. Returns ------- data : ndarray, shape = (Ngals, 2) grid of projected (x, y) locations of galaxies in Mpc """ # local imports so we don't need dependencies for loading module from scipy.interpolate import interp1d from ..cosmology import Cosmology data = fetch_sdss_specgals(data_home, download_if_missing) # cut to the part of the sky with the "great wall" data = data[(data['dec'] > -7) & (data['dec'] < 7)] data = data[(data['ra'] > 80) & (data['ra'] < 280)] # do a redshift cut, following Cowan et al 2008 z = data['z'] data = data[(z > 0.01) & (z < 0.12)] # use redshift to compute absolute r-band magnitude cosmo = Cosmology(omegaM=0.27, omegaL=0.73, h=0.732) # first sample the distance modulus on a grid zgrid = np.linspace(min(data['z']), max(data['z']), 100) mugrid = np.array([cosmo.mu(z) for z in zgrid]) f = interp1d(zgrid, mugrid) mu = f(data['z']) # do an absolute magnitude cut at -20 Mr = data['petroMag_r'] + data['extinction_r'] - mu data = data[Mr < -21] # compute distances in the equatorial plane # first sample comoving distance Dcgrid = np.array([cosmo.Dc(z) for z in zgrid]) f = interp1d(zgrid, Dcgrid) dist = f(data['z']) locs = np.vstack([dist * np.cos(data['ra'] * np.pi / 180.), dist * np.sin(data['ra'] * np.pi / 180.)]).T # cut on x and y limits if specified if xlim is not None: locs = locs[(locs[:, 0] > xlim[0]) & (locs[:, 0] < xlim[1])] if ylim is not None: locs = locs[(locs[:, 1] > ylim[0]) & (locs[:, 1] < ylim[1])] return locs astroML-0.3/astroML/datasets/sdss_spectrum.py0000644000076500000240000000364312421516407022155 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import os import numpy as np from .tools import get_data_home, download_with_progress_bar,\ SDSSfits, sdss_fits_url, sdss_fits_filename def fetch_sdss_spectrum(plate, mjd, fiber, data_home=None, download_if_missing=True, cache_to_disk=True): """Fetch an SDSS spectrum from the Data Archive Server Parameters ---------- plate: integer plate number of desired spectrum mjd: integer mean julian date of desired spectrum fiber: integer fiber number of desired spectrum Other Parameters ---------------- data_home: string (optional) directory in which to cache downloaded fits files. If not specified, it will be set to ~/astroML_data download_if_missing: boolean (default = True) download the fits file if it is not cached locally cache_to_disk: boolean (default = True) cache downloaded file to data_home Returns ------- spec: :class:`astroML.tools.SDSSfits` object An object wrapper for the fits data """ data_home = get_data_home(data_home) target_url = sdss_fits_url(plate, mjd, fiber) target_file = os.path.join(data_home, 'SDSSspec', '%04i' % plate, sdss_fits_filename(plate, mjd, fiber)) if not os.path.exists(target_file): if not download_if_missing: raise IOError("SDSS colors training data not found") buf = download_with_progress_bar(target_url, return_buffer=True) if cache_to_disk: print("caching to %s" % target_file) if not os.path.exists(os.path.dirname(target_file)): os.makedirs(os.path.dirname(target_file)) fhandler = open(target_file, 'wb') fhandler.write(buf.read()) buf.seek(0) else: buf = target_file return SDSSfits(buf) astroML-0.3/astroML/datasets/sdss_sspp.py0000644000076500000240000001161012420767763021305 0ustar jakevdpstaff00000000000000from __future__ import print_function import os import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL = ("http://www.astro.washington.edu/users/ivezic/" "DMbook/data/SDSSssppDR9_rerun122.fit") def compute_distances(data): """Compute the distances to select stars in the sdss_sspp sample. Distance are determined using empirical color/magnitude fits from Ivezic et al 2008, ApJ 684:287 Extinction correcctions come from Berry et al 2011, arXiv 1111.4985 This distance only works for stars with log(g) > 3.3 Other stars will have distance=-1 """ # extinction terms from Berry et al Ar = data['Ar'] Au = 1.810 * Ar Ag = 1.400 * Ar Ai = 0.759 * Ar Az = 0.561 * Ar # compute corrected mags and colors gmag = data['gpsf'] - Ag rmag = data['rpsf'] - Ar imag = data['ipsf'] - Ai gi = gmag - imag # compute distance fit from Ivezic et al FeH = data['FeH'] Mr0 = (-5.06 + 14.32 * gi - 12.97 * gi ** 2 + 6.127 * gi ** 3 - 1.267 * gi ** 4 + 0.0967 * gi ** 5) FeHoffset = 4.50 - 1.11 * FeH - 0.18 * FeH ** 2 Mr = Mr0 + FeHoffset dist = 0.01 * 10 ** (0.2 * (rmag - Mr)) # stars with log(g) < 3.3 don't work for this fit: set distance to -1 dist[data['logg'] < 3.3] = -1 return dist def fetch_sdss_sspp(data_home=None, download_if_missing=True, cleaned=False): """Loader for SDSS SEGUE Stellar Parameter Pipeline data Parameters ---------- data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : bool (optional) default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. cleaned : bool (optional) default=False if True, then return a cleaned catalog where objects with extreme values are removed. Returns ------- data : recarray, shape = (327260,) record array containing pipeline parameters Notes ----- Here are the comments from the fits file header: Imaging data and spectrum identifiers for a sample of 327,260 stars with SDSS spectra, selected as: 1) available SSPP parameters in SDSS Data Release 9 (SSPP rerun 122, file from Y.S. Lee) 2) 14 < r < 21 (psf magnitudes, uncorrected for ISM extinction) 3) 10 < u < 25 & 10 < z < 25 (same as above) 4) errors in ugriz well measured (>0) and <10 5) 0 < u-g < 3 (all color cuts based on psf mags, dereddened) 6) -0.5 < g-r < 1.5 & -0.5 < r-i < 1.0 & -0.5 < i-z < 1.0 7) -200 < pmL < 200 & -200 < pmB < 200 (proper motion in mas/yr) 8) pmErr < 10 mas/yr (proper motion error) 9) 1 < log(g) < 5 10) TeffErr < 300 K Teff and TeffErr are given in Kelvin, radVel and radVelErr in km/s. (ZI, Feb 2012, ivezic@astro.washington.edu) Examples -------- >>> from astroML.datasets import fetch_sdss_sspp >>> data = fetch_sdss_sspp() >>> data.shape # number of objects in dataset (327260,) >>> print(data.names[:5]) # names of the first five columns ['ra', 'dec', 'Ar', 'upsf', 'uErr'] >>> print(data['ra'][:2]) # first two RA values [ 49.62750244 40.27209091] >>> print(data['dec'][:2]) # first two DEC values [-1.04175591 -0.64250112] """ # fits is an optional dependency: don't import globally from astropy.io import fits data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) archive_file = os.path.join(data_home, os.path.basename(DATA_URL)) if not os.path.exists(archive_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') fitsdata = download_with_progress_bar(DATA_URL) open(archive_file, 'wb').write(fitsdata) hdulist = fits.open(archive_file) data = np.asarray(hdulist[1].data) if cleaned: # -1.1 < FeH < 0.1 data = data[(data['FeH'] > -1.1) & (data['FeH'] < 0.1)] # -0.03 < alpha/Fe < 0.57 data = data[(data['alphFe'] > -0.03) & (data['alphFe'] < 0.57)] # 5000 < Teff < 6500 data = data[(data['Teff'] > 5000) & (data['Teff'] < 6500)] # 3.5 < log(g) < 5 data = data[(data['logg'] > 3.5) & (data['logg'] < 5)] # 0 < error for FeH < 0.1 data = data[(data['FeHErr'] > 0) & (data['FeHErr'] < 0.1)] # 0 < error for alpha/Fe < 0.05 data = data[(data['alphFeErr'] > 0) & (data['alphFeErr'] < 0.05)] # 15 < g mag < 18 data = data[(data['gpsf'] > 15) & (data['gpsf'] < 18)] # abs(radVel) < 100 km/s data = data[(abs(data['radVel']) < 100)] return data astroML-0.3/astroML/datasets/tools/0000755000076500000240000000000012462244012020032 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/datasets/tools/__init__.py0000644000076500000240000000166612115147567022170 0ustar jakevdpstaff00000000000000""" tools for the dataset loaders """ from .download import download_with_progress_bar from .sql_query import sql_query from .cas_query import * from .sdss_fits import sdss_fits_url, sdss_fits_filename, SDSSfits def get_data_home(data_home=None): """Get the home data directory. By default the data dir is set to a folder named 'astroML_data' in the user home folder. Alternatively, it can be set by the 'ASTROML_DATA' environment variable or programatically by giving an explit folder path. The '~' symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created. """ import os if data_home is None: data_home = os.environ.get('ASTROML_DATA', os.path.join('~', 'astroML_data')) data_home = os.path.expanduser(data_home) if not os.path.exists(data_home): os.makedirs(data_home) return data_home astroML-0.3/astroML/datasets/tools/cas_query.py0000644000076500000240000000557412115147567022426 0ustar jakevdpstaff00000000000000import numpy as np from . import sql_query # SDSS primtarget codes TARGET_QSO_HIZ = int('0x00000001', 16) TARGET_QSO_CAP = int('0x00000002', 16) TARGET_QSO_SKIRT = int('0x00000004', 16) TARGET_QSO_FIRST_CAP = int('0x00000008', 16) TARGET_QSO_FIRST_SKIRT = int('0x00000010', 16) TARGET_GALAXY_RED = int('0x00000020', 16) TARGET_GALAXY = int('0x00000040', 16) TARGET_GALAXY_BIG = int('0x00000080', 16) TARGET_GALAXY_BRIGHT_CORE = int('0x00000100', 16) TARGET_ROSAT_A = int('0x00000200', 16) TARGET_ROSAT_B = int('0x00000400', 16) TARGET_ROSAT_C = int('0x00000800', 16) TARGET_ROSAT_D = int('0x00001000', 16) TARGET_STAR_BHB = int('0x00002000', 16) TARGET_STAR_CARBON = int('0x00004000', 16) TARGET_STAR_BROWN_DWARF = int('0x00008000', 16) TARGET_STAR_SUB_DWARF = int('0x00010000', 16) TARGET_STAR_CATY_VAR = int('0x00020000', 16) TARGET_STAR_RED_DWARF = int('0x00040000', 16) TARGET_STAR_WHITE_DWARF = int('0x00080000', 16) TARGET_SERENDIP_BLUE = int('0x00100000', 16) TARGET_SERENDIP_FIRST = int('0x00200000', 16) TARGET_SERENDIP_RED = int('0x00400000', 16) TARGET_SERENDIP_DISTANT = int('0x00800000', 16) TARGET_SERENDIP_MANUAL = int('0x01000000', 16) TARGET_QSO_FAINT = int('0x02000000', 16) TARGET_GALAXY_RED_II = int('0x04000000', 16) TARGET_ROSAT_E = int('0x08000000', 16) TARGET_STAR_PN = int('0x10000000', 16) TARGET_QSO_REJECT = int('0x20000000', 16) DEFAULT_TARGET = TARGET_GALAXY # main galaxy sample def query_plate_mjd_fiber(n_spectra, primtarget=DEFAULT_TARGET, zmin=0, zmax=0.7): """Query the SDSS server for plate, mjd, and fiber numbers Parameters ---------- n_spectra: int number of spectra to query. Max is 100,000 (set by CAS server) primtarget: int prime target flag. See notes below zmin, zmax: float minimum and maximum redshift range for query Returns ------- plate, mjd, fiber : ndarrays, size=n_spectra The plate numbers MJD, and fiber numbers of the spectra Notes ----- Primtarget flag values can be found at http://cas.sdss.org/dr7/en/help/browser/enum.asp?n=PrimTarget """ query_text = '\n'.join(( "SELECT TOP %(n_spectra)i ", " plate, mjd, fiberid ", "FROM specObj ", "WHERE ((PrimTarget & %(primtarget)i) > 0) ", " AND (z > %(zmin)f) AND (z <= %(zmax)f) ")) % locals() output = sql_query(query_text).readlines() keys = output[0] res = np.zeros((n_spectra, 3), dtype=int) for i, line in enumerate(output[1:]): try: res[i] = map(int, line.strip().split(',')) except: raise ValueError('\n'.join(output)) ntot = i + 1 return res[:ntot].T astroML-0.3/astroML/datasets/tools/download.py0000644000076500000240000000313712421516047022224 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import sys from ...py3k_compat import urlopen, BytesIO, url_content_length def bytes_to_string(nbytes): if nbytes < 1024: return '%ib' % nbytes nbytes /= 1024. if nbytes < 1024: return '%.1fkb' % nbytes nbytes /= 1024. if nbytes < 1024: return '%.2fMb' % nbytes nbytes /= 1024. return '%.1fGb' % nbytes def download_with_progress_bar(data_url, return_buffer=False): """Download a file, showing progress Parameters ---------- data_url : string web address return_buffer : boolean (optional) if true, return a BytesIO buffer rather than a string Returns ------- s : string content of the file """ num_units = 40 fhandle = urlopen(data_url) content_length = url_content_length(fhandle) chunk_size = content_length // num_units print("Downloading %s" % data_url) nchunks = 0 buf = BytesIO() content_length_str = bytes_to_string(content_length) while True: next_chunk = fhandle.read(chunk_size) nchunks += 1 if next_chunk: buf.write(next_chunk) s = ('[' + nchunks * '=' + (num_units - 1 - nchunks) * ' ' + '] %s / %s \r' % (bytes_to_string(buf.tell()), content_length_str)) else: sys.stdout.write('\n') break sys.stdout.write(s) sys.stdout.flush() buf.seek(0) if return_buffer: return buf else: return buf.getvalue() astroML-0.3/astroML/datasets/tools/sdss_fits.py0000644000076500000240000003003712462227412022415 0ustar jakevdpstaff00000000000000""" Tools to download and process SDSS fits files. More information can be found at http://www.sdss.org/dr7/products/spectra/index.html """ import gc # garbage collection import numpy as np from scipy.ndimage.filters import gaussian_filter1d, uniform_filter1d from scipy import interpolate from . import download_with_progress_bar # This is the URL of the sdss fits spectra FITS_FILENAME = 'spSpec-%(mjd)05i-%(plate)04i-%(fiber)03i.fit' SDSS_URL = ('http://das.sdss.org/spectro/1d_26/%(plate)04i/' '1d/spSpec-%(mjd)05i-%(plate)04i-%(fiber)03i.fit') # lines used to generate line-index labeling LINES = dict(Ha=6564.61, Hb=4862.68, OI=6302.05, OIII=5008.24, NIIa=6549.86, NIIb=6585.27, SIIa=6718.29, SIIb=6732.67) def sdss_fits_url(plate, mjd, fiber): """Return the URL of the spectrum FITS file""" return SDSS_URL % dict(plate=plate, mjd=mjd, fiber=fiber) def sdss_fits_filename(plate, mjd, fiber): """Return the name of the spectrum FITS file""" return FITS_FILENAME % dict(plate=plate, mjd=mjd, fiber=fiber) spec_cln_dict = ['SPEC_UNKNOWN', 'SPEC_STAR', 'SPEC_GALAXY', 'SPEC_QSO', 'SPEC_HIZ_QSO', # high redshift QSO, z>2.3 'SPEC_SKY', 'STAR_LATE', # Type M or later (molecular bands dominate) 'GAL_EM'] # emission line galaxy class SDSSfits(object): """A class to open and interact with fits files from SDSS Parameters ---------- buf : string or file buffer (optional) file path, buffer, or url of SDSS spectra fits file if None, then initialize an empty instance. Notes ----- This class only provides access to a subset of the information available in the sdss spectra fits file. The raw fits data can be accessed using the fits object directly. This can be found in the attribute ``hdulist``. For details, please refer to the data description: http://www.sdss.org/dr7/dm/flatFiles/spSpec.html """ def __init__(self, source=None): if source is None: pass elif isinstance(source, str): if source.startswith('http://'): self._load_fits_url(source) else: self._load_fits_file(source) else: self._load_fits_file(source) def _load_fits_url(self, url): # fits is an optional dependency: don't import globally from astropy.io import fits buffer = download_with_progress_bar(url, return_buffer=True) self._initialize(fits.open(buffer)) def _load_fits_file(self, file_or_buffer): # fits is an optional dependency: don't import globally from astropy.io import fits self._initialize(fits.open(file_or_buffer)) def _initialize(self, hdulist): data = hdulist[0].data self.name = hdulist[0].header['NAME'] self.spec_cln = hdulist[0].header['SPEC_CLN'] self.coeff0 = hdulist[0].header['COEFF0'] self.coeff1 = hdulist[0].header['COEFF1'] self.z = hdulist[0].header['Z'] self.zerr = hdulist[0].header['Z_ERR'] self.zconf = hdulist[0].header['Z_CONF'] self.spectrum = data[0] self.spectrum_cont = data[1] self.error = data[2] self.mask = data[3] self.large_err = self.error.max() * 2 self.hdulist = hdulist def get_line_ew(self, wavelength): i = np.where(abs(self.hdulist[2].data['restWave'] - wavelength) < 1) return self.hdulist[2].data['ew'][i] def __del__(self): if hasattr(self, 'hdulist'): del self.hdulist gc.collect() def copy(self): snew = self.__class__() for param in ['name', 'spec_cln', 'coeff0', 'coeff1', 'z', 'zerr', 'zconf', 'spectrum', 'spectrum_cont', 'error', 'large_err', 'mask', 'hdulist']: setattr(snew, param, getattr(self, param)) return snew def restframe(self): snew = self.copy() snew.coeff0 = self.coeff0_restframe() snew.z = 0 return snew def __len__(self): return len(self.spectrum) def log_w_min(self, i=None): """ if i is specified, return log_w_min of bin i otherwise, return log_w_min of the spectrum """ if i is None: i = 0 return self.coeff0 + (i - 0.5) * self.coeff1 def log_w_max(self, i=None): """ if i is specified, return log_w_max of bin i otherwise, return log_max of the spectrum """ if i is None: i = len(self) - 1 return self.coeff0 + (i + 0.5) * self.coeff1 def w_min(self, i=None): return 10 ** self.log_w_min(i) def w_max(self, i=None): return 10 ** self.log_w_max(i) def coeff0_restframe(self): return self.coeff0 - np.log10(1 + self.z) def wavelength(self, restframe=False): """ return the wavelength of the spectrum in angstroms """ if restframe: coeff0 = self.coeff0_restframe() else: coeff0 = self.coeff0 return 10 ** (coeff0 + self.coeff1 * np.arange(len(self.spectrum))) def compute_mask(self, frac=0.5, filtwidth=5): """ return a mask showing where noise spikes to frac over the local background """ smoothed_noise = gaussian_filter1d(self.error, filtwidth) mask = ((self.error >= (1 + frac) * smoothed_noise) | (self.error <= 0) | (self.error >= self.large_err) | (self.spectrum == 0)) mask_filtered = uniform_filter1d(mask.astype(float), max(3, filtwidth)) return mask_filtered > 0.5 / filtwidth def rebin(self, rebin_coeff0, rebin_coeff1, rebin_length): """Rebin the spectrum to a new grid. Parameters ---------- rebin_coeff0: float log minimum wavelength rebin_coeff1: float log wavelength bin width rebin_length: int number of bins Returns ------- S_new: SDSSfits object The new spectrum, rebinned to the desired wavelength binning """ snew = self.copy() snew.spectrum = np.zeros(rebin_length) snew.error = np.zeros(rebin_length) snew.coeff0 = rebin_coeff0 snew.coeff1 = rebin_coeff1 N_old = len(self.spectrum) N_new = len(snew.spectrum) log_w_old = self.coeff0 + (np.arange(N_old + 1) - 0.5) * self.coeff1 log_w_new = snew.coeff0 + (np.arange(N_new + 1) - 0.5) * snew.coeff1 # Perform the interpolation. We'll interpolate the cumulative sum # so that the total flux of the spectrum is conserved. # interpolate spectrum spec_cuml_old = self.spectrum.cumsum() tck = interpolate.splrep(log_w_old, np.hstack(([0], spec_cuml_old))) spec_cuml_new = interpolate.splev(log_w_new, tck) spec_cuml_new[log_w_new >= log_w_old[-1]] = log_w_old[-1] spec_cuml_new[log_w_new <= log_w_old[0]] = 0 snew.spectrum = np.diff(spec_cuml_new) snew.spectrum *= self.coeff1 / snew.coeff1 # interpolate error err_cuml_old = self.error.cumsum() tck = interpolate.splrep(log_w_old, np.hstack(([0], err_cuml_old))) err_cuml_new = interpolate.splev(log_w_new, tck) err_cuml_new[log_w_new >= log_w_old[-1]] = log_w_old[-1] err_cuml_new[log_w_new <= log_w_old[0]] = 0 snew.error = np.diff(err_cuml_new) snew.error *= self.coeff1 / snew.coeff1 return snew def _get_line_strength(self, line): lam = LINES.get(line) if lam is None: lam1 = LINES.get(line + 'a') ind1 = np.where(abs(self.hdulist[2].data['restWave'] - lam1) < 1)[0] lam2 = LINES.get(line + 'b') ind2 = np.where(abs(self.hdulist[2].data['restWave'] - lam2) < 1)[0] if len(ind1) == 0: s1 = h1 = 0 nsig1 = 0 else: s1 = self.hdulist[2].data['sigma'][ind1] h1 = self.hdulist[2].data['height'][ind1] nsig1 = self.hdulist[2].data['nsigma'][ind1] if len(ind2) == 0: s2 = h2 = 0 nsig2 = 0 else: s2 = self.hdulist[2].data['sigma'][ind2] h2 = self.hdulist[2].data['height'][ind2] nsig2 = self.hdulist[2].data['nsigma'][ind2] strength = s1 * h1 + s2 * h2 nsig = max(nsig1, nsig2) else: ind = np.where(abs(self.hdulist[2].data['restWave'] - lam) < 1)[0] if len(ind) == 0: strength = 0 nsig = 0 else: s = self.hdulist[2].data['sigma'][ind] h = self.hdulist[2].data['height'][ind] nsig = self.hdulist[2].data['nsigma'][ind] strength = s * h return strength, nsig def lineratio_index(self, indicator='NII'): """Return the line ratio index for the given galaxy. This is the index used in Vanderplas et al 2009, and makes use of line-ratio fits from Kewley et al 2001 Parameters ---------- indicator: string ['NII'|'OI'|'SII'] The emission line to use as an indicator Returns ------- cln: integer The classification of the spectrum based on SDSS pipeline and the line ratios. 0 : unknown (SPEC_CLN = 0) 1 : star (SPEC_CLN = 1) 2 : absorption galaxy (H-alpha seen in absorption) 3 : normal galaxy (no significant H-alpha emission or absorption) 4 : emission line galaxies (below line-ratio curve) 5 : narrow-line QSO (above line-ratio curve) 6 : broad-line QSO (SPEC_CLN = 3) 7 : Sky (SPEC_CLN = 4) 8 : Hi-z QSO (SPEC_CLN = 5) 9 : Late-type star (SPEC_CLN = 6) 10 : Emission galaxy (SPEC_CLN = 7) ratios: tuple The line ratios used to compute this """ assert indicator in ['NII', 'OI', 'SII'] if self.spec_cln < 2: return self.spec_cln, (0, 0) elif self.spec_cln > 2: return self.spec_cln + 3, (0, 0) strength_Ha, nsig_Ha = self._get_line_strength('Ha') strength_Hb, nsig_Hb = self._get_line_strength('Hb') if nsig_Ha < 3 or nsig_Hb < 3: return 3, (0, 0) if strength_Ha < 0 or strength_Hb < 0: return 2, (0, 0) # all that's left is choosing between 4 and 5 # we do this based on line-ratios strength_I, nsig_I = self._get_line_strength(indicator) strength_OIII, nsig_OIII = self._get_line_strength('OIII') log_OIII_Hb = np.log10(strength_OIII / strength_Hb) I_Ha = np.log10(strength_I / strength_Ha) if indicator == 'NII': if I_Ha >= 0.47 or log_OIII_Hb >= log_OIII_Hb_NII(I_Ha): return 5, (I_Ha, log_OIII_Hb) else: return 4, (I_Ha, log_OIII_Hb) elif indicator == 'OI': if I_Ha >= -0.59 or log_OIII_Hb >= log_OIII_Hb_OI(I_Ha): return 5, (I_Ha, log_OIII_Hb) else: return 4, (I_Ha, log_OIII_Hb) else: if I_Ha >= 0.32 or log_OIII_Hb >= log_OIII_Hb_SII(I_Ha): return 5, (I_Ha, log_OIII_Hb) else: return 4, (I_Ha, log_OIII_Hb) #---------------------------------------------------------------------- # Empirical fits from Kewley et al 2001 def log_OIII_Hb_NII(log_NII_Ha, eps=0): return 1.19 + eps + 0.61 / (log_NII_Ha - eps - 0.47) def log_OIII_Hb_OI(log_OI_Ha, eps=0): return 1.33 + eps + 0.73 / (log_OI_Ha - eps + 0.59) def log_OIII_Hb_SII(log_SII_Ha, eps=0): return 1.30 + eps + 0.72 / (log_SII_Ha - eps - 0.32) astroML-0.3/astroML/datasets/tools/sql_query.py0000644000076500000240000000164012420767763022452 0ustar jakevdpstaff00000000000000""" Tools to perform a SQL queries to an online server. Default values are provided for http://cas.sdss.org """ from ...py3k_compat import urlencode, urlopen PUBLIC_URL = 'http://cas.sdss.org/public/en/tools/search/x_sql.asp' DEFAULT_FMT = 'csv' def remove_sql_comments(sql): """Strip SQL comments starting with --""" return ' \n'.join(map(lambda x: x.split('--')[0], sql.split('\n'))) def sql_query(sql_str, url=PUBLIC_URL, format='csv'): """Execute query Parameters ---------- sql_str : string valid sql query url: string (optional) query url. Default is http://cas.sdss.org query script format: string (default='csv') query output format Returns ------- F: file object results of the query """ sql_str = remove_sql_comments(sql_str) params = urlencode(dict(cmd=sql_str, format=format)) return urlopen(url + '?%s' % params) astroML-0.3/astroML/datasets/wmap_temperatures.py0000644000076500000240000000467612462227466023043 0ustar jakevdpstaff00000000000000import os import numpy as np from . import get_data_home from .tools import download_with_progress_bar DATA_URL = ('http://lambda.gsfc.nasa.gov/data/map/dr4/' 'skymaps/7yr/raw/wmap_band_imap_r9_7yr_W_v4.fits') MASK_URL = ('http://lambda.gsfc.nasa.gov/data/map/dr4/' 'ancillary/masks/wmap_temperature_analysis_mask_r9_7yr_v4.fits') def fetch_wmap_temperatures(masked=False, data_home=None, download_if_missing=True): """Loader for WMAP temperature map data Parameters ---------- masked : optional, default=False If True, then return the foreground-masked healpix array of data If False, then return the raw temperature array data_home : optional, default=None Specify another download and cache folder for the datasets. By default all scikit learn data is stored in '~/astroML_data' subfolders. download_if_missing : optional, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. Returns ------- data : np.ndarray or np.ma.MaskedArray record array containing (masked) temperature data """ # because of a bug in healpy, pylab must be imported before healpy is # or else a segmentation fault can result. import pylab import healpy as hp data_home = get_data_home(data_home) if not os.path.exists(data_home): os.makedirs(data_home) data_file = os.path.join(data_home, os.path.basename(DATA_URL)) mask_file = os.path.join(data_home, os.path.basename(MASK_URL)) if not os.path.exists(data_file): if not download_if_missing: raise IOError('data not present on disk. ' 'set download_if_missing=True to download') data_buffer = download_with_progress_bar(DATA_URL) open(data_file, 'wb').write(data_buffer) data = hp.read_map(data_file) if masked: if not os.path.exists(mask_file): if not download_if_missing: raise IOError('mask data not present on disk. ' 'set download_if_missing=True to download') mask_buffer = download_with_progress_bar(MASK_URL) open(mask_file, 'w').write(mask_buffer) mask = hp.read_map(mask_file) data = hp.ma(data) data.mask = np.logical_not(mask) # WMAP mask has 0=bad. We need 1=bad return data astroML-0.3/astroML/decorators.py0000644000076500000240000000723712426547020017617 0ustar jakevdpstaff00000000000000from __future__ import print_function import os from . import py3k_compat as pickle import numpy as np def pickle_results(filename=None, verbose=True): """Generator for decorator which allows pickling the results of a funcion Pickle is python's built-in object serialization. This decorator, when used on a function, saves the results of the computation in the function to a pickle file. If the function is called a second time with the same inputs, then the computation will not be repeated and the previous results will be used. This functionality is useful for computations which take a long time, but will need to be repeated (such as the first step of a data analysis). Parameters ---------- filename : string (optional) pickle file to which results will be saved. If not specified, then the file is '_output.pkl' where '' is replaced by the name of the decorated function. verbose : boolean (optional) if True, then print a message to standard out specifying when the pickle file is written or read. Examples -------- >>> @pickle_results('tmp.pkl', verbose=True) ... def f(x): ... return x * x >>> f(4) @pickle_results: computing results and saving to 'tmp.pkl' 16 >>> f(4) @pickle_results: using precomputed results from 'tmp.pkl' 16 >>> f(6) @pickle_results: computing results and saving to 'tmp.pkl' 36 >>> import os; os.remove('tmp.pkl') """ def pickle_func(f, filename=filename, verbose=verbose): if filename is None: filename = '%s_output.pkl' % f.__name__ def new_f(*args, **kwargs): try: D = pickle.load(open(filename, 'rb')) cache_exists = True except: D = {} cache_exists = False # simple comparison doesn't work in the case of numpy arrays Dargs = D.get('args') Dkwargs = D.get('kwargs') try: args_match = (args == Dargs) except: args_match = np.all([np.all(a1 == a2) for (a1, a2) in zip(Dargs, args)]) try: kwargs_match = (kwargs == Dkwargs) except: kwargs_match = ((sorted(Dkwargs.keys()) == sorted(kwargs.keys())) and (np.all([np.all(Dkwargs[key] == kwargs[key]) for key in kwargs]))) if (type(D) == dict and D.get('funcname') == f.__name__ and args_match and kwargs_match): if verbose: print("@pickle_results: using precomputed " "results from '%s'" % filename) retval = D['retval'] else: if verbose: print("@pickle_results: computing results " "and saving to '%s'" % filename) if cache_exists: print(" warning: cache file '%s' exists" % filename) print(" - args match: %s" % args_match) print(" - kwargs match: %s" % kwargs_match) retval = f(*args, **kwargs) funcdict = dict(funcname=f.__name__, retval=retval, args=args, kwargs=kwargs) with open(filename, 'wb') as outfile: pickle.dump(funcdict, outfile) return retval return new_f return pickle_func astroML-0.3/astroML/density_estimation/0000755000076500000240000000000012462244012020775 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/density_estimation/__init__.py0000644000076500000240000000051412420767763023127 0ustar jakevdpstaff00000000000000from .density_estimation import KDE, KNeighborsDensity from .xdeconv import XDGMM from .histtools import\ scotts_bin_width, freedman_bin_width, knuth_bin_width, histogram from .bayesian_blocks import bayesian_blocks from .empirical import FunctionDistribution, EmpiricalDistribution from .gauss_mixture import GaussianMixture1D astroML-0.3/astroML/density_estimation/bayesian_blocks.py0000644000076500000240000002733312420767763024530 0ustar jakevdpstaff00000000000000""" Bayesian Block implementation ============================= Dynamic programming algorithm for finding the optimal adaptive-width histogram. Based on Scargle et al 2012 [1]_ References ---------- .. [1] http://adsabs.harvard.edu/abs/2012arXiv1207.5578S """ import numpy as np # TODO: implement other fitness functions from appendix B of Scargle 2012 class FitnessFunc(object): """Base class for fitness functions Each fitness function class has the following: - fitness(...) : compute fitness function. Arguments accepted by fitness must be among [T_k, N_k, a_k, b_k, c_k] - prior(N, Ntot) : compute prior on N given a total number of points Ntot """ def __init__(self, p0=0.05, gamma=None): self.p0 = p0 self.gamma = gamma def validate_input(self, t, x, sigma): """Check that input is valid""" pass def fitness(**kwargs): raise NotImplementedError() def prior(self, N, Ntot): if self.gamma is None: return self.p0_prior(N, Ntot) else: return self.gamma_prior(N, Ntot) def p0_prior(self, N, Ntot): # eq. 21 from Scargle 2012 return 4 - np.log(73.53 * self.p0 * (N ** -0.478)) def gamma_prior(self, N, Ntot): """Basic prior, parametrized by gamma (eq. 3 in Scargle 2012)""" if self.gamma == 1: return 0 else: return (np.log(1 - self.gamma) - np.log(1 - self.gamma ** (Ntot + 1)) + N * np.log(self.gamma)) # the fitness_args property will return the list of arguments accepted by # the method fitness(). This allows more efficient computation below. @property def args(self): try: # Python 2 return self.fitness.func_code.co_varnames[1:] except AttributeError: return self.fitness.__code__.co_varnames[1:] class Events(FitnessFunc): """Fitness for binned or unbinned events Parameters ---------- p0 : float False alarm probability, used to compute the prior on N (see eq. 21 of Scargle 2012). Default prior is for p0 = 0. gamma : float or None If specified, then use this gamma to compute the general prior form, p ~ gamma^N. If gamma is specified, p0 is ignored. """ def fitness(self, N_k, T_k): # eq. 19 from Scargle 2012 return N_k * (np.log(N_k) - np.log(T_k)) def prior(self, N, Ntot): if self.gamma is not None: return self.gamma_prior(N, Ntot) else: # eq. 21 from Scargle 2012 return 4 - np.log(73.53 * self.p0 * (N ** -0.478)) class RegularEvents(FitnessFunc): """Fitness for regular events This is for data which has a fundamental "tick" length, so that all measured values are multiples of this tick length. In each tick, there are either zero or one counts. Parameters ---------- dt : float tick rate for data gamma : float specifies the prior on the number of bins: p ~ gamma^N """ def __init__(self, dt, p0=0.05, gamma=None): self.dt = dt self.p0 = p0 self.gamma = gamma def validate_input(self, t, x, sigma): unique_x = np.unique(x) if list(unique_x) not in ([0], [1], [0, 1]): raise ValueError("Regular events must have only 0 and 1 in x") def fitness(self, T_k, N_k): # Eq. 75 of Scargle 2012 M_k = T_k / self.dt N_over_M = N_k * 1. / M_k eps = 1E-8 if np.any(N_over_M > 1 + eps): import warnings warnings.warn('regular events: N/M > 1. ' 'Is the time step correct?') one_m_NM = 1 - N_over_M N_over_M[N_over_M <= 0] = 1 one_m_NM[one_m_NM <= 0] = 1 return N_k * np.log(N_over_M) + (M_k - N_k) * np.log(one_m_NM) class PointMeasures(FitnessFunc): """Fitness for point measures Parameters ---------- gamma : float specifies the prior on the number of bins: p ~ gamma^N if gamma is not specified, then a prior based on simulations will be used (see sec 3.3 of Scargle 2012) """ def __init__(self, p0=None, gamma=None): self.p0 = p0 self.gamma = gamma def fitness(self, a_k, b_k): # eq. 41 from Scargle 2012 return (b_k * b_k) / (4 * a_k) def prior(self, N, Ntot): if self.gamma is not None: return self.gamma_prior(N, Ntot) elif self.p0 is not None: return self.p0_prior(N, Ntot) else: # eq. at end of sec 3.3 in Scargle 2012 return 1.32 + 0.577 * np.log10(N) def bayesian_blocks(t, x=None, sigma=None, fitness='events', **kwargs): """Bayesian Blocks Implementation This is a flexible implementation of the Bayesian Blocks algorithm described in Scargle 2012 [1]_ Parameters ---------- t : array_like data times (one dimensional, length N) x : array_like (optional) data values sigma : array_like or float (optional) data errors fitness : str or object the fitness function to use. If a string, the following options are supported: - 'events' : binned or unbinned event data extra arguments are `p0`, which gives the false alarm probability to compute the prior, or `gamma` which gives the slope of the prior on the number of bins. - 'regular_events' : non-overlapping events measured at multiples of a fundamental tick rate, `dt`, which must be specified as an additional argument. The prior can be specified through `gamma`, which gives the slope of the prior on the number of bins. - 'measures' : fitness for a measured sequence with Gaussian errors The prior can be specified using `gamma`, which gives the slope of the prior on the number of bins. If `gamma` is not specified, then a simulation-derived prior will be used. Alternatively, the fitness can be a user-specified object of type derived from the FitnessFunc class. Returns ------- edges : ndarray array containing the (N+1) bin edges Examples -------- Event data: >>> t = np.random.normal(size=100) >>> bins = bayesian_blocks(t, fitness='events', p0=0.01) Event data with repeats: >>> t = np.random.normal(size=100) >>> t[80:] = t[:20] >>> bins = bayesian_blocks(t, fitness='events', p0=0.01) Regular event data: >>> dt = 0.01 >>> t = dt * np.arange(1000) >>> x = np.zeros(len(t)) >>> x[np.random.randint(0, len(t), len(t) / 10)] = 1 >>> bins = bayesian_blocks(t, fitness='regular_events', dt=dt, gamma=0.9) Measured point data with errors: >>> t = 100 * np.random.random(100) >>> x = np.exp(-0.5 * (t - 50) ** 2) >>> sigma = 0.1 >>> x_obs = np.random.normal(x, sigma) >>> bins = bayesian_blocks(t, fitness='measures') References ---------- .. [1] Scargle, J `et al.` (2012) http://adsabs.harvard.edu/abs/2012arXiv1207.5578S See Also -------- astroML.plotting.hist : histogram plotting function which can make use of bayesian blocks. """ # validate array input t = np.asarray(t, dtype=float) if x is not None: x = np.asarray(x) if sigma is not None: sigma = np.asarray(sigma) # verify the fitness function if fitness == 'events': if x is not None and np.any(x % 1 > 0): raise ValueError("x must be integer counts for fitness='events'") fitfunc = Events(**kwargs) elif fitness == 'regular_events': if x is not None and (np.any(x % 1 > 0) or np.any(x > 1)): raise ValueError("x must be 0 or 1 for fitness='regular_events'") fitfunc = RegularEvents(**kwargs) elif fitness == 'measures': if x is None: raise ValueError("x must be specified for fitness='measures'") fitfunc = PointMeasures(**kwargs) else: if not (hasattr(fitness, 'args') and hasattr(fitness, 'fitness') and hasattr(fitness, 'prior')): raise ValueError("fitness not understood") fitfunc = fitness # find unique values of t t = np.array(t, dtype=float) assert t.ndim == 1 unq_t, unq_ind, unq_inv = np.unique(t, return_index=True, return_inverse=True) # if x is not specified, x will be counts at each time if x is None: if sigma is not None: raise ValueError("If sigma is specified, x must be specified") if len(unq_t) == len(t): x = np.ones_like(t) else: x = np.bincount(unq_inv) t = unq_t sigma = 1 # if x is specified, then we need to sort t and x together else: x = np.asarray(x) if len(t) != len(x): raise ValueError("Size of t and x does not match") if len(unq_t) != len(t): raise ValueError("Repeated values in t not supported when " "x is specified") t = unq_t x = x[unq_ind] # verify the given sigma value N = t.size if sigma is not None: sigma = np.asarray(sigma) if sigma.shape not in [(), (1,), (N,)]: raise ValueError('sigma does not match the shape of x') else: sigma = 1 # validate the input fitfunc.validate_input(t, x, sigma) # compute values needed for computation, below if 'a_k' in fitfunc.args: ak_raw = np.ones_like(x) / sigma / sigma if 'b_k' in fitfunc.args: bk_raw = x / sigma / sigma if 'c_k' in fitfunc.args: ck_raw = x * x / sigma / sigma # create length-(N + 1) array of cell edges edges = np.concatenate([t[:1], 0.5 * (t[1:] + t[:-1]), t[-1:]]) block_length = t[-1] - edges # arrays to store the best configuration best = np.zeros(N, dtype=float) last = np.zeros(N, dtype=int) #----------------------------------------------------------------- # Start with first data cell; add one cell at each iteration #----------------------------------------------------------------- for R in range(N): # Compute fit_vec : fitness of putative last block (end at R) kwds = {} # T_k: width/duration of each block if 'T_k' in fitfunc.args: kwds['T_k'] = block_length[:R + 1] - block_length[R + 1] # N_k: number of elements in each block if 'N_k' in fitfunc.args: kwds['N_k'] = np.cumsum(x[:R + 1][::-1])[::-1] # a_k: eq. 31 if 'a_k' in fitfunc.args: kwds['a_k'] = 0.5 * np.cumsum(ak_raw[:R + 1][::-1])[::-1] # b_k: eq. 32 if 'b_k' in fitfunc.args: kwds['b_k'] = - np.cumsum(bk_raw[:R + 1][::-1])[::-1] # c_k: eq. 33 if 'c_k' in fitfunc.args: kwds['c_k'] = 0.5 * np.cumsum(ck_raw[:R + 1][::-1])[::-1] # evaluate fitness function fit_vec = fitfunc.fitness(**kwds) A_R = fit_vec - fitfunc.prior(R + 1, N) A_R[1:] += best[:R] i_max = np.argmax(A_R) last[R] = i_max best[R] = A_R[i_max] #----------------------------------------------------------------- # Now find changepoints by iteratively peeling off the last block #----------------------------------------------------------------- change_points = np.zeros(N, dtype=int) i_cp = N ind = N while True: i_cp -= 1 change_points[i_cp] = ind if ind == 0: break ind = last[ind - 1] change_points = change_points[i_cp:] return edges[change_points] astroML-0.3/astroML/density_estimation/density_estimation.py0000644000076500000240000001715412252721253025276 0ustar jakevdpstaff00000000000000""" Tools for density estimation See also: - sklearn.mixture.gmm : gaussian mixture models - sklearn.neighbors.KernelDensity : Kernel Density Estimation (version 0.14+) - astroML.density_estimation.XDGMM : extreme deconvolution - scipy.spatial.gaussian_kde : a gaussian KDE implementation """ import warnings import numpy as np from scipy import special from sklearn.metrics import pairwise_kernels, pairwise_distances from sklearn.neighbors import BallTree # TODO: # - KDE with errors (chp 6.1.2) def n_volume(r, n): """compute the n-volume of a sphere of radius r in n dimensions""" return np.pi ** (0.5 * n) / special.gamma(0.5 * n + 1) * (r ** n) class KDE(object): """Kernel Density Estimate .. note:: Deprecated in astroML 0.2 Scikit-learn version 0.14 added a KernelDensity estimator class which has much better performance than this class. The ``KDE`` class will be removed in astroML version 0.3. Parameters ---------- metric : string or callable ['gaussian'|'tophat'|'exponential'] or one of the options in sklearn.metrics.pairwise_kernels. See pairwise_kernels documentation for more information. For 'gaussian' or 'tophat', 'exponential', and 'quadratic', the results will be properly normalized in D dimensions. This may not be the case for other metrics. h : float (optional) if metric is 'gaussian' or 'tophat', h gives the width of the kernel. Otherwise, h is not referenced. **kwargs : other keywords will be passed to the sklearn.metrics.pairwise_kernels function. Notes ----- Kernel forms are as follows: - 'gaussian' : K(x, y) ~ exp( -0.5 (x - y)^2 / h^2 ) - 'tophat' : K(x, y) ~ 1 if abs(x - y) < h ~ 0 otherwise - 'exponential' : K(x, y) ~ exp(- abs(x - y) / h) - 'quadratic' : K(x, y) ~ (1 - (x - y)^2) if abs(x) < 1 ~ 0 otherwise All are properly normalized, so that their integral over all space is 1. See Also -------- - sklearn.mixture.gmm : gaussian mixture models - KNeighborsDenstiy: nearest neighbors density estimation - scipy.spatial.gaussian_kde : a gaussian KDE implementation """ def __init__(self, metric='gaussian', h=None, **kwargs): warnings.warn("The KDE class is deprecated as of version 0.2 and will " "be removed in version 0.3. Upgrade to scikit-learn " "version >0.14 and use sklearn.neighbors.KernelDensity " "instead.") self.metric = metric self.kwargs = kwargs self.h = h self.factor = lambda ndim: 1 def fit(self, X): """Train the kernel density estimator Parameters ---------- X : array_like array of points to use to train the KDE. Shape is (n_points, n_dim) """ self.X_ = np.atleast_2d(X) if self.X_.ndim != 2: raise ValueError('X must be two-dimensional') return self def eval(self, X): """Evaluate the kernel density estimation Parameters ---------- X : array_like array of points at which to evaluate the KDE. Shape is (n_points, n_dim), where n_dim matches the dimension of the training points. Returns ------- dens : ndarray array of shape (n_points,) giving the density at each point. The density will be normalized for metric='gaussian' or metric='tophat', and will be unnormalized otherwise. """ X = np.atleast_2d(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X_.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.metric == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel gamma = 0.5 / self.h / self.h D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma) D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1])) dens = D.sum(1) elif self.metric == 'tophat': # use Ball Tree to efficiently count neighbors bt = BallTree(self.X_) counts = bt.query_radius(X, self.h, count_only=True) dens = counts / n_volume(self.h, X.shape[1]) elif self.metric == 'exponential': D = pairwise_distances(X, self.X_) dens = np.exp(-abs(D) / self.h) dens = dens.sum(1) dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1]) elif self.metric == 'quadratic': D = pairwise_distances(X, self.X_) dens = (1 - (D / self.h) ** 2) dens[D > self.h] = 0 dens = dens.sum(1) dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2) else: D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs) dens = D.sum(1) return dens class KNeighborsDensity(object): """K-neighbors density estimation Parameters ---------- method : string method to use. Must be one of ['simple'|'bayesian'] (see below) n_neighbors : int number of neighbors to use Notes ----- The two methods are as follows: - simple: The density at a point x is estimated by n(x) ~ k / r_k^n - bayesian: The density at a point x is estimated by n(x) ~ sum_{i=1}^k[1 / r_i^n]. See Also -------- KDE : kernel density estimation """ def __init__(self, method='bayesian', n_neighbors=10): if method not in ['simple', 'bayesian']: raise ValueError("method = %s not recognized" % method) self.n_neighbors = n_neighbors self.method = method def fit(self, X): """Train the K-neighbors density estimator Parameters ---------- X : array_like array of points to use to train the KDE. Shape is (n_points, n_dim) """ self.X_ = np.atleast_2d(X) if self.X_.ndim != 2: raise ValueError('X must be two-dimensional') self.bt_ = BallTree(self.X_) return self def eval(self, X): """Evaluate the kernel density estimation Parameters ---------- X : array_like array of points at which to evaluate the KDE. Shape is (n_points, n_dim), where n_dim matches the dimension of the training points. Returns ------- dens : ndarray array of shape (n_points,) giving the density at each point. The density will be normalized for metric='gaussian' or metric='tophat', and will be unnormalized otherwise. """ X = np.atleast_2d(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X_.shape[1]: raise ValueError('dimensions of X do not match training dimension') dist, ind = self.bt_.query(X, self.n_neighbors, return_distance=True) k = float(self.n_neighbors) ndim = X.shape[1] if self.method == 'simple': return k / n_volume(dist[:, -1], ndim) elif self.method == 'bayesian': # XXX this may be wrong in more than 1 dimension! return (k * (k + 1) * 0.5 / n_volume(1, ndim) / (dist ** ndim).sum(1)) else: raise ValueError("Unrecognized method '%s'" % self.method) return dens astroML-0.3/astroML/density_estimation/empirical.py0000644000076500000240000000626212252721253023326 0ustar jakevdpstaff00000000000000import numpy as np from scipy import interpolate from ..utils import check_random_state class FunctionDistribution(object): """Generate random variables distributed according to an arbitrary function Parameters ---------- func : function func should take an array of x values, and return an array proportional to the probability density at each value xmin : float minimum value of interest xmax : float maximum value of interest Nx : int (optional) number of samples to draw. Default is 1000 random_state : None, int, or np.random.RandomState instance random seed or random number generator func_args : dictionary (optional) additional keyword arguments to be passed to func """ def __init__(self, func, xmin, xmax, Nx=1000, random_state=None, func_args=None): self.random_state = check_random_state(random_state) if func_args is None: func_args = {} x = np.linspace(xmin, xmax, Nx) Px = func(x, **func_args) # if there are too many zeros, interpolation will fail positive = (Px > 1E-10 * Px.max()) x = x[positive] Px = Px[positive].cumsum() Px /= Px[-1] self._tck = interpolate.splrep(Px, x) def rvs(self, shape): """Draw random variables from the distribution Parameters ---------- shape : integer or tuple shape of desired array Returns ------- rv : ndarray, shape=shape random variables """ # generate uniform variables between 0 and 1 y = self.random_state.random_sample(shape) return interpolate.splev(y, self._tck) class EmpiricalDistribution(object): """Empirically learn a distribution from one-dimensional data Parameters ---------- data : one-dimensional array input data Examples -------- >>> import numpy as np >>> np.random.seed(0) >>> x = np.random.normal(size=10000) # normally-distributed variables >>> x.mean(), x.std() (-0.018433720158265783, 0.98755656817612003) >>> x2 = EmpiricalDistribution(x).rvs(10000) >>> x2.mean(), x2.std() (-0.020293716681613363, 1.0039249294845276) Notes ----- This function works by approximating the inverse of the cumulative distribution using an efficient spline fit to the sorted values. """ def __init__(self, data): # copy, because we'll need to sort in-place data = np.array(data, copy=True) if data.ndim != 1: raise ValueError("data should be one-dimensional") data.sort() # set up spline y = np.linspace(0, 1, data.size) self._tck = interpolate.splrep(y, data) def rvs(self, shape): """Draw random variables from the distribution Parameters ---------- shape : integer or tuple shape of desired array Returns ------- rv : ndarray, shape=shape random variables """ # generate uniform variables between 0 and 1 y = np.random.random(shape) return interpolate.splev(y, self._tck) astroML-0.3/astroML/density_estimation/gauss_mixture.py0000644000076500000240000000240312252721253024251 0ustar jakevdpstaff00000000000000import numpy as np from sklearn.mixture import GMM class GaussianMixture1D(object): """ Simple class to work with 1D mixtures of Gaussians Parameters ---------- means : array_like means of component distributions (default = 0) sigmas : array_like standard deviations of component distributions (default = 1) weights : array_like weight of component distributions (default = 1) """ def __init__(self, means=0, sigmas=1, weights=1): data = np.array([t for t in np.broadcast(means, sigmas, weights)]) self._gmm = GMM(data.shape[0]) self._gmm.fit = None # disable fit method for safety self._gmm.means_ = data[:, :1] self._gmm.covars_ = data[:, 1:2] ** 2 self._gmm.weights = data[:, 2] / data[:, 2].sum() def sample(self, size): """Random sample""" return self._gmm.sample(size) def pdf(self, x): """Compute probability distribution""" logprob, responsibilities = self._gmm.eval(x) return np.exp(logprob) def pdf_individual(self, x): """Compute probability distribution of each component""" logprob, responsibilities = self._gmm.eval(x) return responsibilities * np.exp(logprob[:, np.newaxis]) astroML-0.3/astroML/density_estimation/histtools.py0000644000076500000240000001746112437206505023417 0ustar jakevdpstaff00000000000000""" Tools for working with distributions """ import numpy as np from astroML.density_estimation import bayesian_blocks from scipy.special import gammaln from scipy import optimize def scotts_bin_width(data, return_bins=False): r"""Return the optimal histogram bin width using Scott's rule: Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{3.5\sigma}{n^{1/3}} where :math:`\sigma` is the standard deviation of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width freedman_bin_width astroML.plotting.hist """ data = np.asarray(data) if data.ndim != 1: raise ValueError("data should be one-dimensional") n = data.size sigma = np.std(data) dx = 3.5 * sigma * 1. / (n ** (1. / 3)) if return_bins: Nbins = np.ceil((data.max() - data.min()) * 1. / dx) Nbins = max(1, Nbins) bins = data.min() + dx * np.arange(Nbins + 1) return dx, bins else: return dx def freedman_bin_width(data, return_bins=False): r"""Return the optimal histogram bin width using the Freedman-Diaconis rule Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}} where :math:`q_{N}` is the :math:`N` percent quartile of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width scotts_bin_width astroML.plotting.hist """ data = np.asarray(data) if data.ndim != 1: raise ValueError("data should be one-dimensional") n = data.size if n < 4: raise ValueError("data should have more than three entries") dsorted = np.sort(data) v25 = dsorted[n / 4 - 1] v75 = dsorted[(3 * n) / 4 - 1] dx = 2 * (v75 - v25) * 1. / (n ** (1. / 3)) if return_bins: Nbins = np.ceil((dsorted[-1] - dsorted[0]) * 1. / dx) Nbins = max(1, Nbins) bins = dsorted[0] + dx * np.arange(Nbins + 1) return dx, bins else: return dx class KnuthF(object): r"""Class which implements the function minimized by knuth_bin_width Parameters ---------- data : array-like, one dimension data to be histogrammed Notes ----- the function F is given by .. math:: F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2}) - M\log\Gamma(\frac{1}{2}) - \log\Gamma(\frac{2n+M}{2}) + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2}) where :math:`\Gamma` is the Gamma function, :math:`n` is the number of data points, :math:`n_k` is the number of measurements in bin :math:`k`. See Also -------- knuth_bin_width astroML.plotting.hist """ def __init__(self, data): self.data = np.array(data, copy=True) if self.data.ndim != 1: raise ValueError("data should be 1-dimensional") self.data.sort() self.n = self.data.size def bins(self, M): """Return the bin edges given a width dx""" return np.linspace(self.data[0], self.data[-1], int(M) + 1) def __call__(self, M): return self.eval(M) def eval(self, M): """Evaluate the Knuth function Parameters ---------- dx : float Width of bins Returns ------- F : float evaluation of the negative Knuth likelihood function: smaller values indicate a better fit. """ M = int(M) if M <= 0: return np.inf bins = self.bins(M) nk, bins = np.histogram(self.data, bins) return -(self.n * np.log(M) + gammaln(0.5 * M) - M * gammaln(0.5) - gammaln(self.n + 0.5 * M) + np.sum(gammaln(nk + 0.5))) def knuth_bin_width(data, return_bins=False, disp=True): r"""Return the optimal histogram bin width using Knuth's rule [1]_ Parameters ---------- data : array-like, ndim=1 observed (one-dimensional) data return_bins : bool (optional) if True, then return the bin edges Returns ------- dx : float optimal bin width. Bins are measured starting at the first data point. bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal number of bins is the value M which maximizes the function .. math:: F(M|x,I) = n\log(M) + \log\Gamma(\frac{M}{2}) - M\log\Gamma(\frac{1}{2}) - \log\Gamma(\frac{2n+M}{2}) + \sum_{k=1}^M \log\Gamma(n_k + \frac{1}{2}) where :math:`\Gamma` is the Gamma function, :math:`n` is the number of data points, :math:`n_k` is the number of measurements in bin :math:`k`. References ---------- .. [1] Knuth, K.H. "Optimal Data-Based Binning for Histograms". arXiv:0605197, 2006 See Also -------- KnuthF freedman_bin_width scotts_bin_width """ knuthF = KnuthF(data) dx0, bins0 = freedman_bin_width(data, True) M0 = len(bins0) - 1 M = optimize.fmin(knuthF, len(bins0), disp=disp)[0] bins = knuthF.bins(M) dx = bins[1] - bins[0] if return_bins: return dx, bins else: return dx def histogram(a, bins=10, range=None, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as numpy.histogram(). Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram astroML.plotting.hist """ a = np.asarray(a) # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'scotts', 'freedman'])): a = a[(a >= range[0]) & (a <= range[1])] if bins == 'blocks': bins = bayesian_blocks(a) elif bins == 'knuth': da, bins = knuth_bin_width(a, True) elif bins == 'scotts': da, bins = scotts_bin_width(a, True) elif bins == 'freedman': da, bins = freedman_bin_width(a, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) return np.histogram(a, bins, range, **kwargs) astroML-0.3/astroML/density_estimation/tests/0000755000076500000240000000000012462244012022137 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/density_estimation/tests/__init__.py0000644000076500000240000000000012252721253024242 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/density_estimation/tests/test_bayesian_blocks.py0000644000076500000240000000314112115147567026713 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose, assert_ from astroML.density_estimation import bayesian_blocks def test_single_change_point(): np.random.seed(0) x = np.concatenate([np.random.random(100), 1 + np.random.random(200)]) bins = bayesian_blocks(x) assert_(len(bins) == 3) assert_allclose(bins[1], 1, rtol=0.02) def test_duplicate_events(): t = np.random.random(100) t[80:] = t[:20] x = np.ones_like(t) x[:20] += 1 bins1 = bayesian_blocks(t) bins2 = bayesian_blocks(t[:80], x[:80]) assert_allclose(bins1, bins2) def test_measures_fitness_homoscedastic(): np.random.seed(0) t = np.linspace(0, 1, 11) x = np.exp(-0.5 * (t - 0.5) ** 2 / 0.01 ** 2) sigma = 0.05 x = np.random.normal(x, sigma) bins = bayesian_blocks(t, x, sigma, fitness='measures') assert_allclose(bins, [0, 0.45, 0.55, 1]) def test_measures_fitness_heteroscedastic(): np.random.seed(1) t = np.linspace(0, 1, 11) x = np.exp(-0.5 * (t - 0.5) ** 2 / 0.01 ** 2) sigma = 0.02 + 0.02 * np.random.random(len(x)) x = np.random.normal(x, sigma) bins = bayesian_blocks(t, x, sigma, fitness='measures') assert_allclose(bins, [0, 0.45, 0.55, 1]) def test_regular_events(): np.random.seed(0) dt = 0.01 steps = np.concatenate([np.unique(np.random.randint(0, 500, 100)), np.unique(np.random.randint(500, 1000, 200))]) t = dt * steps bins = bayesian_blocks(t, fitness='regular_events', dt=dt) assert_(len(bins) == 3) assert_allclose(bins[1], 5, rtol=0.05) astroML-0.3/astroML/density_estimation/tests/test_density.py0000644000076500000240000000165612115147567025253 0ustar jakevdpstaff00000000000000""" Test density estimation techniques """ import numpy as np from numpy.testing import assert_allclose from scipy.stats import norm from astroML.density_estimation import KDE, KNeighborsDensity def check_1D_density(clf, X, X2, true_dens, atol): clf.fit(X) dens = clf.eval(X2) assert_allclose(dens, true_dens, atol=atol) def test_1D_density(): np.random.seed(0) dist = norm(0, 1) X = dist.rvs((5000, 1)) X2 = np.linspace(-5, 5, 10).reshape((10, 1)) true_dens = dist.pdf(X2[:, 0]) * X.shape[0] classifiers = [KDE('gaussian', h=0.1), KDE('tophat', h=0.2), KDE('exponential', h=0.1), KDE('quadratic', h=0.2), KNeighborsDensity(method='simple', n_neighbors=250), KNeighborsDensity(method='bayesian', n_neighbors=250)] for clf in classifiers: yield (check_1D_density, clf, X, X2, true_dens, 100) astroML-0.3/astroML/density_estimation/tests/test_empirical.py0000644000076500000240000000151312115147567025531 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from scipy.stats import norm from astroML.density_estimation import\ EmpiricalDistribution, FunctionDistribution def test_empirical_distribution(N=1000, rseed=0): np.random.seed(rseed) X = norm.rvs(0, 1, size=N) dist = EmpiricalDistribution(X) X2 = dist.rvs(N) meanX = X.mean() meanX2 = X2.mean() stdX = X.std() stdX2 = X2.std() assert_allclose([meanX, stdX], [meanX2, stdX2], atol=3 / np.sqrt(N)) def test_function_distribution(N=1000, rseed=0): f = norm(0, 1).pdf # go from -10 to 10 to check interpolation in presence of zeros dist = FunctionDistribution(f, -10, 10) np.random.seed(rseed) X = dist.rvs(N) meanX = X.mean() stdX = X.std() assert_allclose([meanX, stdX], [0, 1], atol=3 / np.sqrt(N)) astroML-0.3/astroML/density_estimation/tests/test_hist_binwidth.py0000644000076500000240000000224412115147567026425 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose, assert_ from astroML.density_estimation import \ scotts_bin_width, freedman_bin_width, knuth_bin_width, histogram def test_scotts_bin_width(N=10000, rseed=0): np.random.seed(rseed) X = np.random.normal(size=N) delta = scotts_bin_width(X) assert_allclose(delta, 3.5 * np.std(X) / N ** (1. / 3)) def test_freedman_bin_width(N=10000, rseed=0): np.random.seed(rseed) X = np.random.normal(size=N) delta = freedman_bin_width(X) indices = np.argsort(X) i25 = indices[N / 4 - 1] i75 = indices[(3 * N) / 4 - 1] assert_allclose(delta, 2 * (X[i75] - X[i25]) / N ** (1. / 3)) def test_knuth_bin_width(N=10000, rseed=0): np.random.seed(0) X = np.random.normal(size=N) dx, bins = knuth_bin_width(X, return_bins=True) assert_allclose(len(bins), 59) def test_histogram(N=1000, rseed=0): np.random.seed(0) x = np.random.normal(0, 1, N) for bins in [30, np.linspace(-5, 5, 31), 'knuth', 'scotts', 'freedman']: counts, bins = histogram(x, bins) assert_(counts.sum() == len(x)) assert_(len(counts) == len(bins) - 1) astroML-0.3/astroML/density_estimation/tests/test_xdeconv.py0000644000076500000240000000241112115147567025230 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from astroML.density_estimation import XDGMM def test_XDGMM_1D_gaussian(N=100, sigma=0.1): np.random.seed(0) mu = 0 V = 1 X = np.random.normal(mu, V, size=(N, 1)) X += np.random.normal(0, sigma, size=(N, 1)) Xerr = sigma ** 2 * np.ones((N, 1, 1)) xdgmm = XDGMM(1).fit(X, Xerr) # because of sample variance, results will be similar # but not identical. We'll use a fudge factor of 0.1 assert_allclose(mu, xdgmm.mu[0], atol=0.1) assert_allclose(V, xdgmm.V[0], atol=0.1) def check_single_gaussian(N=100, D=3, sigma=0.1): np.random.seed(0) mu = np.random.random(D) V = np.random.random((D, D)) V = np.dot(V, V.T) X = np.random.multivariate_normal(mu, V, size=N) Xerr = np.zeros((N, D, D)) Xerr[:, range(D), range(D)] = sigma ** 2 X += np.random.normal(0, sigma, X.shape) xdgmm = XDGMM(1) xdgmm.fit(X, Xerr) # because of sample variance, results will be similar # but not identical. We'll use a fudge factor of 0.1 assert_allclose(mu, xdgmm.mu[0], atol=0.1) assert_allclose(V, xdgmm.V[0], atol=0.1) def test_single_gaussian(N=100, sigma=0.1): for D in (1, 2, 3): yield (check_single_gaussian, N, D, sigma) astroML-0.3/astroML/density_estimation/xdeconv.py0000644000076500000240000001535212420767763023044 0ustar jakevdpstaff00000000000000""" Extreme deconvolution solver This follows Bovy et al. http://arxiv.org/pdf/0905.2979v2.pdf Arbitrary mixing matrices R are not yet implemented: currently, this only works with R = I. """ from __future__ import print_function, division from time import time import numpy as np from scipy import linalg from sklearn.mixture import GMM from ..utils import logsumexp, log_multivariate_gaussian, check_random_state class XDGMM(object): """Extreme Deconvolution Fit an extreme deconvolution (XD) model to the data Parameters ---------- n_components: integer number of gaussian components to fit to the data n_iter: integer (optional) number of EM iterations to perform (default=100) tol: float (optional) stopping criterion for EM iterations (default=1E-5) Notes ----- This implementation follows Bovy et al. arXiv 0905.2979 """ def __init__(self, n_components, n_iter=100, tol=1E-5, verbose=False, random_state = None): self.n_components = n_components self.n_iter = n_iter self.tol = tol self.verbose = verbose self.random_state = random_state # model parameters: these are set by the fit() method self.V = None self.mu = None self.alpha = None def fit(self, X, Xerr, R=None): """Fit the XD model to data Parameters ---------- X: array_like Input data. shape = (n_samples, n_features) Xerr: array_like Error on input data. shape = (n_samples, n_features, n_features) R : array_like (TODO: not implemented) Transformation matrix from underlying to observed data. If unspecified, then it is assumed to be the identity matrix. """ if R is not None: raise NotImplementedError("mixing matrix R is not yet implemented") X = np.asarray(X) Xerr = np.asarray(Xerr) n_samples, n_features = X.shape # assume full covariances of data assert Xerr.shape == (n_samples, n_features, n_features) # initialize components via a few steps of GMM # this doesn't take into account errors, but is a fast first-guess gmm = GMM(self.n_components, n_iter=10, covariance_type='full', random_state=self.random_state).fit(X) self.mu = gmm.means_ self.alpha = gmm.weights_ self.V = gmm.covars_ logL = self.logL(X, Xerr) for i in range(self.n_iter): t0 = time() self._EMstep(X, Xerr) logL_next = self.logL(X, Xerr) t1 = time() if self.verbose: print("%i: log(L) = %.5g" % (i + 1, logL_next)) print(" (%.2g sec)" % (t1 - t0)) if logL_next < logL + self.tol: break logL = logL_next return self def logprob_a(self, X, Xerr): """ Evaluate the probability for a set of points Parameters ---------- X: array_like Input data. shape = (n_samples, n_features) Xerr: array_like Error on input data. shape = (n_samples, n_features, n_features) Returns ------- p: ndarray Probabilities. shape = (n_samples,) """ X = np.asarray(X) Xerr = np.asarray(Xerr) n_samples, n_features = X.shape # assume full covariances of data assert Xerr.shape == (n_samples, n_features, n_features) X = X[:, np.newaxis, :] Xerr = Xerr[:, np.newaxis, :, :] T = Xerr + self.V return log_multivariate_gaussian(X, self.mu, T) def logL(self, X, Xerr): """Compute the log-likelihood of data given the model Parameters ---------- X: array_like data, shape = (n_samples, n_features) Xerr: array_like errors, shape = (n_samples, n_features, n_features) Returns ------- logL : float log-likelihood """ return np.sum(logsumexp(self.logprob_a(X, Xerr), -1)) def _EMstep(self, X, Xerr): """ Perform the E-step (eq 16 of Bovy et al) """ n_samples, n_features = X.shape X = X[:, np.newaxis, :] Xerr = Xerr[:, np.newaxis, :, :] w_m = X - self.mu T = Xerr + self.V #------------------------------------------------------------ # compute inverse of each covariance matrix T Tshape = T.shape T = T.reshape([n_samples * self.n_components, n_features, n_features]) Tinv = np.array([linalg.inv(T[i]) for i in range(T.shape[0])]).reshape(Tshape) T = T.reshape(Tshape) #------------------------------------------------------------ # evaluate each mixture at each point N = np.exp(log_multivariate_gaussian(X, self.mu, T, Vinv=Tinv)) #------------------------------------------------------------ # E-step: # compute q_ij, b_ij, and B_ij q = (N * self.alpha) / np.dot(N, self.alpha)[:, None] tmp = np.sum(Tinv * w_m[:, :, np.newaxis, :], -1) b = self.mu + np.sum(self.V * tmp[:, :, np.newaxis, :], -1) tmp = np.sum(Tinv[:, :, :, :, np.newaxis] * self.V[:, np.newaxis, :, :], -2) B = self.V - np.sum(self.V[:, :, :, np.newaxis] * tmp[:, :, np.newaxis, :, :], -2) #------------------------------------------------------------ # M-step: # compute alpha, m, V qj = q.sum(0) self.alpha = qj / n_samples self.mu = np.sum(q[:, :, np.newaxis] * b, 0) / qj[:, np.newaxis] m_b = self.mu - b tmp = m_b[:, :, np.newaxis, :] * m_b[:, :, :, np.newaxis] tmp += B tmp *= q[:, :, np.newaxis, np.newaxis] self.V = tmp.sum(0) / qj[:, np.newaxis, np.newaxis] def sample(self, size=1, random_state=None): if random_state is None: random_state = self.random_state rng = check_random_state(random_state) shape = tuple(np.atleast_1d(size)) + (self.mu.shape[1],) npts = np.prod(size) alpha_cs = np.cumsum(self.alpha) r = np.atleast_1d(np.random.random(size)) r.sort() ind = r.searchsorted(alpha_cs) ind = np.concatenate(([0], ind)) if ind[-1] != size: ind[-1] = size draw = np.vstack([np.random.multivariate_normal(self.mu[i], self.V[i], (ind[i + 1] - ind[i],)) for i in range(len(self.alpha))]) return draw.reshape(shape) astroML-0.3/astroML/dimensionality/0000755000076500000240000000000012462244012020112 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/dimensionality/__init__.py0000644000076500000240000000005112420767763022240 0ustar jakevdpstaff00000000000000from .iterative_pca import iterative_pca astroML-0.3/astroML/dimensionality/iterative_pca.py0000644000076500000240000001147612462227504023323 0ustar jakevdpstaff00000000000000import sys import numpy as np from scipy.linalg import solve def iterative_pca(X, M, n_ev=5, n_iter=15, norm=None, full_output=False): """ Parameters ---------- X: ndarray, shape = (n_samples, n_features) input data M: ndarray, bool, shape = (n_samples, n_features) mask for input data. where mask == True, the spectrum is unconstrained n_ev: int number of eigenvectors to use in reconstructing masked regions n_iter: int number of iterations to find eigenvectors norm: string what type of normalization to use on the data. Options are - None : no normalization - 'L1' : L1-norm - 'L2' : L2-norm full_output: boolean (optional) if False (default) return only the reconstructed data X_recons if True, return the full information (see below) Returns ------- X_recons: ndarray, shape = (n_samples, n_features) data with masked regions reconstructed mu: ndarray, shape = (n_features,) mean of data evecs: ndarray, shape = (min(n_samples, n_features), n_features) eigenvectors of the reconstructed data evals: ndarray, size = min(n_samples, n_features) eigenvalues of the reconstructed data norms: ndarray, size = n_samples normalization of each input coeffs: ndarray, size = (n_samples, n_ev) coefficients used to reconstruct X """ X = np.asarray(X, dtype=np.float) M = np.asarray(M, dtype=np.bool) if X.shape != M.shape: raise ValueError('X and M must have the same shape') n_samples, n_features = X.shape if np.any(M.sum(0) == n_samples): raise ValueError('Some features are masked in all samples') if type(norm) == str: norm = norm.upper() if norm not in (None, 'none', 'L1', 'L2'): raise ValueError('unrecognized norm: %s' % norm) notM = (~M) X_recons = X.copy() X_recons[M] = 0 # as an initial guess, we'll fill-in masked regions with the mean # of the rest of the sample if norm is None: mu = (X_recons * notM).sum(0) / notM.sum(0) mu = mu * np.ones([n_samples, 1]) X_recons[M] = mu[M] else: # since we're normalizing each spectrum, and the norm depends on # the filled-in values, we need to iterate a few times to make # sure things are consistent. for i in range(n_iter): # normalize if norm == 'L1': X_recons /= np.sum(X_recons, 1)[:, None] else: X_recons /= np.sqrt(np.sum(X_recons ** 2, 1))[:, None] # find the mean mu = (X_recons * notM).sum(0) / notM.sum(0) mu = mu * np.ones([n_samples, 1]) X_recons[M] = mu[M] # Matrix of coefficients coeffs = np.zeros((n_samples, n_ev)) # Now we iterate through, using the principal components to reconstruct # these regions. for i in range(n_iter): sys.stdout.write(' PCA iteration %i / %i\r' % (i + 1, n_iter)) sys.stdout.flush() # normalize the data if norm == 'L1': X_recons /= np.sum(X_recons, 1)[:, None] else: X_recons /= np.sqrt(np.sum(X_recons ** 2, 1))[:, None] # now compute the principal components mu = X_recons.mean(0) X_centered = X_recons - mu U, S, VT = np.linalg.svd(X_centered, full_matrices=False) # perform a least-squares fit to estimate the coefficients of the # first n_ev eigenvectors for each data point. # The eigenvectors are in the rows of the matrix VT. # The coefficients are given by # a_n = [V_n^T W V_n]^(-1) V_n W x # Such that x can be reconstructed via # x_n = V_n a_n # Variables here are: # x : vector length n_features. This is a data point to be # reconstructed # a_n : vector of length n. These are the reconstruction weights # V_n : eigenvector matrix of size (n_features, n). # W : diagonal weight matrix of size (n_features, n_features) # such that W[i,i] = M[i] # x_n : vector of length n_features which approximates x VWx = np.dot(VT[:n_ev], (notM * X_centered).T) for i in range(n_samples): VWV = np.dot(VT[:n_ev], (notM[i] * VT[:n_ev]).T) coeffs[i] = solve(VWV, VWx[:, i], sym_pos=True, overwrite_a=True) X_fill = mu + np.dot(coeffs, VT[:n_ev]) X_recons[M] = X_fill[M] sys.stdout.write('\n') # un-normalize X_recons norms = np.zeros(n_samples) for i in range(n_samples): ratio_i = X[i][notM[i]] / X_recons[i][notM[i]] norms[i] = ratio_i[~np.isnan(ratio_i)][0] X_recons[i] *= norms[i] if full_output: return X_recons, mu, VT, S, norms, coeffs else: return X_recons astroML-0.3/astroML/dimensionality/tests/0000755000076500000240000000000012462244012021254 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/dimensionality/tests/__init__.py0000644000076500000240000000000012252721253023357 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/dimensionality/tests/test_iterative_PCA.py0000644000076500000240000000126712115147567025366 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_array_almost_equal from astroML.dimensionality import iterative_pca def test_iterative_PCA(n_samples=50, n_features=40): np.random.seed(0) # construct some data that is well-approximated # by two principal components x = np.linspace(0, np.pi, n_features) x0 = np.linspace(0, np.pi, n_samples) X = np.sin(x) * np.cos(0.5 * (x - x0[:, None])) # mask 10% of the pixels M = (np.random.random(X.shape) > 0.9) # reconstruct and check accuracy for norm in (None, 'L1', 'L2'): X_recons = iterative_pca(X, M, n_ev=2, n_iter=10, norm=norm) assert_array_almost_equal(X, X_recons, decimal=2) astroML-0.3/astroML/filters.py0000644000076500000240000002345012420767763017131 0ustar jakevdpstaff00000000000000import numpy as np from scipy import optimize, fftpack, signal # Note: there is a scipy PR to include an improved SG filter within the # scipy.signal submodule. It should replace this when it's finished. # see http://github.com/scipy/scipy/pull/304 def savitzky_golay(y, window_size, order, deriv=0, use_fft=True): r"""Smooth (and optionally differentiate) data with a Savitzky-Golay filter This implementation is based on [1]_. The Savitzky-Golay filter removes high frequency noise from data. It has the advantage of preserving the original shape and features of the signal better than other types of filtering approaches, such as moving averages techhniques. Parameters ---------- y : array_like, shape (N,) the values of the time history of the signal. window_size : int the length of the window. Must be an odd integer number. order : int the order of the polynomial used in the filtering. Must be less then `window_size` - 1. deriv: int the order of the derivative to compute (default = 0 means only smoothing) use_fft : bool if True (default) then convolue using FFT for speed Returns ------- y_smooth : ndarray, shape (N) the smoothed signal (or it's n-th derivative). Notes ----- The Savitzky-Golay is a type of low-pass filter, particularly suited for smoothing noisy data. The main idea behind this approach is to make for each point a least-square fit with a polynomial of high order over a odd-sized window centered at the point. Examples -------- >>> t = np.linspace(-4, 4, 500) >>> y = np.exp(-t ** 2) >>> np.random.seed(0) >>> y_noisy = y + np.random.normal(0, 0.05, t.shape) >>> y_smooth = savitzky_golay(y, window_size=31, order=4) >>> print(np.rms(y_noisy - y)) >>> print(np.rms(y_smooth - y)) References ---------- .. [1] http://www.scipy.org/Cookbook/SavitzkyGolay .. [2] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of Data by Simplified Least Squares Procedures. Analytical Chemistry, 1964, 36 (8), pp 1627-1639. .. [3] Numerical Recipes 3rd Edition: The Art of Scientific Computing W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery Cambridge University Press ISBN-13: 9780521880688 """ try: window_size = np.abs(np.int(window_size)) order = np.abs(np.int(order)) except ValueError: raise ValueError("window_size and order have to be of type int") if window_size % 2 != 1 or window_size < 1: raise TypeError("window_size size must be a positive odd number") if window_size < order + 2: raise TypeError("window_size is too small for the polynomials order") order_range = range(order + 1) half_window = (window_size - 1) // 2 # precompute coefficients b = np.mat([[k ** i for i in order_range] for k in range(-half_window, half_window + 1)]) m = np.linalg.pinv(b).A[deriv] # pad the signal at the extremes with # values taken from the signal itself firstvals = y[0] - np.abs(y[1:half_window + 1][::-1] - y[0]) lastvals = y[-1] + np.abs(y[-half_window - 1:-1][::-1] - y[-1]) y = np.concatenate((firstvals, y, lastvals)) if use_fft: return signal.fftconvolve(y, m, mode='valid') else: return np.convolve(y, m, mode='valid') def wiener_filter(t, h, signal='gaussian', noise='flat', return_PSDs=False, signal_params=None, noise_params=None): """Compute a Wiener-filtered time-series Parameters ---------- t : array_like evenly-sampled time series, length N h : array_like observations at each t signal : str (optional) currently only 'gaussian' is supported noise : str (optional) currently only 'flat' is supported return_PSDs : bool (optional) if True, then return (PSD, P_S, P_N) signal_guess : tuple (optional) A starting guess at the parameters for the signal. If not specified, a suitable guess will be estimated from the data itself. (see Notes below) noise_guess : tuple (optional) A starting guess at the parameters for the noise. If not specified, a suitable guess will be estimated from the data itself. (see Notes below) Returns ------- h_smooth : ndarray a smoothed version of h, length N Notes ----- The Wiener filter operates by fitting a functional form to the PSD:: PSD = P_S + P_N The resulting frequency-space filter is given by:: Phi = P_S / (P_S + P_N) This entire operation is equivalent to a kernel smoothing by a kernel whose Fourier transform is Phi. Choosing Signal/Noise Parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ the arguments ``signal_guess`` and ``noise_guess`` specify the initial guess for the characteristics of signal and noise used in the minimization. They are generally expected to be tuples, and the meaning varies depending on the form of signal and noise used. For ``gaussian``, the params are (amplitude, width). For ``flat``, the params are (amplitude,). See Also -------- scipy.signal.wiener : a static (non-adaptive) wiener filter """ # Validate signal if signal != 'gaussian': raise ValueError("only signal='gaussian' is supported") if signal_params is not None and len(signal_params) != 2: raise ValueError("signal_params should be length 2") # Validate noise if noise != 'flat': raise ValueError("only noise='flat' is supported") if noise_params is not None and len(noise_params) != 1: raise ValueError("noise_params should be length 1") # Validate t and hd t = np.asarray(t) h = np.asarray(h) if (t.ndim != 1) or (t.shape != h.shape): raise ValueError('t and h must be equal-length 1-dimensional arrays') # compute the PSD of the input N = len(t) Df = 1. / N / (t[1] - t[0]) f = fftpack.ifftshift(Df * (np.arange(N) - N / 2)) H = fftpack.fft(h) PSD = abs(H) ** 2 # fit signal/noise params if necessary if signal_params is None: amp_guess = np.max(PSD[1:]) width_guess = np.min(np.abs(f[PSD[1:] < np.mean(PSD[1:])])) signal_params = (amp_guess, width_guess) if noise_params is None: noise_params = (np.mean(PSD[1:]),) # Set up the Wiener filter: # fit a model to the PSD: sum of signal form and noise form def signal(x, A, width): width = abs(width) + 1E-99 # prevent divide-by-zero errors return A * np.exp(-0.5 * (x / width) ** 2) def noise(x, n): return n * np.ones(x.shape) # use [1:] here to remove the zero-frequency term: we don't want to # fit to this for data with an offset. min_func = lambda v: np.sum((PSD[1:] - signal(f[1:], v[0], v[1]) - noise(f[1:], v[2])) ** 2) v0 = tuple(signal_params) + tuple(noise_params) v = optimize.fmin(min_func, v0) P_S = signal(f, v[0], v[1]) P_N = noise(f, v[2]) Phi = P_S / (P_S + P_N) Phi[0] = 1 # correct for DC offset # Use Phi to filter and smooth the values h_smooth = fftpack.ifft(Phi * H) if not np.iscomplexobj(h): h_smooth = h_smooth.real if return_PSDs: return h_smooth, PSD, P_S, P_N, Phi else: return h_smooth def min_component_filter(x, y, feature_mask, p=1, fcut=None, Q=None): """Minimum component filtering Minimum component filtering is useful for determining the background component of a signal in the presence of spikes Parameters ---------- x : array_like 1D array of evenly spaced x values y : array_like 1D array of y values corresponding to x feature_mask : array_like 1D mask array giving the locations of features in the data which should be ignored for smoothing p : integer (optional) polynomial degree to be used for the fit (default = 1) fcut : float (optional) the cutoff frequency for the low-pass filter. Default value is f_nyq / sqrt(N) Q : float (optional) the strength of the low-pass filter. Larger Q means a steeper cutoff default value is 0.1 * fcut Returns ------- y_filtered : ndarray The filtered version of y. Notes ----- This code follows the procedure explained in the book "Practical Statistics for Astronomers" by Wall & Jenkins book, as well as in Wall, J, A&A 122:371, 1997 """ x = np.asarray(x, dtype=float) y = np.asarray(y, dtype=float) feature_mask = np.asarray(feature_mask, dtype=bool) if ((x.ndim != 1) or (x.shape != y.shape) or (y.shape != feature_mask.shape)): raise ValueError('x, y, and feature_mask must be 1 dimensional ' 'with matching lengths') if fcut is None: f_nyquist = 1. / (x[1] - x[0]) fcut = f_nyquist / np.sqrt(len(x)) if Q is None: Q = 0.1 * fcut # compute polynomial features XX = x[:, None] ** np.arange(p + 1) # compute least-squares fit to non-masked data beta = np.linalg.lstsq(XX[~feature_mask], y[~feature_mask])[0] # subtract polynomial fit and mask the data y_mask = y - np.dot(XX, beta) y_mask[feature_mask] = 0 # get Fourier transforms of arrays yFT_mask = fftpack.fft(y_mask) # compute (shifted) frequency array for filter N = len(x) f = fftpack.ifftshift((np.arange(N) - N / 2.) * 1. / N / (x[1] - x[0])) # construct low-pass filter filt = np.exp(- (Q * (abs(f) - fcut) / fcut) ** 2) filt[abs(f) < fcut] = 1 # reconstruct filtered signal y_filtered = fftpack.ifft(yFT_mask * filt).real + np.dot(XX, beta) return y_filtered astroML-0.3/astroML/fourier.py0000644000076500000240000001550612115147567017132 0ustar jakevdpstaff00000000000000import numpy as np try: # use scipy if available: it's faster from scipy.fftpack import fft, ifft, fftshift, ifftshift except: from numpy.fft import fft, ifft, fftshift, ifftshift def FT_continuous(t, h, axis=-1, method=1): """Approximate a continuous 1D Fourier Transform with sampled data. This function uses the Fast Fourier Transform to approximate the continuous fourier transform of a sampled function, using the convention .. math:: H(f) = \int h(t) exp(-2 \pi i f t) dt It returns f and H, which approximate H(f). Parameters ---------- t : array_like regularly sampled array of times t is assumed to be regularly spaced, i.e. t = t0 + Dt * np.arange(N) h : array_like real or complex signal at each time axis : int axis along which to perform fourier transform. This axis must be the same length as t. Returns ------- f : ndarray frequencies of result. Units are the same as 1/t H : ndarray Fourier coefficients at each frequency. """ assert t.ndim == 1 assert h.shape[axis] == t.shape[0] N = len(t) if N % 2 != 0: raise ValueError("number of samples must be even") Dt = t[1] - t[0] Df = 1. / (N * Dt) t0 = t[N / 2] f = Df * (np.arange(N) - N / 2) shape = np.ones(h.ndim, dtype=int) shape[axis] = N phase = np.ones(N) phase[1::2] = -1 phase = phase.reshape(shape) if method == 1: H = Dt * fft(h * phase, axis=axis) else: H = Dt * fftshift(fft(h, axis=axis), axes=axis) H *= phase H *= np.exp(-2j * np.pi * t0 * f.reshape(shape)) H *= np.exp(-1j * np.pi * N / 2) return f, H def IFT_continuous(f, H, axis=-1, method=1): """Approximate a continuous 1D Inverse Fourier Transform with sampled data. This function uses the Fast Fourier Transform to approximate the continuous fourier transform of a sampled function, using the convention .. math:: H(f) = integral[ h(t) exp(-2 pi i f t) dt] h(t) = integral[ H(f) exp(2 pi i f t) dt] It returns t and h, which approximate h(t). Parameters ---------- f : array_like regularly sampled array of times t is assumed to be regularly spaced, i.e. f = f0 + Df * np.arange(N) H : array_like real or complex signal at each time axis : int axis along which to perform fourier transform. This axis must be the same length as t. Returns ------- f : ndarray frequencies of result. Units are the same as 1/t H : ndarray Fourier coefficients at each frequency. """ assert f.ndim == 1 assert H.shape[axis] == f.shape[0] N = len(f) if N % 2 != 0: raise ValueError("number of samples must be even") f0 = f[0] Df = f[1] - f[0] t0 = -0.5 / Df Dt = 1. / (N * Df) t = t0 + Dt * np.arange(N) shape = np.ones(H.ndim, dtype=int) shape[axis] = N t_calc = t.reshape(shape) f_calc = f.reshape(shape) H_prime = H * np.exp(2j * np.pi * t0 * f_calc) h_prime = ifft(H_prime, axis=axis) h = N * Df * np.exp(2j * np.pi * f0 * (t_calc - t0)) * h_prime return t, h def PSD_continuous(t, h, axis=-1, method=1): """Approximate a continuous 1D Power Spectral Density of sampled data. This function uses the Fast Fourier Transform to approximate the continuous fourier transform of a sampled function, using the convention .. math:: H(f) = \int h(t) \exp(-2 \pi i f t) dt It returns f and PSD, which approximate PSD(f) where .. math:: PSD(f) = |H(f)|^2 + |H(-f)|^2 Parameters ---------- t : array_like regularly sampled array of times t is assumed to be regularly spaced, i.e. t = t0 + Dt * np.arange(N) h : array_like real or complex signal at each time axis : int axis along which to perform fourier transform. This axis must be the same length as t. Returns ------- f : ndarray frequencies of result. Units are the same as 1/t PSD : ndarray Fourier coefficients at each frequency. """ assert t.ndim == 1 assert h.shape[axis] == t.shape[0] N = len(t) if N % 2 != 0: raise ValueError("number of samples must be even") ax = axis % h.ndim if method == 1: # use FT_continuous f, Hf = FT_continuous(t, h, axis) Hf = np.rollaxis(Hf, ax) f = -f[N / 2::-1] PSD = abs(Hf[N / 2::-1]) ** 2 PSD[:-1] += abs(Hf[N / 2:]) ** 2 PSD = np.rollaxis(PSD, 0, ax + 1) else: # A faster way to do it is with fftshift # take advantage of the fact that phases go away Dt = t[1] - t[0] Df = 1. / (N * Dt) f = Df * np.arange(N / 2 + 1) Hf = fft(h, axis=axis) Hf = np.rollaxis(Hf, ax) PSD = abs(Hf[:N / 2 + 1]) ** 2 PSD[-1] = 0 PSD[1:] += abs(Hf[N / 2:][::-1]) ** 2 PSD[0] *= 2 PSD = Dt ** 2 * np.rollaxis(PSD, 0, ax + 1) return f, PSD def sinegauss(t, t0, f0, Q): """Sine-gaussian wavelet""" a = (f0 * 1. / Q) ** 2 return (np.exp(-a * (t - t0) ** 2) * np.exp(2j * np.pi * f0 * (t - t0))) def sinegauss_FT(f, t0, f0, Q): """Fourier transform of the sine-gaussian wavelet. This uses the convention .. math:: H(f) = integral[ h(t) exp(-2pi i f t) dt] """ a = (f0 * 1. / Q) ** 2 return (np.sqrt(np.pi / a) * np.exp(-2j * np.pi * f * t0) * np.exp(-np.pi ** 2 * (f - f0) ** 2 / a)) def sinegauss_PSD(f, t0, f0, Q): """Compute the PSD of the sine-gaussian function at frequency f .. math:: PSD(f) = |H(f)|^2 + |H(-f)|^2 """ a = (f0 * 1. / Q) ** 2 Pf = np.pi / a * np.exp(-2 * np.pi ** 2 * (f - f0) ** 2 / a) Pmf = np.pi / a * np.exp(-2 * np.pi ** 2 * (-f - f0) ** 2 / a) return Pf + Pmf def wavelet_PSD(t, h, f0, Q=1.0): """Compute the wavelet PSD as a function of f0 and t Parameters ---------- t : array_like array of times, length N h : array_like array of observed values, length N f0 : array_like array of candidate frequencies, length Nf Q : float Q-parameter for wavelet Returns ------- PSD : ndarray The 2-dimensional PSD, of shape (Nf, N), corresponding with frequencies f0 and times t. """ t, h, f0 = map(np.asarray, (t, h, f0)) if (t.ndim != 1) or (t.shape != h.shape): raise ValueError('t and h must be one dimensional and the same shape') if f0.ndim != 1: raise ValueError('f0 must be one dimensional') Q = Q + np.zeros_like(f0) f, H = FT_continuous(t, h) W = np.conj(sinegauss_FT(f, 0, f0[:, None], Q[:, None])) _, HW = IFT_continuous(f, H * W) return abs(HW) ** 2 astroML-0.3/astroML/linear_model/0000755000076500000240000000000012462244012017514 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/linear_model/__init__.py0000644000076500000240000000025512422070472021632 0ustar jakevdpstaff00000000000000from .linear_regression import \ LinearRegression, PolynomialRegression, BasisFunctionRegression from .kernel_regression import NadarayaWatson from .TLS import TLS_logL astroML-0.3/astroML/linear_model/kernel_regression.py0000644000076500000240000000304012252721253023607 0ustar jakevdpstaff00000000000000import numpy as np from .linear_regression import gaussian_basis from sklearn.metrics import pairwise_kernels class NadarayaWatson(object): """Nadaraya-Watson Kernel Regression This is basically a gaussian-weighted moving average of points Parameters ---------- kernel : string kernel is either "gaussian", or one of the kernels available in sklearn.metrics.pairwise. h : float or array_like width of kernel. If array, its length must be the number of dimensions in the training data Additional keyword arguments are passed to the kernel. """ def __init__(self, kernel='gaussian', h=None, **kwargs): self.kernel = kernel self.h = h self.kwargs = kwargs def fit(self, X, y, dy=1): self.X = np.asarray(X) self.y = np.asarray(y) self.dy = np.atleast_1d(dy) return self def predict(self, X): X = np.asarray(X) if X.ndim != 2: raise ValueError('X must be two-dimensional') if X.shape[1] != self.X.shape[1]: raise ValueError('dimensions of X do not match training dimension') if self.kernel == 'gaussian': # wrangle gaussian into scikit-learn's 'rbf' kernel h = np.asarray(self.h) gamma = 0.5 / h / h K = pairwise_kernels(X, self.X, metric='rbf', gamma=gamma) else: K = pairwise_kernels(X, self.X, metric=self.kernel, **self.kwargs) K /= self.dy ** 2 return (K * self.y).sum(1) / K.sum(1) astroML-0.3/astroML/linear_model/linear_regression.py0000644000076500000240000001534512422072150023606 0ustar jakevdpstaff00000000000000import numpy as np from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression, Lasso, Ridge #------------------------------------------------------------ # Basis functions def gaussian_basis(X, mu, sigma): """Gaussian Basis function Parameters ---------- X : array_like input data: shape = (n_samples, n_features) mu : array_like means of bases, shape = (n_bases, n_features) sigma : float or array_like must broadcast to shape of mu Returns ------- Xg : ndarray shape = (n_samples, n_bases) """ X = np.asarray(X) mu = np.atleast_2d(mu) sigma = np.atleast_2d(sigma) n_samples, n_features = X.shape n_bases = mu.shape[0] if mu.shape[1] != n_features: raise ValueError('shape of mu must match shape of X') r = (((X[:, None, :] - mu) / sigma) ** 2).sum(2) Xg = np.exp(-0.5 * r) Xg *= 1. / np.sqrt(2 * np.pi) / sigma.prod(1) return Xg class LinearRegression(object): """Simple Linear Regression with errors in y This is a stripped-down version of sklearn.linear_model.LinearRegression which can correctly accounts for errors in the y variable Parameters ---------- fit_intercept : bool (optional) if True (default) then fit the intercept of the data regularization : string (optional) ['l1'|'l2'|'none'] Use L1 (Lasso) or L2 (Ridge) regression kwds: dict additional keyword arguments passed to sklearn estimators: LinearRegression, Lasso (L1), or Ridge (L2) Notes ----- This implementation may be compared to that in sklearn.linear_model.LinearRegression. The difference is that here errors are """ _regressors = {'none' : LinearRegression, 'l1' : Lasso, 'l2' : Ridge} def __init__(self, fit_intercept=True, regularization='none', kwds=None): if regularization.lower() not in ['l1', 'l2', 'none']: raise ValueError("regularization='{0}' not recognized" "".format(regularization)) self.fit_intercept = fit_intercept self.regularization = regularization self.kwds = kwds def _transform_X(self, X): X = np.asarray(X) if self.fit_intercept: X = np.hstack([np.ones([X.shape[0], 1]), X]) return X @staticmethod def _scale_by_error(X, y, y_error=1): """Scale regression by error on y""" X = np.atleast_2d(X) y = np.asarray(y) y_error = np.asarray(y_error) assert X.ndim == 2 assert y.ndim == 1 assert X.shape[0] == y.shape[0] if y_error.ndim == 0: return X / y_error, y / y_error elif y_error.ndim == 1: assert y_error.shape == y.shape X_out, y_out = X / y_error[:, None], y / y_error elif y_error.ndim == 2: assert y_error.shape == (y.size, y.size) evals, evecs = np.linalg.eigh(y_error) X_out = np.dot(evecs * (evals ** -0.5), np.dot(evecs.T, X)) y_out = np.dot(evecs * (evals ** -0.5), np.dot(evecs.T, y)) else: raise ValueError("shape of y_error does not match that of y") return X_out, y_out def _choose_regressor(self): model = self._regressors.get(self.regularization.lower(), None) if model is None: raise ValueError("regularization='{0}' unrecognized" "".format(self.regularization)) return model def fit(self, X, y, y_error=1): kwds = {} if self.kwds is not None: kwds.update(self.kwds) kwds['fit_intercept'] = False model = self._choose_regressor() self.clf_ = model(**kwds) X = self._transform_X(X) X, y = self._scale_by_error(X, y, y_error) self.clf_.fit(X, y) return self def predict(self, X): X = self._transform_X(X) return self.clf_.predict(X) @property def coef_(self): return self.clf_.coef_ class PolynomialRegression(LinearRegression): """Polynomial Regression with errors in y Parameters ---------- degree : int degree of the polynomial. interaction_only : bool (optional) If true, only interaction features are produced: features that are products of at most ``degree`` *distinct* input features (so not ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). fit_intercept : bool (optional) if True (default) then fit the intercept of the data regularization : string (optional) ['l1'|'l2'|'none'] Use L1 (Lasso) or L2 (Ridge) regression kwds: dict additional keyword arguments passed to sklearn estimators: LinearRegression, Lasso (L1), or Ridge (L2) """ def __init__(self, degree=1, interaction_only=False, fit_intercept=True, regularization='none', kwds=None): self.degree = degree self.interaction_only = interaction_only LinearRegression.__init__(self, fit_intercept, regularization, kwds) def _transform_X(self, X): trans = PolynomialFeatures(degree=self.degree, interaction_only=self.interaction_only, include_bias=self.fit_intercept) return trans.fit_transform(X) class BasisFunctionRegression(LinearRegression): """Basis Function with errors in y Parameters ---------- basis_func : str or function specify the basis function to use. This should take an input matrix of size (n_samples, n_features), along with optional parameters, and return a matrix of size (n_samples, n_bases). fit_intercept : bool (optional) if True (default) then fit the intercept of the data regularization : string (optional) ['l1'|'l2'|'none'] Use L1 (Lasso) or L2 (Ridge) regression kwds: dict additional keyword arguments passed to sklearn estimators: LinearRegression, Lasso (L1), or Ridge (L2) """ _basis_funcs = {'gaussian': gaussian_basis} def __init__(self, basis_func='gaussian', fit_intercept=True, regularization='none', kwds=None, **kwargs): self.basis_func = basis_func self.kwargs = kwargs LinearRegression.__init__(self, fit_intercept, regularization, kwds) def _transform_X(self, X): if callable(self.basis_func): basis_func = self.basis_func else: basis_func = self._basis_funcs.get(self.basis_func, None) X = basis_func(X, **self.kwargs) if self.fit_intercept: X = np.hstack([np.ones((X.shape[0], 1)), X]) return X astroML-0.3/astroML/linear_model/tests/0000755000076500000240000000000012462244012020656 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/linear_model/tests/__init__.py0000644000076500000240000000000012252721253022761 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/linear_model/tests/test_kernel_regression.py0000644000076500000240000000067212115147567026030 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from astroML.linear_model import NadarayaWatson def test_NW_simple(): X = np.arange(11.) y = X + 1 dy = 1 # by symmetry, NW regression should get these exactly correct Xfit = np.array([4, 5, 6])[:, None] y_true = np.ravel(Xfit + 1) clf = NadarayaWatson(h=0.5).fit(X[:, None], y, dy) y_fit = clf.predict(Xfit) assert_allclose(y_fit, y_true) astroML-0.3/astroML/linear_model/tests/test_linear_regression.py0000644000076500000240000000565712422070406026016 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from sklearn.linear_model import LinearRegression as skLinearRegression from astroML.linear_model import \ LinearRegression, PolynomialRegression, BasisFunctionRegression def test_error_transform_diag(N=20, rseed=0): rng = np.random.RandomState(rseed) X = rng.rand(N, 2) yerr = 0.05 * (1 + rng.rand(N)) y = (X[:, 0] ** 2 + X[:, 1]) + yerr * rng.randn(N) Sigma = np.eye(N) * yerr ** 2 X1, y1 = LinearRegression._scale_by_error(X, y, yerr) X2, y2 = LinearRegression._scale_by_error(X, y, Sigma) assert_allclose(X1, X2) assert_allclose(y1, y2) def test_error_transform_full(N=20, rseed=0): rng = np.random.RandomState(rseed) X = rng.rand(N, 2) # generate a pos-definite error matrix Sigma = 0.05 * rng.randn(N, N) u, s, v = np.linalg.svd(Sigma) Sigma = np.dot(u * s, u.T) # draw y from this error distribution y = (X[:, 0] ** 2 + X[:, 1]) y = rng.multivariate_normal(y, Sigma) X2, y2 = LinearRegression._scale_by_error(X, y, Sigma) # check that the form entering the chi^2 is correct assert_allclose(np.dot(X2.T, X2), np.dot(X.T, np.linalg.solve(Sigma, X))) assert_allclose(np.dot(y2, y2), np.dot(y, np.linalg.solve(Sigma, y))) def test_LinearRegression_simple(): """ Test a simple linear regression """ x = np.arange(10.).reshape((10, 1)) y = np.arange(10.) + 1 dy = 1 clf = LinearRegression().fit(x, y, dy) y_true = clf.predict(x) assert_allclose(y, y_true, atol=1E-10) def test_LinearRegression_err(): """ Test that errors are correctly accounted for By comparing to scikit-learn LinearRegression """ np.random.seed(0) X = np.random.random((10, 1)) y = np.random.random(10) + 1 dy = 0.1 y = np.random.normal(y, dy) X_fit = np.linspace(0, 1, 10)[:, None] clf1 = LinearRegression().fit(X, y, dy) clf2 = skLinearRegression().fit(X / dy, y / dy) assert_allclose(clf1.coef_[1:], clf2.coef_) assert_allclose(clf1.coef_[0], clf2.intercept_ * dy) def test_LinearRegression_fit_intercept(): np.random.seed(0) X = np.random.random((10, 1)) y = np.random.random(10) clf1 = LinearRegression(fit_intercept=False).fit(X, y) clf2 = skLinearRegression(fit_intercept=False).fit(X, y) assert_allclose(clf1.coef_, clf2.coef_) def test_PolynomialRegression_simple(): x = np.arange(10.).reshape((10, 1)) y = np.arange(10.) dy = 1 clf = PolynomialRegression(2).fit(x, y, dy) y_true = clf.predict(x) assert_allclose(y, y_true, atol=1E-10) def test_BasisfunctionRegression_simple(): x = np.arange(10.).reshape((10, 1)) y = np.arange(10.) + 1 dy = 1 mu = np.arange(11.)[:, None] sigma = 1.0 clf = BasisFunctionRegression(mu=mu, sigma=sigma).fit(x, y, dy) y_true = clf.predict(x) assert_allclose(y, y_true, atol=1E-10) astroML-0.3/astroML/linear_model/tests/test_TLS.py0000644000076500000240000000071012421234344022731 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from ..TLS import TLS_logL def test_TLS_likelihood_diagonal(rseed=0): """Test Total-Least-Squares fit with diagonal covariance""" np.random.seed(rseed) X = np.random.rand(10, 2) dX1 = 0.1 * np.ones((10, 2)) dX2 = 0.1 * np.array([np.eye(2) for i in range(10)]) v = np.random.random(2) assert_allclose(TLS_logL(v, X, dX1), TLS_logL(v, X, dX2)) astroML-0.3/astroML/linear_model/TLS.py0000644000076500000240000000265512421233731020541 0ustar jakevdpstaff00000000000000import numpy as np from scipy import optimize def TLS_logL(v, X, dX): """Compute the total least squares log-likelihood This uses Hogg et al eq. 29-32 Parameters ---------- v : ndarray The normal vector to the linear best fit. shape=(D,). Note that the magnitude |v| is a stand-in for the intercept. X : ndarray The input data. shape = [N, D] dX : ndarray The covariance of the errors for each point. For diagonal errors, the shape = (N, D) and the entries are dX[i] = [sigma_x1, sigma_x2 ... sigma_xD] For full covariance, the shape = (N, D, D) and the entries are dX[i] = Cov(X[i], X[i]), the full error covariance. Returns ------- logL : float The log-likelihood of the model v given the data. Notes ----- This implementation follows Hogg 2010, arXiv 1008.4686 """ # check inputs X, dX, v = map(np.asarray, (X, dX, v)) N, D = X.shape assert v.shape == (D,) assert dX.shape in ((N, D), (N, D, D)) v_norm = np.linalg.norm(v) v_hat = v / v_norm # eq. 30 Delta = np.dot(X, v_hat) - v_norm # eq. 31 if dX.ndim == 2: # diagonal covariance Sig2 = np.sum(dX * v_hat ** 2, 1) else: # full covariance Sig2 = np.dot(np.dot(v_hat, dX), v_hat) return (-0.5 * np.sum(np.log(2 * np.pi * Sig2)) - np.sum(0.5 * Delta ** 2 / Sig2)) astroML-0.3/astroML/lumfunc.py0000644000076500000240000001230312115147567017120 0ustar jakevdpstaff00000000000000import numpy as np def _sorted_interpolate(x, y, x_eval): """utility function for binned_Cminus""" # note that x should be sorted N = len(x) ind = x.searchsorted(x_eval) ind[ind == N] = N - 1 y_eval = np.zeros(x_eval.shape) # find perfect matches match = (x[ind] == x_eval) | (x_eval > x[-1]) | (x_eval < x[0]) y_eval[match] = y[ind[match]] ind = ind[~match] # take care of extrapolation ind[ind == 0] = 1 x_lo = x[ind - 1] x_up = x[ind] y_lo = y[ind - 1] y_up = y[ind] # take care of places where x_lo = x_up y_eval[~match] = (y_lo + (x_eval[~match] - x_lo) * (y_up - y_lo) / (x_up - x_lo)) return y_eval def Cminus(x, y, xmax, ymax): """Lynden-Bell's C-minus method Parameters ---------- x : array_like array of x values y : array_like array of y values xmax : array_like array of maximum x values for each y value ymax : array_like array of maximum y values for each x value Returns ------- Nx, Ny, cuml_x, cuml_y: ndarrays Nx and cuml_x are in the order of the sorted x array Ny and cuml_y are in the order of the sorted y array """ # make copies of input x, y, xmax, ymax = map(np.array, (x, y, xmax, ymax)) Nall = len(x) cuml_x = np.zeros(x.shape) cuml_y = np.zeros(y.shape) Nx = np.zeros(x.shape) Ny = np.zeros(y.shape) # first the y direction. i_sort = np.argsort(y) x = x[i_sort] y = y[i_sort] xmax = xmax[i_sort] ymax = ymax[i_sort] for j in range(1, Nall): Ny[j] = np.sum(x[:j] < xmax[j]) Ny[0] = np.inf cuml_y = np.cumprod(1. + 1. / Ny) Ny[0] = 0 # renormalize cuml_y *= Nall / cuml_y[-1] #now the x direction i_sort = np.argsort(x) x = x[i_sort] y = y[i_sort] xmax = xmax[i_sort] ymax = ymax[i_sort] for i in range(1, Nall): Nx[i] = np.sum(y[:i] < ymax[i]) Nx[0] = np.inf cuml_x = np.cumprod(1. + 1. / Nx) Nx[0] = 0 # renormalize cuml_x *= Nall / cuml_x[-1] return Nx, Ny, cuml_x, cuml_y def binned_Cminus(x, y, xmax, ymax, xbins, ybins, normalize=False): """Compute the binned distributions using the Cminus method Parameters ---------- x : array_like array of x values y : array_like array of y values xmax : array_like array of maximum x values for each y value ymax : array_like array of maximum y values for each x value xbins : array_like array of bin edges for the x function: size=Nbins_x + 1 ybins : array_like array of bin edges for the y function: size=Nbins_y + 1 normalize : boolean if true, then returned distributions are normalized. Default is False. Returns ------- dist_x, dist_y : ndarrays distributions of size Nbins_x and Nbins_y """ Nx, Ny, cuml_x, cuml_y = Cminus(x, y, xmax, ymax) # simple linear interpolation using a binary search # interpolate the cumulative distributions x_sort = np.sort(x) y_sort = np.sort(y) Ix_edges = _sorted_interpolate(x_sort, cuml_x, xbins) Iy_edges = _sorted_interpolate(y_sort, cuml_y, ybins) if xbins[0] < x_sort[0]: Ix_edges[0] = cuml_x[0] if xbins[-1] > x_sort[-1]: Ix_edges[-1] = cuml_x[-1] if ybins[0] < y_sort[0]: Iy_edges[0] = cuml_y[0] if ybins[-1] > y_sort[-1]: Iy_edges[-1] = cuml_y[-1] x_dist = np.diff(Ix_edges) / np.diff(xbins) y_dist = np.diff(Iy_edges) / np.diff(ybins) if normalize: x_dist /= len(x) y_dist /= len(y) return x_dist, y_dist def bootstrap_Cminus(x, y, xmax, ymax, xbins, ybins, Nbootstraps=10, normalize=False): """ Compute the binned distributions using the Cminus method, with bootstrapped estimates of the errors Parameters ---------- x : array_like array of x values y : array_like array of y values xmax : array_like array of maximum x values for each y value ymax : array_like array of maximum y values for each x value xbins : array_like array of bin edges for the x function: size=Nbins_x + 1 ybins : array_like array of bin edges for the y function: size=Nbins_y + 1 Nbootstraps : int number of bootstrap resamplings to perform normalize : boolean if true, then returned distributions are normalized. Default is False. Returns ------- dist_x, err_x, dist_y, err_y : ndarrays distributions of size Nbins_x and Nbins_y """ x, y, xmax, ymax = map(np.asarray, (x, y, xmax, ymax)) x_dist = np.zeros((Nbootstraps, len(xbins) - 1)) y_dist = np.zeros((Nbootstraps, len(ybins) - 1)) for i in range(Nbootstraps): ind = np.random.randint(0, len(x), len(x)) x_dist[i], y_dist[i] = binned_Cminus(x[ind], y[ind], xmax[ind], ymax[ind], xbins, ybins, normalize=normalize) return (x_dist.mean(0), x_dist.std(0, ddof=1), y_dist.mean(0), y_dist.std(0, ddof=1)) astroML-0.3/astroML/plotting/0000755000076500000240000000000012462244012016722 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/plotting/__init__.py0000644000076500000240000000032712252721253021041 0ustar jakevdpstaff00000000000000from .hist_tools import hist from .scatter_contour import scatter_contour from .mcmc import plot_mcmc from .ellipse import plot_tissot_ellipse from .multiaxes import MultiAxes from .settings import setup_text_plots astroML-0.3/astroML/plotting/ellipse.py0000644000076500000240000000170012462237653020744 0ustar jakevdpstaff00000000000000import numpy as np def plot_tissot_ellipse(longitude, latitude, radius, ax=None, **kwargs): """Plot Tissot Ellipse/Tissot Indicatrix Parameters ---------- longitude : float or array_like longitude of ellipse centers (radians) latitude : float or array_like latitude of ellipse centers (radians) radius : float or array_like radius of ellipses ax : Axes object (optional) matplotlib axes instance on which to draw ellipses. Other Parameters ---------------- other keyword arguments will be passed to matplotlib.patches.Ellipse. """ # Import here so that testing with Agg will work from matplotlib import pyplot as plt from matplotlib.patches import Ellipse if ax is None: ax = plt.gca() for long, lat, rad in np.broadcast(longitude, latitude, radius): el = Ellipse((long, lat), radius / np.cos(lat), radius, **kwargs) ax.add_patch(el) astroML-0.3/astroML/plotting/hist_tools.py0000644000076500000240000000477612462237365021516 0ustar jakevdpstaff00000000000000import warnings import numpy as np from astroML.density_estimation import\ scotts_bin_width, freedman_bin_width,\ knuth_bin_width, bayesian_blocks def hist(x, bins=10, range=None, *args, **kwargs): """Enhanced histogram This is a histogram function that enables the use of more sophisticated algorithms for determining bins. Aside from the `bins` argument allowing a string specified how bins are computed, the parameters are the same as pylab.hist(). Parameters ---------- x : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'blocks' : use bayesian blocks for dynamic bin widths 'knuth' : use Knuth's rule to determine bins 'scott' : use Scott's rule to determine bins 'freedman' : use the Freedman-diaconis rule to determine bins range : tuple or None (optional) the minimum and maximum range for the histogram. If not specified, it will be (x.min(), x.max()) ax : Axes instance (optional) specify the Axes on which to draw the histogram. If not specified, then the current active axes will be used. **kwargs : other keyword arguments are described in pylab.hist(). """ if isinstance(bins, str) and "weights" in kwargs: warnings.warn("weights argument is not supported: it will be ignored.") kwargs.pop('weights') x = np.asarray(x) if 'ax' in kwargs: ax = kwargs['ax'] del kwargs['ax'] else: # import here so that testing with Agg will work from matplotlib import pyplot as plt ax = plt.gca() # if range is specified, we need to truncate the data for # the bin-finding routines if (range is not None and (bins in ['blocks', 'knuth', 'knuths', 'scott', 'scotts', 'freedman', 'freedmans'])): x = x[(x >= range[0]) & (x <= range[1])] if bins in ['blocks']: bins = bayesian_blocks(x) elif bins in ['knuth', 'knuths']: dx, bins = knuth_bin_width(x, True, disp=False) elif bins in ['scott', 'scotts']: dx, bins = scotts_bin_width(x, True) elif bins in ['freedman', 'freedmans']: dx, bins = freedman_bin_width(x, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) return ax.hist(x, bins, range, **kwargs) astroML-0.3/astroML/plotting/mcmc.py0000644000076500000240000001026712462237460020232 0ustar jakevdpstaff00000000000000import numpy as np def convert_to_stdev(logL): """ Given a grid of log-likelihood values, convert them to cumulative standard deviation. This is useful for drawing contours from a grid of likelihoods. """ sigma = np.exp(logL) shape = sigma.shape sigma = sigma.ravel() # obtain the indices to sort and unsort the flattened array i_sort = np.argsort(sigma)[::-1] i_unsort = np.argsort(i_sort) sigma_cumsum = sigma[i_sort].cumsum() sigma_cumsum /= sigma_cumsum[-1] return sigma_cumsum[i_unsort].reshape(shape) def plot_mcmc(traces, labels=None, limits=None, true_values=None, fig=None, contour=True, scatter=False, levels=[0.683, 0.955], bins=20, bounds=[0.08, 0.08, 0.95, 0.95], **kwargs): """Plot a grid of MCMC results Parameters ---------- traces : array_like the MCMC chain traces. shape is [Ndim, Nchain] labels : list of strings (optional) if specified, the label associated with each trace limits : list of tuples (optional) if specified, the axes limits for each trace true_values : list of floats (optional) if specified, the true value for each trace (will be indicated with an 'X' on the plot) fig : matplotlib.Figure (optional) the figure on which to draw the axes. If not specified, a new one will be created. contour : bool (optional) if True, then draw contours in each subplot. Default=True. scatter : bool (optional) if True, then scatter points in each subplot. Default=False. levels : list of floats the list of percentile levels at which to plot contours. Each entry should be between 0 and 1 bins : int, tuple, array, or tuple of arrays the binning parameter passed to np.histogram2d. It is assumed that the point density is constant on the scale of the bins bounds : list of floats the bounds of the set of axes used for plotting additional keyword arguments are passed to scatter() and contour() Returns ------- axes_list : list of matplotlib.Axes instances the list of axes created by the routine """ # Import here so that testing with Agg will work from matplotlib import pyplot as plt if fig is None: fig = plt.figure(figsize=(8, 8)) if limits is None: limits = [(t.min(), t.max()) for t in traces] if labels is None: labels = ['' for t in traces] num_traces = len(traces) bins = [np.linspace(limits[i][0], limits[i][1], bins + 1) for i in range(num_traces)] xmin, xmax = bounds[0], bounds[2] ymin, ymax = bounds[1], bounds[3] dx = (xmax - xmin) * 1. / (num_traces - 1) dy = (ymax - ymin) * 1. / (num_traces - 1) axes_list = [] for j in range(1, num_traces): for i in range(j): ax = fig.add_axes([xmin + i * dx, ymin + (num_traces - 1 - j) * dy, dx, dy]) if scatter: plt.scatter(traces[i], traces[j], **kwargs) if contour: H, xbins, ybins = np.histogram2d(traces[i], traces[j], bins=(bins[i], bins[j])) H[H == 0] = 1E-16 Nsigma = convert_to_stdev(np.log(H)) ax.contour(0.5 * (xbins[1:] + xbins[:-1]), 0.5 * (ybins[1:] + ybins[:-1]), Nsigma.T, levels=levels, **kwargs) if i == 0: ax.set_ylabel(labels[j]) else: ax.yaxis.set_major_formatter(plt.NullFormatter()) if j == num_traces - 1: ax.set_xlabel(labels[i]) else: ax.xaxis.set_major_formatter(plt.NullFormatter()) if true_values is not None: ax.plot(limits[i], [true_values[j], true_values[j]], ':k', lw=1) ax.plot([true_values[i], true_values[i]], limits[j], ':k', lw=1) ax.set_xlim(limits[i]) ax.set_ylim(limits[j]) axes_list.append(ax) return axes_list astroML-0.3/astroML/plotting/multiaxes.py0000644000076500000240000002453712462237576021343 0ustar jakevdpstaff00000000000000""" Multi-panel plotting """ from copy import deepcopy import numpy as np class MultiAxes(object): """Visualize Multiple-dimensional data This class enables the visualization of multi-dimensional data, using a triangular grid of 2D plots. Parameters ---------- ndim : integer Number of data dimensions inner_labels : bool If true, then label the inner axes. If false, then only the outer axes will be labeled fig : matplotlib.Figure if specified, draw the plot on this figure. Otherwise, use the current active figure. left, bottom, right, top, wspace, hspace : floats these parameters control the layout of the plots. They behave have an identical effect as the arguments to plt.subplots_adjust. If not specified, default values from the rc file will be used. Examples -------- A grid of scatter plots can be created as follows:: x = np.random.normal((4, 1000)) R = np.random.random((4, 4)) # projection matrix x = np.dot(R, x) ax = MultiAxes(4) ax.scatter(x) ax.set_labels(['x1', 'x2', 'x3', 'x4']) Alternatively, the scatter plot can be visualized as a density:: ax = MultiAxes(4) ax.density(x, bins=[20, 20, 20, 20]) """ def __init__(self, ndim, inner_labels=False, fig=None, left=None, bottom=None, right=None, top=None, wspace=None, hspace=None): # Import here so that testing with Agg will work from matplotlib import pyplot as plt if fig is None: fig = plt.gcf() self.fig = fig self.ndim = ndim self.inner_labels = inner_labels self._update('left', left) self._update('bottom', bottom) self._update('right', right) self._update('top', top) self._update('wspace', wspace) self._update('hspace', hspace) self.axes = self._draw_panels() def _update(self, s, val): # Import here so that testing with Agg will work from matplotlib import rcParams if val is None: val = getattr(self, s, None) if val is None: key = 'figure.subplot.' + s val = rcParams[key] setattr(self, s, val) def _check_data(self, data): data = np.asarray(data) if data.ndim != 2: raise ValueError("data dimension should be 2") if data.shape[1] != self.ndim: raise ValueError("leading dimension of data should match ndim") return data def _draw_panels(self): # Import here so that testing with Agg will work from matplotlib import pyplot as plt if self.top <= self.bottom: raise ValueError('top must be larger than bottom') if self.right <= self.left: raise ValueError('right must be larger than left') ndim = self.ndim panel_width = ((self.right - self.left) / (ndim - 1 + self.wspace * (ndim - 2))) panel_height = ((self.top - self.bottom) / (ndim - 1 + self.hspace * (ndim - 2))) full_panel_width = (1 + self.wspace) * panel_width full_panel_height = (1 + self.hspace) * panel_height axes = np.empty((ndim, ndim), dtype=object) axes.fill(None) for j in range(1, ndim): for i in range(j): left = self.left + i * full_panel_width right = self.bottom + (ndim - 1 - j) * full_panel_height ax = self.fig.add_axes([left, right, panel_width, panel_height]) axes[i, j] = ax if not self.inner_labels: # remove unneeded x labels for i in range(ndim): for j in range(ndim - 1): ax = axes[i, j] if ax is not None: ax.xaxis.set_major_formatter(plt.NullFormatter()) # remove unneeded y labels for i in range(1, ndim): for j in range(ndim): ax = axes[i, j] if ax is not None: ax.yaxis.set_major_formatter(plt.NullFormatter()) return np.asarray(axes, dtype=object) def set_limits(self, limits): """Set the axes limits Parameters ---------- limits : list of tuples a list of plot limits for each dimension, each in the form (xmin, xmax). The length of `limits` should match the data dimension. """ if len(limits) != self.ndim: raise ValueError("limits do not match number of dimensions") for i in range(self.ndim): for j in range(self.ndim): ax = self.axes[i, j] if ax is not None: ax.set_xlim(limits[i]) ax.set_ylim(limits[j]) def set_labels(self, labels): """Set the axes labels Parameters ---------- labels : list of strings a list of plot limits for each dimension. The length of `labels` should match the data dimension. """ if len(labels) != self.ndim: raise ValueError("labels do not match number of dimensions") for i in range(self.ndim): ax = self.axes[i, self.ndim - 1] if ax is not None: ax.set_xlabel(labels[i]) for j in range(self.ndim): ax = self.axes[0, j] if ax is not None: ax.set_ylabel(labels[j]) def set_locators(self, locators): """Set the tick locators for the plots Parameters ---------- locators : list or plt.Locator object If a list, then the length should match the data dimension. If a single Locator instance, then each axes will be given the same locator. """ # Import here so that testing with Agg will work from matplotlib import pyplot as plt if isinstance(locators, plt.Locator): locators = [deepcopy(locators) for i in range(self.ndim)] elif len(locators) != self.ndim: raise ValueError("locators do not match number of dimensions") for i in range(self.ndim): for j in range(self.ndim): ax = self.axes[i, j] if ax is not None: ax.xaxis.set_major_locator(locators[i]) ax.yaxis.set_major_locator(locators[j]) def set_formatters(self, formatters): """Set the tick formatters for the outer edge of plots Parameters ---------- formatterss : list or plt.Formatter object If a list, then the length should match the data dimension. If a single Formatter instance, then each axes will be given the same locator. """ # Import here so that testing with Agg will work from matplotlib import pyplot as plt if isinstance(formatters, plt.Formatter): formatters = [deepcopy(formatters) for i in range(self.ndim)] elif len(formatters) != self.ndim: raise ValueError("formatters do not match number of dimensions") for i in range(self.ndim): ax = self.axes[i, self.ndim - 1] if ax is not None: ax.xaxis.set_major_formatter(formatters[i]) for j in range(self.ndim): ax = self.axes[0, j] if ax is not None: ax.xaxis.set_major_formatter(formatters[i]) def plot(self, data, *args, **kwargs): """Plot data This function calls plt.plot() on each axes. All arguments or keyword arguments are passed to the plt.plot function. Parameters ---------- data : ndarray shape of data is [n_samples, ndim], and ndim should match that passed to the MultiAxes constructor. """ data = self._check_data(data) for i in range(self.ndim): for j in range(self.ndim): ax = self.axes[i, j] if ax is None: continue ax.plot(data[:, i], data[:, j], *args, **kwargs) def scatter(self, data, *args, **kwargs): """Scatter plot data This function calls plt.scatter() on each axes. All arguments or keyword arguments are passed to the plt.scatter function. Parameters ---------- data : ndarray shape of data is [n_samples, ndim], and ndim should match that passed to the MultiAxes constructor. """ data = self._check_data(data) for i in range(self.ndim): for j in range(self.ndim): ax = self.axes[i, j] if ax is None: continue ax.scatter(data[:, i], data[:, j], *args, **kwargs) def density(self, data, bins=20, **kwargs): """Density plot of data This function calls np.histogram2D to bin the data in each axes, then calls plt.imshow() on the result. All extra arguments or keyword arguments are passed to the plt.imshow function. Parameters ---------- data : ndarray shape of data is [n_samples, ndim], and ndim should match that passed to the MultiAxes constructor. bins : int, array, list of ints, or list of arrays specify the bins for each dimension. If bins is a list, then the length must match the data dimension """ data = self._check_data(data) if not hasattr(bins, '__len__'): bins = [bins for i in range(self.ndim)] elif len(bins) != self.ndim: bins = [bins for i in range(self.ndim)] for i in range(self.ndim): for j in range(self.ndim): ax = self.axes[i, j] if ax is None: continue H, xbins, ybins = np.histogram2d(data[:, i], data[:, j], (bins[i], bins[j])) ax.imshow(H.T, origin='lower', aspect='auto', extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), **kwargs) ax.set_xlim(xbins[0], xbins[-1]) ax.set_ylim(ybins[0], ybins[-1]) astroML-0.3/astroML/plotting/scatter_contour.py0000644000076500000240000000721112462237631022524 0ustar jakevdpstaff00000000000000import numpy as np def scatter_contour(x, y, levels=10, threshold=100, log_counts=False, histogram2d_args=None, plot_args=None, contour_args=None, filled_contour=True, ax=None): """Scatter plot with contour over dense regions Parameters ---------- x, y : arrays x and y data for the contour plot levels : integer or array (optional, default=10) number of contour levels, or array of contour levels threshold : float (default=100) number of points per 2D bin at which to begin drawing contours log_counts :boolean (optional) if True, contour levels are the base-10 logarithm of bin counts. histogram2d_args : dict keyword arguments passed to numpy.histogram2d see doc string of numpy.histogram2d for more information plot_args : dict keyword arguments passed to plt.plot. By default it will use dict(marker='.', linestyle='none'). see doc string of pylab.plot for more information contour_args : dict keyword arguments passed to plt.contourf or plt.contour see doc string of pylab.contourf for more information filled_contour : bool If True (default) use filled contours. Otherwise, use contour outlines. ax : pylab.Axes instance the axes on which to plot. If not specified, the current axes will be used Returns ------- points, contours : points is the return value of ax.plot() contours is the return value of ax.contour or ax.contourf """ x = np.asarray(x) y = np.asarray(y) default_plot_args = dict(marker='.', linestyle='none') if plot_args is not None: default_plot_args.update(plot_args) plot_args = default_plot_args if histogram2d_args is None: histogram2d_args = {} if contour_args is None: contour_args = {} if ax is None: # Import here so that testing with Agg will work from matplotlib import pyplot as plt ax = plt.gca() H, xbins, ybins = np.histogram2d(x, y, **histogram2d_args) Nx = len(xbins) Ny = len(ybins) if log_counts: H = np.log10(1 + H) threshold = np.log10(1 + threshold) levels = np.asarray(levels) if levels.size == 1: levels = np.linspace(threshold, H.max(), levels) extent = [xbins[0], xbins[-1], ybins[0], ybins[-1]] i_min = np.argmin(levels) # draw a zero-width line: this gives us the outer polygon to # reduce the number of points we draw # somewhat hackish... we could probably get the same info from # the full contour plot below. outline = ax.contour(H.T, levels[i_min:i_min + 1], linewidths=0, extent=extent) if filled_contour: contours = ax.contourf(H.T, levels, extent=extent, **contour_args) else: contours = ax.contour(H.T, levels, extent=extent, **contour_args) X = np.hstack([x[:, None], y[:, None]]) if len(outline.allsegs[0]) > 0: outer_poly = outline.allsegs[0][0] try: # this works in newer matplotlib versions from matplotlib.path import Path points_inside = Path(outer_poly).contains_points(X) except: # this works in older matplotlib versions import matplotlib.nxutils as nx points_inside = nx.points_inside_poly(X, outer_poly) Xplot = X[~points_inside] else: Xplot = X points = ax.plot(Xplot[:, 0], Xplot[:, 1], zorder=1, **plot_args) return points, contours astroML-0.3/astroML/plotting/settings.py0000644000076500000240000000121712252721253021141 0ustar jakevdpstaff00000000000000def setup_text_plots(fontsize=8, usetex=True): """ This function adjusts matplotlib settings so that all figures in the textbook have a uniform format and look. """ import matplotlib matplotlib.rc('legend', fontsize=fontsize, handlelength=3) matplotlib.rc('axes', titlesize=fontsize) matplotlib.rc('axes', labelsize=fontsize) matplotlib.rc('xtick', labelsize=fontsize) matplotlib.rc('ytick', labelsize=fontsize) matplotlib.rc('text', usetex=usetex) matplotlib.rc('font', size=fontsize, family='serif', style='normal', variant='normal', stretch='normal', weight='normal') astroML-0.3/astroML/plotting/tests/0000755000076500000240000000000012462244012020064 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/plotting/tests/__init__.py0000644000076500000240000000005012462237111022172 0ustar jakevdpstaff00000000000000import matplotlib matplotlib.use('Agg') astroML-0.3/astroML/plotting/tests/test_devectorize.py0000644000076500000240000000163212420767763024043 0ustar jakevdpstaff00000000000000import matplotlib matplotlib.use('Agg') # don't display plots import numpy as np from numpy.testing import assert_ from matplotlib import image import matplotlib.pyplot as plt from astroML.py3k_compat import BytesIO from astroML.plotting.tools import devectorize_axes def test_devectorize_axes(): np.random.seed(0) x, y = np.random.random((2, 1000)) # save vectorized version fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(x, y) output = BytesIO() fig.savefig(output) output.seek(0) im1 = image.imread(output) plt.close() # save devectorized version fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(x, y) devectorize_axes(ax, dpi=200) output = BytesIO() fig.savefig(output) output.seek(0) im2 = image.imread(output) plt.close() assert_(im1.shape == im2.shape) assert_((im1 != im2).sum() < 0.1 * im1.size) astroML-0.3/astroML/plotting/tools.py0000644000076500000240000001125312420767763020457 0ustar jakevdpstaff00000000000000import os import numpy as np from matplotlib import pyplot as plt from scipy import interpolate from matplotlib import image from matplotlib.colors import LinearSegmentedColormap from matplotlib.transforms import Bbox from matplotlib.patches import Ellipse from ..py3k_compat import BytesIO def devectorize_axes(ax=None, dpi=None, transparent=True): """Convert axes contents to a png. This is useful when plotting many points, as the size of the saved file can become very large otherwise. Parameters ---------- ax : Axes instance (optional) Axes to de-vectorize. If None, this uses the current active axes (plt.gca()) dpi: int (optional) resolution of the png image. If not specified, the default from 'savefig.dpi' in rcParams will be used transparent : bool (optional) if True (default) then the PNG will be made transparent Returns ------- ax : Axes instance the in-place modified Axes instance Examples -------- The code can be used in the following way:: import matplotlib.pyplot as plt fig, ax = plt.subplots() x, y = np.random.random((2, 10000)) ax.scatter(x, y) devectorize_axes(ax) plt.savefig('devectorized.pdf') The resulting figure will be much smaller than the vectorized version. """ if ax is None: ax = plt.gca() fig = ax.figure axlim = ax.axis() # setup: make all visible spines (axes & ticks) & text invisible # we need to set these back later, so we save their current state _sp = {} _txt_vis = [t.get_visible() for t in ax.texts] for k in ax.spines: _sp[k] = ax.spines[k].get_visible() ax.spines[k].set_visible(False) for t in ax.texts: t.set_visible(False) _xax = ax.xaxis.get_visible() _yax = ax.yaxis.get_visible() _patch = ax.axesPatch.get_visible() ax.axesPatch.set_visible(False) ax.xaxis.set_visible(False) ax.yaxis.set_visible(False) # convert canvas to PNG extents = ax.bbox.extents / fig.dpi output = BytesIO() plt.savefig(output, format='png', dpi=dpi, transparent=transparent, bbox_inches=Bbox([extents[:2], extents[2:]])) output.seek(0) im = image.imread(output) # clear everything on axis (but not text) ax.lines = [] ax.patches = [] ax.tables = [] ax.artists = [] ax.images = [] ax.collections = [] # Show the image ax.imshow(im, extent=axlim, aspect='auto', interpolation='nearest') # restore all the spines & text for k in ax.spines: ax.spines[k].set_visible(_sp[k]) for t, v in zip(ax.texts, _txt_vis): t.set_visible(v) ax.axesPatch.set_visible(_patch) ax.xaxis.set_visible(_xax) ax.yaxis.set_visible(_yax) if plt.isinteractive(): plt.draw() return ax def discretize_cmap(cmap, N): """Return a discrete colormap from the continuous colormap cmap. Parameters ---------- cmap: colormap instance, eg. cm.jet. N: Number of colors. Returns ------- cmap_d: discretized colormap Example ------- >>> x = resize(arange(100), (5,100)) >>> djet = cmap_discretize(cm.jet, 5) """ cdict = cmap._segmentdata.copy() # N colors colors_i = np.linspace(0, 1., N) # N+1 indices indices = np.linspace(0, 1., N + 1) for key in ('red', 'green', 'blue'): # Find the N colors D = np.array(cdict[key]) I = interpolate.interp1d(D[:, 0], D[:, 1]) colors = I(colors_i) # Place these colors at the correct indices. A = np.zeros((N + 1, 3), float) A[:, 0] = indices A[1:, 1] = colors A[:-1, 2] = colors # Create a tuple for the dictionary. L = [] for l in A: L.append(tuple(l)) cdict[key] = tuple(L) # Return colormap object. return LinearSegmentedColormap('colormap', cdict, 1024) def draw_ellipse(mu, C, scales=[1, 2, 3], ax=None, **kwargs): if ax is None: ax = plt.gca() # find principal components and rotation angle of ellipse sigma_x2 = C[0, 0] sigma_y2 = C[1, 1] sigma_xy = C[0, 1] alpha = 0.5 * np.arctan2(2 * sigma_xy, (sigma_x2 - sigma_y2)) tmp1 = 0.5 * (sigma_x2 + sigma_y2) tmp2 = np.sqrt(0.25 * (sigma_x2 - sigma_y2) ** 2 + sigma_xy ** 2) sigma1 = np.sqrt(tmp1 + tmp2) sigma2 = np.sqrt(tmp1 - tmp2) for scale in scales: ax.add_patch(Ellipse((mu[0], mu[1]), 2 * scale * sigma1, 2 * scale * sigma2, alpha * 180. / np.pi, **kwargs)) astroML-0.3/astroML/py3k_compat.py0000644000076500000240000000176112421515344017676 0ustar jakevdpstaff00000000000000""" Compatibility utilities for Python 2 & 3 """ import sys py3k = (sys.version_info[0] == 3) #---------------------------------------------------------------------- # urllib stuff if py3k: from urllib.request import urlopen from urllib.error import HTTPError from urllib.parse import urlencode else: from urllib2 import urlopen from urllib2 import HTTPError from urllib import urlencode def url_content_length(fhandle): if py3k: length = dict(fhandle.info())['Content-Length'] else: length = fhandle.info().getheader('Content-Length') return int(length.strip()) #---------------------------------------------------------------------- # pickle stuff if py3k: from pickle import load, dump else: from cPickle import load, dump #---------------------------------------------------------------------- # StringIO if py3k: from io import StringIO, BytesIO else: from cStringIO import StringIO from cStringIO import StringIO as BytesIO astroML-0.3/astroML/resample.py0000644000076500000240000001350012420767763017264 0ustar jakevdpstaff00000000000000import numpy as np from astroML.utils import check_random_state def bootstrap(data, n_bootstraps, user_statistic, kwargs=None, pass_indices=False, random_state=None): """Compute bootstraped statistics of a dataset. Parameters ---------- data : array_like A 1-dimensional data array of size n_samples n_bootstraps : integer the number of bootstrap samples to compute. Note that internally, two arrays of size (n_bootstraps, n_samples) will be allocated. For very large numbers of bootstraps, this can cause memory issues. user_statistic : function The statistic to be computed. This should take an array of data of size (n_bootstraps, n_samples) and return the row-wise statistics of the data. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. pass_indices : boolean (optional) if True, then the indices of the points rather than the points themselves are passed to `user_statistic` random_state: RandomState or an int seed (0 by default) A random number generator instance Returns ------- distribution : ndarray the bootstrapped distribution of statistics (length = n_bootstraps) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} rng = check_random_state(random_state) data = np.asarray(data) n_samples = data.size if data.ndim != 1: raise ValueError("bootstrap expects 1-dimensional data") # Generate random indices with repetition ind = rng.randint(n_samples, size=(n_bootstraps, n_samples)) # Call the function if pass_indices: stat_bootstrap = user_statistic(ind, **kwargs) else: stat_bootstrap = user_statistic(data[ind], **kwargs) # compute the statistic on the data return stat_bootstrap def jackknife(data, user_statistic, kwargs=None, return_raw_distribution=False, pass_indices=False): """Compute first-order jackknife statistics of the data. Parameters ---------- data : array_like A 1-dimensional data array of size n_samples user_statistic : function The statistic to be computed. This should take an array of data of size (n_samples, n_samples - 1) and return an array of size n_samples or tuple of arrays of size n_samples, representing the row-wise statistics of the input. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. return_raw_distribution : boolean (optional) if True, return the raw jackknife distribution. Be aware that this distribution is not reflective of the true distribution: it is simply an intermediate step in the jackknife calculation pass_indices : boolean (optional) if True, then the indices of the points rather than the points themselves are passed to `user_statistic` Returns ------- mean, stdev : floats The mean and standard deviation of the jackknifed distribution raw_distribution : ndarray Returned only if `return_raw_distribution` is True The array containing the raw distribution (length n_samples) Be aware that this distribution is not reflective of the true distribution: it is simply an intermediate step in the jackknife calculation Notes ----- This implementation is a leave-one-out jackknife. Jackknife resampling is known to fail on rank-based statistics (e.g. median, quartiles, etc.) It works well on smooth statistics (e.g. mean, standard deviation, etc.) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} data = np.asarray(data) n_samples = data.size if data.ndim != 1: raise ValueError("bootstrap expects 1-dimensional data") # generate indices for the entire dataset, converting to row vector ind0 = np.arange(n_samples)[np.newaxis, :] # generate sets of indices where a single datapoint is left-out ind = np.arange(n_samples, dtype=int) ind = np.vstack([np.hstack((ind[:i], ind[i + 1:])) for i in ind]) # compute the statistic for the whole dataset if pass_indices: stat_data = user_statistic(ind0, **kwargs) stat_jackknife = user_statistic(ind, **kwargs) else: stat_data = user_statistic(data[ind0], **kwargs) stat_jackknife = user_statistic(data[ind], **kwargs) # handle multiple statistics: # if ndim=0, then the statistic is not operating on rows (error). # if ndim=1, then it's a single statistic returned # if ndim=2, then a tuple has been returned stat_data = np.asarray(stat_data) ndim = stat_data.ndim if ndim == 0: raise ValueError("user_statistic should return row-wise statistics") stat_data = np.atleast_2d(stat_data).T stat_jackknife = np.atleast_2d(stat_jackknife) # compute the jackknife correction formula delta_stat = (n_samples - 1) * (stat_data - stat_jackknife.mean(1)) stat_corrected = (stat_data + delta_stat)[0] sigma_stat = np.sqrt(1. / n_samples / (n_samples + 1) * np.sum((n_samples * stat_data - stat_corrected - (n_samples - 1) * stat_jackknife.T) ** 2, 0)) if return_raw_distribution: results = tuple(zip(stat_corrected, sigma_stat, stat_jackknife)) else: results = tuple(zip(stat_corrected, sigma_stat)) if ndim == 1: return results[0] else: return results astroML-0.3/astroML/stats/0000755000076500000240000000000012462244012016220 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/stats/__init__.py0000644000076500000240000000042312420767763020351 0ustar jakevdpstaff00000000000000from ._binned_statistic import (binned_statistic, binned_statistic_2d, binned_statistic_dd) from ._point_statistics import \ mean_sigma, sigmaG, median_sigmaG, fit_bivariate_normal from .random import bivariate_normal, trunc_exp, linear astroML-0.3/astroML/stats/_binned_statistic.py0000644000076500000240000003201212115147567022271 0ustar jakevdpstaff00000000000000import numpy as np def binned_statistic(x, values, statistic='mean', bins=10, range=None): """ Compute a binned statistic for a set of data. This is a generalization of a histogram function. A histogram divides the space into bins, and returns the count of the number of points in each bin. This function allows the computation of the sum, mean, median, or other statistic of the values within each bin. Parameters ---------- x : array_like A sequence of values to be binned. values : array_like The values on which the statistic will be computed. This must be the same shape as x. statistic : string or callable, optional The statistic to compute (default is 'mean'). The following statistics are available: * 'mean' : compute the mean of values for points within each bin. Empty bins will be represented by NaN. * 'median' : compute the median of values for points within each bin. Empty bins will be represented by NaN. * 'count' : compute the count of points within each bin. This is identical to an unweighted histogram. `values` array is not referenced. * 'sum' : compute the sum of values for points within each bin. This is identical to a weighted histogram. * function : a user-defined function which takes a 1D array of values, and outputs a single numerical statistic. This function will be called on the values in each bin. Empty bins will be represented by function([]), or NaN if this returns an error. bins : int or sequence of scalars, optional If `bins` is an int, it defines the number of equal-width bins in the given range (10, by default). If `bins` is a sequence, it defines the bin edges, including the rightmost edge, allowing for non-uniform bin widths. range : (float, float), optional The lower and upper range of the bins. If not provided, range is simply ``(x.min(), x.max())``. Values outside the range are ignored. Returns ------- statistic : array The values of the selected statistic in each bin. bin_edges : array of dtype float Return the bin edges ``(length(statistic)+1)``. Notes ----- All but the last (righthand-most) bin is half-open. In other words, if `bins` is:: [1, 2, 3, 4] then the first bin is ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes* 4. Examples -------- >>> binned_statistic([1,2,1], bins=[0,1,2,3], 'count') (array([0, 2, 1]), array([0, 1, 2, 3]))\ See Also -------- np.histogram, binned_statistic_2d, binned_statistic_dd """ try: N = len(bins) except TypeError: N = 1 if N != 1: bins = [np.asarray(bins, float)] medians, edges = binned_statistic_dd([x], values, statistic, bins, range) return medians, edges[0] def binned_statistic_2d(x, y, values, statistic='mean', bins=10, range=None): """ Compute a bidimensional binned statistic for a set of data. This is a generalization of a histogram2d function. A histogram divides the space into bins, and returns the count of the number of points in each bin. This function allows the computation of the sum, mean, median, or other statistic of the values within each bin. Parameters ---------- x : array_like A sequence of values to be binned along the first dimension. y : array_like A sequence of values to be binned along the second dimension. values : array_like The values on which the statistic will be computed. This must be the same shape as x. statistic : string or callable, optional The statistic to compute (default is 'mean'). The following statistics are available: * 'mean' : compute the mean of values for points within each bin. Empty bins will be represented by NaN. * 'median' : compute the median of values for points within each bin. Empty bins will be represented by NaN. * 'count' : compute the count of points within each bin. This is identical to an unweighted histogram. `values` array is not referenced. * 'sum' : compute the sum of values for points within each bin. This is identical to a weighted histogram. * function : a user-defined function which takes a 1D array of values, and outputs a single numerical statistic. This function will be called on the values in each bin. Empty bins will be represented by function([]), or NaN if this returns an error. bins : int or [int, int] or array-like or [array, array], optional The bin specification: * the number of bins for the two dimensions (nx=ny=bins), * the number of bins in each dimension (nx, ny = bins), * the bin edges for the two dimensions (x_edges=y_edges=bins), * the bin edges in each dimension (x_edges, y_edges = bins). range : array_like, shape(2,2), optional The leftmost and rightmost edges of the bins along each dimension (if not specified explicitly in the `bins` parameters): [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be considered outliers and not tallied in the histogram. Returns ------- statistic : ndarray, shape(nx, ny) The values of the selected statistic in each two-dimensional bin xedges : ndarray, shape(nx + 1,) The bin edges along the first dimension. yedges : ndarray, shape(ny + 1,) The bin edges along the second dimension. See Also -------- np.histogram2d, binned_statistic, binned_statistic_dd """ # This code is based on np.histogram2d try: N = len(bins) except TypeError: N = 1 if N != 1 and N != 2: xedges = yedges = np.asarray(bins, float) bins = [xedges, yedges] medians, edges = binned_statistic_dd([x, y], values, statistic, bins, range) return medians, edges[0], edges[1] def binned_statistic_dd(sample, values, statistic='mean', bins=10, range=None): """ Compute a multidimensional binned statistic for a set of data. This is a generalization of a histogramdd function. A histogram divides the space into bins, and returns the count of the number of points in each bin. This function allows the computation of the sum, mean, median, or other statistic of the values within each bin. Parameters ---------- sample : array_like Data to histogram passed as a sequence of D arrays of length N, or as an (N,D) array. values : array_like The values on which the statistic will be computed. This must be the same shape as x. statistic : string or callable, optional The statistic to compute (default is 'mean'). The following statistics are available: * 'mean' : compute the mean of values for points within each bin. Empty bins will be represented by NaN. * 'median' : compute the median of values for points within each bin. Empty bins will be represented by NaN. * 'count' : compute the count of points within each bin. This is identical to an unweighted histogram. `values` array is not referenced. * 'sum' : compute the sum of values for points within each bin. This is identical to a weighted histogram. * function : a user-defined function which takes a 1D array of values, and outputs a single numerical statistic. This function will be called on the values in each bin. Empty bins will be represented by function([]), or NaN if this returns an error. bins : sequence or int, optional The bin specification: * A sequence of arrays describing the bin edges along each dimension. * The number of bins for each dimension (nx, ny, ... =bins) * The number of bins for all dimensions (nx=ny=...=bins). range : sequence, optional A sequence of lower and upper bin edges to be used if the edges are not given explicitely in `bins`. Defaults to the minimum and maximum values along each dimension. Returns ------- statistic : ndarray, shape(nx1, nx2, nx3,...) The values of the selected statistic in each two-dimensional bin edges : list of ndarrays A list of D arrays describing the (nxi + 1) bin edges for each dimension See Also -------- np.histogramdd, binned_statistic, binned_statistic_2d """ if type(statistic) == str: if statistic not in ['mean', 'median', 'count', 'sum']: raise ValueError('unrecognized statistic "%s"' % statistic) elif callable(statistic): pass else: raise ValueError("statistic not understood") # This code is based on np.histogramdd try: # Sample is an ND-array. N, D = sample.shape except (AttributeError, ValueError): # Sample is a sequence of 1D arrays. sample = np.atleast_2d(sample).T N, D = sample.shape nbin = np.empty(D, int) edges = D * [None] dedges = D * [None] try: M = len(bins) if M != D: raise AttributeError('The dimension of bins must be equal ' 'to the dimension of the sample x.') except TypeError: bins = D * [bins] # Select range for each dimension # Used only if number of bins is given. if range is None: smin = np.atleast_1d(np.array(sample.min(0), float)) smax = np.atleast_1d(np.array(sample.max(0), float)) else: smin = np.zeros(D) smax = np.zeros(D) for i in np.arange(D): smin[i], smax[i] = range[i] # Make sure the bins have a finite width. for i in np.arange(len(smin)): if smin[i] == smax[i]: smin[i] = smin[i] - .5 smax[i] = smax[i] + .5 # Create edge arrays for i in np.arange(D): if np.isscalar(bins[i]): nbin[i] = bins[i] + 2 # +2 for outlier bins edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1) else: edges[i] = np.asarray(bins[i], float) nbin[i] = len(edges[i]) + 1 # +1 for outlier bins dedges[i] = np.diff(edges[i]) nbin = np.asarray(nbin) # Compute the bin number each sample falls into. Ncount = {} for i in np.arange(D): Ncount[i] = np.digitize(sample[:, i], edges[i]) # Using digitize, values that fall on an edge are put in the right bin. # For the rightmost bin, we want values equal to the right # edge to be counted in the last bin, and not as an outlier. outliers = np.zeros(N, int) for i in np.arange(D): # Rounding precision decimal = int(-np.log10(dedges[i].min())) + 6 # Find which points are on the rightmost edge. on_edge = np.where(np.around(sample[:, i], decimal) == np.around(edges[i][-1], decimal))[0] # Shift these points one bin to the left. Ncount[i][on_edge] -= 1 # Compute the sample indices in the flattened statistic matrix. ni = nbin.argsort() shape = [] xy = np.zeros(N, int) for i in np.arange(0, D - 1): xy += Ncount[ni[i]] * nbin[ni[i + 1:]].prod() xy += Ncount[ni[-1]] result = np.empty(nbin.prod(), float) if statistic == 'mean': result.fill(np.nan) flatcount = np.bincount(xy, None) flatsum = np.bincount(xy, values) a = np.arange(len(flatcount)) result[a] = flatsum result[a] /= flatcount elif statistic == 'count': result.fill(0) flatcount = np.bincount(xy, None) a = np.arange(len(flatcount)) result[a] = flatcount elif statistic == 'sum': result.fill(0) flatsum = np.bincount(xy, values) a = np.arange(len(flatsum)) result[a] = flatsum elif statistic == 'median': result.fill(np.nan) for i in np.unique(xy): result[i] = np.median(values[xy == i]) elif callable(statistic): try: null = statistic([]) except: null = np.nan result.fill(null) for i in np.unique(xy): result[i] = statistic(values[xy == i]) # Shape into a proper matrix result = result.reshape(np.sort(nbin)) for i in np.arange(nbin.size): j = ni.argsort()[i] result = result.swapaxes(i, j) ni[i], ni[j] = ni[j], ni[i] # Remove outliers (indices 0 and -1 for each dimension). core = D * [slice(1, -1)] result = result[core] if (result.shape != nbin - 2).any(): raise RuntimeError('Internal Shape Error') return result, edges astroML-0.3/astroML/stats/_point_statistics.py0000644000076500000240000002110112252721253022333 0ustar jakevdpstaff00000000000000import numpy as np from scipy import stats #from scipy.special import erfinv #sigmaG_factor = 1. / (2 * np.sqrt(2) * erfinv(0.5)) sigmaG_factor = 0.74130110925280102 def mean_sigma(a, axis=None, dtype=None, ddof=0, keepdims=False): """Compute mean and standard deviation for an array Parameters ---------- a : array_like Array containing numbers whose mean is desired. If `a` is not an array, a conversion is attempted. axis : int, optional Axis along which the means are computed. The default is to compute the mean of the flattened array. dtype : dtype, optional Type to use in computing the standard deviation. For arrays of integer type the default is float64, for arrays of float types it is the same as the array type. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the original `arr`. Returns ------- mu : ndarray, see dtype parameter above array containing the mean values sigma : ndarray, see dtype parameter above. array containing the standard deviation See Also -------- median_sigmaG : robust rank-based version of this calculation. Notes ----- This routine simply calls ``np.mean`` and ``np.std``, passing the keyword arguments to them. It is provided for ease of comparison with the function median_sigmaG() """ mu = np.mean(a, axis=axis, dtype=dtype) sigma = np.std(a, axis=axis, dtype=dtype, ddof=ddof) if keepdims: if axis is None: newshape = a.ndim * (1,) else: newshape = np.asarray(a.shape) newshape[axis] = 1 mu = mu.reshape(newshape) sigma = sigma.reshape(newshape) return mu, sigma def median_sigmaG(a, axis=None, overwrite_input=False, keepdims=False): """Compute median and rank-based estimate of the standard deviation Parameters ---------- a : array_like Array containing numbers whose mean is desired. If `a` is not an array, a conversion is attempted. axis : int, optional Axis along which the means are computed. The default is to compute the mean of the flattened array. overwrite_input : bool, optional If True, then allow use of memory of input array `a` for calculations. The input array will be modified by the call to median. This will save memory when you do not need to preserve the contents of the input array. Treat the input as undefined, but it will probably be fully or partially sorted. Default is False. Note that, if `overwrite_input` is True and the input is not already an array, an error will be raised. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the original `arr`. Returns ------- median : ndarray, see dtype parameter above array containing the median values sigmaG : ndarray, see dtype parameter above. array containing the robust estimator of the standard deviation See Also -------- mean_sigma : non-robust version of this calculation sigmaG : robust rank-based estimate of standard deviation Notes ----- This routine uses a single call to ``np.percentile`` to find the quartiles along the given axis, and uses these to compute the median and sigmaG: median = q50 sigmaG = (q75 - q25) * 0.7413 where 0.7413 ~ 1 / (2 sqrt(2) erf^-1(0.5)) """ q25, median, q75 = np.percentile(a, [25, 50, 75], axis=axis, overwrite_input=overwrite_input) sigmaG = sigmaG_factor * (q75 - q25) if keepdims: if axis is None: newshape = a.ndim * (1,) else: newshape = np.asarray(a.shape) newshape[axis] = 1 median = median.reshape(newshape) sigmaG = sigmaG.reshape(newshape) return median, sigmaG def sigmaG(a, axis=None, overwrite_input=False, keepdims=False): """Compute the rank-based estimate of the standard deviation Parameters ---------- a : array_like Array containing numbers whose mean is desired. If `a` is not an array, a conversion is attempted. axis : int, optional Axis along which the means are computed. The default is to compute the mean of the flattened array. overwrite_input : bool, optional If True, then allow use of memory of input array `a` for calculations. The input array will be modified by the call to median. This will save memory when you do not need to preserve the contents of the input array. Treat the input as undefined, but it will probably be fully or partially sorted. Default is False. Note that, if `overwrite_input` is True and the input is not already an array, an error will be raised. keepdims : bool, optional If this is set to True, the axes which are reduced are left in the result as dimensions with size one. With this option, the result will broadcast correctly against the original `arr`. Returns ------- median : ndarray, see dtype parameter above array containing the median values sigmaG : ndarray, see dtype parameter above. array containing the robust estimator of the standard deviation See Also -------- median_sigmaG : robust rank-based estimate of mean and standard deviation Notes ----- This routine uses a single call to ``np.percentile`` to find the quartiles along the given axis, and uses these to compute the sigmaG, a robust estimate of the standard deviation sigma: sigmaG = 0.7413 * (q75 - q25) where 0.7413 ~ 1 / (2 sqrt(2) erf^-1(0.5)) """ q25, q75 = np.percentile(a, [25, 75], axis=axis, overwrite_input=overwrite_input) sigmaG = sigmaG_factor * (q75 - q25) if keepdims: if axis is None: newshape = a.ndim * (1,) else: newshape = np.asarray(a.shape) newshape[axis] = 1 sigmaG = sigmaG.reshape(newshape) return sigmaG def fit_bivariate_normal(x, y, robust=False): """Fit bivariate normal parameters to a 2D distribution of points Parameters ---------- x, y : array_like The x, y coordinates of the points robust : boolean (optional, default=False) If True, then use rank-based statistics which are robust to outliers Otherwise, use mean/std statistics which are not robust Returns ------- mu : tuple (x, y) location of the best-fit bivariate normal sigma_1, sigma_2 : float The best-fit gaussian widths in the uncorrelated frame alpha : float The rotation angle in radians of the uncorrelated frame """ x = np.asarray(x) y = np.asarray(y) assert x.shape == y.shape if robust: # use quartiles to compute center and spread med_x, sigmaG_x = median_sigmaG(x) med_y, sigmaG_y = median_sigmaG(y) # define the principal variables from Shevlyakov & Smirnov (2011) sx = 2 * sigmaG_x sy = 2 * sigmaG_y u = (x / sx + y / sy) / np.sqrt(2) v = (x / sx - y / sy) / np.sqrt(2) med_u, sigmaG_u = median_sigmaG(u) med_v, sigmaG_v = median_sigmaG(v) r_xy = ((sigmaG_u ** 2 - sigmaG_v ** 2) / (sigmaG_u ** 2 + sigmaG_v ** 2)) # rename estimators mu_x, mu_y = med_x, med_y sigma_x, sigma_y = sigmaG_x, sigmaG_y else: mu_x = np.mean(x) sigma_x = np.std(x) mu_y = np.mean(y) sigma_y = np.std(y) r_xy = stats.pearsonr(x, y)[0] # We need to use the full (-180, 180) version of arctan: this is # np.arctan2(x, y) = np.arctan(x / y), modulo 180 degrees sigma_xy = r_xy * sigma_x * sigma_y alpha = 0.5 * np.arctan2(2 * sigma_xy, sigma_x ** 2 - sigma_y ** 2) sigma1 = np.sqrt((0.5 * (sigma_x ** 2 + sigma_y ** 2) + np.sqrt(0.25 * (sigma_x ** 2 - sigma_y ** 2) ** 2 + sigma_xy ** 2))) sigma2 = np.sqrt((0.5 * (sigma_x ** 2 + sigma_y ** 2) - np.sqrt(0.25 * (sigma_x ** 2 - sigma_y ** 2) ** 2 + sigma_xy ** 2))) return [mu_x, mu_y], sigma1, sigma2, alpha astroML-0.3/astroML/stats/random.py0000644000076500000240000000746212115147567020077 0ustar jakevdpstaff00000000000000""" Statistics for astronomy """ import numpy as np from scipy.stats.distributions import rv_continuous def bivariate_normal(mu=[0, 0], sigma_1=1, sigma_2=1, alpha=0, size=None, return_cov=False): """Sample points from a 2D normal distribution Parameters ---------- mu : array-like (length 2) The mean of the distribution sigma_1 : float The unrotated x-axis width sigma_2 : float The unrotated y-axis width alpha : float The rotation counter-clockwise about the origin size : tuple of ints, optional Given a shape of, for example, ``(m,n,k)``, ``m*n*k`` samples are generated, and packed in an `m`-by-`n`-by-`k` arrangement. Because each sample is `N`-dimensional, the output shape is ``(m,n,k,N)``. If no shape is specified, a single (`N`-D) sample is returned. return_cov : boolean, optional If True, return the computed covariance matrix. Returns ------- out : ndarray The drawn samples, of shape *size*, if that was provided. If not, the shape is ``(N,)``. In other words, each entry ``out[i,j,...,:]`` is an N-dimensional value drawn from the distribution. cov : ndarray The 2x2 covariance matrix. Returned only if return_cov == True. Notes ----- This function works by computing a covariance matrix from the inputs, and calling ``np.random.multivariate_normal()``. If the covariance matrix is available, this function can be called directly. """ # compute covariance matrix sigma_xx = ((sigma_1 * np.cos(alpha)) ** 2 + (sigma_2 * np.sin(alpha)) ** 2) sigma_yy = ((sigma_1 * np.sin(alpha)) ** 2 + (sigma_2 * np.cos(alpha)) ** 2) sigma_xy = (sigma_1 ** 2 - sigma_2 ** 2) * np.sin(alpha) * np.cos(alpha) cov = np.array([[sigma_xx, sigma_xy], [sigma_xy, sigma_yy]]) # draw points from the distribution x = np.random.multivariate_normal(mu, cov, size) if return_cov: return x, cov else: return x #---------------------------------------------------------------------- # Define some new distributions based on rv_continuous class trunc_exp_gen(rv_continuous): """A truncated positive exponential continuous random variable. The probability distribution is:: p(x) ~ exp(k * x) between a and b = 0 otherwise The arguments are (a, b, k) %(before_notes)s %(example)s """ def _argcheck(self, a, b, k): self._const = k / (np.exp(k * b) - np.exp(k * a)) return (a != b) and not np.isinf(k) def _pdf(self, x, a, b, k): pdf = self._const * np.exp(k * x) pdf[(x < a) | (x > b)] = 0 return pdf def _rvs(self, a, b, k): y = np.random.random(self._size) return (1. / k) * np.log(1 + y * k / self._const) trunc_exp = trunc_exp_gen(name="trunc_exp", shapes='a, b, k') class linear_gen(rv_continuous): """A truncated positive exponential continuous random variable. The probability distribution is:: p(x) ~ c * x + d between a and b = 0 otherwise The arguments are (a, b, c). d is set by the normalization %(before_notes)s %(example)s """ def _argcheck(self, a, b, c): return (a != b) and not np.isinf(c) def _pdf(self, x, a, b, c): d = 1. / (b - a) - 0.5 * c * (b + a) pdf = c * x + d pdf[(x < a) | (x > b)] = 0 return pdf def _rvs(self, a, b, c): mu = 0.5 * (a + b) W = (b - a) x0 = 1. / c / W - mu r = np.random.random(self._size) return -x0 + np.sqrt(2. * r / c + a * a + 2. * a * x0 + x0 * x0) linear = linear_gen(name="linear", shapes='a, b, c') astroML-0.3/astroML/stats/tests/0000755000076500000240000000000012462244012017362 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/stats/tests/__init__.py0000644000076500000240000000000012252721253021465 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/stats/tests/test_binned_statistic.py0000644000076500000240000000774112115147567024346 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_array_almost_equal from astroML.stats import \ binned_statistic, binned_statistic_2d, binned_statistic_dd def test_1d_count(): x = np.random.random(100) v = np.random.random(100) count1, edges1 = binned_statistic(x, v, 'count', bins=10) count2, edges2 = np.histogram(x, bins=10) assert_array_almost_equal(count1, count2) assert_array_almost_equal(edges1, edges2) def test_1d_sum(): x = np.random.random(100) v = np.random.random(100) sum1, edges1 = binned_statistic(x, v, 'sum', bins=10) sum2, edges2 = np.histogram(x, bins=10, weights=v) assert_array_almost_equal(sum1, sum2) assert_array_almost_equal(edges1, edges2) def test_1d_mean(): x = np.random.random(100) v = np.random.random(100) stat1, edges1 = binned_statistic(x, v, 'mean', bins=10) stat2, edges2 = binned_statistic(x, v, np.mean, bins=10) assert_array_almost_equal(stat1, stat2) assert_array_almost_equal(edges1, edges2) def test_1d_median(): x = np.random.random(100) v = np.random.random(100) stat1, edges1 = binned_statistic(x, v, 'median', bins=10) stat2, edges2 = binned_statistic(x, v, np.median, bins=10) assert_array_almost_equal(stat1, stat2) assert_array_almost_equal(edges1, edges2) def test_2d_count(): x = np.random.random(100) y = np.random.random(100) v = np.random.random(100) count1, binx1, biny1 = binned_statistic_2d(x, y, v, 'count', bins=5) count2, binx2, biny2 = np.histogram2d(x, y, bins=5) assert_array_almost_equal(count1, count2) assert_array_almost_equal(binx1, binx2) assert_array_almost_equal(biny1, biny2) def test_2d_sum(): x = np.random.random(100) y = np.random.random(100) v = np.random.random(100) sum1, binx1, biny1 = binned_statistic_2d(x, y, v, 'sum', bins=5) sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v) assert_array_almost_equal(sum1, sum2) assert_array_almost_equal(binx1, binx2) assert_array_almost_equal(biny1, biny2) def test_2d_mean(): x = np.random.random(100) y = np.random.random(100) v = np.random.random(100) stat1, binx1, biny1 = binned_statistic_2d(x, y, v, 'mean', bins=5) stat2, binx2, biny2 = binned_statistic_2d(x, y, v, np.mean, bins=5) assert_array_almost_equal(stat1, stat2) assert_array_almost_equal(binx1, binx2) assert_array_almost_equal(biny1, biny2) def test_2d_median(): x = np.random.random(100) y = np.random.random(100) v = np.random.random(100) stat1, binx1, biny1 = binned_statistic_2d(x, y, v, 'median', bins=5) stat2, binx2, biny2 = binned_statistic_2d(x, y, v, np.median, bins=5) assert_array_almost_equal(stat1, stat2) assert_array_almost_equal(binx1, binx2) assert_array_almost_equal(biny1, biny2) def test_dd_count(): X = np.random.random((100, 3)) v = np.random.random(100) count1, edges1 = binned_statistic_dd(X, v, 'count', bins=3) count2, edges2 = np.histogramdd(X, bins=3) assert_array_almost_equal(count1, count2) assert_array_almost_equal(edges1, edges2) def test_dd_sum(): X = np.random.random((100, 3)) v = np.random.random(100) sum1, edges1 = binned_statistic_dd(X, v, 'sum', bins=3) sum2, edges2 = np.histogramdd(X, bins=3, weights=v) assert_array_almost_equal(sum1, sum2) assert_array_almost_equal(edges1, edges2) def test_dd_mean(): X = np.random.random((100, 3)) v = np.random.random(100) stat1, edges1 = binned_statistic_dd(X, v, 'mean', bins=3) stat2, edges2 = binned_statistic_dd(X, v, np.mean, bins=3) assert_array_almost_equal(stat1, stat2) assert_array_almost_equal(edges1, edges2) def test_dd_median(): X = np.random.random((100, 3)) v = np.random.random(100) stat1, edges1 = binned_statistic_dd(X, v, 'median', bins=3) stat2, edges2 = binned_statistic_dd(X, v, np.median, bins=3) assert_array_almost_equal(stat1, stat2) assert_array_almost_equal(edges1, edges2) astroML-0.3/astroML/stats/tests/test_stats.py0000644000076500000240000001114012420767763022147 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import numpy as np from numpy.testing import (assert_array_almost_equal, assert_array_equal, assert_equal, assert_allclose) from astroML.stats import (mean_sigma, median_sigmaG, sigmaG, fit_bivariate_normal) from astroML.stats.random import bivariate_normal #--------------------------------------------------------------------------- # Check that mean_sigma() returns the same values as np.mean() and np.std() def check_mean_sigma(a, axis=None, ddof=0): mu1, sigma1 = mean_sigma(a, axis=axis, ddof=ddof) mu2 = np.mean(a, axis=axis) sigma2 = np.std(a, axis=axis, ddof=ddof) assert_array_almost_equal(mu1, mu2) assert_array_almost_equal(sigma1, sigma2) def test_mean_sigma(): np.random.seed(0) for shape in [(4, ), (4, 5), (4, 5, 6)]: a = np.random.random(shape) for axis in (None, 0): for ddof in (0, 1): yield (check_mean_sigma, a, axis, ddof) #--------------------------------------------------------------------------- # Check that the keepdims argument works as expected # we'll later compare median_sigmaG to these results, so that # is effectively tested as well. def check_mean_sigma_keepdims(a, axis): mu1, sigma1 = mean_sigma(a, axis, keepdims=False) mu2, sigma2 = mean_sigma(a, axis, keepdims=True) assert_array_equal(mu1.ravel(), mu2.ravel()) assert_array_equal(sigma1.ravel(), sigma2.ravel()) assert_array_equal(np.broadcast(a, mu2).shape, a.shape) assert_array_equal(np.broadcast(a, sigma2).shape, a.shape) def test_mean_sigma_keepdims(): np.random.seed(0) a = np.random.random((4, 5, 6)) for axis in [None, 0, 1, 2]: yield (check_mean_sigma_keepdims, a, axis) #--------------------------------------------------------------------------- # Check that median_sigmaG matches the values computed using np.percentile # and np.median def check_median_sigmaG(a, axis): from scipy.special import erfinv factor = 1. / (2 * np.sqrt(2) * erfinv(0.5)) med1, sigmaG1 = median_sigmaG(a, axis=axis) med2 = np.median(a, axis=axis) q25, q75 = np.percentile(a, [25, 75], axis=axis) sigmaG2 = factor * (q75 - q25) assert_array_almost_equal(med1, med2) assert_array_almost_equal(sigmaG1, sigmaG2) def test_median_sigmaG(): np.random.seed(0) a = np.random.random((20, 40, 60)) for axis in [None, 0, 1, 2]: yield (check_median_sigmaG, a, axis) def check_sigmaG(a, axis): from scipy.special import erfinv factor = 1. / (2 * np.sqrt(2) * erfinv(0.5)) sigmaG1 = sigmaG(a, axis=axis) q25, q75 = np.percentile(a, [25, 75], axis=axis) sigmaG2 = factor * (q75 - q25) assert_array_almost_equal(sigmaG1, sigmaG2) def test_sigmaG(): np.random.seed(0) a = np.random.random((20, 40, 60)) for axis in [None, 0, 1, 2]: yield (check_sigmaG, a, axis) #--------------------------------------------------------------------------- # Check that median_sigmaG() is a good approximation of mean_sigma() # for normally-distributed data. def check_median_sigmaG_approx(a, axis, keepdims, atol=0.15): med, sigmaG = median_sigmaG(a, axis=axis, keepdims=keepdims) mu, sigma = mean_sigma(a, axis=axis, ddof=1, keepdims=keepdims) assert_allclose(med, mu, atol=atol) assert_allclose(sigmaG, sigma, atol=atol) def test_median_sigmaG_approx(): np.random.seed(0) a = np.random.normal(0, 1, size=(10, 10000)) for axis in (None, 1): for keepdims in (True, False): yield (check_median_sigmaG_approx, a, axis, keepdims, 0.02) #--------------------------------------------------------------------------- # Check the bivariate normal fit def check_fit_bivariate_normal(sigma1, sigma2, mu, alpha, N=1000): # poisson stats rtol = 2 * np.sqrt(N) / N x, y = bivariate_normal(mu, sigma1, sigma2, alpha, N).T mu_fit, sigma1_fit, sigma2_fit, alpha_fit = fit_bivariate_normal(x, y) if alpha_fit > np.pi / 2: alpha_fit -= np.pi elif alpha_fit < -np.pi / 2: alpha_fit += np.pi # Circular degeneracy in alpha: test sin(2*alpha) instead assert_allclose(np.sin(2 * alpha_fit), np.sin(2 * alpha), atol=2 * rtol) assert_allclose(mu, mu_fit, rtol=rtol) assert_allclose(sigma1_fit, sigma1, rtol=rtol) assert_allclose(sigma2_fit, sigma2, rtol=rtol) def test_fit_bivariate_normal(sigma1=2.0, sigma2=1.0, N=1000): np.random.seed(0) mu = [10, 10] for alpha in np.linspace(-np.pi / 2, np.pi / 2, 7): yield check_fit_bivariate_normal, sigma1, sigma2, mu, alpha, N astroML-0.3/astroML/sum_of_norms.py0000644000076500000240000001043212115147567020156 0ustar jakevdpstaff00000000000000""" Functions for regression using sums-of-norms """ import numpy as np def norm(x, x0, sigma): return (1. / np.sqrt(2 * np.pi) / sigma * np.exp(-0.5 * (x - x0) ** 2 / sigma ** 2)) def sum_of_norms(x, y, num_gaussians=None, locs=None, widths=None, spacing='linear', full_output=False): r"""Approximate a function with a sum of gaussians Parameters ---------- x : array-like, shape = n_training The x-value of the input function y : array-like, shape = n_training The y-value of the input function num_gaussians : integer (optional) The number of gaussians to use. If this is not specified, then the number of items in `locs` is used. If neither is specified, this defaults to 30 locs : array-like (optional) The locations of the gaussians to use. If not specified, locations will be uniformly spaced between the end-points of x. widths : float or array-like (optional) The widths of the gaussians to use. If a single value, use this for all widths. If multiple values, the length must be equal to len(locs), if specified, and/or num_gaussians, if specified. If widths is not provided, then widths will be used which are half the distance between adjacent gaussians will be used. full_output : boolean (default = False) if True, return the rms error of the best-fit, the list of locations, and the list of widths spacing : string, ['linear'|'log'] spacing to use for automatic determination of locs. Not referenced if locs is specified Returns ------- weights if full_output == False (weights, rms, locs, widths) if full_output == True weights : array-like, length = num_gaussians The weights which best approximate the spectrum. The reconstruction is given by sum_{i=1}^{num_gaussians} weights[i] * norm(locs[i], widths[i]) rms : float the root-mean-square error of the best-fit solution locs : array the locations of the gaussians used for the fit widths : array the widths of the gaussians used for the fit Notes ----- This is solved using linear regression. Our matrix :math:`X` has shape :math:`(m, n)` where :math:`m` is the number of training points, and :math:`n` is the number of gaussians in the fit. We seek the linear combination of these :math:`n` gaussians which minimizes the squared residual error, which in matrix form can be expressed .. math: \epsilon = \min\left|y - Xw \right| here the vector :math:`w` encodes the linear combination. The vector :math:`w` which minimizes :math:`\epsilon` can be shown to be .. math: w = (X^T X)^{-1} X^T y This is the result returned by this function. """ x, y = map(np.asarray, (x, y)) assert x.ndim == 1 assert y.shape == x.shape n_training = x.shape[0] if locs is None: if num_gaussians is None: num_gaussians = 30 if spacing == 'linear': locs = np.linspace(x[0], x[-1], num_gaussians) elif spacing == 'log': locs = np.logspace(np.log10(x[0]), np.log10(x[-1]), num_gaussians) else: locs = np.asarray(locs) if num_gaussians is None: num_gaussians = len(locs) if num_gaussians is not None: assert len(locs) == num_gaussians if widths is None: widths = np.zeros(num_gaussians) widths[:-1] = locs[1:] - locs[:-1] if len(widths) > 1: widths[-1] = widths[-2] else: widths[-1] = x[-1] - x[0] else: widths = np.atleast_1d(widths) assert widths.size in (1, num_gaussians) widths = widths + np.zeros(num_gaussians) # broadcast to shape # use broadcasting to compute X in one go, without slow loops X = norm(x.reshape(n_training, 1), locs.reshape(1, num_gaussians), widths.reshape(1, num_gaussians)) # use pinv rather than inv for numerical stability w_best = np.dot(np.linalg.pinv(np.dot(X.T, X)), np.dot(X.T, y)) if not full_output: return w_best else: rms = np.sqrt(np.mean(y - np.dot(X, w_best)) ** 2) return w_best, rms, locs, widths astroML-0.3/astroML/tests/0000755000076500000240000000000012462244012016224 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/tests/__init__.py0000644000076500000240000000000012252721253020327 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/tests/test_filters.py0000644000076500000240000000201412252721253021306 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from astroML.filters import savitzky_golay, wiener_filter def test_savitzky_golay(): y = np.zeros(100) y[::2] = 1 f = savitzky_golay(y, window_size=3, order=1) assert_allclose(f, (2 - y) / 3.) def test_savitzky_golay_fft(): y = np.random.normal(size=100) for width in [3, 5]: for order in range(width - 1): f1 = savitzky_golay(y, width, order, use_fft=False) f2 = savitzky_golay(y, width, order, use_fft=True) assert_allclose(f1, f2) def test_wiener_filter_simple(): t = np.linspace(0, 1, 256) h = np.zeros_like(t) h[::2] = 1000 s = wiener_filter(t, h) assert_allclose(s, np.mean(h)) def test_wienter_filter_spike(): np.random.seed(0) N = 2048 dt = 0.05 t = dt * np.arange(N) h = np.exp(-0.5 * ((t - 20.) / 1.0) ** 2) + 10 hN = h + np.random.normal(0, 0.05, size=h.shape) h_smooth = wiener_filter(t, hN) assert_allclose(h, h_smooth, atol=0.03) astroML-0.3/astroML/tests/test_fourier.py0000644000076500000240000000543012115147567021326 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_allclose from astroML.fourier import\ FT_continuous, IFT_continuous, PSD_continuous, sinegauss, sinegauss_FT def check_wavelets(t0, f0, Q, t): h = sinegauss(t, t0, f0, Q) f, H = FT_continuous(t, h) H2 = sinegauss_FT(f, t0, f0, Q) assert_allclose(H, H2, atol=1E-8) def test_wavelets(): t = np.linspace(-10, 10, 10000) for t0 in (-1, 0, 1): for f0 in (1, 2): for Q in (1, 2): yield (check_wavelets, t0, f0, Q, t) def sinegauss(t, t0, f0, a): """Sine-gaussian wavelet""" return (np.exp(-a * (t - t0) ** 2) * np.exp(2j * np.pi * f0 * (t - t0))) def sinegauss_FT(f, t0, f0, a): """Fourier transform of the sine-gaussian wavelet. This uses the convention H(f) = integral[ h(t) exp(-2pi i f t) dt] """ return (np.sqrt(np.pi / a) * np.exp(-2j * np.pi * f * t0) * np.exp(-np.pi ** 2 * (f - f0) ** 2 / a)) def sinegauss_PSD(f, t0, f0, a): """PSD of the sine-gaussian wavelet PSD(f) = |H(f)|^2 + |H(-f)|^2 """ Pf = np.pi / a * np.exp(-2 * np.pi ** 2 * (f - f0) ** 2 / a) Pmf = np.pi / a * np.exp(-2 * np.pi ** 2 * (-f - f0) ** 2 / a) return Pf + Pmf def check_FT_continuous(a, t0, f0, method, t): h = sinegauss(t, t0, f0, a) f, H = FT_continuous(t, h, method=method) assert_allclose(H, sinegauss_FT(f, t0, f0, a), atol=1E-12) def test_FT_continuous(): t = np.linspace(-9, 10, 10000) for a in (1, 2): for t0 in (-2, 0, 2): for f0 in (-1, 0, 1): for method in (1, 2): yield (check_FT_continuous, a, t0, f0, method, t) def check_PSD_continuous(a, t0, f0, method, t): h = sinegauss(t, t0, f0, a) f, P = PSD_continuous(t, h, method=method) assert_allclose(P, sinegauss_PSD(f, t0, f0, a), atol=1E-12) def test_PSD_continuous(): t = np.linspace(-9, 10, 10000) for a in (1, 2): for t0 in (-2, 0, 2): for f0 in (-1, 0, 1): for method in (1, 2): yield (check_PSD_continuous, a, t0, f0, method, t) def check_IFT_continuous(a, t0, f0, method, f): H = sinegauss_FT(f, t0, f0, a) t, h = IFT_continuous(f, H, method=method) assert_allclose(h, sinegauss(t, t0, f0, a), atol=1E-12) def test_IFT_continuous(): f = np.linspace(-9, 10, 10000) for a in (1, 2): for t0 in (-2, 0, 2): for f0 in (-1, 0, 1): for method in (1, 2): yield (check_IFT_continuous, a, t0, f0, method, f) def test_IFT_FT(): # Test IFT(FT(x)) = x np.random.seed(0) t = -50 + 0.01 * np.arange(10000.) x = np.random.random(10000) f, y = FT_continuous(t, x) t, xp = IFT_continuous(f, y) assert_allclose(x, xp, atol=1E-7) astroML-0.3/astroML/tests/test_pickle_results.py0000644000076500000240000000141612115147567022703 0ustar jakevdpstaff00000000000000import os from astroML.decorators import pickle_results def test_pickle_results(): filename = 'tmp.pkl' @pickle_results('tmp.pkl') def foo(x): foo.called = True return x * x # cleanup if necessary if os.path.exists(filename): os.remove(filename) # initial calculation: function should be executed foo.called = False assert foo(4) == 16 assert foo.called is True # recalculation: function should not be executed foo.called = False assert foo(4) == 16 assert foo.called is False # recalculation with different input: function should be executed foo.called = False assert foo(5) == 25 assert foo.called is True # cleanup assert os.path.exists(filename) os.remove(filename) astroML-0.3/astroML/tests/test_resample.py0000644000076500000240000000454712420767763021500 0ustar jakevdpstaff00000000000000from __future__ import print_function, division import numpy as np from numpy.testing import assert_allclose, run_module_suite from astroML.resample import bootstrap, jackknife from astroML.stats import mean_sigma def test_jackknife_results(): np.random.seed(0) x = np.random.normal(0, 1, 100) mu1, sig1 = jackknife(x, np.mean, kwargs=dict(axis=1)) mu2, sig2 = jackknife(x, np.std, kwargs=dict(axis=1)) assert_allclose([mu1, sig1, mu2, sig2], [0.0598080155345, 0.100288031685, 1.01510470168, 0.0649020337599]) def test_jackknife_multiple(): np.random.seed(0) x = np.random.normal(0, 1, 100) mu1, sig1 = jackknife(x, np.mean, kwargs=dict(axis=1)) mu2, sig2 = jackknife(x, np.std, kwargs=dict(axis=1)) res = jackknife(x, mean_sigma, kwargs=dict(axis=1)) assert_allclose(res[0], (mu1, sig1)) assert_allclose(res[1], (mu2, sig2)) def test_bootstrap_results(): np.random.seed(0) x = np.random.normal(0, 1, 100) distribution = bootstrap(x, 100, np.mean, kwargs=dict(axis=1), random_state=0) mu, sigma = mean_sigma(distribution) assert_allclose([mu, sigma], [0.08139846, 0.10465327]) def test_bootstrap_multiple(): np.random.seed(0) x = np.random.normal(0, 1, 100) dist_mean = bootstrap(x, 100, np.mean, kwargs=dict(axis=1), random_state=0) dist_std = bootstrap(x, 100, np.std, kwargs=dict(axis=1), random_state=0) res = bootstrap(x, 100, mean_sigma, kwargs=dict(axis=1), random_state=0) assert_allclose(res[0], dist_mean) assert_allclose(res[1], dist_std) def test_bootstrap_pass_indices(): np.random.seed(0) x = np.random.normal(0, 1, 100) dist1 = bootstrap(x, 100, np.mean, kwargs=dict(axis=1), random_state=0) dist2 = bootstrap(x, 100, lambda i: np.mean(x[i], axis=1), pass_indices=True, random_state=0) assert_allclose(dist1, dist2) def test_jackknife_pass_indices(): np.random.seed(0) x = np.random.normal(0, 1, 100) res1 = jackknife(x, np.mean, kwargs=dict(axis=1)) res2 = jackknife(x, lambda i: np.mean(x[i], axis=1), pass_indices=True) assert_allclose(res1, res2) if __name__ == '__main__': run_module_suite() astroML-0.3/astroML/tests/test_utils.py0000644000076500000240000000441112115147567021011 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_almost_equal, assert_array_almost_equal from astroML.utils import logsumexp, log_multivariate_gaussian, convert_2D_cov def positive_definite_matrix(N, M=None): """return an array of M positive-definite matrices with shape (N, N)""" if M is None: V = np.random.random((N, N)) V = np.dot(V, V.T) else: V = np.random.random((M, N, N)) for i in range(M): V[i] = np.dot(V[i], V[i].T) return V def test_logsumexp(): np.random.seed(0) X = np.random.random((100, 100)) for axis in (None, 0, 1): np_result = np.log(np.sum(np.exp(X), axis=axis)) aML_result = logsumexp(X, axis=axis) assert_array_almost_equal(np_result, aML_result) def test_log_multivariate_gaussian_methods(): np.random.seed(0) x = np.random.random(3) mu = np.random.random(3) V = positive_definite_matrix(3, M=10) res1 = log_multivariate_gaussian(x, mu, V, method=0) res2 = log_multivariate_gaussian(x, mu, V, method=1) assert_array_almost_equal(res1, res2) def test_log_multivariate_gaussian(): np.random.seed(0) x = np.random.random((2, 1, 1, 3)) mu = np.random.random((3, 1, 3)) V = positive_definite_matrix(3, M=4) res1 = log_multivariate_gaussian(x, mu, V) assert res1.shape == (2, 3, 4) res2 = np.zeros_like(res1) for i in range(2): for j in range(3): for k in range(4): res2[i, j, k] = log_multivariate_gaussian(x[i, 0, 0], mu[j, 0], V[k]) assert_array_almost_equal(res1, res2) def test_log_multivariate_gaussian_Vinv(): np.random.seed(0) x = np.random.random((2, 1, 1, 3)) mu = np.random.random((3, 1, 3)) V = positive_definite_matrix(3, M=4) Vinv = np.array([np.linalg.inv(Vi) for Vi in V]) res1 = log_multivariate_gaussian(x, mu, V) res2 = log_multivariate_gaussian(x, mu, V, Vinv=Vinv) assert_array_almost_equal(res1, res2) def test_2D_cov(): s1 = 1.3 s2 = 1.0 alpha = 0.2 cov = convert_2D_cov(s1, s2, alpha) assert_array_almost_equal([s1, s2, alpha], convert_2D_cov(cov)) astroML-0.3/astroML/time_series/0000755000076500000240000000000012462244012017372 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/time_series/__init__.py0000644000076500000240000000041112420767763021520 0ustar jakevdpstaff00000000000000from .ACF import ACF_scargle, ACF_EK from .periodogram import lomb_scargle, lomb_scargle_bootstrap, \ lomb_scargle_AIC, lomb_scargle_BIC, multiterm_periodogram, \ search_frequencies, MultiTermFit from .generate import generate_power_law, generate_damped_RW astroML-0.3/astroML/time_series/_periodogram.py0000644000076500000240000001015612115147567022432 0ustar jakevdpstaff00000000000000import numpy as np def lomb_scargle(t, y, dy, omega, generalized=True, subtract_mean=True, significance=None): """ (Generalized) Lomb-Scargle Periodogram with Floating Mean Parameters ---------- t : array_like sequence of times y : array_like sequence of observations dy : array_like sequence of observational errors omega : array_like frequencies at which to evaluate p(omega) generalized : bool if True (default) use generalized lomb-scargle method otherwise, use classic lomb-scargle. subtract_mean : bool if True (default) subtract the sample mean from the data before computing the periodogram. Only referenced if generalized is False significance : None or float or ndarray if specified, then this is a list of significances to compute for the results. Returns ------- p : array_like Lomb-Scargle power associated with each frequency omega z : array_like if significance is specified, this gives the levels corresponding to the desired significance (using the Scargle 1982 formalism) Notes ----- The algorithm is based on reference [1]_. The result for generalized=False is given by equation 4 of this work, while the result for generalized=True is given by equation 20. Note that the normalization used in this reference is different from that used in other places in the literature (e.g. [2]_). For a discussion of normalization and false-alarm probability, see [1]_. To recover the normalization used in Scargle [3]_, the results should be multiplied by (N - 1) / 2 where N is the number of data points. References ---------- .. [1] M. Zechmeister and M. Kurster, A&A 496, 577-584 (2009) .. [2] W. Press et al, Numerical Recipies in C (2002) .. [3] Scargle, J.D. 1982, ApJ 263:835-853 """ t = np.asarray(t) y = np.asarray(y) dy = np.asarray(dy) * np.ones_like(y) assert t.ndim == 1 assert y.ndim == 1 assert dy.ndim == 1 assert t.shape == y.shape assert y.shape == dy.shape w = 1. / dy / dy w /= w.sum() # the generalized method takes care of offset automatically, # while the classic method requires centered data. if (not generalized) and subtract_mean: # subtract MLE for mean in the presence of noise. y = y - np.dot(w, y) omega = np.asarray(omega) shape = omega.shape omega = omega.ravel()[np.newaxis, :] t = t[:, np.newaxis] y = y[:, np.newaxis] dy = dy[:, np.newaxis] w = w[:, np.newaxis] sin_omega_t = np.sin(omega * t) cos_omega_t = np.cos(omega * t) # compute time-shift tau # S2 = np.dot(w.T, np.sin(2 * omega * t) S2 = 2 * np.dot(w.T, sin_omega_t * cos_omega_t) # C2 = np.dot(w.T, np.cos(2 * omega * t) C2 = 2 * np.dot(w.T, 0.5 - sin_omega_t ** 2) if generalized: S = np.dot(w.T, sin_omega_t) C = np.dot(w.T, cos_omega_t) S2 -= (2 * S * C) C2 -= (C * C - S * S) tan_2omega_tau = S2 / C2 tau = np.arctan(tan_2omega_tau) tau *= 0.5 tau /= omega # compute components needed for the fit omega_t_tau = omega * (t - tau) sin_omega_t_tau = np.sin(omega_t_tau) cos_omega_t_tau = np.cos(omega_t_tau) Y = np.dot(w.T, y) YY = np.dot(w.T, y * y) - Y * Y wy = w * y YCtau = np.dot(wy.T, cos_omega_t_tau) YStau = np.dot(wy.T, sin_omega_t_tau) CCtau = np.dot(w.T, cos_omega_t_tau * cos_omega_t_tau) SStau = np.dot(w.T, sin_omega_t_tau * sin_omega_t_tau) if generalized: Ctau = np.dot(w.T, cos_omega_t_tau) Stau = np.dot(w.T, sin_omega_t_tau) YCtau -= Y * Ctau YStau -= Y * Stau CCtau -= Ctau * Ctau SStau -= Stau * Stau p_omega = (YCtau * YCtau / CCtau + YStau * YStau / SStau) / YY p_omega = p_omega.reshape(shape) if significance is not None: N = t.size M = 2 * N z = (-2.0 / (N - 1.) * np.log(1 - (1 - np.asarray(significance)) ** (1. / M))) return p_omega, z else: return p_omega astroML-0.3/astroML/time_series/ACF.py0000644000076500000240000000723312420767763020363 0ustar jakevdpstaff00000000000000""" Auto-correlation functions """ from __future__ import division import numpy as np from scipy import fftpack from .periodogram import lomb_scargle def ACF_scargle(t, y, dy, n_omega=2 ** 10, omega_max=100): """Compute the Auto-correlation function via Scargle's method Parameters ---------- t : array_like times of observation. Assumed to be in increasing order. y : array_like values of each observation. Should be same shape as t dy : float or array_like errors in each observation. n_omega : int (optional) number of angular frequencies at which to evaluate the periodogram default is 2^10 omega_max : float (optional) maximum value of omega at which to evaluate the periodogram default is 100 Returns ------- ACF, t : ndarrays The auto-correlation function and associated times """ t = np.asarray(t) y = np.asarray(y) if y.shape != t.shape: raise ValueError("shapes of t and y must match") dy = np.asarray(dy) * np.ones(y.shape) d_omega = omega_max * 1. / (n_omega + 1) omega = d_omega * np.arange(1, n_omega + 1) # recall that P(omega = 0) = (chi^2(0) - chi^2(0)) / chi^2(0) # = 0 # compute P and shifted full-frequency array P = lomb_scargle(t, y, dy, omega, generalized=True) P = np.concatenate([[0], P, P[-2::-1]]) # compute PW, the power of the window function PW = lomb_scargle(t, np.ones(len(t)), dy, omega, generalized=False, subtract_mean=False) PW = np.concatenate([[0], PW, PW[-2::-1]]) # compute the inverse fourier transform of P and PW rho = fftpack.ifft(P).real rhoW = fftpack.ifft(PW).real ACF = fftpack.fftshift(rho / rhoW) / np.sqrt(2) N = len(ACF) dt = 2 * np.pi / N / (omega[1] - omega[0]) t = dt * (np.arange(N) - N // 2) return ACF, t def ACF_EK(t, y, dy, bins=20): """Auto-correlation function via the Edelson-Krolik method Parameters ---------- t : array_like times of observation. Assumed to be in increasing order. y : array_like values of each observation. Should be same shape as t dy : float or array_like errors in each observation. bins : int or array_like (optional) if integer, the number of bins to use in the analysis. if array, the (nbins + 1) bin edges. Default is bins=20. Returns ------- ACF : ndarray The auto-correlation function and associated times err : ndarray the error in the ACF bins : ndarray bin edges used in computation """ t = np.asarray(t) y = np.asarray(y) if y.shape != t.shape: raise ValueError("shapes of t and y must match") if t.ndim != 1: raise ValueError("t should be a 1-dimensional array") dy = np.asarray(dy) * np.ones(y.shape) # compute mean and standard deviation of y w = 1. / dy / dy w /= w.sum() mu = np.dot(w, y) sigma = np.std(y, ddof=1) dy2 = dy[:, None] dt = t - t[:, None] UDCF = ((y - mu) * (y - mu)[:, None] / np.sqrt((sigma ** 2 - dy ** 2) * (sigma ** 2 - dy2 ** 2))) # determine binning bins = np.asarray(bins) if bins.size == 1: dt_min = dt.min() dt_max = dt.max() bins = np.linspace(dt_min, dt_max + 1E-10, bins + 1) ACF = np.zeros(len(bins) - 1) M = np.zeros(len(bins) - 1) for i in range(len(bins) - 1): flag = (dt >= bins[i]) & (dt < bins[i + 1]) M[i] = flag.sum() ACF[i] = np.sum(UDCF[flag]) ACF /= M return ACF, np.sqrt(2. / M), bins astroML-0.3/astroML/time_series/generate.py0000644000076500000240000000721012420767763021557 0ustar jakevdpstaff00000000000000import numpy as np from ..utils import check_random_state def generate_power_law(N, dt, beta, generate_complex=False, random_state=None): """Generate a power-law light curve This uses the method from Timmer & Koenig [1]_ Parameters ---------- N : integer Number of equal-spaced time steps to generate dt : float Spacing between time-steps beta : float Power-law index. The spectrum will be (1 / f)^beta generate_complex : boolean (optional) if True, generate a complex time series rather than a real time series random_state : None, int, or np.random.RandomState instance (optional) random seed or random number generator Returns ------- x : ndarray the length-N References ---------- .. [1] Timmer, J. & Koenig, M. On Generating Power Law Noise. A&A 300:707 """ random_state = check_random_state(random_state) dt = float(dt) N = int(N) Npos = int(N / 2) Nneg = int((N - 1) / 2) domega = (2 * np.pi / dt / N) if generate_complex: omega = domega * np.fft.ifftshift(np.arange(N) - int(N / 2)) else: omega = domega * np.arange(Npos + 1) x_fft = np.zeros(len(omega), dtype=complex) x_fft.real[1:] = random_state.normal(0, 1, len(omega) - 1) x_fft.imag[1:] = random_state.normal(0, 1, len(omega) - 1) x_fft[1:] *= (1. / omega[1:]) ** (0.5 * beta) x_fft[1:] *= (1. / np.sqrt(2)) # by symmetry, the Nyquist frequency is real if x is real if (not generate_complex) and (N % 2 == 0): x_fft.imag[-1] = 0 if generate_complex: x = np.fft.ifft(x_fft) else: x = np.fft.irfft(x_fft, N) return x def generate_damped_RW(t_rest, tau=300., z=2.0, xmean=0, SFinf=0.3, random_state=None): """Generate a damped random walk light curve This uses a damped random walk model to generate a light curve similar to that of a QSO [1]_. Parameters ---------- t_rest : array_like rest-frame time. Should be in increasing order tau : float relaxation time z : float redshift xmean : float (optional) mean value of random walk; default=0 SFinf : float (optional Structure function at infinity; default=0.3 random_state : None, int, or np.random.RandomState instance (optional) random seed or random number generator Returns ------- x : ndarray the sampled values corresponding to times t_rest Notes ----- The differential equation is (with t = time/tau): dX = -X(t) * dt + sigma * sqrt(tau) * e(t) * sqrt(dt) + b * tau * dt where e(t) is white noise with zero mean and unit variance, and Xmean = b * tau SFinf = sigma * sqrt(tau / 2) so dX(t) = -X(t) * dt + sqrt(2) * SFint * e(t) * sqrt(dt) + Xmean * dt References ---------- .. [1] Kelly, B., Bechtold, J. & Siemiginowska, A. (2009) Are the Variations in Quasar Optical Flux Driven by Thermal Fluctuations? ApJ 698:895 (2009) """ # Xmean = b * tau # SFinf = sigma * sqrt(tau / 2) t_rest = np.atleast_1d(t_rest) if t_rest.ndim != 1: raise ValueError('t_rest should be a 1D array') random_state = check_random_state(random_state) N = len(t_rest) t_obs = t_rest * (1. + z) / tau x = np.zeros(N) x[0] = random_state.normal(xmean, SFinf) E = random_state.normal(0, 1, N) for i in range(1, N): dt = t_obs[i] - t_obs[i - 1] x[i] = (x[i - 1] - dt * (x[i - 1] - xmean) + np.sqrt(2) * SFinf * E[i] * np.sqrt(dt)) return x astroML-0.3/astroML/time_series/periodogram.py0000644000076500000240000002605312252721253022266 0ustar jakevdpstaff00000000000000import numpy as np from ..utils import check_random_state try: from astroML_addons.periodogram import lomb_scargle except ImportError: import warnings warnings.warn("Using slow version of lomb_scargle. Install astroML_addons " "to use an optimized version") from astroML.time_series._periodogram import lomb_scargle def lomb_scargle_bootstrap(t, y, dy, omega, generalized=True, subtract_mean=True, N_bootstraps=100, random_state=None): """Use a bootstrap analysis to compute Lomb-Scargle significance Parameters ---------- The first set of parameters are passed to the lomb_scargle algorithm t : array_like sequence of times y : array_like sequence of observations dy : array_like sequence of observational errors omega : array_like frequencies at which to evaluate p(omega) generalized : bool if True (default) use generalized lomb-scargle method otherwise, use classic lomb-scargle. subtract_mean : bool if True (default) subtract the sample mean from the data before computing the periodogram. Only referenced if generalized is False Remaining parameters control the bootstrap N_bootstraps : int number of bootstraps random_state : None, int, or RandomState object random seed, or random number generator Returns ------- D : ndarray distribution of the height of the highest peak """ random_state = check_random_state(random_state) t = np.asarray(t) y = np.asarray(y) dy = np.asarray(dy) + np.zeros_like(y) D = np.zeros(N_bootstraps) for i in range(N_bootstraps): ind = random_state.randint(0, len(y), len(y)) p = lomb_scargle(t, y[ind], dy[ind], omega, generalized=generalized, subtract_mean=subtract_mean) D[i] = p.max() return D def lomb_scargle_AIC(P, y, dy, n_harmonics=1): """Compute the AIC for a Lomb-Scargle Periodogram Parameters ---------- P : array_like lomb-scargle power y : array_like observations dy : array_like errors n_harmonics : int (optional) the number of harmonics used in the Lomb-Scargle fit. Default is 1 Returns ------- AIC : ndarray AIC value corresponding to values in P """ P, y, dy = map(np.asarray(P, y, dy)) w = 1. / dy ** 2 mu = np.dot(w, y) / w.sum() N = len(y) return np.sum(((y_obs - mu) / dy) ** 2) * P - (2 * n_harmonics + 1) * 2 def lomb_scargle_BIC(P, y, dy, n_harmonics=1): """Compute the BIC for a Lomb-Scargle Periodogram Parameters ---------- P : array_like lomb-scargle power y : array_like observations dy : array_like errors n_harmonics : int (optional) the number of harmonics used in the Lomb-Scargle fit. Default is 1 Returns ------- BIC : ndarray BIC value corresponding to values in P """ P, y, dy = map(np.asarray, (P, y, dy)) w = 1. / dy ** 2 mu = np.dot(w, y) / w.sum() N = len(y) return np.sum(((y - mu) / dy) ** 2) * P - (2 * n_harmonics + 1) * np.log(N) def multiterm_periodogram(t, y, dy, omega, n_terms=3): """Perform a multiterm periodogram at each omega This calculates the chi2 for the best-fit least-squares solution for each frequency omega. Parameters ---------- t : array_like sequence of times y : array_like sequence of observations dy : array_like sequence of observational errors omega : float or array_like frequencies at which to evaluate p(omega) Returns ------- power : ndarray P = 1. - chi2 / chi2_0 where chi2_0 is the chi-square for a simple mean fit to the data """ # TODO: this is a slow implementation. A Lomb-Scargle-type implementation # could be faster. It would also gain from cythonization and the # use of trig identities to compute higher-order sines & cosines. t = np.asarray(t) y = np.array(y, copy=True) dy = np.asarray(dy) assert t.ndim == 1 assert y.ndim == 1 assert dy.ndim == 1 assert t.shape == y.shape assert y.shape == dy.shape omega = np.asarray(omega) shape = omega.shape omega = omega.ravel() # compute chi2_0, the chi2 for a simple fit to the mean mu = np.sum(y / dy ** 2) / np.sum(1. / dy ** 2) chi2_0 = np.sum(((y - mu) / dy) ** 2) chi2 = np.zeros(omega.shape) X = np.empty((y.shape[0], 1 + 2 * n_terms), dtype=float) y /= dy dy_inv = 1. / dy[:, None] for i, omega_i in enumerate(omega): X[:, 0] = 1 for m in range(1, n_terms + 1): X[:, 2 * m - 1] = np.sin(m * omega_i * t) X[:, 2 * m] = np.cos(m * omega_i * t) X *= dy_inv M, chi2[i], rank, s = np.linalg.lstsq(X, y) return 1. - chi2.reshape(shape) / chi2_0 def search_frequencies(t, y, dy, LS_func=lomb_scargle, LS_kwargs=None, initial_guess=25, limit_fractions=[0.04, 0.3, 0.9, 0.99], n_eval=10000, n_retry=5, n_save=50): """Utility Routine to find the best frequencies To find the best frequency with a Lomb-Scargle periodogram requires searching a large range of frequencies at a very fine resolution. This is an iterative routine that searches progressively finer grids to narrow-in on the best result. Parameters ---------- t: array_like observed times y: array_like observed fluxes or magnitudes dy: array_like observed errors on y Other Parameters ---------------- LS_func : function Function used to perform Lomb-Scargle periodogram. The call signature should be LS_func(t, y, dy, omega, **kwargs) (Default is astroML.periodogram.lomb_scargle) LS_kwargs : dict dictionary of keyword arguments to pass to LS_func in addition to (t, y, dy, omega) initial_guess : float the initial guess of the best period limit_fractions : array_like the list of fractions to use when zooming in on peak possibilities. On the i^th iteration, with f_i = limit_fractions[i], the range probed around each candidate will be (candidate * f_i, candidate / f_i). n_eval : integer or list The number of point to evaluate in the range on each iteration. If n_eval is a list, it should have the same length as limit_fractions. n_retry : integer or list Number of top points to search on each iteration. If n_retry is a list, it should have the same length as limit_fractions. n_save : integer or list Number of evaluations to save on each iteration. If n_save is a list, it should have the same length as limit_fractions. Returns ------- omega_top, power_top: ndarrays The saved values of omega and power. These will have size 1 + n_save * (1 + n_retry * len(limit_fractions)) as long as n_save > n_retry """ if LS_kwargs is None: LS_kwargs = dict() omega_best = [initial_guess] power_best = LS_func(t, y, dy, omega_best, **LS_kwargs) for (Ne, Nr, Ns, frac) in np.broadcast(n_eval, n_retry, n_save, limit_fractions): # make sure we explore differing regions log_ob = np.log(omega_best) width = 0.1 * np.log(frac) log_ob = np.floor(-log_ob / width).astype(int) indices = np.arange(len(log_ob)) for i in range(Nr): if len(indices) == 0: break omega_try = omega_best[indices[-1]] non_duplicates = (log_ob != log_ob[-1]) log_ob = log_ob[non_duplicates] indices = indices[non_duplicates] omega = np.linspace(omega_try * frac, omega_try / frac, Ne) power = LS_func(t, y, dy, omega, **LS_kwargs) i = np.argsort(power)[-Ns:] power_best = np.concatenate([power_best, power[i]]) omega_best = np.concatenate([omega_best, omega[i]]) i = np.argsort(power_best) power_best = power_best[i] omega_best = omega_best[i] i = np.argsort(omega_best) return omega_best[i], power_best[i] class MultiTermFit(object): """Multi-term Fourier fit to a light curve Parameters ---------- omega : float angular frequency of the fundamental mode n_terms : int the number of Fourier modes to use in the fit """ def __init__(self, omega, n_terms): self.omega = omega self.n_terms = n_terms def _make_X(self, t): t = np.asarray(t) k = np.arange(1, self.n_terms + 1) X = np.hstack([np.ones(t[:, None].shape), np.sin(k * self.omega * t[:, None]), np.cos(k * self.omega * t[:, None])]) return X def fit(self, t, y, dy): """Fit multiple Fourier terms to the data Parameters ---------- t: array_like observed times y: array_like observed fluxes or magnitudes dy: array_like observed errors on y Returns ------- self : The MultiTermFit object is returned """ t = np.asarray(t) y = np.asarray(y) dy = np.asarray(dy) X_scaled = self._make_X(t) / dy[:, None] y_scaled = y / dy self.t_ = t self.w_ = np.linalg.solve(np.dot(X_scaled.T, X_scaled), np.dot(X_scaled.T, y_scaled)) return self def predict(self, Nphase, return_phased_times=False, adjust_offset=True): """Compute the phased fit, and optionally return phased times Parameters ---------- Nphase : int Number of terms to use in the phased fit return_phased_times : bool If True, then return a phased version of the input times adjust_offset : bool If true, then shift results so that the minimum value is at phase 0 Returns ------- phase, y_fit : ndarrays The phase and y value of the best-fit light curve phased_times : ndarray The phased version of the training times. Returned if return_phased_times is set to True. """ phase_fit = np.linspace(0, 1, Nphase + 1)[:-1] X_fit = self._make_X(2 * np.pi * phase_fit / self.omega) y_fit = np.dot(X_fit, self.w_) i_offset = np.argmin(y_fit) if adjust_offset: y_fit = np.concatenate([y_fit[i_offset:], y_fit[:i_offset]]) if return_phased_times: if adjust_offset: offset = phase_fit[i_offset] else: offset = 0 phased_times = (self.t_ * self.omega * 0.5 / np.pi - offset) % 1 return phase_fit, y_fit, phased_times else: return phase_fit, y_fit astroML-0.3/astroML/time_series/tests/0000755000076500000240000000000012462244012020534 5ustar jakevdpstaff00000000000000astroML-0.3/astroML/time_series/tests/__init__.py0000644000076500000240000000000012252721253022637 0ustar jakevdpstaff00000000000000astroML-0.3/astroML/time_series/tests/test_generate.py0000644000076500000240000000076312115147567023761 0ustar jakevdpstaff00000000000000import numpy as np from numpy.testing import assert_ from astroML.time_series import generate_power_law def check_generate_args(N, dt, beta, generate_complex): x = generate_power_law(N, dt, beta, generate_complex) assert_(bool(generate_complex) == np.iscomplexobj(x)) assert_(len(x) == N) def test_generate_args(): dt = 0.1 beta = 2 for N in [10, 11]: for generate_complex in [True, False]: yield (check_generate_args, N, dt, beta, generate_complex) astroML-0.3/astroML/utils.py0000644000076500000240000002075112305431600016576 0ustar jakevdpstaff00000000000000import numpy as np from scipy import linalg try: # exists in python 2.7+ from itertools import combinations_with_replacement except: def combinations_with_replacement(iterable, r): """ Return successive r-length combinations of elements in the iterable allowing individual elements to have successive repeats. combinations_with_replacement('ABC', 2) --> AA AB AC BB BC CC """ from itertools import product pool = tuple(iterable) n = len(pool) for indices in product(range(n), repeat=r): if sorted(indices) == list(indices): yield tuple(pool[i] for i in indices) def logsumexp(arr, axis=None): """Computes the sum of arr assuming arr is in the log domain. Returns log(sum(exp(arr))) while minimizing the possibility of over/underflow. Examples -------- >>> import numpy as np >>> a = np.arange(10) >>> np.log(np.sum(np.exp(a))) 9.4586297444267107 >>> logsumexp(a) 9.4586297444267107 """ # if axis is specified, roll axis to 0 so that broadcasting works below if axis is not None: arr = np.rollaxis(arr, axis) axis = 0 # Use the max to normalize, as with the log this is what accumulates # the fewest errors vmax = arr.max(axis=axis) out = np.log(np.sum(np.exp(arr - vmax), axis=axis)) out += vmax return out def log_multivariate_gaussian(x, mu, V, Vinv=None, method=1): """Evaluate a multivariate gaussian N(x|mu, V) This allows for multiple evaluations at once, using array broadcasting Parameters ---------- x: array_like points, shape[-1] = n_features mu: array_like centers, shape[-1] = n_features V: array_like covariances, shape[-2:] = (n_features, n_features) Vinv: array_like or None pre-computed inverses of V: should have the same shape as V method: integer, optional method = 0: use cholesky decompositions of V method = 1: use explicit inverse of V Returns ------- values: ndarray shape = broadcast(x.shape[:-1], mu.shape[:-1], V.shape[:-2]) Examples -------- >>> x = [1, 2] >>> mu = [0, 0] >>> V = [[2, 1], [1, 2]] >>> log_multivariate_gaussian(x, mu, V) -3.3871832107434003 """ x = np.asarray(x, dtype=float) mu = np.asarray(mu, dtype=float) V = np.asarray(V, dtype=float) ndim = x.shape[-1] x_mu = x - mu if V.shape[-2:] != (ndim, ndim): raise ValueError("Shape of (x-mu) and V do not match") Vshape = V.shape V = V.reshape([-1, ndim, ndim]) if Vinv is not None: assert Vinv.shape == Vshape method = 1 if method == 0: Vchol = np.array([linalg.cholesky(V[i], lower=True) for i in range(V.shape[0])]) # we may be more efficient by using scipy.linalg.solve_triangular # with each cholesky decomposition VcholI = np.array([linalg.inv(Vchol[i]) for i in range(V.shape[0])]) logdet = np.array([2 * np.sum(np.log(np.diagonal(Vchol[i]))) for i in range(V.shape[0])]) VcholI = VcholI.reshape(Vshape) logdet = logdet.reshape(Vshape[:-2]) VcIx = np.sum(VcholI * x_mu.reshape(x_mu.shape[:-1] + (1,) + x_mu.shape[-1:]), -1) xVIx = np.sum(VcIx ** 2, -1) elif method == 1: if Vinv is None: Vinv = np.array([linalg.inv(V[i]) for i in range(V.shape[0])]).reshape(Vshape) else: assert Vinv.shape == Vshape logdet = np.log(np.array([linalg.det(V[i]) for i in range(V.shape[0])])) logdet = logdet.reshape(Vshape[:-2]) xVI = np.sum(x_mu.reshape(x_mu.shape + (1,)) * Vinv, -2) xVIx = np.sum(xVI * x_mu, -1) else: raise ValueError("unrecognized method %s" % method) return -0.5 * ndim * np.log(2 * np.pi) - 0.5 * (logdet + xVIx) # From scikit-learn utilities: def check_random_state(seed): """Turn seed into a np.random.RandomState instance If seed is None, return the RandomState singleton used by np.random. If seed is an int, return a new RandomState instance seeded with seed. If seed is already a RandomState instance, return it. Otherwise raise ValueError. """ if seed is None or seed is np.random: return np.random.mtrand._rand if isinstance(seed, (int, np.integer)): return np.random.RandomState(seed) if isinstance(seed, np.random.RandomState): return seed raise ValueError('%r cannot be used to seed a numpy.random.RandomState' ' instance' % seed) def split_samples(X, y, fractions=[0.75, 0.25], random_state=None): """Split samples into training, test, and cross-validation sets Parameters ---------- X, y : array_like leading dimension n_samples fraction : array_like length n_splits. If the fractions do not add to 1, they will be re-normalized. random_state : None, int, or RandomState object random seed, or random number generator """ X = np.asarray(X) y = np.asarray(y) if X.shape[0] != y.shape[0]: raise ValueError("X and y should have the same leading dimension") n_samples = X.shape[0] fractions = np.asarray(fractions).ravel().cumsum() fractions /= fractions[-1] fractions *= n_samples N = np.concatenate([[0], fractions.astype(int)]) N[-1] = n_samples # in case of roundoff errors random_state = check_random_state(random_state) indices = np.arange(len(y)) random_state.shuffle(indices) X_divisions = tuple(X[indices[N[i]:N[i + 1]]] for i in range(len(fractions))) y_divisions = tuple(y[indices[N[i]:N[i + 1]]] for i in range(len(fractions))) return X_divisions, y_divisions def completeness_contamination(predicted, true): """Compute the completeness and contamination values Parameters ---------- predicted_value, true_value : array_like integer arrays of predicted and true values. This assumes that 'false' values are given by 0, and 'true' values are nonzero. Returns ------- completeness, contamination : float or array_like the completeness and contamination of the results. shape is np.broadcast(predicted, true).shape[:-1] """ predicted = np.asarray(predicted) true = np.asarray(true) outshape = np.broadcast(predicted, true).shape[:-1] predicted = np.atleast_2d(predicted) true = np.atleast_2d(true) matches = (predicted == true) tp = np.sum(matches & (true != 0), -1) tn = np.sum(matches & (true == 0), -1) fp = np.sum(~matches & (true == 0), -1) fn = np.sum(~matches & (true != 0), -1) tot = (tp + fn) tot[tot == 0] = 1 completeness = tp * 1. / tot tot = (tp + fp) tot[tot == 0] = 1 contamination = fp * 1. / tot completeness[np.isnan(completeness)] = 0 contamination[np.isnan(contamination)] = 0 return completeness.reshape(outshape), contamination.reshape(outshape) def convert_2D_cov(*args): """Convert a 2D covariance from matrix form to principal form, and back if one parameter is passed, it is a covariance matrix, and the principal axes and rotation (sigma1, sigma2, alpha) are returned. if three parameters are passed, they are assumed to be (sigma1, sigma2, alpha) and the covariance is returned """ if len(args) == 1: C = np.asarray(args[0]) if C.shape != (2, 2): raise ValueError("Input not understood") sigma_x2 = C[0, 0] sigma_y2 = C[1, 1] sigma_xy = C[0, 1] alpha = 0.5 * np.arctan2(2 * sigma_xy, (sigma_x2 - sigma_y2)) tmp1 = 0.5 * (sigma_x2 + sigma_y2) tmp2 = np.sqrt(0.25 * (sigma_x2 - sigma_y2) ** 2 + sigma_xy ** 2) sigma1 = np.sqrt(tmp1 + tmp2) sigma2 = np.sqrt(tmp1 - tmp2) return (sigma1, sigma2, alpha) elif len(args) == 3: sigma1, sigma2, alpha = args s = np.sin(alpha) c = np.cos(alpha) sigma_x2 = (sigma1 * c) ** 2 + (sigma2 * s) ** 2 sigma_y2 = (sigma1 * s) ** 2 + (sigma2 * c) ** 2 sigma_xy = (sigma1 ** 2 - sigma2 ** 2) * s * c return np.array([[sigma_x2, sigma_xy], [sigma_xy, sigma_y2]]) else: raise ValueError("Input not understood") astroML-0.3/book_figures/0000755000076500000240000000000012462244012016157 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/appendix/0000755000076500000240000000000012462244012017767 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/appendix/fig_ball_dualtree.py0000644000076500000240000000472712252721253024003 0ustar jakevdpstaff00000000000000""" Ball Dual-tree Diagram ---------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95) #------------------------------------------------------------ ax = fig.add_subplot(111, xticks=[], yticks=[], aspect='equal') Qx = np.array([[-0.3, -0.5, -0.7, -0.35, -0.58, -0.33], [0.4, 0.36, 0.68, 0.44, 0.77, 0.65]]) Rx = np.array([[0.24, 0.63, 0.7, 0.35, 0.58, 0.33], [0.34, 0.36, 0.68, 0.44, 0.78, 0.65]]) ax.plot([-0.5, 0.5], [0.5, 0.5], 'kx', ms=8) ax.scatter(Qx[0], Qx[1], c='r', s=30) ax.scatter(Rx[0], Rx[1], c='b', s=30) ax.add_patch(plt.Circle((-0.5, 0.5), 0.3, fc='none', lw=2)) ax.add_patch(plt.Circle((0.5, 0.5), 0.35, fc='none', lw=2)) ax.arrow(-0.5, 0.5, -0.16, 0.26, width=0.01, lw=0, color='gray', length_includes_head=True, zorder=1) ax.arrow(0.5, 0.5, 0.19, 0.29, width=0.01, lw=0, color='gray', length_includes_head=True, zorder=1) ax.text(-0.8, 0.7, r'$Q$', ha='left', va='bottom', fontsize=12) ax.text(0.8, 0.7, r'$R$', ha='left', va='bottom', fontsize=12) ax.text(-0.55, 0.6, r'$r_Q$', ha='left', va='bottom', fontsize=12) ax.text(0.5, 0.65, r'$r_R$', ha='left', va='bottom', fontsize=12) ax.text(-0.5, 0.48, r'$\vec{\mu}_Q$', ha='left', va='top', fontsize=12) ax.text(0.5, 0.48, r'$\vec{\mu}_R$', ha='left', va='top', fontsize=12) ax.text(0, -0.08, r'$D^l(Q, R) = |\vec{\mu}_Q - \vec{\mu}_R| - r_Q - r_R$', va='bottom', ha='center', fontsize=12) ax.text(0, 0.02, r'$D^u(Q, R) = |\vec{\mu}_Q - \vec{\mu}_R| + r_Q + r_R$', va='bottom', ha='center', fontsize=12) ax.set_xlim(-1, 1) ax.set_ylim(-0.1, 1) plt.show() astroML-0.3/book_figures/appendix/fig_broadcast_visual.py0000644000076500000240000002066012252721253024523 0ustar jakevdpstaff00000000000000""" Broadcast Visualization ----------------------- Figure A.1 A visualization of NumPy array broadcasting. Note that the extra memory indicated by the dotted boxes is never allocated, but it can be convenient to think about the operations as if it is. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Draw a figure and axis with no boundary fig = plt.figure(figsize=(5, 3.75), facecolor='w') ax = plt.axes([0, 0, 1, 1], xticks=[], yticks=[], frameon=False) def draw_cube(ax, xy, size, depth=0.4, edges=None, label=None, label_kwargs=None, **kwargs): """draw and label a cube. edges is a list of numbers between 1 and 12, specifying which of the 12 cube edges to draw""" if edges is None: edges = range(1, 13) x, y = xy if 1 in edges: ax.plot([x, x + size], [y + size, y + size], **kwargs) if 2 in edges: ax.plot([x + size, x + size], [y, y + size], **kwargs) if 3 in edges: ax.plot([x, x + size], [y, y], **kwargs) if 4 in edges: ax.plot([x, x], [y, y + size], **kwargs) if 5 in edges: ax.plot([x, x + depth], [y + size, y + depth + size], **kwargs) if 6 in edges: ax.plot([x + size, x + size + depth], [y + size, y + depth + size], **kwargs) if 7 in edges: ax.plot([x + size, x + size + depth], [y, y + depth], **kwargs) if 8 in edges: ax.plot([x, x + depth], [y, y + depth], **kwargs) if 9 in edges: ax.plot([x + depth, x + depth + size], [y + depth + size, y + depth + size], **kwargs) if 10 in edges: ax.plot([x + depth + size, x + depth + size], [y + depth, y + depth + size], **kwargs) if 11 in edges: ax.plot([x + depth, x + depth + size], [y + depth, y + depth], **kwargs) if 12 in edges: ax.plot([x + depth, x + depth], [y + depth, y + depth + size], **kwargs) if label: if label_kwargs is None: label_kwargs = {} ax.text(x + 0.5 * size, y + 0.5 * size, label, ha='center', va='center', **label_kwargs) solid = dict(c='black', ls='-', lw=1, label_kwargs=dict(color='k')) dotted = dict(c='black', ls=':', lw=0.5, label_kwargs=dict(color='gray')) depth = 0.3 #------------------------------------------------------------ # Draw top operation: vector plus scalar draw_cube(ax, (1, 10), 1, depth, [1, 2, 3, 4, 5, 6, 9], '0', **solid) draw_cube(ax, (2, 10), 1, depth, [1, 2, 3, 6, 9], '1', **solid) draw_cube(ax, (3, 10), 1, depth, [1, 2, 3, 6, 7, 9, 10], '2', **solid) draw_cube(ax, (6, 10), 1, depth, [1, 2, 3, 4, 5, 6, 7, 9, 10], '5', **solid) draw_cube(ax, (7, 10), 1, depth, [1, 2, 3, 6, 7, 9, 10, 11], '5', **dotted) draw_cube(ax, (8, 10), 1, depth, [1, 2, 3, 6, 7, 9, 10, 11], '5', **dotted) draw_cube(ax, (12, 10), 1, depth, [1, 2, 3, 4, 5, 6, 9], '5', **solid) draw_cube(ax, (13, 10), 1, depth, [1, 2, 3, 6, 9], '6', **solid) draw_cube(ax, (14, 10), 1, depth, [1, 2, 3, 6, 7, 9, 10], '7', **solid) ax.text(5, 10.5, '+', size=12, ha='center', va='center') ax.text(10.5, 10.5, '=', size=12, ha='center', va='center') ax.text(1, 11.5, r'${\tt np.arange(3) + 5}$', size=12, ha='left', va='bottom') #------------------------------------------------------------ # Draw middle operation: matrix plus vector # first block draw_cube(ax, (1, 7.5), 1, depth, [1, 2, 3, 4, 5, 6, 9], '1', **solid) draw_cube(ax, (2, 7.5), 1, depth, [1, 2, 3, 6, 9], '1', **solid) draw_cube(ax, (3, 7.5), 1, depth, [1, 2, 3, 6, 7, 9, 10], '1', **solid) draw_cube(ax, (1, 6.5), 1, depth, [2, 3, 4], '1', **solid) draw_cube(ax, (2, 6.5), 1, depth, [2, 3], '1', **solid) draw_cube(ax, (3, 6.5), 1, depth, [2, 3, 7, 10], '1', **solid) draw_cube(ax, (1, 5.5), 1, depth, [2, 3, 4], '1', **solid) draw_cube(ax, (2, 5.5), 1, depth, [2, 3], '1', **solid) draw_cube(ax, (3, 5.5), 1, depth, [2, 3, 7, 10], '1', **solid) # second block draw_cube(ax, (6, 7.5), 1, depth, [1, 2, 3, 4, 5, 6, 9], '0', **solid) draw_cube(ax, (7, 7.5), 1, depth, [1, 2, 3, 6, 9], '1', **solid) draw_cube(ax, (8, 7.5), 1, depth, [1, 2, 3, 6, 7, 9, 10], '2', **solid) draw_cube(ax, (6, 6.5), 1, depth, range(2, 13), '0', **dotted) draw_cube(ax, (7, 6.5), 1, depth, [2, 3, 6, 7, 9, 10, 11], '1', **dotted) draw_cube(ax, (8, 6.5), 1, depth, [2, 3, 6, 7, 9, 10, 11], '2', **dotted) draw_cube(ax, (6, 5.5), 1, depth, [2, 3, 4, 7, 8, 10, 11, 12], '0', **dotted) draw_cube(ax, (7, 5.5), 1, depth, [2, 3, 7, 10, 11], '1', **dotted) draw_cube(ax, (8, 5.5), 1, depth, [2, 3, 7, 10, 11], '2', **dotted) # third block draw_cube(ax, (12, 7.5), 1, depth, [1, 2, 3, 4, 5, 6, 9], '1', **solid) draw_cube(ax, (13, 7.5), 1, depth, [1, 2, 3, 6, 9], '2', **solid) draw_cube(ax, (14, 7.5), 1, depth, [1, 2, 3, 6, 7, 9, 10], '3', **solid) draw_cube(ax, (12, 6.5), 1, depth, [2, 3, 4], '1', **solid) draw_cube(ax, (13, 6.5), 1, depth, [2, 3], '2', **solid) draw_cube(ax, (14, 6.5), 1, depth, [2, 3, 7, 10], '3', **solid) draw_cube(ax, (12, 5.5), 1, depth, [2, 3, 4], '1', **solid) draw_cube(ax, (13, 5.5), 1, depth, [2, 3], '2', **solid) draw_cube(ax, (14, 5.5), 1, depth, [2, 3, 7, 10], '3', **solid) ax.text(5, 7.0, '+', size=12, ha='center', va='center') ax.text(10.5, 7.0, '=', size=12, ha='center', va='center') ax.text(1, 9.0, r'${\tt np.ones((3,\, 3)) + np.arange(3)}$', size=12, ha='left', va='bottom') #------------------------------------------------------------ # Draw bottom operation: vector plus vector, double broadcast # first block draw_cube(ax, (1, 3), 1, depth, [1, 2, 3, 4, 5, 6, 7, 9, 10], '0', **solid) draw_cube(ax, (1, 2), 1, depth, [2, 3, 4, 7, 10], '1', **solid) draw_cube(ax, (1, 1), 1, depth, [2, 3, 4, 7, 10], '2', **solid) draw_cube(ax, (2, 3), 1, depth, [1, 2, 3, 6, 7, 9, 10, 11], '0', **dotted) draw_cube(ax, (2, 2), 1, depth, [2, 3, 7, 10, 11], '1', **dotted) draw_cube(ax, (2, 1), 1, depth, [2, 3, 7, 10, 11], '2', **dotted) draw_cube(ax, (3, 3), 1, depth, [1, 2, 3, 6, 7, 9, 10, 11], '0', **dotted) draw_cube(ax, (3, 2), 1, depth, [2, 3, 7, 10, 11], '1', **dotted) draw_cube(ax, (3, 1), 1, depth, [2, 3, 7, 10, 11], '2', **dotted) # second block draw_cube(ax, (6, 3), 1, depth, [1, 2, 3, 4, 5, 6, 9], '0', **solid) draw_cube(ax, (7, 3), 1, depth, [1, 2, 3, 6, 9], '1', **solid) draw_cube(ax, (8, 3), 1, depth, [1, 2, 3, 6, 7, 9, 10], '2', **solid) draw_cube(ax, (6, 2), 1, depth, range(2, 13), '0', **dotted) draw_cube(ax, (7, 2), 1, depth, [2, 3, 6, 7, 9, 10, 11], '1', **dotted) draw_cube(ax, (8, 2), 1, depth, [2, 3, 6, 7, 9, 10, 11], '2', **dotted) draw_cube(ax, (6, 1), 1, depth, [2, 3, 4, 7, 8, 10, 11, 12], '0', **dotted) draw_cube(ax, (7, 1), 1, depth, [2, 3, 7, 10, 11], '1', **dotted) draw_cube(ax, (8, 1), 1, depth, [2, 3, 7, 10, 11], '2', **dotted) # third block draw_cube(ax, (12, 3), 1, depth, [1, 2, 3, 4, 5, 6, 9], '0', **solid) draw_cube(ax, (13, 3), 1, depth, [1, 2, 3, 6, 9], '1', **solid) draw_cube(ax, (14, 3), 1, depth, [1, 2, 3, 6, 7, 9, 10], '2', **solid) draw_cube(ax, (12, 2), 1, depth, [2, 3, 4], '1', **solid) draw_cube(ax, (13, 2), 1, depth, [2, 3], '2', **solid) draw_cube(ax, (14, 2), 1, depth, [2, 3, 7, 10], '3', **solid) draw_cube(ax, (12, 1), 1, depth, [2, 3, 4], '2', **solid) draw_cube(ax, (13, 1), 1, depth, [2, 3], '3', **solid) draw_cube(ax, (14, 1), 1, depth, [2, 3, 7, 10], '4', **solid) ax.text(5, 2.5, '+', size=12, ha='center', va='center') ax.text(10.5, 2.5, '=', size=12, ha='center', va='center') ax.text(1, 4.5, r'${\tt np.arange(3).reshape((3,\, 1)) + np.arange(3)}$', ha='left', size=12, va='bottom') ax.set_xlim(0, 16) ax.set_ylim(0.5, 12.5) plt.show() astroML-0.3/book_figures/appendix/fig_fft_text_example.py0000644000076500000240000000451012252721253024530 0ustar jakevdpstaff00000000000000""" Example of a Fourier Transform ------------------------------ Figure E.1 An example of approximating the continuous Fourier transform of a function using the fast Fourier transform. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import fftpack from astroML.fourier import FT_continuous, sinegauss, sinegauss_FT #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Choose parameters for the wavelet N = 10000 t0 = 5 f0 = 2 Q = 2 #------------------------------------------------------------ # Compute the wavelet on a grid of times Dt = 0.01 t = t0 + Dt * (np.arange(N) - N / 2) h = sinegauss(t, t0, f0, Q) #------------------------------------------------------------ # Approximate the continuous Fourier Transform f, H = FT_continuous(t, h) rms_err = np.sqrt(np.mean(abs(H - sinegauss_FT(f, t0, f0, Q)) ** 2)) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.25) # plot the wavelet ax = fig.add_subplot(211) ax.plot(t, h.real, '-', c='black', label='$Re[h]$', lw=1) ax.plot(t, h.imag, ':', c='black', label='$Im[h]$', lw=1) ax.legend() ax.set_xlim(2, 8) ax.set_ylim(-1.2, 1.2) ax.set_xlabel('$t$') ax.set_ylabel('$h(t)$') # plot the Fourier transform ax = fig.add_subplot(212) ax.plot(f, H.real, '-', c='black', label='$Re[H]$', lw=1) ax.plot(f, H.imag, ':', c='black', label='$Im[H]$', lw=1) ax.text(0.55, 1.5, "RMS Error = %.2g" % rms_err) ax.legend() ax.set_xlim(0.5, 3.5) ax.set_ylim(-1.9, 1.9) ax.set_xlabel('$f$') ax.set_ylabel('$H(f)$') plt.show() astroML-0.3/book_figures/appendix/fig_gauss_kernel.py0000644000076500000240000000516512252721253023663 0ustar jakevdpstaff00000000000000""" Gaussian Kernel Expansion Diagram --------------------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) plt.figure(figsize=(5, 3.75), facecolor='w') ax = plt.axes([0, 0, 1, 1], frameon=False, xticks=[], yticks=[]) ax.add_patch(plt.Rectangle((-0.5, -0.25), 0.8, 0.4, fc='none', ec='k', lw=2)) ax.add_patch(plt.Rectangle((-1.75, 0.1), 0.8, 0.4, fc='none', ec='k', lw=2, linestyle='dashed')) ax.add_patch(plt.Rectangle((0.8, -0.55), 0.8, 0.4, fc='none', ec='k', lw=2, linestyle='dashed')) ax.add_patch(plt.Rectangle((-1.3, -0.95), 0.8, 0.4, fc='none', ec='k', lw=2, linestyle='dashed')) red_pts = np.array([[-0.163, 0.093], [-0.123, -0.22], [0.194, 0.035], [0.146, -0.178], [-0.387, -0.143]]) blue_pts = np.array([[-1.51, 0.17], [-1.17, 0.36], [-1.23, -0.68], [-0.80, -0.83], [1.28, -0.45], [1.41, -0.26]]) x0 = -0.5 + 0.4 y0 = -0.25 + 0.2 ax.scatter(red_pts[:, 0], red_pts[:, 1], c='r') ax.scatter(blue_pts[:, 0], blue_pts[:, 1], c='b') ax.scatter([x0], [y0], c='gray') for pt in blue_pts: ax.annotate("", pt, (x0, y0), arrowprops=dict(arrowstyle='->', linestyle='dashed')) for i, pt in enumerate(red_pts): ax.annotate("", pt, (x0, y0), arrowprops=dict(arrowstyle='<-')) ax.text(pt[0] + 0.03, pt[1] + 0.03, '$r_{j%i}$' % (i + 1), bbox=dict(boxstyle='round', ec='k', fc='w', alpha=0.7)) ax.annotate("R.c", (x0, y0), (0.2, 0.2), arrowprops=dict(arrowstyle='-', color='gray'), bbox=dict(boxstyle='round', ec='k', fc='w')) ax.set_xlim(-1.9, 1.9) ax.set_ylim(-1.2, 0.8) plt.show() astroML-0.3/book_figures/appendix/fig_kd_dualtree.py0000644000076500000240000000562612252721253023466 0ustar jakevdpstaff00000000000000""" KD Dual-tree Diagram ---------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.95, hspace=0.05) #------------------------------------------------------------ ax = fig.add_subplot(211, xticks=[], yticks=[], aspect='equal') x = np.array([[0.5], [0.5]]) Rx = np.array([[1.25, 1.30, 1.40, 1.52, 1.56], [0.50, 0.78, 0.22, 0.45, 0.64]]) ax.add_patch(plt.Rectangle((1.2, 0.2), 0.4, 0.6, fc='none', lw=2, zorder=2)) ax.scatter(x[0], x[1], c='r', s=30, zorder=2) ax.scatter(Rx[0], Rx[1], c='b', s=30, zorder=2) ax.arrow(0.5, 0.5, 0.7, 0, width=0.01, lw=0, color='gray', length_includes_head=True, zorder=1) ax.arrow(0.5, 0.5, 1.1, -0.3, width=0.01, lw=0, color='gray', length_includes_head=True, zorder=1) ax.text(x[0], x[1], r'$x_i$ ', ha='right', va='bottom', fontsize=12) ax.text(1.65, 0.7, r' $R$', ha='left', va='bottom', fontsize=12) ax.text(0.8, 0.55, r'$D^l(x_i, R)$', ha='left', va='bottom', fontsize=12) ax.text(0.8, 0.25, r'$D^u(x_i, R)$', ha='left', va='bottom', fontsize=12) ax.set_xlim(0.2, 1.8) ax.set_ylim(0.1, 0.9) #---------------------------------------------------------------------- ax = fig.add_subplot(212, xticks=[], yticks=[], aspect='equal') Qx = Rx.copy() Qx[0] -= 0.8 Qx[1] = 1.1 - Qx[1] ax.add_patch(plt.Rectangle((0.4, 0.3), 0.4, 0.6, fc='none', lw=2, zorder=2)) ax.add_patch(plt.Rectangle((1.2, 0.2), 0.4, 0.6, fc='none', lw=2, zorder=2)) ax.scatter(Qx[0], Qx[1], c='r', s=30, zorder=2) ax.scatter(Rx[0], Rx[1], c='b', s=30, zorder=2) ax.arrow(0.8, 0.3, 0.4, 0, width=0.01, lw=0, color='gray', length_includes_head=True, zorder=1) ax.arrow(0.4, 0.9, 1.2, -0.7, width=0.01, lw=0, color='gray', length_includes_head=True, zorder=1) ax.text(0.35, 0.8, r'$Q$ ', ha='right', va='bottom', fontsize=12) ax.text(1.65, 0.7, r' $R$', ha='left', va='bottom', fontsize=12) ax.text(0.86, 0.65, r'$D^l(Q, R)$', ha='left', va='bottom', fontsize=12) ax.text(0.86, 0.2, r'$D^u(Q, R)$', ha='left', va='bottom', fontsize=12) ax.set_xlim(0.2, 1.8) ax.set_ylim(0.15, 0.95) plt.show() astroML-0.3/book_figures/appendix/fig_LIGO_bandpower.py0000644000076500000240000000411212420767763024000 0ustar jakevdpstaff00000000000000""" Plot the band power of the LIGO big dog event --------------------------------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import division import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_LIGO_bigdog from astroML.fourier import FT_continuous #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def multiple_power_spectrum(t, x, window_size=10000, step_size=1000): assert x.shape == t.shape assert x.ndim == 1 assert len(x) > window_size N_steps = (len(x) - window_size) // step_size indices = np.arange(window_size) + step_size * np.arange(N_steps)[:, None] X = x[indices].astype(complex) f, H = FT_continuous(t[:window_size], X) i = (f > 0) return f[i], abs(H[:, i]) X = fetch_LIGO_bigdog() t = X['t'] x = X['Hanford'] window_size = 10000 step_size = 500 f, P = multiple_power_spectrum(t, x, window_size=window_size, step_size=step_size) i = (f > 50) & (f < 1500) P = P[:, i] f = f[i] fig = plt.figure(figsize=(5, 3.75)) plt.imshow(np.log10(P).T, origin='lower', aspect='auto', extent=[t[window_size / 2], t[window_size / 2 + step_size * P.shape[0]], f[0], f[-1]]) plt.xlabel('t (s)') plt.ylabel('f (Hz) derived from %.2fs window' % (t[window_size] - t[0])) plt.colorbar().set_label('$|H(f)|$') plt.show() astroML-0.3/book_figures/appendix/fig_LIGO_wavelets.py0000644000076500000240000000423612252721253023643 0ustar jakevdpstaff00000000000000""" LIGO wavelet transform ---------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_LIGO_bigdog from astroML.fourier import FT_continuous, IFT_continuous #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def wavelet(t, t0, f0, Q): return (np.exp(-(f0 * (t - t0) / Q) ** 2) * np.exp(2j * np.pi * f0 * (t - t0))) def wavelet_FT(f, t0, f0, Q): # this is its fourier transform using # H(f) = integral[ h(t) exp(-2pi i f t) dt] return (np.sqrt(np.pi) * Q / f0 * np.exp(-2j * np.pi * f * t0) * np.exp(-(np.pi * (f - f0) * Q / f0) ** 2)) def check_funcs(t0=1, f0=2, Q=3): t = np.linspace(-10, 10, 10000) h = wavelet(t, t0, f0, Q) f, H = FT_continuous(t, h) assert np.allclose(H, wavelet_FT(f, t0, f0, Q)) X = fetch_LIGO_bigdog() t = X['t'] h = X['Hanford'] dt = t[1] - t[0] Q = np.sqrt(22) f0 = 1 f0 = 2 ** np.linspace(5, 8, 50) f, H = FT_continuous(t, h) W = np.conj(wavelet_FT(f, 0, f0[:, None], Q)) t, HW = IFT_continuous(f, H * W) t = t[::100] HW = HW[:, ::100] fig = plt.figure(figsize=(5, 3.75)) plt.imshow(abs(HW), origin='lower', aspect='auto', extent=[t[0], t[-1], np.log2(f0[0]), np.log2(f0[-1])]) plt.colorbar() plt.gca().yaxis.set_major_locator(plt.MultipleLocator(1)) plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, *args: "%i" % (2 ** x))) plt.show() astroML-0.3/book_figures/appendix/fig_neural_network.py0000644000076500000240000000602212252721253024231 0ustar jakevdpstaff00000000000000""" Neural Network Diagram ---------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) fig = plt.figure(figsize=(5, 3.75), facecolor='w') ax = fig.add_axes([0, 0, 1, 1], xticks=[], yticks=[]) plt.box(False) circ = plt.Circle((1, 1), 2) radius = 0.3 arrow_kwargs = dict(head_width=0.05, fc='black') # function to draw arrows def draw_connecting_arrow(ax, circ1, rad1, circ2, rad2): theta = np.arctan2(circ2[1] - circ1[1], circ2[0] - circ1[0]) starting_point = (circ1[0] + rad1 * np.cos(theta), circ1[1] + rad1 * np.sin(theta)) length = (circ2[0] - circ1[0] - (rad1 + 1.4 * rad2) * np.cos(theta), circ2[1] - circ1[1] - (rad1 + 1.4 * rad2) * np.sin(theta)) ax.arrow(starting_point[0], starting_point[1], length[0], length[1], **arrow_kwargs) # function to draw circles def draw_circle(ax, center, radius): circ = plt.Circle(center, radius, fc='none', lw=2) ax.add_patch(circ) x1 = -2 x2 = 0 x3 = 2 y3 = 0 #------------------------------------------------------------ # draw circles for i, y1 in enumerate(np.linspace(1.5, -1.5, 4)): draw_circle(ax, (x1, y1), radius) ax.text(x1 - 0.9, y1, 'Input #%i' % (i + 1), ha='right', va='center', fontsize=16) draw_connecting_arrow(ax, (x1 - 0.9, y1), 0.1, (x1, y1), radius) for y2 in np.linspace(-2, 2, 5): draw_circle(ax, (x2, y2), radius) draw_circle(ax, (x3, y3), radius) ax.text(x3 + 0.8, y3, 'Output', ha='left', va='center', fontsize=16) draw_connecting_arrow(ax, (x3, y3), radius, (x3 + 0.8, y3), 0.1) #------------------------------------------------------------ # draw connecting arrows for y1 in np.linspace(-1.5, 1.5, 4): for y2 in np.linspace(-2, 2, 5): draw_connecting_arrow(ax, (x1, y1), radius, (x2, y2), radius) for y2 in np.linspace(-2, 2, 5): draw_connecting_arrow(ax, (x2, y2), radius, (x3, y3), radius) #------------------------------------------------------------ # Add text labels plt.text(x1, 2.7, "Input\nLayer", ha='center', va='top', fontsize=16) plt.text(x2, 2.7, "Hidden Layer", ha='center', va='top', fontsize=16) plt.text(x3, 2.7, "Output\nLayer", ha='center', va='top', fontsize=16) ax.set_aspect('equal') plt.xlim(-4, 4) plt.ylim(-3, 3) plt.show() astroML-0.3/book_figures/appendix/fig_plotting_examples.py0000644000076500000240000000451512252721253024735 0ustar jakevdpstaff00000000000000""" Examples of Plotting with Matplotlib ------------------------------------ Figures A.2, A.3, A.4, A.5 These scripts generate the output of the plotting examples in the appendix. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) np.random.seed(0) #------------------------------------------------------------ # First Example: simple plot plt.figure(1, figsize=(5, 3.75)) x = np.linspace(0, 2 * np.pi, 1000) y = np.sin(x) plt.plot(x, y) plt.xlim(0, 2 * np.pi) plt.ylim(-1.3, 1.3) plt.xlabel('x') plt.ylabel('y') plt.title('Simple Sinusoid Plot') #------------------------------------------------------------ # Second Example: error-bars over simple plot plt.figure(2, figsize=(5, 3.75)) x = np.linspace(0, 2 * np.pi, 1000) y = np.sin(x) plt.plot(x, y) plt.xlim(0, 2 * np.pi) plt.ylim(-1.3, 1.3) plt.xlabel('x') plt.ylabel('y') plt.title('Simple Sinusoid Plot') x_obs = 2 * np.pi * np.random.random(50) y_obs = np.sin(x_obs) y_obs += np.random.normal(0, 0.1, 50) plt.errorbar(x_obs, y_obs, 0.1, fmt='.', color='black') #------------------------------------------------------------ # Third Example: histogram plt.figure(3, figsize=(5, 3.75)) x = np.random.normal(size=1000) plt.hist(x, bins=50) plt.xlabel('x') plt.ylabel('N(x)') #------------------------------------------------------------ # Fourth Example: spline fitting from scipy import interpolate x = np.linspace(0, 16, 30) y = np.sin(x) x2 = np.linspace(0, 16, 1000) spl = interpolate.UnivariateSpline(x, y, s=0) plt.figure(4, figsize=(5, 3.75)) plt.plot(x, y, 'ok') plt.plot(x2, spl(x2), '-k') plt.ylim(-1.3, 1.3) plt.show() astroML-0.3/book_figures/appendix/fig_sdss_filters.py0000644000076500000240000000417712252721253023707 0ustar jakevdpstaff00000000000000r""" SDSS Filters ------------ Figure C.1 The five SDSS filters, showing the total transmission taking into account atmospheric transmission and instrumental effects such as CCD efficiency. Shown for reference is the spectrum (:math:`F_\lambda`) of a star similar to Vega (alpha-Lyr), which for many years was used as a reference flux for magnitude calibration. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_filter, fetch_vega_spectrum #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up figure and axes fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) #---------------------------------------------------------------------- # Fetch and plot the Vega spectrum spec = fetch_vega_spectrum() lam = spec[0] spectrum = spec[1] / 2.1 / spec[1].max() ax.plot(lam, spectrum, '-k') #------------------------------------------------------------ # Fetch and plot the five filters text_kwargs = dict(ha='center', va='center', alpha=0.5, fontsize=14) for f, c, loc in zip('ugriz', 'bgrmk', [3500, 4600, 6100, 7500, 8800]): data = fetch_sdss_filter(f) ax.fill(data[0], data[1], ec=c, fc=c, alpha=0.4) ax.text(loc, 0.02, f, color=c, **text_kwargs) ax.set_xlim(3000, 11000) ax.set_title('SDSS Filters and Reference Spectrum') ax.set_xlabel('Wavelength (Angstroms)') ax.set_ylabel('normalized flux / filter transmission') plt.show() astroML-0.3/book_figures/appendix/README.rst0000644000076500000240000000022212115147567021466 0ustar jakevdpstaff00000000000000Appendix -------- There are several appendices, covering topics from the efficient use of Python code to the practical use of Fourier transforms. astroML-0.3/book_figures/chapter1/0000755000076500000240000000000012462244012017666 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter1/fig_dr7_quasar.py0000644000076500000240000000320712252721253023143 0ustar jakevdpstaff00000000000000""" SDSS DR7 Quasars ---------------- Figure 1.4. The r-i color vs. redshift diagram for the first 10,000 entries from the SDSS Data Release 7 Quasar Catalog. The color variation is due to emission lines entering and exiting the r and i band wavelength windows. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.datasets import fetch_dr7_quasar #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the quasar data data = fetch_dr7_quasar() # select the first 10000 points data = data[:10000] r = data['mag_r'] i = data['mag_i'] z = data['redshift'] #------------------------------------------------------------ # Plot the quasar data fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(z, r - i, marker='.', markersize=2, linestyle='none', color='black') ax.set_xlim(0, 5) ax.set_ylim(-0.5, 1.0) ax.set_xlabel(r'${\rm redshift}$') ax.set_ylabel(r'${\rm r-i}$') plt.show() astroML-0.3/book_figures/chapter1/fig_healpix_ex.py0000644000076500000240000000465112420767763023242 0ustar jakevdpstaff00000000000000""" Example of HealPix pixellization -------------------------------- Figure 1.15. The top panel shows HEALPix pixels in nested order. The 12 fundamental sky divisions can be seen, as well as the hierarchical nature of the smaller pixels. This shows a pixelization with nside = 4, that is, each of the 12 large regions has 4 x 4 pixels, for a total of 192 pixels. The lower panel shows a seven-year co-add of raw WMAP data, plotted using the HEALPix projection using the HealPy package. This particular realization has nside = 512, for a total of 3,145,728 pixels. The pixels are roughly 6.8 arcminutes on a side. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt # warning: due to a bug in healpy, importing it before pylab can cause # a segmentation fault in some circumstances. import healpy as hp from astroML.datasets import fetch_wmap_temperatures #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # First plot an example pixellization # Prepare the healpix pixels NSIDE = 4 m = np.arange(hp.nside2npix(NSIDE)) print("number of pixels:", len(m)) # Plot the pixelization fig = plt.figure(1, figsize=(5, 3.75)) hp.mollview(m, nest=True, title="HEALPix Pixels (Mollweide)", fig=1) # remove colorbar: we don't need it for this plot fig.delaxes(fig.axes[1]) #------------------------------------------------------------ # Next plot the wmap pixellization wmap_unmasked = fetch_wmap_temperatures(masked=False) # plot the unmasked map fig = plt.figure(2, figsize=(5, 3.75)) hp.mollview(wmap_unmasked, min=-1, max=1, title='Raw WMAP data', unit=r'$\Delta$T (mK)', fig=2) fig.axes[1].texts[0].set_fontsize(8) plt.show() astroML-0.3/book_figures/chapter1/fig_LINEAR_sample.py0000644000076500000240000000735012420767763023426 0ustar jakevdpstaff00000000000000""" Phased LINEAR Light Curve ------------------------- Figure1.7. An example of the type of data available in the LINEAR dataset. The scatter plots show the g-r and r-i colors, and the variability period determined using a Lomb-Scargle periodogram (for details see chapter 10). The upper-right panel shows a phased light curve for one of the over 7000 objects. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_LINEAR_sample, fetch_LINEAR_geneva #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get data for the plot data = fetch_LINEAR_sample() geneva = fetch_LINEAR_geneva() # contains well-measured periods # Compute the phased light curve for a single object. # the best-fit period in the file is not accurate enough # for light curve phasing. The frequency below is # calculated using Lomb Scargle (see chapter10/fig_LINEAR_LS.py) id = 18525697 omega = 10.82722481 t, y, dy = data[id].T phase = (t * omega * 0.5 / np.pi + 0.1) % 1 # Select colors, magnitudes, and periods from the global set targets = data.targets[data.targets['LP1'] < 2] r = targets['r'] gr = targets['gr'] ri = targets['ri'] logP = targets['LP1'] # Cross-match by ID with the geneva catalog to get more accurate periods targetIDs = [str(ID).lstrip('0') for ID in targets['objectID']] genevaIDs = [ID.decode('utf-8').lstrip('0') for ID in geneva['LINEARobjectID']] def safe_index(L, val): try: return L.index(val) except ValueError: return -1 ind = np.array([safe_index(genevaIDs, ID) for ID in targetIDs]) mask = (ind >= 0) logP = geneva['logP'][ind[mask]] r = r[mask] gr = gr[mask] ri = ri[mask] #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.1, wspace=0.1, top=0.95, right=0.95) ax = fig.add_axes((0.64, 0.62, 0.3, 0.25)) plt.errorbar(phase, y, dy, fmt='.', color='black', ecolor='gray', lw=1, ms=4, capsize=1.5) plt.ylim(plt.ylim()[::-1]) plt.xlabel('phase') plt.ylabel('magnitude') ax.yaxis.set_major_locator(plt.MultipleLocator(0.5)) plt.title("Example of\nphased light curve") ax = fig.add_subplot(223) ax.plot(gr, ri, '.', color='black', markersize=2) ax.set_xlim(-0.3, 1.5) ax.set_ylim(-1.0, 1.5) ax.xaxis.set_major_locator(plt.MultipleLocator(1.0)) ax.yaxis.set_major_locator(plt.MultipleLocator(1.0)) ax.set_xlabel(r'${\rm g-r}$') ax.set_ylabel(r'${\rm r-i}$') ax = fig.add_subplot(221, yscale='log') ax.plot(gr, 10 ** logP, '.', color='black', markersize=2) ax.set_xlim(-0.3, 1.5) ax.set_ylim(3E-2, 1E1) ax.xaxis.set_major_locator(plt.MultipleLocator(1.0)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('Period (days)') ax = fig.add_subplot(224, xscale='log') ax.plot(10 ** logP, ri, '.', color='black', markersize=2) ax.set_xlim(3E-2, 1E1) ax.set_ylim(-1.0, 1.5) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_locator(plt.MultipleLocator(1.0)) ax.set_xlabel('Period (days)') plt.show() astroML-0.3/book_figures/chapter1/fig_mercator.py0000644000076500000240000000431112252721253022704 0ustar jakevdpstaff00000000000000""" Mercator Projection ------------------- Figure1.13. The Mercator projection. Shown are the projections of circles of constant radius 10 degrees across the sky. Note that the area is not preserved by the Mercator projection: the projection increases the size of finite regions on the sphere, with a magnitude which increases at high latitudes. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.plotting import plot_tissot_ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # generate a latitude/longitude grid circ_long = np.linspace(-np.pi, np.pi, 13)[1:-1] circ_lat = np.linspace(-np.pi / 2, np.pi / 2, 7)[1:-1] radius = 10 * np.pi / 180. #------------------------------------------------------------ # plot Mercator projection: we need to set this up manually def mercator_axes(): ax = plt.axes(aspect=1.0) ax.set_xticks(np.pi / 6 * np.linspace(-5, 5, 11)) ax.set_yticks(np.pi / 12 * np.linspace(-5, 5, 11)) for axy in (ax.xaxis, ax.yaxis): axy.set_major_formatter(plt.FuncFormatter(lambda s, a: r'$%i^\circ$' % np.round(s * 180 / np.pi))) ax.set_xlim(-np.pi, np.pi) ax.set_ylim(-np.pi / 2, np.pi / 2) return ax plt.figure(figsize=(5, 3.75)) ax = mercator_axes() ax.grid(True) plot_tissot_ellipse(circ_long[:, None], circ_lat, radius, ax=ax, fc='k', alpha=0.3, lw=0) ax.set_title('Mercator projection') plt.show() astroML-0.3/book_figures/chapter1/fig_moving_objects.py0000644000076500000240000000356712252721253024114 0ustar jakevdpstaff00000000000000""" SDSS Moving Object Data ----------------------- Figure 1.8. The orbital semimajor axis vs. the orbital inclination angle diagram for the first 10,000 catalog entries from the SDSS Moving Object Catalog (after applying several quality cuts). The gaps at approximately 2.5, 2.8, and 3.3 AU are called the Kirkwood gaps and are due to orbital resonances with Jupiter. The several distinct clumps are called asteroid families and represent remnants from collisions of larger asteroids. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.datasets import fetch_moving_objects #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the moving object data data = fetch_moving_objects(Parker2008_cuts=True) # Use only the first 10000 points data = data[:10000] a = data['aprime'] sini = data['sin_iprime'] #------------------------------------------------------------ # Plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(a, sini, '.', markersize=2, color='black') ax.set_xlim(2.0, 3.6) ax.set_ylim(-0.01, 0.31) ax.set_xlabel('Semimajor Axis (AU)') ax.set_ylabel('Sine of Inclination Angle') plt.show() astroML-0.3/book_figures/chapter1/fig_moving_objects_multicolor.py0000644000076500000240000001074412420577220026360 0ustar jakevdpstaff00000000000000""" SDSS Stripe 82 Moving Object Catalog ------------------------------------ Figure 1.12. A multicolor scatter plot of the properties of asteroids from the SDSS Moving Object Catalog (cf. figure 1.8). The left panel shows observational markers of the chemical properties of the asteroids: two colors a* and i-z. The right panel shows the orbital parameters: semimajor axis a vs. the sine of the inclination. The color of points in the right panel reflects their position in the left panel. This plot is similar to that used in figures 3-4 of Parker et al 2008. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_moving_objects from astroML.plotting.tools import devectorize_axes #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def black_bg_subplot(*args, **kwargs): """Create a subplot with black background""" kwargs['axisbg'] = 'k' ax = plt.subplot(*args, **kwargs) # set ticks and labels to white for spine in ax.spines.values(): spine.set_color('w') for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): for child in tick.get_children(): child.set_color('w') return ax def compute_color(mag_a, mag_i, mag_z, a_crit=-0.1): """ Compute the scatter-plot color using code adapted from TCL source used in Parker 2008. """ # define the base color scalings R = np.ones_like(mag_i) G = 0.5 * 10 ** (-2 * (mag_i - mag_z - 0.01)) B = 1.5 * 10 ** (-8 * (mag_a + 0.0)) # enhance green beyond the a_crit cutoff G += 10. / (1 + np.exp((mag_a - a_crit) / 0.02)) # normalize color of each point to its maximum component RGB = np.vstack([R, G, B]) RGB /= RGB.max(0) # return an array of RGB colors, which is shape (n_points, 3) return RGB.T #------------------------------------------------------------ # Fetch data and extract the desired quantities data = fetch_moving_objects(Parker2008_cuts=True) mag_a = data['mag_a'] mag_i = data['mag_i'] mag_z = data['mag_z'] a = data['aprime'] sini = data['sin_iprime'] # dither: magnitudes are recorded only to +/- 0.01 np.random.seed(0) mag_a += -0.005 + 0.01 * np.random.random(size=mag_a.shape) mag_i += -0.005 + 0.01 * np.random.random(size=mag_i.shape) mag_z += -0.005 + 0.01 * np.random.random(size=mag_z.shape) # compute RGB color based on magnitudes color = compute_color(mag_a, mag_i, mag_z) #------------------------------------------------------------ # set up the plot fig = plt.figure(figsize=(5, 2.2), facecolor='k') fig.subplots_adjust(left=0.1, right=0.95, wspace=0.3, bottom=0.2, top=0.93) # plot the color-magnitude plot ax = black_bg_subplot(121) ax.scatter(mag_a, mag_i - mag_z, c=color, s=0.5, lw=0) devectorize_axes(ax, dpi=400) ax.plot([0, 0], [-0.8, 0.6], '--w', lw=1) ax.plot([0, 0.4], [-0.15, -0.15], '--w', lw=1) ax.set_xlim(-0.3, 0.4) ax.set_ylim(-0.8, 0.6) ax.set_xlabel(r'${\rm a*}$', color='w') ax.set_ylabel(r'${\rm i-z}$', color='w') # plot the orbital parameters plot ax = black_bg_subplot(122) ax.scatter(a, sini, c=color, s=0.5, lw=0, edgecolor='none') devectorize_axes(ax, dpi=400) ax.plot([2.5, 2.5], [-0.02, 0.3], '--w', lw=1) ax.plot([2.82, 2.82], [-0.02, 0.3], '--w', lw=1) ax.set_xlim(2.0, 3.3) ax.set_ylim(-0.02, 0.3) ax.set_xlabel(r'${\rm a (AU)}$', color='w') ax.set_ylabel(r'${\rm sin(i)}$', color='w') # label the plot text_kwargs = dict(color='w', transform=plt.gca().transAxes, ha='center', va='bottom') ax.text(0.25, 1.02, 'Inner', **text_kwargs) ax.text(0.53, 1.02, 'Mid', **text_kwargs) ax.text(0.83, 1.02, 'Outer', **text_kwargs) # Saving the black-background figure requires some extra arguments: #fig.savefig('moving_objects.png', # facecolor='black', # edgecolor='none') plt.show() astroML-0.3/book_figures/chapter1/fig_projections.py0000644000076500000240000000502712252721253023434 0ustar jakevdpstaff00000000000000""" Spherical Projections --------------------- Figure 1.14. Four common full-sky projections. The shaded ellipses represent the distortion across the sky: each is projected from a circle of radius 10 degrees on the sphere. The extent to which these are distorted and/or magnified shows the distortion inherent to the mapping. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.plotting import plot_tissot_ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # generate a latitude/longitude grid circ_long = np.linspace(-np.pi, np.pi, 13)[1:-1] circ_lat = np.linspace(-np.pi / 2, np.pi / 2, 7)[1:-1] radius = 10 * np.pi / 180. #------------------------------------------------------------ # Plot the built-in projections plt.figure(figsize=(5, 4)) plt.subplots_adjust(hspace=0, wspace=0.12, left=0.08, right=0.95, bottom=0.05, top=1.0) for (i, projection) in enumerate(['Hammer', 'Aitoff', 'Mollweide', 'Lambert']): ax = plt.subplot(221 + i, projection=projection.lower()) ax.xaxis.set_major_locator(plt.FixedLocator(np.pi / 3 * np.linspace(-2, 2, 5))) ax.xaxis.set_minor_locator(plt.FixedLocator(np.pi / 6 * np.linspace(-5, 5, 11))) ax.yaxis.set_major_locator(plt.FixedLocator(np.pi / 6 * np.linspace(-2, 2, 5))) ax.yaxis.set_minor_locator(plt.FixedLocator(np.pi / 12 * np.linspace(-5, 5, 11))) ax.grid(True, which='minor') plot_tissot_ellipse(circ_long[:, None], circ_lat, radius, ax=ax, fc='k', alpha=0.3, linewidth=0) ax.set_title('%s projection' % projection) plt.show() astroML-0.3/book_figures/chapter1/fig_S82_hess.py0000644000076500000240000000414212252721253022470 0ustar jakevdpstaff00000000000000""" SDSS Stripe 82 Hess Diagram --------------------------- Figure 1.10. A Hess diagram of the r-i vs. g-r colors for the entire set of SDSS Stripe 82 standard stars. The pixels are colored with a logarithmic scaling; cf. figures 1.6 and 1.9. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_S82standards #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the stripe 82 data data = fetch_sdss_S82standards() g = data['mmu_g'] r = data['mmu_r'] i = data['mmu_i'] #------------------------------------------------------------ # Compute and plot the 2D histogram H, xbins, ybins = np.histogram2d(g - r, r - i, bins=(np.linspace(-0.5, 2.5, 50), np.linspace(-0.5, 2.5, 50))) # Create a black and white color map where bad data (NaNs) are white cmap = plt.cm.binary cmap.set_bad('w', 1.) # Use the image display function imshow() to plot the result fig, ax = plt.subplots(figsize=(5, 3.75)) H[H == 0] = 1 # prevent warnings in log10 ax.imshow(np.log10(H).T, origin='lower', extent=[xbins[0], xbins[-1], ybins[0], ybins[-1]], cmap=cmap, interpolation='nearest', aspect='auto') ax.set_xlabel(r'${\rm g - r}$') ax.set_ylabel(r'${\rm r - i}$') ax.set_xlim(-0.6, 2.5) ax.set_ylim(-0.6, 2.5) plt.show() astroML-0.3/book_figures/chapter1/fig_S82_scatter_contour.py0000644000076500000240000000344012252721253024744 0ustar jakevdpstaff00000000000000""" SDSS Stripe 82 Standard Stars ----------------------------- Figure 1.9. Scatter plot with contours over dense regions.This is a color-color diagram of the entire set of SDSS Stripe 82 standard stars; cf. figure 1.6. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.plotting import scatter_contour from astroML.datasets import fetch_sdss_S82standards #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the Stripe 82 standard star catalog data = fetch_sdss_S82standards() g = data['mmu_g'] r = data['mmu_r'] i = data['mmu_i'] #------------------------------------------------------------ # plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) scatter_contour(g - r, r - i, threshold=200, log_counts=True, ax=ax, histogram2d_args=dict(bins=40), plot_args=dict(marker=',', linestyle='none', color='black'), contour_args=dict(cmap=plt.cm.bone)) ax.set_xlabel(r'${\rm g - r}$') ax.set_ylabel(r'${\rm r - i}$') ax.set_xlim(-0.6, 2.5) ax.set_ylim(-0.6, 2.5) plt.show() astroML-0.3/book_figures/chapter1/fig_SDSS_imaging.py0000644000076500000240000000635212252721253023346 0ustar jakevdpstaff00000000000000""" SDSS Imaging ============ Figure 1.1 The r vs. g-r color-magnitude diagrams and the r-i vs. g-r color-color diagrams for galaxies (left column) and stars (right column) from the SDSS imaging catalog. Only the first 5000 entries for each subset are shown in order to minimize the blending of points (various more sophisticated visualization methods are discussed in Section 1.6). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_imaging_sample #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def get_stars_and_galaxies(Nstars=5000, Ngals=5000): """Get the subset of star/galaxy data to plot""" data = fetch_imaging_sample() objtype = data['type'] stars = data[objtype == 6][:Nstars] galaxies = data[objtype == 3][:Ngals] return stars, galaxies def plot_stars_and_galaxies(stars, galaxies): """Plot the star and galaxy data""" # Note: we use plot() rather than scatter() because it's more efficient # for large numbers of points. # Scatter should be used only when points need to be different colors # and/or sizes plot_kwargs = dict(color='k', linestyle='none', marker=',') fig = plt.figure(figsize=(5, 3.75)) ax1 = fig.add_subplot(221) ax1.plot(galaxies['gRaw'] - galaxies['rRaw'], galaxies['rRaw'], **plot_kwargs) ax2 = fig.add_subplot(223, sharex=ax1) ax2.plot(galaxies['gRaw'] - galaxies['rRaw'], galaxies['rRaw'] - galaxies['iRaw'], **plot_kwargs) ax3 = fig.add_subplot(222, sharey=ax1) ax3.plot(stars['gRaw'] - stars['rRaw'], stars['rRaw'], **plot_kwargs) ax4 = fig.add_subplot(224, sharex=ax3, sharey=ax2) ax4.plot(stars['gRaw'] - stars['rRaw'], stars['rRaw'] - stars['iRaw'], **plot_kwargs) # set labels and titles ax1.set_ylabel(r'${\rm r}$') ax2.set_ylabel(r'${\rm r - i}$') ax2.set_xlabel(r'${\rm g - r}$') ax4.set_xlabel(r'${\rm g - r}$') ax1.set_title('Galaxies') ax3.set_title('Stars') # set axis limits ax2.set_xlim(-1, 3) ax3.set_ylim(22.5, 14) ax4.set_xlim(-1, 3) ax4.set_ylim(-1, 2) # adjust tick spacings on all axes for ax in (ax1, ax2, ax3, ax4): ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(1)) #------------------------------------------------------------ # Generate and show the plot stars, galaxies = get_stars_and_galaxies() plot_stars_and_galaxies(stars, galaxies) plt.show() astroML-0.3/book_figures/chapter1/fig_sdss_S82standards.py0000644000076500000240000000352412252721253024411 0ustar jakevdpstaff00000000000000""" SDSS Stripe 82 Standard Stars ----------------------------- Figure 1.6. The g-r vs. r-i color-color diagram for the first 10,000 entries in the Stripe 82 Standard Star Catalog. The region with the highest point density is dominated by main sequence stars. The thin extension toward the lower-left corner is dominated by the so-called blue horizontal branch stars and white dwarf stars. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_S82standards #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the stripe 82 data data = fetch_sdss_S82standards() # select the first 10000 points data = data[:10000] # select the mean magnitudes for g, r, i g = data['mmu_g'] r = data['mmu_r'] i = data['mmu_i'] #------------------------------------------------------------ # Plot the g-r vs r-i colors fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(g - r, r - i, marker='.', markersize=2, color='black', linestyle='none') ax.set_xlim(-0.6, 2.0) ax.set_ylim(-0.6, 2.5) ax.set_xlabel(r'${\rm g - r}$') ax.set_ylabel(r'${\rm r - i}$') plt.show() astroML-0.3/book_figures/chapter1/fig_SDSS_specgals.py0000644000076500000240000000377112252721253023536 0ustar jakevdpstaff00000000000000""" SDSS Spectroscopic Galaxy Sample -------------------------------- Figure 1.3. The r vs. u-r color-magnitude diagram for the first 10,000 entries in the catalog of spectroscopically observed galaxies from the Sloan Digital Sky Survey (SDSS). Note two "clouds" of points with different morphologies separated by u-r ~ 2.3. The abrupt decrease of the point density for r > 17.7 (the bottom of the diagram) is due to the selection function for the spectroscopic galaxy sample from SDSS. This example shows how to fetch photometric data from the SDSS spectroscopic sample and plot a simple color-magnitude diagram. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_specgals #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch spectroscopic galaxy data data = fetch_sdss_specgals() data = data[:10000] u = data['modelMag_u'] r = data['modelMag_r'] rPetro = data['petroMag_r'] #------------------------------------------------------------ # Plot the galaxy colors and magnitudes fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(u - r, rPetro, '.k', markersize=2) ax.set_xlim(1, 4.5) ax.set_ylim(18.1, 13.5) ax.set_xlabel(r'$\mathrm{u - r}$') ax.set_ylabel(r'$\mathrm{r_{petrosian}}$') plt.show() astroML-0.3/book_figures/chapter1/fig_sdss_spectrum.py0000644000076500000240000000332012252721253023765 0ustar jakevdpstaff00000000000000""" SDSS Spectrum Example --------------------- Figure 1.2. An example of an SDSS spectrum (the specific flux plotted as a function of wavelength) loaded from the SDSS SQL server in real time using Python tools provided here (this spectrum is uniquely described by SDSS parameters plate=1615, fiber=513, and mjd=53166). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_spectrum #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch single spectrum plate = 1615 mjd = 53166 fiber = 513 spec = fetch_sdss_spectrum(plate, mjd, fiber) #------------------------------------------------------------ # Plot the resulting spectrum fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(spec.wavelength(), spec.spectrum, '-k', lw=1) ax.set_xlim(3000, 10000) ax.set_ylim(25, 300) ax.set_xlabel(r'$\lambda {(\rm \AA)}$') ax.set_ylabel('Flux') ax.set_title('Plate = %(plate)i, MJD = %(mjd)i, Fiber = %(fiber)i' % locals()) plt.show() astroML-0.3/book_figures/chapter1/fig_SDSS_sspp.py0000644000076500000240000000444412252721253022720 0ustar jakevdpstaff00000000000000""" SDSS Segue Stellar Parameter Pipeline Data ------------------------------------------ Figure 1.5. The surface gravity vs. effective temperature plot for the first 10,000 entries from the catalog of stars with SDSS spectra. The rich substructure reflects both stellar physics and the SDSS selection criteria for spectroscopic follow-up. The plume of points centered on Teff ~ 5300 K and log g ~ 3 is dominated by red giant stars, and the locus of points with Teff < 6500 K and log g > 4.5 is dominated by main sequence stars. Stars to the left from the main sequence locus are dominated by the so-called blue horizontal branch stars. The axes are plotted backward for ease of comparison with the classical Hertzsprung-Russell diagram: the luminosity of a star approximately increases upward in this diagram. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_sspp #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the data data = fetch_sdss_sspp() # select the first 10000 points data = data[:10000] # do some reasonable magnitude cuts rpsf = data['rpsf'] data = data[(rpsf > 15) & (rpsf < 19)] # get the desired data logg = data['logg'] Teff = data['Teff'] #------------------------------------------------------------ # Plot the data fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(Teff, logg, marker='.', markersize=2, linestyle='none', color='black') ax.set_xlim(8000, 4500) ax.set_ylim(5.1, 1) ax.set_xlabel(r'$\mathrm{T_{eff}\ (K)}$') ax.set_ylabel(r'$\mathrm{log_{10}[g / (cm/s^2)]}$') plt.show() astroML-0.3/book_figures/chapter1/fig_SSPP_metallicity.py0000644000076500000240000001076312252721253024265 0ustar jakevdpstaff00000000000000""" Stellar Parameters Hess Diagram ------------------------------- Figure 1.11. A Hess diagram of the number per pixel (left) and [Fe/H] metallicity (center, right) of SEGUE Stellar Parameters Pipeline stars. In the center and right panels, contours representing the number density are overplotted for comparison. These two panels show identical data, but compare a grayscale and multicolor plotting scheme. This is an example of a situation in which multiple colors are very helpful in distinguishing close metallicity levels. This is the same data as shown in figure 1.5. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get SDSS SSPP data from astroML.datasets import fetch_sdss_sspp data = fetch_sdss_sspp() # do some reasonable magnitude cuts rpsf = data['rpsf'] data = data[(rpsf > 15) & (rpsf < 19)] # get the desired data logg = data['logg'] Teff = data['Teff'] FeH = data['FeH'] #------------------------------------------------------------ # Plot the results using the binned_statistic function from astroML.stats import binned_statistic_2d N, xedges, yedges = binned_statistic_2d(Teff, logg, FeH, 'count', bins=100) FeH_mean, xedges, yedges = binned_statistic_2d(Teff, logg, FeH, 'mean', bins=100) # Define custom colormaps: Set pixels with no sources to white cmap = plt.cm.copper cmap.set_bad('w', 1.) cmap_multicolor = plt.cm.jet cmap_multicolor.set_bad('w', 1.) # Create figure and subplots fig = plt.figure(figsize=(5, 2)) fig.subplots_adjust(wspace=0.22, left=0.1, right=0.95, bottom=0.12, top=0.95) #-------------------- # First axes: ax = plt.subplot(131, xticks=[4000, 5000, 6000, 7000, 8000]) plt.imshow(np.log10(N.T), origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], aspect='auto', interpolation='nearest', cmap=cmap) plt.xlim(xedges[-1], xedges[0]) plt.ylim(yedges[-1], yedges[0]) plt.xlabel(r'$\mathrm{T_{eff}}$') plt.ylabel(r'$\mathrm{log(g)}$') cb = plt.colorbar(ticks=[0, 1, 2, 3], pad=0.2, format=r'$10^{%i}$', orientation='horizontal') cb.set_label(r'$\mathrm{number\ in\ pixel}$') plt.clim(0, 3) #-------------------- # Second axes: ax = plt.subplot(132, xticks=[4000, 5000, 6000, 7000, 8000]) plt.imshow(FeH_mean.T, origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], aspect='auto', interpolation='nearest', cmap=cmap) plt.xlim(xedges[-1], xedges[0]) plt.ylim(yedges[-1], yedges[0]) plt.xlabel(r'$\mathrm{T_{eff}}$') cb = plt.colorbar(ticks=np.arange(-2.5, 1, 1), pad=0.2, format=r'$%.1f$', orientation='horizontal') cb.set_label(r'$\mathrm{mean\ [Fe/H]\ in\ pixel}$') plt.clim(-2.5, 0.5) # Draw density contours over the colors levels = np.linspace(0, np.log10(N.max()), 7)[2:] plt.contour(np.log10(N.T), levels, colors='k', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) #-------------------- # Third axes: ax = plt.subplot(133, xticks=[4000, 5000, 6000, 7000, 8000]) plt.imshow(FeH_mean.T, origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], aspect='auto', interpolation='nearest', cmap=cmap_multicolor) plt.xlim(xedges[-1], xedges[0]) plt.ylim(yedges[-1], yedges[0]) plt.xlabel(r'$\mathrm{T_{eff}}$') cb = plt.colorbar(ticks=np.arange(-2.5, 1, 1), pad=0.2, format=r'$%.1f$', orientation='horizontal') cb.set_label(r'$\mathrm{mean\ [Fe/H]\ in\ pixel}$') plt.clim(-2.5, 0.5) # Draw density contours over the colors levels = np.linspace(0, np.log10(N.max()), 7)[2:] plt.contour(np.log10(N.T), levels, colors='k', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) plt.show() astroML-0.3/book_figures/chapter1/README.rst0000644000076500000240000000030312115147567021365 0ustar jakevdpstaff00000000000000Chapter 1: Introduction ----------------------- This chapter consists of introductory figures, including several of the datasets available in astroML, and some strategies for data visualization. astroML-0.3/book_figures/chapter10/0000755000076500000240000000000012462244012017746 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter10/compute_periods.py0000644000076500000240000000336712420767763023553 0ustar jakevdpstaff00000000000000""" Compute periods for the LINEAR data ----------------------------------- """ from __future__ import print_function from time import time import numpy as np from astroML.datasets import fetch_LINEAR_sample from astroML.time_series import lomb_scargle, multiterm_periodogram, \ search_frequencies import sqlite3 Ngrid = 50000 DATABASE = 'periods.db' data = fetch_LINEAR_sample() # set up a database to hold periods con = sqlite3.connect(DATABASE) with con: cur = con.cursor() try: cur.execute("CREATE TABLE Periods(id INT, omega FLOAT)") except: pass for count, id in enumerate(data.ids): # only compute period if it hasn't been computed before cur.execute("SELECT * from Periods WHERE id = %i" % id) res = cur.fetchall() if len(res) > 0: print(res[0]) else: print("computing period for id = {0} ({1} / {2})" "".format(id, count + 1, len(data.ids)))) lc = data[id] t0 = time() omega, power = search_frequencies(lc[:, 0], lc[:, 1], lc[:, 2], LS_func=multiterm_periodogram, n_save=5, n_retry=5, n_eval=10000, LS_kwargs=dict(n_terms=5)) omega_best = omega[np.argmax(power)] t1 = time() print(" - execution time: %.2g sec" % (t1 - t0)) # insert value and commit to disk cur.execute("INSERT INTO Periods VALUES(%i, %f)" % (id, omega_best)) con.commit() con.close() #cur.execute("SELECT * from Periods") #print(cur.fetchall()) astroML-0.3/book_figures/chapter10/fig_arrival_time.py0000644000076500000240000001120712420767763023645 0ustar jakevdpstaff00000000000000""" Arrival Time Analysis --------------------- Figure 10.24 Modeling time-dependent flux based on arrival time data. The top-right panel shows the rate r(t) = r0[1 + a sin(omega t + phi)], along with the locations of the 104 detected photons. The remaining panels show the model contours calculated via MCMC; dotted lines indicate the input parameters. The likelihood used is from eq. 10.83. Note the strong covariance between phi and omega in the bottom-right panel. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt # Hack to fix import issue in older versions of pymc import scipy import scipy.misc scipy.derivative = scipy.misc.derivative import pymc from astroML.plotting.mcmc import plot_mcmc from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create some data np.random.seed(1) N_expected = 100 # define our rate function def rate_func(t, r0, a, omega, phi): return r0 * (1 + a * np.sin(omega * t + phi)) # define the time steps t = np.linspace(0, 10, 10000) Dt = t[1] - t[0] # compute the total rate in each bin r0_true = N_expected / (t[-1] - t[0]) a_true = 0.8 phi_true = np.pi / 4 omega_true = 4 r = rate_func(t, r0_true, a_true, omega_true, phi_true) # randomly sample photon arrivals from the rate x = np.random.random(t.shape) obs = (x < r * Dt).astype(int) print("Number of observed photons:", np.sum(obs)) #---------------------------------------------------------------------- # Set up our MCMC model r0 = pymc.Uniform('r0', 0, 1000, value=10) a = pymc.Uniform('a', 0, 1, value=0.5) phi = pymc.Uniform('phi', -np.pi, np.pi, value=0) log_omega = pymc.Uniform('log_omega', 0, np.log(10), value=np.log(4)) # uniform prior on log(omega) @pymc.deterministic def omega(log_omega=log_omega): return np.exp(log_omega) @pymc.deterministic def rate(r0=r0, a=a, omega=omega, phi=phi): return rate_func(t, r0, a, omega, phi) def arrival_like(obs, rate, Dt): """likelihood for arrival time""" N = np.sum(obs) return (N * np.log(Dt) - np.sum(rate) * Dt + np.sum(np.log(rate[obs > 0]))) Arrival = pymc.stochastic_from_dist('arrival', logp=arrival_like, dtype=np.float, mv=True) obs_dist = Arrival('obs_dist', rate=rate, Dt=Dt, observed=True, value=obs) model = dict(obs_dist=obs_dist, r0=r0, a=a, phi=phi, log_omega=log_omega, omega=omega, rate=rate) #------------------------------------------------------------ # Compute results (and save to a pickle file) @pickle_results('arrival_times.pkl') def compute_model(niter=20000, burn=2000): S = pymc.MCMC(model) S.sample(iter=niter, burn=burn) traces = [S.trace(s)[:] for s in ['r0', 'a', 'phi', 'omega']] return traces traces = compute_model() labels = ['$r_0$', '$a$', r'$\phi$', r'$\omega$'] limits = [(6.5, 13.5), (0.55, 1.1), (-0.3, 1.7), (3.75, 4.25)] true = [r0_true, a_true, phi_true, omega_true] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) # This function plots multiple panels with the traces plot_mcmc(traces, labels=labels, limits=limits, true_values=true, fig=fig, bins=30, colors='k') # Plot the model of arrival times ax = fig.add_axes([0.5, 0.75, 0.45, 0.2]) ax.fill_between(t, 0, rate_func(t, r0_true, a_true, omega_true, phi_true), facecolor='#DDDDDD', edgecolor='black') ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlim(t[0], t[-1]) ax.set_ylim(0, 20) ax.set_ylabel('$r(t)$') # Plot the actual data ax = fig.add_axes([0.5, 0.7, 0.45, 0.04], yticks=[]) t_obs = t[obs > 0] ax.scatter(t_obs, np.random.RandomState(0).rand(len(t_obs)), marker='+', color='k') ax.set_xlim(t[0], t[-1]) ax.set_ylim(-0.3, 1.3) ax.set_xlabel('$t$') plt.show() astroML-0.3/book_figures/chapter10/fig_autocorrelation.py0000644000076500000240000000570712420767763024411 0ustar jakevdpstaff00000000000000""" Autocorrelation Function ------------------------ Figure 10.30 Example of the autocorrelation function for a stochastic process. The top panel shows a simulated light curve generated using a damped random walk model (Section 10.5.4). The bottom panel shows the corresponding autocorrelation function computed using Edelson and Krolik's DCF method and the Scargle method. The solid line shows the input autocorrelation function used to generate the light curve. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import lomb_scargle, generate_damped_RW from astroML.time_series import ACF_scargle, ACF_EK #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate time-series data: # we'll do 1000 days worth of magnitudes t = np.arange(0, 1E3) z = 2.0 tau = 300 tau_obs = tau / (1. + z) np.random.seed(6) y = generate_damped_RW(t, tau=tau, z=z, xmean=20) # randomly sample 100 of these ind = np.arange(len(t)) np.random.shuffle(ind) ind = ind[:100] ind.sort() t = t[ind] y = y[ind] # add errors dy = 0.1 y_obs = np.random.normal(y, dy) #------------------------------------------------------------ # compute ACF via scargle method C_S, t_S = ACF_scargle(t, y_obs, dy, n_omega=2. ** 12, omega_max=np.pi / 5.0) ind = (t_S >= 0) & (t_S <= 500) t_S = t_S[ind] C_S = C_S[ind] #------------------------------------------------------------ # compute ACF via E-K method C_EK, C_EK_err, bins = ACF_EK(t, y_obs, dy, bins=np.linspace(0, 500, 51)) t_EK = 0.5 * (bins[1:] + bins[:-1]) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) # plot the input data ax = fig.add_subplot(211) ax.errorbar(t, y_obs, dy, fmt='.k', lw=1) ax.set_xlabel('t (days)') ax.set_ylabel('observed flux') # plot the ACF ax = fig.add_subplot(212) ax.plot(t_S, C_S, '-', c='gray', lw=1, label='Scargle') ax.errorbar(t_EK, C_EK, C_EK_err, fmt='.k', lw=1, label='Edelson-Krolik') ax.plot(t_S, np.exp(-abs(t_S) / tau_obs), '-k', label='True') ax.legend(loc=3) ax.plot(t_S, 0 * t_S, ':', lw=1, c='gray') ax.set_xlim(0, 500) ax.set_ylim(-1.0, 1.1) ax.set_xlabel('t (days)') ax.set_ylabel('ACF(t)') plt.show() astroML-0.3/book_figures/chapter10/fig_chirp2_PSD.py0000644000076500000240000000551112420577220023050 0ustar jakevdpstaff00000000000000""" Chirp wavelet PSD ----------------- Figure 10.28 A wavelet PSD of the ten-parameter chirp signal similar to that analyzed in figure 10.27. Here, the signal with an amplitude of A = 0.8 is sampled in 4096 evenly spaced bins, and with Gaussian noise with sigma = 1. The two-dimensional wavelet PSD easily recovers the increase of characteristic chirp frequency with time. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.fourier import FT_continuous, IFT_continuous, wavelet_PSD #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the chirp signal def chirp(t, T, A, phi, omega, beta): signal = A * np.sin(phi + omega * (t - T) + beta * (t - T) ** 2) signal[t < T] = 0 return signal def background(t, b0, b1, Omega1, Omega2): return b0 + b1 * np.sin(Omega1 * t) * np.sin(Omega2 * t) np.random.seed(42) N = 4096 t = np.linspace(-50, 50, N) h_true = chirp(t, -20, 0.8, 0, 0.2, 0.02) h = h_true + np.random.normal(0, 1, N) #------------------------------------------------------------ # Compute the wavelet PSD f0 = np.linspace(0.04, 0.6, 100) wPSD = wavelet_PSD(t, h, f0, Q=1.0) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.05, left=0.1, right=0.95, bottom=0.1, top=0.95) # Top: plot the data ax = fig.add_subplot(211) ax.plot(t + 50, h, '-', c='#AAAAAA') ax.plot(t + 50, h_true, '-k') ax.text(0.02, 0.95, "Input Signal: chirp", ha='left', va='top', transform=ax.transAxes, bbox=dict(boxstyle='round', fc='w', ec='k')) ax.set_xlim(0, 100) ax.set_ylim(-2.9, 2.9) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('$h(t)$') # Bottom: plot the 2D PSD ax = fig.add_subplot(212) ax.imshow(wPSD, origin='lower', aspect='auto', extent=[t[0] + 50, t[-1] + 50, f0[0], f0[-1]], cmap=plt.cm.binary) ax.text(0.02, 0.95, ("Wavelet PSD"), color='w', ha='left', va='top', transform=ax.transAxes) ax.set_xlim(0, 100) ax.set_ylim(0.04, 0.6001) ax.set_xlabel('$t$') ax.set_ylabel('$f_0$') plt.show() astroML-0.3/book_figures/chapter10/fig_convolution_diagram.py0000644000076500000240000001364512252721253025225 0ustar jakevdpstaff00000000000000""" Plot a Diagram explaining a Convolution --------------------------------------- Figure 10.2 A schematic of how the convolution of two functions works. The top-left panel shows simulated data (black line); this time series is convolved with a top-hat function (gray boxes); see eq. 10.8. The top-right panels show the Fourier transform of the data and the window function. These can be multiplied together (bottom-right panel) and inverse transformed to find the convolution (bottom-left panel), which amounts to integrating the data over copies of the window at all locations. The result in the bottom-left panel can be viewed as the signal shown in the top-left panel smoothed with the window (top-hat) function. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.signal import fftconvolve #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate random x, y with a given covariance length np.random.seed(1) x = np.linspace(0, 1, 500) h = 0.01 C = np.exp(-0.5 * (x - x[:, None]) ** 2 / h ** 2) y = 0.8 + 0.3 * np.random.multivariate_normal(np.zeros(len(x)), C) #------------------------------------------------------------ # Define a normalized top-hat window function w = np.zeros_like(x) w[(x > 0.12) & (x < 0.28)] = 1 #------------------------------------------------------------ # Perform the convolution y_norm = np.convolve(np.ones_like(y), w, mode='full') valid_indices = (y_norm != 0) y_norm = y_norm[valid_indices] y_w = np.convolve(y, w, mode='full')[valid_indices] / y_norm # trick: convolve with x-coordinate to find the center of the window at # each point. x_w = np.convolve(x, w, mode='full')[valid_indices] / y_norm #------------------------------------------------------------ # Compute the Fourier transforms of the signal and window y_fft = np.fft.fft(y) w_fft = np.fft.fft(w) yw_fft = y_fft * w_fft yw_final = np.fft.ifft(yw_fft) #------------------------------------------------------------ # Set up the plots fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.09, bottom=0.09, right=0.95, top=0.95, hspace=0.05, wspace=0.05) #---------------------------------------- # plot the data and window function ax = fig.add_subplot(221) ax.plot(x, y, '-k', label=r'data $D(x)$') ax.fill(x, w, color='gray', alpha=0.5, label=r'window $W(x)$') ax.fill(x, w[::-1], color='gray', alpha=0.5) ax.legend() ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('$D$') ax.set_xlim(0.01, 0.99) ax.set_ylim(0, 2.0) #---------------------------------------- # plot the convolution ax = fig.add_subplot(223) ax.plot(x_w, y_w, '-k') ax.text(0.5, 0.95, "Convolution:\n" + r"$[D \ast W](x)$", ha='center', va='top', transform=ax.transAxes, bbox=dict(fc='w', ec='k', pad=8), zorder=2) ax.text(0.5, 0.05, (r'$[D \ast W](x)$' + r'$= \mathcal{F}^{-1}\{\mathcal{F}[D] \cdot \mathcal{F}[W]\}$'), ha='center', va='bottom', transform=ax.transAxes) for x_loc in (0.2, 0.8): y_loc = y_w[x_w <= x_loc][-1] ax.annotate('', (x_loc, y_loc), (x_loc, 2.0), zorder=1, arrowprops=dict(arrowstyle='->', color='gray', lw=2)) ax.set_xlabel('$x$') ax.set_ylabel('$D_W$') ax.set_xlim(0.01, 0.99) ax.set_ylim(0, 1.99) #---------------------------------------- # plot the Fourier transforms N = len(x) k = - 0.5 * N + np.arange(N) * 1. / N / (x[1] - x[0]) ax = fig.add_subplot(422) ax.plot(k, abs(np.fft.fftshift(y_fft)), '-k') ax.text(0.95, 0.95, r'$\mathcal{F}(D)$', ha='right', va='top', transform=ax.transAxes) ax.set_xlim(-100, 100) ax.set_ylim(-5, 85) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax = fig.add_subplot(424) ax.plot(k, abs(np.fft.fftshift(w_fft)), '-k') ax.text(0.95, 0.95, r'$\mathcal{F}(W)$', ha='right', va='top', transform=ax.transAxes) ax.set_xlim(-100, 100) ax.set_ylim(-5, 85) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) #---------------------------------------- # plot the product of Fourier transforms ax = fig.add_subplot(224) ax.plot(k, abs(np.fft.fftshift(yw_fft)), '-k') ax.text(0.95, 0.95, ('Pointwise\nproduct:\n' + r'$\mathcal{F}(D) \cdot \mathcal{F}(W)$'), ha='right', va='top', transform=ax.transAxes, bbox=dict(fc='w', ec='k', pad=8), zorder=2) ax.set_xlim(-100, 100) ax.set_ylim(-100, 3500) ax.set_xlabel('$k$') ax.yaxis.set_major_formatter(plt.NullFormatter()) #------------------------------------------------------------ # Plot flow arrows ax = fig.add_axes([0, 0, 1, 1], xticks=[], yticks=[], frameon=False) arrowprops = dict(arrowstyle="simple", color="gray", alpha=0.5, shrinkA=5, shrinkB=5, patchA=None, patchB=None, connectionstyle="arc3,rad=-0.35") ax.annotate('', [0.57, 0.57], [0.47, 0.57], arrowprops=arrowprops, transform=ax.transAxes) ax.annotate('', [0.57, 0.47], [0.57, 0.57], arrowprops=arrowprops, transform=ax.transAxes) ax.annotate('', [0.47, 0.47], [0.57, 0.47], arrowprops=arrowprops, transform=ax.transAxes) plt.show() astroML-0.3/book_figures/chapter10/fig_FFT_aliasing.py0000644000076500000240000001200612252721253023436 0ustar jakevdpstaff00000000000000""" The effect of Sampling ---------------------- Figure 10.3 A visualization of aliasing in the Fourier transform. In each set of four panels, the top-left panel shows a signal and a regular sampling function, the top-right panel shows the Fourier transform of the signal and sampling function, the bottom-left panel shows the sampled data, and the bottom-right panel shows the convolution of the Fourier-space representations (cf. figure 10.2). In the top four panels, the data is well sampled, and there is little to no aliasing. In the bottom panels, the data is not well sampled (the spacing between two data points is larger) which leads to aliasing, as seen in the overlap of the convolved Fourier transforms (figure adapted from Greg05). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def gaussian(x, a=1.0): return np.exp(-0.5 * (x / a) ** 2) def gaussian_FT(f, a=1.0): return np.sqrt(2 * np.pi * a ** 2) * np.exp(-2 * (np.pi * a * f) ** 2) #------------------------------------------------------------ # Define our terms a = 1.0 t = np.linspace(-5, 5, 1000) h = gaussian(t, a) f = np.linspace(-2, 2, 1000) H = gaussian_FT(f, a) #------------------------------------------------------------ # Two plots: one well-sampled, one over-sampled N = 12 for dt in (0.9, 1.5): # define time-space sampling t_sample = dt * (np.arange(N) - N / 2) h_sample = gaussian(t_sample, a) # Fourier transform of time-space sampling df = 1. / dt f_sample = df * (np.arange(N) - N / 2) # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.07, right=0.95, wspace=0.16, bottom=0.1, top=0.85, hspace=0.05) # First plot: sampled time-series ax = fig.add_subplot(221) ax.plot(t, h, '-k') for ts in t_sample: ax.annotate('', (ts, 0.5), (ts, 0), ha='center', va='center', arrowprops=dict(arrowstyle='->')) ax.text(0.03, 0.95, ("Signal and Sampling Window\n" + r"Sampling Rate $\Delta t$"), ha='left', va='top', transform=ax.transAxes) ax.set_ylabel('$h(t)$') ax.set_xlim(-5, 5) ax.set_ylim(0, 1.4) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_title('Time Domain: Multiplication') # second plot: frequency space ax = fig.add_subplot(222) ax.plot(f, H, '-k') for fs in f_sample: ax.annotate('', (fs, 1.5), (fs, 0), ha='center', va='center', arrowprops=dict(arrowstyle='->')) ax.text(0.03, 0.95, ("FT of Signal and Sampling Window\n" + r"$\Delta f = 1 / \Delta t$"), ha='left', va='top', transform=ax.transAxes) ax.set_ylabel('$H(f)$') ax.set_xlim(-1.5, 1.5) ax.set_ylim(0, 3.8) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_title('Frequency Domain: Convolution') # third plot: windowed function ax = fig.add_subplot(223) for (ts, hs) in zip(t_sample, h_sample): if hs < 0.1: continue ax.annotate('', (ts, hs), (ts, 0), ha='center', va='center', arrowprops=dict(arrowstyle='->')) ax.plot(t, h, ':k') ax.text(0.03, 0.95, "Sampled signal: pointwise\nmultiplication", ha='left', va='top', transform=ax.transAxes) ax.set_xlabel('$t$') ax.set_ylabel('$h(t)$') ax.set_xlim(-5, 5) ax.set_ylim(0, 1.4) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) # fourth plot: convolved PSD ax = fig.add_subplot(224) window = np.array([gaussian_FT(f - fs, a) for fs in f_sample]) ax.plot(f, window.sum(0), '-k') if dt > 1: ax.plot(f, window.T, ':k') ax.text(0.03, 0.95, "Convolution of signal FT\nand window FT", ha='left', va='top', transform=ax.transAxes) ax.set_xlabel('$f$') ax.set_ylabel('$H(f)$') ax.set_xlim(-1.5, 1.5) ax.set_ylim(0, 3.8) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) if dt > 1: fig.suptitle(r"Undersampled data: $\Delta t > t_c$") else: fig.suptitle(r"Well-sampled data: $\Delta t < t_c$") plt.show() astroML-0.3/book_figures/chapter10/fig_fft_example.py0000644000076500000240000000574712252721253023460 0ustar jakevdpstaff00000000000000""" Fast Fourier Transform Example ------------------------------ Figure 10.5 The discrete Fourier transform (bottom panel) for two noisy data sets shown in the top panel. For 512 evenly sampled times t (dt = 0.977), points are drawn from h(t) = a + sin(t)G(t), where G(t) is a Gaussian N(mu = 0,sigma = 10). Gaussian noise with sigma = 0.05 (top data set) and 0.005 (bottom data set) is added to signal h(t). The value of the offset a is 0.15 and 0, respectively. The discrete Fourier transform is computed as described in Section 10.2.3. For both noise realizations, the correct frequency f = (2pi)-1 ~ 0.159 is easily discernible in the bottom panel. Note that the height of peaks is the same for both noise realizations. The large value of abs(H(f = 0)) for data with larger noise is due to the vertical offset. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.fftpack import fft from scipy.stats import norm from astroML.fourier import PSD_continuous #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Draw the data np.random.seed(1) tj = np.linspace(-25, 25, 512) hj = np.sin(tj) hj *= norm(0, 10).pdf(tj) #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.25) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) offsets = (0, 0.15) colors = ('black', 'gray') linewidths = (1, 2) errors = (0.005, 0.05) for (offset, color, error, linewidth) in zip(offsets, colors, errors, linewidths): # compute the PSD err = np.random.normal(0, error, size=hj.shape) hj_N = hj + err + offset fk, PSD = PSD_continuous(tj, hj_N) # plot the data and PSD ax1.scatter(tj, hj_N, s=4, c=color, lw=0) ax1.plot(tj, 0 * tj + offset, '-', c=color, lw=1) ax2.plot(fk, PSD, '-', c=color, lw=linewidth) # vertical line marking the expected peak location ax2.plot([0.5 / np.pi, 0.5 / np.pi], [-0.1, 1], ':k', lw=1) ax1.set_xlim(-25, 25) ax1.set_ylim(-0.1, 0.3001) ax1.set_xlabel('$t$') ax1.set_ylabel('$h(t)$') ax1.yaxis.set_major_locator(plt.MultipleLocator(0.1)) ax2.set_xlim(0, 0.8) ax2.set_ylim(-0.101, 0.801) ax2.set_xlabel('$f$') ax2.set_ylabel('$PSD(f)$') plt.show() astroML-0.3/book_figures/chapter10/fig_FFT_sampling.py0000644000076500000240000000732712252721253023473 0ustar jakevdpstaff00000000000000""" The effect of Sampling ---------------------- Figure 10.4 An illustration of the impact of a sampling window function of resulting PSD. The top-left panel shows a simulated data set with 40 points drawn from the function y(t|P) = sin(t) (i.e., f = 1/(2pi) ~ 0.16). The sampling is random, and illustrated by the vertical lines in the bottom-left panel. The PSD of sampling times, or spectral window, is shown in the bottom-right panel. The PSD computed for the data set from the top-left panel is shown in the top-right panel; it is equal to a convolution of the single peak (shaded in gray) with the window PSD shown in the bottom-right panel (e.g., the peak at f ~ 0.42 in the top-right panel can be traced to a peak at f ~ 0.26 in the bottom-right panel). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate the data Nbins = 2 ** 15 Nobs = 40 f = lambda t: np.sin(np.pi * t / 3) t = np.linspace(-100, 200, Nbins) dt = t[1] - t[0] y = f(t) # select observations np.random.seed(42) t_obs = 100 * np.random.random(40) D = abs(t_obs[:, np.newaxis] - t) i = np.argmin(D, 1) t_obs = t[i] y_obs = y[i] window = np.zeros(Nbins) window[i] = 1 #------------------------------------------------------------ # Compute PSDs Nfreq = Nbins / 2 dt = t[1] - t[0] df = 1. / (Nbins * dt) f = df * np.arange(Nfreq) PSD_window = abs(np.fft.fft(window)[:Nfreq]) ** 2 PSD_y = abs(np.fft.fft(y)[:Nfreq]) ** 2 PSD_obs = abs(np.fft.fft(y * window)[:Nfreq]) ** 2 # normalize the true PSD so it can be shown in the plot: # in theory it's a delta function, so normalization is # arbitrary # scale PSDs for plotting PSD_window /= 500 PSD_y /= PSD_y.max() PSD_obs /= 500 #------------------------------------------------------------ # Prepare the figures fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, hspace=0.2, wspace=0.25, left=0.12, right=0.95) # First panel: data vs time ax = fig.add_subplot(221) ax.plot(t, y, '-', c='gray') ax.plot(t_obs, y_obs, '.k', ms=4) ax.text(0.95, 0.93, "Data", ha='right', va='top', transform=ax.transAxes) ax.set_ylabel('$y(t)$') ax.set_xlim(0, 100) ax.set_ylim(-1.5, 1.8) # Second panel: PSD of data ax = fig.add_subplot(222) ax.fill(f, PSD_y, fc='gray', ec='gray') ax.plot(f, PSD_obs, '-', c='black') ax.text(0.95, 0.93, "Data PSD", ha='right', va='top', transform=ax.transAxes) ax.set_ylabel('$P(f)$') ax.set_xlim(0, 1.0) ax.set_ylim(-0.1, 1.1) # Third panel: window vs time ax = fig.add_subplot(223) ax.plot(t, window, '-', c='black') ax.text(0.95, 0.93, "Window", ha='right', va='top', transform=ax.transAxes) ax.set_xlabel('$t$') ax.set_ylabel('$y(t)$') ax.set_xlim(0, 100) ax.set_ylim(-0.2, 1.5) # Fourth panel: PSD of window ax = fig.add_subplot(224) ax.plot(f, PSD_window, '-', c='black') ax.text(0.95, 0.93, "Window PSD", ha='right', va='top', transform=ax.transAxes) ax.set_xlabel('$f$') ax.set_ylabel('$P(f)$') ax.set_xlim(0, 1.0) ax.set_ylim(-0.1, 1.1) plt.show() astroML-0.3/book_figures/chapter10/fig_gaussian_reconstruct.py0000644000076500000240000000372212252721253025422 0ustar jakevdpstaff00000000000000""" Fourier Reconstruction of a Gaussian ------------------------------------ This figure demonstrates Fourier decomposition of a Gaussian """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) x = np.linspace(-50, 50, 10000) y = norm.pdf(x, 0, 1) fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0) kvals = [20, 30, 50] subplots = [311, 312, 313] for (k, subplot) in zip(kvals, subplots): ax = fig.add_subplot(subplot) # Use FFT to fit a truncated Fourier series y_fft = np.fft.fft(y) y_fft[k + 1:-k] = 0 y_fit = np.fft.ifft(y_fft).real ax.plot(x, y, color='gray') ax.plot(x, y_fit, color='black') if k == 1: ax.text(0.01, 0.95, "1 mode", ha='left', va='top', transform=ax.transAxes) else: ax.text(0.01, 0.95, "%i modes" % k, ha='left', va='top', transform=ax.transAxes) if subplot == subplots[-1]: ax.set_xlabel('phase') else: ax.xaxis.set_major_formatter(plt.NullFormatter()) if subplot == subplots[1]: ax.set_ylabel('amplitude') ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlim(-5, 5) ax.set_ylim(-0.05, 0.5) plt.show() astroML-0.3/book_figures/chapter10/fig_LIGO_power_spectrum.py0000644000076500000240000000763012252721253025047 0ustar jakevdpstaff00000000000000""" Plot the power spectrum of LIGO data ------------------------------------ Figure 10.6 LIGO data and its noise power spectrum. The upper panel shows a 2-second-long stretch of data (~8000 points; essentially noise without signal) from LIGO Hanford. The middle and bottom panels show the power spectral density computed for 2048 seconds of data, sampled at 4096 Hz (~8 million data values). The gray line shows the PSD computed using a naive FFT approach; the dark line uses Welch's method of overlapping windows to smooth noise; the middle panel uses a 1-second-wide top-hat window and the bottom panel the so-called Hanning (cosine) window with the same width. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import fftpack from matplotlib import mlab from astroML.datasets import fetch_LIGO_large #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the LIGO hanford data data, dt = fetch_LIGO_large() # subset of the data to plot t0 = 646 T = 2 tplot = dt * np.arange(T * 4096) dplot = data[4096 * t0: 4096 * (t0 + T)] tplot = tplot[::10] dplot = dplot[::10] fmin = 40 fmax = 2060 #------------------------------------------------------------ # compute PSD using simple FFT N = len(data) df = 1. / (N * dt) PSD = abs(dt * fftpack.fft(data)[:N / 2]) ** 2 f = df * np.arange(N / 2) cutoff = ((f >= fmin) & (f <= fmax)) f = f[cutoff] PSD = PSD[cutoff] f = f[::100] PSD = PSD[::100] #------------------------------------------------------------ # compute PSD using Welch's method -- no window function PSDW1, fW1 = mlab.psd(data, NFFT=4096, Fs=1. / dt, window=mlab.window_none, noverlap=2048) dfW1 = fW1[1] - fW1[0] cutoff = (fW1 >= fmin) & (fW1 <= fmax) fW1 = fW1[cutoff] PSDW1 = PSDW1[cutoff] #------------------------------------------------------------ # compute PSD using Welch's method -- hanning window function PSDW2, fW2 = mlab.psd(data, NFFT=4096, Fs=1. / dt, window=mlab.window_hanning, noverlap=2048) dfW2 = fW2[1] - fW2[0] cutoff = (fW2 >= fmin) & (fW2 <= fmax) fW2 = fW2[cutoff] PSDW2 = PSDW2[cutoff] #------------------------------------------------------------ # Plot the data fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(bottom=0.1, top=0.9, hspace=0.3) # top panel: time series ax = fig.add_subplot(311) ax.plot(tplot, dplot, '-k') ax.set_xlabel('time (s)') ax.set_ylabel('$h(t)$') ax.set_ylim(-1.2E-18, 1.2E-18) # middle panel: non-windowed filter ax = fig.add_subplot(312) ax.loglog(f, PSD, '-', c='#AAAAAA') ax.loglog(fW1, PSDW1, '-k') ax.text(0.98, 0.95, "Top-hat window", ha='right', va='top', transform=ax.transAxes) ax.set_xlabel('frequency (Hz)') ax.set_ylabel(r'$PSD(f)$') ax.set_xlim(40, 2060) ax.set_ylim(1E-46, 1E-36) ax.yaxis.set_major_locator(plt.LogLocator(base=100)) # bottom panel: hanning window ax = fig.add_subplot(313) ax.loglog(f, PSD, '-', c='#AAAAAA') ax.loglog(fW2, PSDW2, '-k') ax.text(0.98, 0.95, "Hanning (cosine) window", ha='right', va='top', transform=ax.transAxes) ax.set_xlabel('frequency (Hz)') ax.set_ylabel(r'$PSD(f)$') ax.set_xlim(40, 2060) ax.set_ylim(1E-46, 1E-36) ax.yaxis.set_major_locator(plt.LogLocator(base=100)) plt.show() astroML-0.3/book_figures/chapter10/fig_line_wavelet_PSD.py0000644000076500000240000001041012252721253024331 0ustar jakevdpstaff00000000000000""" Wavelet transform of a Noisy Spike ---------------------------------- Figure 10.8 Localized frequency analysis using the wavelet transform. The upper panel shows the input signal, which consists of a Gaussian spike in the presence of white (Gaussian) noise (see figure 10.10). The middle panel shows an example wavelet. The lower panel shows the power spectral density as a function of the frequency f0 and the time t0, for Q = 0.3. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.fourier import FT_continuous, IFT_continuous #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def wavelet(t, t0, f0, Q): return (np.exp(-(f0 / Q * (t - t0)) ** 2) * np.exp(2j * np.pi * f0 * (t - t0))) def wavelet_FT(f, t0, f0, Q): # this is its fourier transform using # H(f) = integral[ h(t) exp(-2pi i f t) dt] return (np.sqrt(np.pi) * Q / f0 * np.exp(-2j * np.pi * f * t0) * np.exp(-(np.pi * (f - f0) * Q / f0) ** 2)) def check_funcs(t0=1, f0=2, Q=3): t = np.linspace(-5, 5, 10000) h = wavelet(t, t0, f0, Q) f, H = FT_continuous(t, h) assert np.allclose(H, wavelet_FT(f, t0, f0, Q)) #------------------------------------------------------------ # Create the simulated dataset np.random.seed(5) t = np.linspace(-40, 40, 2001)[:-1] h = np.exp(-0.5 * ((t - 20.) / 1.0) ** 2) hN = h + np.random.normal(0, 0.5, size=h.shape) #------------------------------------------------------------ # Compute the convolution via the continuous Fourier transform # This is more exact than using the discrete transform, because # we have an analytic expression for the FT of the wavelet. Q = 0.3 f0 = 2 ** np.linspace(-3, -1, 100) f, H = FT_continuous(t, hN) W = np.conj(wavelet_FT(f, 0, f0[:, None], Q)) t, HW = IFT_continuous(f, H * W) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.05, left=0.12, right=0.95, bottom=0.08, top=0.95) # First panel: the signal ax = fig.add_subplot(311) ax.plot(t, hN, '-k', lw=1) ax.text(0.02, 0.95, ("Input Signal:\n" "Localized spike plus noise"), ha='left', va='top', transform=ax.transAxes) ax.set_xlim(-40, 40) ax.set_ylim(-1.2, 2.2) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('$h(t)$') # Second panel: the wavelet ax = fig.add_subplot(312) W = wavelet(t, 0, 0.125, Q) ax.plot(t, W.real, '-k', label='real part', lw=1) ax.plot(t, W.imag, '--k', label='imag part', lw=1) ax.legend(loc=1) ax.text(0.02, 0.95, ("Example Wavelet\n" "$t_0 = 0$, $f_0=1/8$, $Q=0.3$"), ha='left', va='top', transform=ax.transAxes) ax.text(0.98, 0.05, (r"$w(t; t_0, f_0, Q) = e^{-[f_0 (t - t_0) / Q]^2}" "e^{2 \pi i f_0 (t - t_0)}$"), ha='right', va='bottom', transform=ax.transAxes) ax.set_xlim(-40, 40) ax.set_ylim(-1.4, 1.4) ax.set_ylabel('$w(t; t_0, f_0, Q)$') ax.xaxis.set_major_formatter(plt.NullFormatter()) # Third panel: the spectrogram ax = fig.add_subplot(313) ax.imshow(abs(HW) ** 2, origin='lower', aspect='auto', cmap=plt.cm.binary, extent=[t[0], t[-1], np.log2(f0)[0], np.log2(f0)[-1]]) ax.set_xlim(-40, 40) ax.text(0.02, 0.95, ("Wavelet PSD"), color='w', ha='left', va='top', transform=ax.transAxes) ax.set_ylim(np.log2(f0)[0], np.log2(f0)[-1]) ax.set_xlabel('$t$') ax.set_ylabel('$f_0$') ax.yaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, *args: ("1/%i" % (2 ** -x)))) plt.show() astroML-0.3/book_figures/chapter10/fig_LINEAR_BIC.py0000644000076500000240000000651012252721253022602 0ustar jakevdpstaff00000000000000""" BIC for LINEAR light curve -------------------------- Figure 10.19 BIC as a function of the number of frequency components for the light curve shown in figure 10.18. BIC for the two prominent frequency peaks is shown. The inset panel details the area near the maximum. For both frequencies, the BIC peaks at between 10 and 15 terms; note that a high value of BIC is achieved already with 6 components. Comparing the two, the longer period model (bottom panel) is much more significant. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import multiterm_periodogram, lomb_scargle_BIC from astroML.datasets import fetch_LINEAR_sample #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the data data = fetch_LINEAR_sample() t, y, dy = data[14752041].T omega0 = 17.217 # focus only on the region with the peak omega1 = np.linspace(17.213, 17.220, 100) omega2 = 0.5 * omega1 #------------------------------------------------------------ # Compute the delta BIC terms = np.arange(1, 21) BIC_max = np.zeros((2, len(terms))) for i, omega in enumerate([omega1, omega2]): for j in range(len(terms)): P = multiterm_periodogram(t, y, dy, omega, terms[j]) BIC = lomb_scargle_BIC(P, y, dy, n_harmonics=terms[j]) BIC_max[i, j] = BIC.max() #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(5, 3.75)) ax = [fig.add_axes((0.15, 0.53, 0.8, 0.37)), fig.add_axes((0.15, 0.1, 0.8, 0.37))] ax_inset = [fig.add_axes((0.15 + 7 * 0.04, 0.55, 0.79 - 7 * 0.04, 0.17)), fig.add_axes((0.15 + 7 * 0.04, 0.12, 0.79 - 7 * 0.04, 0.17))] ylims = [(22750, 22850), (26675, 26775)] omega0 = [17.22, 8.61] for i in range(2): # Plot full panel ax[i].plot(terms, BIC_max[i], '-k') ax[i].set_xlim(0, 20) ax[i].set_ylim(0, 30000) ax[i].text(0.02, 0.95, r"$\omega_0 = %.2f$" % omega0[i], ha='left', va='top', transform=ax[i].transAxes) ax[i].set_ylabel(r'$\Delta BIC$') if i == 1: ax[i].set_xlabel('N frequencies') ax[i].grid(color='gray') # plot inset ax_inset[i].plot(terms, BIC_max[i], '-k') ax_inset[i].xaxis.set_major_locator(plt.MultipleLocator(5)) ax_inset[i].xaxis.set_major_formatter(plt.NullFormatter()) ax_inset[i].yaxis.set_major_locator(plt.MultipleLocator(25)) ax_inset[i].yaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax_inset[i].set_xlim(7, 19.75) ax_inset[i].set_ylim(ylims[i]) ax_inset[i].set_title('zoomed view') ax_inset[i].grid(color='gray') plt.show() astroML-0.3/book_figures/chapter10/fig_LINEAR_clustering.py0000644000076500000240000002165512420767763024410 0ustar jakevdpstaff00000000000000""" Clustering of LINEAR data ------------------------- Figure 10.20 ~~~~~~~~~~~~ Unsupervised clustering analysis of periodic variable stars from the LINEAR data set. The top row shows clusters derived using two attributes (g - i and log P) and a mixture of 12 Gaussians. The colorized symbols mark the five most significant clusters. The bottom row shows analogous diagrams for clustering based on seven attributes (colors u - g, g - i, i - K, and J - K; log P, light-curve amplitude, and light-curve skewness), and a mixture of 15 Gaussians. See figure 10.21 for data projections in the space of other attributes for the latter case. Figure 10.21 ~~~~~~~~~~~~ Unsupervised clustering analysis of periodic variable stars from the LINEAR data set. Clusters are derived using seven attributes (colors u - g, g - i, i - K, and J - K; log P , light-curve amplitude, and light-curve skewness), and a mixture of 15 Gaussians. The log P vs. g - i diagram and log P vs. light-curve amplitude diagram for the same clusters are shown in the lower panels of figure 10.20. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from sklearn.mixture import GMM from astroML.decorators import pickle_results from astroML.datasets import fetch_LINEAR_geneva #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get the Geneva periods data data = fetch_LINEAR_geneva() #---------------------------------------------------------------------- # compute Gaussian Mixture models filetemplate = 'gmm_res_%i_%i.pkl' attributes = [('gi', 'logP'), ('ug', 'gi', 'iK', 'JK', 'logP', 'amp', 'skew')] components = np.arange(1, 21) #------------------------------------------------------------ # Create attribute arrays Xarrays = [] for attr in attributes: Xarrays.append(np.vstack([data[a] for a in attr]).T) #------------------------------------------------------------ # Compute the results (and save to pickle file) @pickle_results('LINEAR_clustering.pkl') def compute_GMM_results(components, attributes): clfs = [] for attr, X in zip(attributes, Xarrays): clfs_i = [] for comp in components: print(" - {0} component fit".format(comp)) clf = GMM(comp, covariance_type='full', random_state=0, n_iter=500) clf.fit(X) clfs_i.append(clf) if not clf.converged_: print(" NOT CONVERGED!") clfs.append(clfs_i) return clfs clfs = compute_GMM_results(components, attributes) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.1, wspace=0.1) class_labels = [] for i in range(2): # Grab the best classifier, based on the BIC X = Xarrays[i] BIC = [c.bic(X) for c in clfs[i]] i_best = np.argmin(BIC) print("number of components:", components[i_best]) clf = clfs[i][i_best] n_components = clf.n_components # Predict the cluster labels with the classifier c = clf.predict(X) classes = np.unique(c) class_labels.append(c) # sort the cluster by normalized density of points counts = np.sum(c == classes[:, None], 1) size = np.array([np.linalg.det(C) for C in clf.covars_]) weights = clf.weights_ density = counts / size # Clusters with very few points are less interesting: # set their density to zero so they'll go to the end of the list density[counts < 5] = 0 isort = np.argsort(density)[::-1] # find statistics of the top 10 clusters Nclusters = 6 means = [] stdevs = [] counts = [] names = [name for name in data.dtype.names[2:] if name != 'LINEARobjectID'] labels = ['$u-g$', '$g-i$', '$i-K$', '$J-K$', r'$\log(P)$', 'amplitude', 'skew', 'kurtosis', 'median mag', r'$N_{\rm obs}$', 'Visual Class'] assert len(names) == len(labels) i_logP = names.index('logP') for j in range(Nclusters): flag = (c == isort[j]) counts.append(np.sum(flag)) means.append([np.mean(data[n][flag]) for n in names]) stdevs.append([data[n][flag].std() for n in names]) counts = np.array(counts) means = np.array(means) stdevs = np.array(stdevs) # define colors based on median of logP j_ordered = np.argsort(-means[:, i_logP]) # tweak colors by hand if i == 1: j_ordered[3], j_ordered[2] = j_ordered[2], j_ordered[3] color = np.zeros(c.shape) for j in range(Nclusters): flag = (c == isort[j_ordered[j]]) color[flag] = j + 1 # separate into foureground and background back = (color == 0) fore = ~back # Plot the resulting clusters ax1 = fig.add_subplot(221 + 2 * i) ax1.scatter(data['gi'][back], data['logP'][back], c='gray', edgecolors='none', s=4, linewidths=0) ax1.scatter(data['gi'][fore], data['logP'][fore], c=color[fore], edgecolors='none', s=4, linewidths=0) ax1.set_ylabel(r'$\log(P)$') ax2 = plt.subplot(222 + 2 * i) ax2.scatter(data['amp'][back], data['logP'][back], c='gray', edgecolors='none', s=4, linewidths=0) ax2.scatter(data['amp'][fore], data['logP'][fore], c=color[fore], edgecolors='none', s=4, linewidths=0) #------------------------------ # set axis limits ax1.set_xlim(-0.6, 2.1) ax2.set_xlim(0.1, 1.5) ax1.set_ylim(-1.5, 0.5) ax2.set_ylim(-1.5, 0.5) ax2.yaxis.set_major_formatter(plt.NullFormatter()) if i == 0: ax1.xaxis.set_major_formatter(plt.NullFormatter()) ax2.xaxis.set_major_formatter(plt.NullFormatter()) else: ax1.set_xlabel(r'$g-i$') ax2.set_xlabel(r'$A$') #------------------------------ # print table of means and medians directly to LaTeX format print(r"\begin{tabular}{|l|lllllll|}") print(r" \hline") for j in range(7): print(' &', labels[j], end=" ") print(r"\\") print(r" \hline") for j in range(Nclusters): print(" {0} ".format(j + 1), end=' ') for k in range(7): print(" & $%.2f \pm %.2f$ " % (means[j, k], stdevs[j, k]), end=' ') print(r"\\") print(r"\hline") print(r"\end{tabular}") #------------------------------------------------------------ # Second figure fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.11, right=0.95, wspace=0.3) attrs = ['skew', 'ug', 'iK', 'JK'] labels = ['skew', '$u-g$', '$i-K$', '$J-K$'] ylims = [(-1.8, 2.2), (0.6, 2.9), (0.1, 2.6), (-0.2, 1.2)] for i in range(4): ax = fig.add_subplot(221 + i) ax.scatter(data['gi'][back], data[attrs[i]][back], c='gray', edgecolors='none', s=4, linewidths=0) ax.scatter(data['gi'][fore], data[attrs[i]][fore], c=color[fore], edgecolors='none', s=4, linewidths=0) ax.set_xlabel('$g-i$') ax.set_ylabel(labels[i]) ax.set_xlim(-0.6, 2.1) ax.set_ylim(ylims[i]) #------------------------------------------------------------ # Save the results # # run the script as # # >$ python fig_LINEAR_clustering.py --save # # to output the data file showing the cluster labels of each point import sys if len(sys.argv) > 1 and sys.argv[1] == '--save': filename = 'cluster_labels.dat' print("Saving cluster labels to", filename) from astroML.datasets.LINEAR_sample import ARCHIVE_DTYPE new_data = np.zeros(len(data), dtype=(ARCHIVE_DTYPE + [('2D_cluster_ID', 'i4'), ('7D_cluster_ID', 'i4')])) for name in data.dtype.names: new_data[name] = data[name] new_data['2D_cluster_ID'] = class_labels[0] new_data['7D_cluster_ID'] = class_labels[1] fmt = ('%.6f %.6f %.3f %.3f %.3f %.3f %.7f %.3f %.3f ' '%.3f %.2f %i %i %s %i %i\n') F = open(filename, 'w') F.write('# ra dec ug gi iK JK ' 'logP Ampl skew kurt magMed nObs LCtype ' 'LINEARobjectID 2D_cluster_ID 7D_cluster_ID\n') for line in new_data: F.write(fmt % tuple(line[col] for col in line.dtype.names)) F.close() plt.show() astroML-0.3/book_figures/chapter10/fig_LINEAR_GMMBayes.py0000644000076500000240000001416512420767763023633 0ustar jakevdpstaff00000000000000""" GMMBayes classification of LINEAR data -------------------------------------- Figure 10.22 Supervised classification of periodic variable stars from the LINEAR data set using a Gaussian mixture model Bayes classifier. The training sample includes five input classes. The top row shows clusters derived using two attributes (g - i and log P) and the bottom row shows analogous diagrams for classification based on seven attributes (colors u - g, g - i, i - K, and J - K; log P, light-curve amplitude, and light-curve skewness). See table 10.2 for the classification performance. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from astroML.classification import GMMBayes from sklearn.cross_validation import train_test_split from astroML.decorators import pickle_results from astroML.datasets import fetch_LINEAR_geneva #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) data = fetch_LINEAR_geneva() attributes = [('gi', 'logP'), ('gi', 'logP', 'ug', 'iK', 'JK', 'amp', 'skew')] labels = ['$u-g$', '$g-i$', '$i-K$', '$J-K$', r'$\log(P)$', 'amplitude', 'skew'] cls = 'LCtype' Ntrain = 3000 #------------------------------------------------------------ # Create attribute arrays X = [] y = [] for attr in attributes: X.append(np.vstack([data[a] for a in attr]).T) LCtype = data[cls].copy() # there is no #3. For a better color scheme in plots, # we'll set 6->3 LCtype[LCtype == 6] = 3 y.append(LCtype) #@pickle_results('LINEAR_GMMBayes.pkl') def compute_SVM_results(i_train, i_test, n_components=5): classifiers = [] predictions = [] Xtests = [] ytests = [] Xtrains = [] ytrains = [] for i in range(len(attributes)): Xtrain = X[i][i_train] Xtest = X[i][i_test] ytrain = y[i][i_train] ytest = y[i][i_test] clf = GMMBayes(n_components, min_covar=1E-5, covariance_type='full', random_state=0) clf.fit(Xtrain, ytrain) y_pred = clf.predict(Xtest) classifiers.append(clf) predictions.append(y_pred) return classifiers, predictions i = np.arange(len(data)) i_train, i_test = train_test_split(i, random_state=0, train_size=2000) clfs, ypred = compute_SVM_results(i_train, i_test) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.1, wspace=0.1) class_labels = [] for i in range(2): Xtest = X[i][i_test] ytest = y[i][i_test] amp = data['amp'][i_test] # Plot the resulting classifications ax1 = fig.add_subplot(221 + 2 * i) ax1.scatter(Xtest[:, 0], Xtest[:, 1], c=ypred[i], edgecolors='none', s=4, linewidths=0) ax1.set_ylabel(r'$\log(P)$') ax2 = plt.subplot(222 + 2 * i) ax2.scatter(amp, Xtest[:, 1], c=ypred[i], edgecolors='none', s=4, lw=0) #------------------------------ # set axis limits ax1.set_xlim(-0.6, 2.1) ax2.set_xlim(0.1, 1.5) ax1.set_ylim(-1.5, 0.5) ax2.set_ylim(-1.5, 0.5) ax2.yaxis.set_major_formatter(plt.NullFormatter()) if i == 0: ax1.xaxis.set_major_formatter(plt.NullFormatter()) ax2.xaxis.set_major_formatter(plt.NullFormatter()) else: ax1.set_xlabel(r'$g-i$') ax2.set_xlabel(r'$A$') #------------------------------------------------------------ # Second figure fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.11, right=0.95, wspace=0.3) attrs = ['skew', 'ug', 'iK', 'JK'] labels = ['skew', '$u-g$', '$i-K$', '$J-K$'] ylims = [(-1.8, 2.2), (0.6, 2.9), (0.1, 2.6), (-0.2, 1.2)] for i in range(4): ax = fig.add_subplot(221 + i) ax.scatter(data['gi'][i_test], data[attrs[i]][i_test], c=ypred[1], edgecolors='none', s=4, lw=0) ax.set_xlabel('$g-i$') ax.set_ylabel(labels[i]) ax.set_xlim(-0.6, 2.1) ax.set_ylim(ylims[i]) #------------------------------------------------------------ # Save the results # # run the script as # # >$ python fig_LINEAR_clustering.py --save # # to output the data file showing the cluster labels of each point import sys if len(sys.argv) > 1 and sys.argv[1] == '--save': filename = 'cluster_labels_gmm.dat' print("Saving cluster labels to", filename) from astroML.datasets.LINEAR_sample import ARCHIVE_DTYPE new_data = np.zeros(len(data), dtype=(ARCHIVE_DTYPE + [('2D_cluster_ID', 'i4'), ('7D_cluster_ID', 'i4')])) # switch the labels back 3->6 for i in range(2): ypred[i][ypred[i] == 3] = 6 # need to put labels back in order class_labels = [-999 * np.ones(len(data)) for i in range(2)] for i in range(2): class_labels[i][i_test] = ypred[i] for name in data.dtype.names: new_data[name] = data[name] new_data['2D_cluster_ID'] = class_labels[0] new_data['7D_cluster_ID'] = class_labels[1] fmt = ('%.6f %.6f %.3f %.3f %.3f %.3f %.7f %.3f %.3f ' '%.3f %.2f %i %i %s %i %i\n') F = open(filename, 'w') F.write('# ra dec ug gi iK JK ' 'logP Ampl skew kurt magMed nObs LCtype ' 'LINEARobjectID 2D_cluster_ID 7D_cluster_ID\n') for line in new_data: F.write(fmt % tuple(line[col] for col in line.dtype.names)) F.close() plt.show() astroML-0.3/book_figures/chapter10/fig_LINEAR_LS.py0000644000076500000240000000721712420767763022545 0ustar jakevdpstaff00000000000000""" Phased LINEAR Light Curve ------------------------- Figure 10.17 Phased light curves for six of the periodic objects from the LINEAR data set. The lines show the best fit to the phased light curve using the first four terms of the Fourier expansion (eq. 10.68), with the omega_0 selected using the Lomb-Scargle periodogram. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from astroML.decorators import pickle_results from astroML.time_series import search_frequencies, lomb_scargle, MultiTermFit from astroML.datasets import fetch_LINEAR_sample #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Load the dataset data = fetch_LINEAR_sample() ids = [14752041, 1009459, 10022663, 10025796, 11375941, 18525697] #------------------------------------------------------------ # Compute the best frequencies @pickle_results('LINEAR_LS.pkl') def compute_best_frequencies(ids, n_eval=10000, n_retry=5, generalized=True): results = {} for i in ids: t, y, dy = data[i].T print(" - computing power for {0} ({1} points)".format(i, len(t))) kwargs = dict(generalized=generalized) omega, power = search_frequencies(t, y, dy, n_eval=n_eval, n_retry=n_retry, LS_kwargs=kwargs) results[i] = [omega, power] return results results = compute_best_frequencies(ids, n_eval=10000, n_retry=5) #------------------------------------------------------------ # Plot the phased light-curves fig = plt.figure(figsize=(5, 6.5)) fig.subplots_adjust(hspace=0.1, bottom=0.06, top=0.94, left=0.12, right=0.94) for i in range(6): # get the data and best-fit angular frequency t, y, dy = data[ids[i]].T omega, power = results[ids[i]] omega_best = omega[np.argmax(power)] print(" - omega_0 = %.10g" % omega_best) # do a fit to the first 4 Fourier components mtf = MultiTermFit(omega_best, 4) mtf.fit(t, y, dy) phase_fit, y_fit, phased_t = mtf.predict(1000, return_phased_times=True) # plot the phased data and best-fit curves ax = fig.add_subplot(321 + i) ax.errorbar(phased_t, y, dy, fmt='.k', ecolor='gray', lw=1, ms=4, capsize=1.5) ax.plot(phase_fit, y_fit, '-b', lw=2) ax.set_xlim(0, 1) ax.set_ylim(plt.ylim()[::-1]) ax.yaxis.set_major_locator(plt.MaxNLocator(4)) ax.text(0.03, 0.04, "ID = %i" % ids[i], ha='left', va='bottom', transform=ax.transAxes) ax.text(0.03, 0.96, "P = %.2f hr" % (2 * np.pi / omega_best * 24.), ha='left', va='top', transform=ax.transAxes) ylim = ax.get_ylim() ax.set_ylim(ylim[0], ylim[0] + 1.1 * (ylim[1] - ylim[0])) if i < 4: ax.xaxis.set_major_formatter(plt.NullFormatter()) if i % 2 == 0: ax.set_ylabel('mag') if i in (4, 5): ax.set_xlabel('phase') plt.show() astroML-0.3/book_figures/chapter10/fig_LINEAR_SVM.py0000644000076500000240000001377712420767763022704 0ustar jakevdpstaff00000000000000""" SVM classification of LINEAR data --------------------------------- Figure 10.23 Supervised classification of periodic variable stars from the LINEAR data set using a support vector machines method. The training sample includes five input classes. The top row shows clusters derived using two attributes (g - i and log P) and the bottom row shows analogous diagrams for classification based on seven attributes (colors u - g, g - i, i - K, and J - K; log P, light-curve amplitude, and light-curve skewness). See table 10.3 for the classification performance. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.svm import SVC from sklearn.cross_validation import train_test_split from astroML.decorators import pickle_results from astroML.datasets import fetch_LINEAR_geneva #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) data = fetch_LINEAR_geneva() attributes = [('gi', 'logP'), ('gi', 'logP', 'ug', 'iK', 'JK', 'amp', 'skew')] labels = ['$u-g$', '$g-i$', '$i-K$', '$J-K$', r'$\log(P)$', 'amplitude', 'skew'] cls = 'LCtype' Ntrain = 3000 #------------------------------------------------------------ # Create attribute arrays X = [] y = [] for attr in attributes: X.append(np.vstack([data[a] for a in attr]).T) LCtype = data[cls].copy() # there is no #3. For a better color scheme in plots, # we'll set 6->3 LCtype[LCtype == 6] = 3 y.append(LCtype) #@pickle_results('LINEAR_SVM.pkl') def compute_SVM_results(i_train, i_test): classifiers = [] predictions = [] Xtests = [] ytests = [] Xtrains = [] ytrains = [] for i in range(len(attributes)): Xtrain = X[i][i_train] Xtest = X[i][i_test] ytrain = y[i][i_train] ytest = y[i][i_test] clf = SVC(kernel='linear', class_weight=None) clf.fit(Xtrain, ytrain) y_pred = clf.predict(Xtest) classifiers.append(clf) predictions.append(y_pred) return classifiers, predictions i = np.arange(len(data)) i_train, i_test = train_test_split(i, random_state=0, train_size=2000) clfs, ypred = compute_SVM_results(i_train, i_test) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.1, wspace=0.1) class_labels = [] for i in range(2): Xtest = X[i][i_test] ytest = y[i][i_test] amp = data['amp'][i_test] # Plot the resulting classifications ax1 = fig.add_subplot(221 + 2 * i) ax1.scatter(Xtest[:, 0], Xtest[:, 1], c=ypred[i], edgecolors='none', s=4, linewidths=0) ax1.set_ylabel(r'$\log(P)$') ax2 = plt.subplot(222 + 2 * i) ax2.scatter(amp, Xtest[:, 1], c=ypred[i], edgecolors='none', s=4, lw=0) #------------------------------ # set axis limits ax1.set_xlim(-0.6, 2.1) ax2.set_xlim(0.1, 1.5) ax1.set_ylim(-1.5, 0.5) ax2.set_ylim(-1.5, 0.5) ax2.yaxis.set_major_formatter(plt.NullFormatter()) if i == 0: ax1.xaxis.set_major_formatter(plt.NullFormatter()) ax2.xaxis.set_major_formatter(plt.NullFormatter()) else: ax1.set_xlabel(r'$g-i$') ax2.set_xlabel(r'$A$') #------------------------------------------------------------ # Second figure fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.11, right=0.95, wspace=0.3) attrs = ['skew', 'ug', 'iK', 'JK'] labels = ['skew', '$u-g$', '$i-K$', '$J-K$'] ylims = [(-1.8, 2.2), (0.6, 2.9), (0.1, 2.6), (-0.2, 1.2)] for i in range(4): ax = fig.add_subplot(221 + i) ax.scatter(data['gi'][i_test], data[attrs[i]][i_test], c=ypred[1], edgecolors='none', s=4, lw=0) ax.set_xlabel('$g-i$') ax.set_ylabel(labels[i]) ax.set_xlim(-0.6, 2.1) ax.set_ylim(ylims[i]) #------------------------------------------------------------ # Save the results # # run the script as # # >$ python fig_LINEAR_clustering.py --save # # to output the data file showing the cluster labels of each point import sys if len(sys.argv) > 1 and sys.argv[1] == '--save': filename = 'cluster_labels_svm.dat' print("Saving cluster labels to", filename) from astroML.datasets.LINEAR_sample import ARCHIVE_DTYPE new_data = np.zeros(len(data), dtype=(ARCHIVE_DTYPE + [('2D_cluster_ID', 'i4'), ('7D_cluster_ID', 'i4')])) # switch the labels back 3->6 for i in range(2): ypred[i][ypred[i] == 3] = 6 # need to put labels back in order class_labels = [-999 * np.ones(len(data)) for i in range(2)] for i in range(2): class_labels[i][i_test] = ypred[i] for name in data.dtype.names: new_data[name] = data[name] new_data['2D_cluster_ID'] = class_labels[0] new_data['7D_cluster_ID'] = class_labels[1] fmt = ('%.6f %.6f %.3f %.3f %.3f %.3f %.7f %.3f %.3f ' '%.3f %.2f %i %i %s %i %i\n') F = open(filename, 'w') F.write('# ra dec ug gi iK JK ' 'logP Ampl skew kurt magMed nObs LCtype ' 'LINEARobjectID 2D_cluster_ID 7D_cluster_ID\n') for line in new_data: F.write(fmt % tuple(line[col] for col in line.dtype.names)) F.close() plt.show() astroML-0.3/book_figures/chapter10/fig_LS_comparison.py0000644000076500000240000000412012420767763023733 0ustar jakevdpstaff00000000000000""" Comparison of Lomb-Scargle Methods ---------------------------------- This shows a comparison of the Lomb-Scargle periodogram and the Modified Lomb-Scargle periodogram for a single star, along with the multi-term results. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from astroML.time_series import\ lomb_scargle, search_frequencies, multiterm_periodogram from astroML.datasets import fetch_LINEAR_sample #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #id, period = 11375941, 58.4 id, period = 18525697, 17.05 data = fetch_LINEAR_sample() t, y, dy = data[id].T omega = np.linspace(period, period + 0.1, 1000) fig = plt.figure(figsize=(5, 3.75)) ax = plt.subplot(211) for n_terms in [1, 2, 3]: P1 = multiterm_periodogram(t, y, dy, omega, n_terms=n_terms) plt.plot(omega, P1, lw=1, label='m = %i' % n_terms) plt.legend(loc=2) plt.xlim(period, period + 0.1) plt.ylim(0, 1.0) plt.ylabel('$1 - \chi^2(\omega) / \chi^2_{ref}$') plt.subplot(212, sharex=ax) for generalized in [True, False]: if generalized: label = 'generalized LS' else: label = 'standard LS' P2 = lomb_scargle(t, y, dy, omega, generalized=generalized) plt.plot(omega, P2, lw=1, label=label) plt.legend(loc=2) plt.xlim(period, period + 0.1) plt.ylim(0, 1.0) plt.xlabel('frequency $\omega$') plt.ylabel('$P_{LS}(\omega)$') plt.show() astroML-0.3/book_figures/chapter10/fig_LS_double_eclipse.py0000644000076500000240000001126512252721253024532 0ustar jakevdpstaff00000000000000""" Lomb-Scargle Aliasing --------------------- Figure 10.18 Analysis of a light curve where the standard Lomb-Scargle periodogram fails to find the correct period (the same star as in the top-left panel in figure 10.17). The two top panels show the periodograms (left) and phased light curves (right) for the truncated Fourier series model with M = 1 and M = 6 terms. Phased light curves are computed using the incorrect aliased period favored by the M = 1 model. The correct period is favored by the M = 6 model but unrecognized by the M = 1 model (bottom-left panel). The phased light curve constructed with the correct period is shown in the bottom-right panel. This case demonstrates that the Lomb-Scargle periodogram may easily fail when the signal shape significantly differs from a single sinusoid. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import multiterm_periodogram, MultiTermFit from astroML.datasets import fetch_LINEAR_sample #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get data data = fetch_LINEAR_sample() t, y, dy = data[14752041].T #------------------------------------------------------------ # Do a single-term and multi-term fit around the peak omega0 = 17.217 nterms_fit = 6 # hack to get better phases: this doesn't change results, # except for how the phase plots are displayed t -= 0.4 * np.pi / omega0 width = 0.03 omega = np.linspace(omega0 - width - 0.01, omega0 + width - 0.01, 1000) #------------------------------------------------------------ # Compute periodograms and best-fit solutions # factor gives the factor that we're dividing the fundamental frequency by factors = [1, 2] nterms = [1, 6] # Compute PSDs for factors & nterms PSDs = dict() for f in factors: for n in nterms: PSDs[(f, n)] = multiterm_periodogram(t, y, dy, omega / f, n) # Compute the best-fit omega from the 6-term fit omega_best = dict() for f in factors: omegaf = omega / f PSDf = PSDs[(f, 6)] omega_best[f] = omegaf[np.argmax(PSDf)] # Compute the best-fit solution based on the fundamental frequency best_fit = dict() for f in factors: for n in nterms: mtf = MultiTermFit(omega_best[f], n) mtf.fit(t, y, dy) phase_best, y_best = mtf.predict(1000, adjust_offset=False) best_fit[(f, n)] = (phase_best, y_best) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.25, bottom=0.12, top=0.95, hspace=0.2) for i, f in enumerate(factors): P_best = 2 * np.pi / omega_best[f] phase_best = (t / P_best) % 1 # first column: plot the PSD ax1 = fig.add_subplot(221 + 2 * i) ax1.plot(omega / f, PSDs[(f, 6)], '-', c='black', label='6 terms') ax1.plot(omega / f, PSDs[(f, 1)], '-', c='gray', label='1 term') ax1.grid(color='gray') ax1.legend(loc=2) ax1.axis('tight') ax1.set_ylim(-0.05, 1.001) ax1.xaxis.set_major_locator(plt.MultipleLocator(0.01)) ax1.xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f')) # second column: plot the phased data & fit ax2 = fig.add_subplot(222 + 2 * i) ax2.errorbar(phase_best, y, dy, fmt='.k', ms=4, ecolor='gray', lw=1, capsize=1.5) ax2.plot(best_fit[(f, 1)][0], best_fit[(f, 1)][1], '-', c='gray') ax2.plot(best_fit[(f, 6)][0], best_fit[(f, 6)][1], '-', c='black') ax2.text(0.02, 0.02, (r"$\omega_0 = %.2f$" % omega_best[f] + "\n" + r"$P_0 = %.2f\ {\rm hours}$" % (24 * P_best)), ha='left', va='bottom', transform=ax2.transAxes) ax2.grid(color='gray') ax2.set_xlim(0, 1) ax2.set_ylim(plt.ylim()[::-1]) ax2.yaxis.set_major_locator(plt.MultipleLocator(0.4)) # label both axes ax1.set_ylabel(r'$P_{\rm LS}(\omega)$') ax2.set_ylabel(r'${\rm mag}$') if i == 1: ax1.set_xlabel(r'$\omega$') ax2.set_xlabel(r'${\rm phase}$') plt.show() astroML-0.3/book_figures/chapter10/fig_LS_example.py0000644000076500000240000000740612252721253023211 0ustar jakevdpstaff00000000000000""" Example of Lomb-Scargle Algorithm --------------------------------- Figure 10.15 Example of a Lomb-Scargle periodogram. The data include 30 points drawn from the function y(t|P) = 10 + sin(2pi t/P) with P = 0.3. Heteroscedastic Gaussian noise is added to the observations, with a width drawn from a uniform distribution with 0.5 < sigma < 1.0. Data are shown in the top panel and the resulting Lomb-Scargle periodogram is shown in the bottom panel. The arrow marks the location of the true period. The dotted lines show the 1% and 5% significance levels for the highest peak, determined by 1000 bootstrap resamplings (see Section 10.3.2). The change in BIC compared to a nonvarying source (eq. 10.55) is shown on the right y-axis. The maximum power corresponds to a delta-BIC = 26.1,indicating the presence of a periodic signal. Bootstrapping indicates the period is detected at ~ 5% significance. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import\ lomb_scargle, lomb_scargle_BIC, lomb_scargle_bootstrap #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate Data np.random.seed(0) N = 30 P = 0.3 t = np.random.randint(100, size=N) + 0.3 + 0.4 * np.random.random(N) y = 10 + np.sin(2 * np.pi * t / P) dy = 0.5 + 0.5 * np.random.random(N) y_obs = np.random.normal(y, dy) #------------------------------------------------------------ # Compute periodogram period = 10 ** np.linspace(-1, 0, 10000) omega = 2 * np.pi / period PS = lomb_scargle(t, y_obs, dy, omega, generalized=True) #------------------------------------------------------------ # Get significance via bootstrap D = lomb_scargle_bootstrap(t, y_obs, dy, omega, generalized=True, N_bootstraps=1000, random_state=0) sig1, sig5 = np.percentile(D, [99, 95]) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(left=0.1, right=0.9, hspace=0.25) # First panel: the data ax = fig.add_subplot(211) ax.errorbar(t, y_obs, dy, fmt='.k', lw=1, ecolor='gray') ax.set_xlabel('time (days)') ax.set_ylabel('flux') ax.set_xlim(-5, 105) # Second panel: the periodogram & significance levels ax1 = fig.add_subplot(212, xscale='log') ax1.plot(period, PS, '-', c='black', lw=1, zorder=1) ax1.plot([period[0], period[-1]], [sig1, sig1], ':', c='black') ax1.plot([period[0], period[-1]], [sig5, sig5], ':', c='black') ax1.annotate("", (0.3, 0.65), (0.3, 0.85), ha='center', arrowprops=dict(arrowstyle='->')) ax1.set_xlim(period[0], period[-1]) ax1.set_ylim(-0.05, 0.85) ax1.set_xlabel(r'period (days)') ax1.set_ylabel('power') # Twin axis: label BIC on the right side ax2 = ax1.twinx() ax2.set_ylim(tuple(lomb_scargle_BIC(ax1.get_ylim(), y_obs, dy))) ax2.set_ylabel(r'$\Delta BIC$') ax1.xaxis.set_major_formatter(plt.FormatStrFormatter('%.1f')) ax1.xaxis.set_minor_formatter(plt.FormatStrFormatter('%.1f')) ax1.xaxis.set_major_locator(plt.LogLocator(10)) ax1.xaxis.set_major_formatter(plt.FormatStrFormatter('%.3g')) plt.show() astroML-0.3/book_figures/chapter10/fig_LS_sg_comparison.py0000644000076500000240000001560112252721253024415 0ustar jakevdpstaff00000000000000""" Generalized vs Standard Lomb-Scargle ------------------------------------ Figure 10.16 A comparison of the standard and generalized Lomb-Scargle periodograms for a signal y(t) = 10 + sin(2pi t/P) with P = 0.3, corresponding to omega_0 ~ 21. This example is, in some sense, a worst-case scenario for the standard Lomb-Scargle algorithm because there are no sampled points during the times when ytrue < 10, which leads to a gross overestimation of the mean. The bottom panel shows the Lomb-Scargle and generalized Lomb-Scargle periodograms for these data; the generalized method recovers the expected peak as the highest peak, while the standard method incorrectly chooses the peak at omega ~ 17.6 (because it is higher than the true peak at omega_0 ~ 21). The dotted lines show the 1% and 5% significance levels for the highest peak in the generalized periodogram, determined by 1000 bootstrap resamplings (see Section 10.3.2). Note: This Plot Contains an Error ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ After the book was in press, a reader pointed out that this plot contains a typo. Instead of passing the noisy data to the Lomb-Scargle routine, we had passed the underlying, non-noisy data. This caused an over-estimate of the Lomb-Scargle power. Because of this, we add two extra plots to this script: the first reproduces the current plot without the typo. In it, we see that for the noisy data, the period is not detected for just ~30 points within ten periods. In the second additional plot, we increase the baseline and the number of points by a factor of ten. With this configuration, the peak is detected, and the qualitative aspects of the above discussion hold true. We regret the error! """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import \ lomb_scargle, lomb_scargle_BIC, lomb_scargle_bootstrap #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate data where y is positive np.random.seed(0) N = 30 P = 0.3 t = P / 2 * np.random.random(N) + P * np.random.randint(100, size=N) y = 10 + np.sin(2 * np.pi * t / P) dy = 0.5 + 0.5 * np.random.random(N) y_obs = y + np.random.normal(dy) omega_0 = 2 * np.pi / P ####################################################################### # Generate the plot with and without the original typo for typo in [True, False]: #------------------------------------------------------------ # Compute the Lomb-Scargle Periodogram sig = np.array([0.1, 0.01, 0.001]) omega = np.linspace(17, 22, 1000) # Notice the typo: we used y rather than y_obs if typo is True: P_S = lomb_scargle(t, y, dy, omega, generalized=False) P_G = lomb_scargle(t, y, dy, omega, generalized=True) else: P_S = lomb_scargle(t, y_obs, dy, omega, generalized=False) P_G = lomb_scargle(t, y_obs, dy, omega, generalized=True) #------------------------------------------------------------ # Get significance via bootstrap D = lomb_scargle_bootstrap(t, y_obs, dy, omega, generalized=True, N_bootstraps=1000, random_state=0) sig1, sig5 = np.percentile(D, [99, 95]) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) # First panel: input data ax = fig.add_subplot(211) ax.errorbar(t, y_obs, dy, fmt='.k', lw=1, ecolor='gray') ax.plot([-2, 32], [10, 10], ':k', lw=1) ax.set_xlim(-2, 32) ax.set_xlabel('$t$') ax.set_ylabel('$y(t)$') if typo is False: ax.set_title('Corrected version') # Second panel: periodogram ax = fig.add_subplot(212) ax.plot(omega, P_S, '--k', lw=1, label='standard') ax.plot(omega, P_G, '-k', lw=1, label='generalized') ax.legend(loc=2) # plot the significance lines. xlim = (omega[0], omega[-1]) ax.plot(xlim, [sig1, sig1], ':', c='black') ax.plot(xlim, [sig5, sig5], ':', c='black') # label BIC on the right side ax2 = ax.twinx() ax2.set_ylim(tuple(lomb_scargle_BIC(ax.get_ylim(), y_obs, dy))) ax2.set_ylabel(r'$\Delta BIC$') ax.set_xlabel('$\omega$') ax.set_ylabel(r'$P_{\rm LS}(\omega)$') ax.set_ylim(0, 1.1) ####################################################################### # Redo the plot without the typo # We need a larger data range to actually get significant power # with actual noisy data #------------------------------------------------------------ # Generate data where y is positive np.random.seed(0) N = 300 P = 0.3 t = P / 2 * np.random.random(N) + P * np.random.randint(1000, size=N) y = 10 + np.sin(2 * np.pi * t / P) dy = 0.1 + 0.1 * np.random.random(N) y_obs = y + np.random.normal(dy) omega_0 = 2 * np.pi / P #------------------------------------------------------------ # Compute the Lomb-Scargle Periodogram sig = np.array([0.1, 0.01, 0.001]) omega = np.linspace(20.5, 21.1, 1000) P_S = lomb_scargle(t, y_obs, dy, omega, generalized=False) P_G = lomb_scargle(t, y_obs, dy, omega, generalized=True) #------------------------------------------------------------ # Get significance via bootstrap D = lomb_scargle_bootstrap(t, y_obs, dy, omega, generalized=True, N_bootstraps=1000, random_state=0) sig1, sig5 = np.percentile(D, [99, 95]) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) # First panel: input data ax = fig.add_subplot(211) ax.errorbar(t, y_obs, dy, fmt='.k', lw=1, ecolor='gray') ax.plot([-20, 320], [10, 10], ':k', lw=1) ax.set_xlim(-20, 320) ax.set_xlabel('$t$') ax.set_ylabel('$y(t)$') # Second panel: periodogram ax = fig.add_subplot(212) ax.plot(omega, P_S, '--k', lw=1, label='standard') ax.plot(omega, P_S, '-', c='gray', lw=1) ax.plot(omega, P_G, '-k', lw=1, label='generalized') ax.legend(loc=2) # plot the significance lines. xlim = (omega[0], omega[-1]) ax.plot(xlim, [sig1, sig1], ':', c='black') ax.plot(xlim, [sig5, sig5], ':', c='black') # label BIC on the right side ax2 = ax.twinx() ax2.set_ylim(tuple(lomb_scargle_BIC(ax.get_ylim(), y_obs, dy))) ax2.set_ylabel(r'$\Delta BIC$') ax.set_xlabel('$\omega$') ax.set_ylabel(r'$P_{\rm LS}(\omega)$') ax.set_xlim(xlim) ax.set_ylim(0, 0.12) plt.show() astroML-0.3/book_figures/chapter10/fig_matchedfilt_burst.py0000644000076500000240000000773012252721253024663 0ustar jakevdpstaff00000000000000""" Matched Filter Burst Search --------------------------- Figure 10.25 A matched filter search for a burst signal in time series data. A simulated data set generated from a model of the form y(t) = b0 for t < T and y = b0 + A exp[-a(t - T)] for t > T , with homoscedastic Gaussian errors with sigma = 2, is shown in the top-right panel. The posterior pdf for the four model parameters is determined using MCMC and shown in the other panels. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt # Hack to fix import issue in older versions of pymc import scipy import scipy.misc scipy.derivative = scipy.misc.derivative import pymc from astroML.plotting.mcmc import plot_mcmc from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # Set up toy dataset def burst(t, b0, A, alpha, T): """Burst model""" y = np.empty(t.shape) y.fill(b0) mask = (t >= T) y[mask] += A * np.exp(-alpha * (t[mask] - T)) return y np.random.seed(0) N = 100 b0_true = 10 A_true = 5 alpha_true = 0.1 T_true = 50 sigma = 1.0 t = 100 * np.random.random(N) y_true = burst(t, b0_true, A_true, alpha_true, T_true) y_obs = np.random.normal(y_true, sigma) #---------------------------------------------------------------------- # Set up MCMC sampling b0 = pymc.Uniform('b0', 0, 50, value=50 * np.random.random()) A = pymc.Uniform('A', 0, 50, value=50 * np.random.random()) T = pymc.Uniform('T', 0, 100, value=100 * np.random.random()) log_alpha = pymc.Uniform('log_alpha', -10, 10, value=0) # uniform prior on log(alpha) @pymc.deterministic def alpha(log_alpha=log_alpha): return np.exp(log_alpha) @pymc.deterministic def y_model(t=t, b0=b0, A=A, alpha=alpha, T=T): return burst(t, b0, A, alpha, T) y = pymc.Normal('y', mu=y_model, tau=sigma ** -2, observed=True, value=y_obs) model = dict(b0=b0, A=A, T=T, log_alpha=log_alpha, alpha=alpha, y_model=y_model, y=y) #---------------------------------------------------------------------- # Run the MCMC sampling @pickle_results('matchedfilt_burst.pkl') def compute_MCMC_results(niter=25000, burn=4000): S = pymc.MCMC(model) S.sample(iter=niter, burn=burn) traces = [S.trace(s)[:] for s in ['b0', 'A', 'T', 'alpha']] M = pymc.MAP(model) M.fit() fit_vals = (M.b0.value, M.A.value, M.alpha.value, M.T.value) return traces, fit_vals traces, fit_vals = compute_MCMC_results() labels = ['$b_0$', '$A$', '$T$', r'$\alpha$'] limits = [(9.2, 11.2), (2, 12), (45, 55), (0.0, 0.25)] true = [b0_true, A_true, T_true, alpha_true] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(bottom=0.1, top=0.95, left=0.1, right=0.95, hspace=0.05, wspace=0.05) # This function plots multiple panels with the traces plot_mcmc(traces, labels=labels, limits=limits, true_values=true, fig=fig, bins=30, colors='k') # Plot the model fit ax = fig.add_axes([0.5, 0.7, 0.45, 0.25]) t_fit = np.linspace(0, 100, 101) y_fit = burst(t_fit, *fit_vals) ax.scatter(t, y_obs, s=9, lw=0, c='k') ax.plot(t_fit, y_fit, '-k') ax.set_xlim(0, 100) ax.set_xlabel('$t$') ax.set_ylabel(r'$h_{\rm obs}$') plt.show() astroML-0.3/book_figures/chapter10/fig_matchedfilt_chirp.py0000644000076500000240000001013012420767326024625 0ustar jakevdpstaff00000000000000""" Matched Filter Chirp Search --------------------------- Figure 10.26 A matched filter search for a chirp signal in time series data. A simulated data set generated from a model of the form y = b0+Asin[omega t + beta t^2], with homoscedastic Gaussian errors with sigma = 2, is shown in the top-right panel. The posterior pdf for the four model parameters is determined using MCMC and shown in the other panels. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt # Hack to fix import issue in older versions of pymc import scipy import scipy.misc scipy.derivative = scipy.misc.derivative import pymc from astroML.plotting.mcmc import plot_mcmc from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # Set up toy dataset def chirp(t, b0, beta, A, omega): return b0 + A * np.sin(omega * t + beta * t * t) np.random.seed(0) N = 100 b0_true = 10 A_true = 5 beta_true = 0.01 omega_true = 0.1 sigma = 2.0 t = 100 * np.random.random(N) y_true = chirp(t, b0_true, beta_true, A_true, omega_true) y_obs = np.random.normal(y_true, sigma) t_fit = np.linspace(0, 100, 1000) y_fit = chirp(t_fit, b0_true, beta_true, A_true, omega_true) i = np.argsort(t) #---------------------------------------------------------------------- # Set up MCMC sampling b0 = pymc.Uniform('b0', 0, 50, value=50 * np.random.random()) A = pymc.Uniform('A', 0, 50, value=50 * np.random.random()) log_beta = pymc.Uniform('log_beta', -10, 10, value=-4.6) log_omega = pymc.Uniform('log_omega', -10, 10, value=-2.3) # uniform prior on log(beta) @pymc.deterministic def beta(log_beta=log_beta): return np.exp(log_beta) # uniform prior on log(omega) @pymc.deterministic def omega(log_omega=log_omega): return np.exp(log_omega) @pymc.deterministic def y_model(t=t, b0=b0, A=A, beta=beta, omega=omega): return chirp(t, b0, beta, A, omega) y = pymc.Normal('y', mu=y_model, tau=sigma ** -2, observed=True, value=y_obs) model = dict(b0=b0, A=A, log_beta=log_beta, beta=beta, log_omega=log_omega, omega=omega, y_model=y_model, y=y) #---------------------------------------------------------------------- # Run the MCMC sampling (saving results to a pickle) @pickle_results('matchedfilt_chirp.pkl') def compute_MCMC_results(niter=20000, burn=2000): S = pymc.MCMC(model) S.sample(iter=niter, burn=burn) traces = [S.trace(s)[:] for s in ['b0', 'A', 'omega', 'beta']] M = pymc.MAP(model) M.fit() fit_vals = (M.b0.value, M.beta.value, M.A.value, M.omega.value) return traces, fit_vals traces, fit_vals = compute_MCMC_results() labels = ['$b_0$', '$A$', r'$\omega$', r'$\beta$'] limits = [(9.5, 11.3), (3.6, 6.4), (0.065, 0.115), (0.00975, 0.01045)] true = [b0_true, A_true, omega_true, beta_true] #---------------------------------------------------------------------- # Find the Maximum a posteriori values fig = plt.figure(figsize=(5, 5)) ax = plt.axes([0.5, 0.7, 0.45, 0.25]) t_fit = np.linspace(0, 100, 1001) y_fit = chirp(t_fit, *fit_vals) plt.scatter(t, y_obs, s=9, lw=0, c='k') plt.plot(t_fit, y_fit, '-k') plt.xlim(0, 100) plt.xlabel('$t$') plt.ylabel(r'$h_{\rm obs}$') # This function plots multiple panels with the traces plot_mcmc(traces, labels=labels, limits=limits, true_values=true, fig=fig, bins=30, bounds=[0.12, 0.08, 0.95, 0.91], colors='k') plt.show() astroML-0.3/book_figures/chapter10/fig_matchedfilt_chirp2.py0000644000076500000240000001200312252721253024700 0ustar jakevdpstaff00000000000000""" Matched Filter Chirp Search --------------------------- Figure 10.27 A ten-parameter chirp model (see eq. 10.87) fit to a time series. Seven of the parameters can be considered nuisance parameters, and we marginalize over them in the likelihood contours shown here. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt # Hack to fix import issue in older versions of pymc import scipy import scipy.misc scipy.derivative = scipy.misc.derivative import pymc from astroML.decorators import pickle_results from astroML.plotting.mcmc import plot_mcmc #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # Set up toy dataset def chirp(t, T, A, phi, omega, beta): """chirp signal""" signal = A * np.sin(phi + omega * (t - T) + beta * (t - T) ** 2) signal[t < T] = 0 return signal def background(t, b0, b1, Omega1, Omega2): """background signal""" return b0 + b1 * np.sin(Omega1 * t) * np.sin(Omega2 * t) np.random.seed(0) N = 500 T_true = 30 A_true = 0.8 phi_true = np.pi / 2 omega_true = 0.1 beta_true = 0.02 b0_true = 0.5 b1_true = 1.0 Omega1_true = 0.3 Omega2_true = 0.4 sigma = 0.1 t = 100 * np.random.random(N) signal = chirp(t, T_true, A_true, phi_true, omega_true, beta_true) bg = background(t, b0_true, b1_true, Omega1_true, Omega2_true) y_true = signal + bg y_obs = np.random.normal(y_true, sigma) t_fit = np.linspace(0, 100, 1000) y_fit = (chirp(t_fit, T_true, A_true, phi_true, omega_true, beta_true) + background(t_fit, b0_true, b1_true, Omega1_true, Omega2_true)) #---------------------------------------------------------------------- # Set up MCMC sampling T = pymc.Uniform('T', 0, 100, value=T_true) A = pymc.Uniform('A', 0, 100, value=A_true) phi = pymc.Uniform('phi', -np.pi, np.pi, value=phi_true) log_omega = pymc.Uniform('log_omega', -4, 0, value=np.log(omega_true)) log_beta = pymc.Uniform('log_beta', -6, 0, value=np.log(beta_true)) b0 = pymc.Uniform('b0', 0, 100, value=b0_true) b1 = pymc.Uniform('b1', 0, 100, value=b1_true) log_Omega1 = pymc.Uniform('log_Omega1', -3, 0, value=np.log(Omega1_true)) log_Omega2 = pymc.Uniform('log_Omega2', -3, 0, value=np.log(Omega2_true)) omega = pymc.Uniform('omega', 0.001, 1, value=omega_true) beta = pymc.Uniform('beta', 0.001, 1, value=beta_true) # uniform prior on log(Omega1) @pymc.deterministic def Omega1(log_Omega1=log_Omega1): return np.exp(log_Omega1) # uniform prior on log(Omega2) @pymc.deterministic def Omega2(log_Omega2=log_Omega2): return np.exp(log_Omega2) @pymc.deterministic def y_model(t=t, T=T, A=A, phi=phi, omega=omega, beta=beta, b0=b0, b1=b1, Omega1=Omega1, Omega2=Omega2): return (chirp(t, T, A, phi, omega, beta) + background(t, b0, b1, Omega1, Omega2)) y = pymc.Normal('y', mu=y_model, tau=sigma ** -2, observed=True, value=y_obs) model = dict(T=T, A=A, phi=phi, b0=b0, b1=b1, log_omega=log_omega, omega=omega, log_beta=log_beta, beta=beta, log_Omega1=log_Omega1, Omega1=Omega1, log_Omega2=log_Omega2, Omega2=Omega2, y_model=y_model, y=y) #---------------------------------------------------------------------- # Run the MCMC sampling (saving the results to a pickle file) @pickle_results('matchedfilt_chirp2.pkl') def compute_MCMC(niter=30000, burn=2000): S = pymc.MCMC(model) S.sample(iter=30000, burn=2000) traces = [S.trace(s)[:] for s in ['T', 'A', 'omega', 'beta']] return traces traces = compute_MCMC() labels = ['$T$', '$A$', r'$\omega$', r'$\beta$'] limits = [(29.75, 30.25), (0.75, 0.83), (0.085, 0.115), (0.0197, 0.0202)] true = [T_true, A_true, omega_true, beta_true] #------------------------------------------------------------ # Plot results fig = plt.figure(figsize=(5, 5)) # This function plots multiple panels with the traces axes_list = plot_mcmc(traces, labels=labels, limits=limits, true_values=true, fig=fig, bins=30, colors='k', bounds=[0.14, 0.08, 0.95, 0.95]) for ax in axes_list: for axis in [ax.xaxis, ax.yaxis]: axis.set_major_locator(plt.MaxNLocator(5)) ax = fig.add_axes([0.52, 0.7, 0.43, 0.25]) ax.scatter(t, y_obs, s=9, lw=0, c='k') ax.plot(t_fit, y_fit, '-k') ax.set_xlim(0, 100) ax.set_xlabel('$t$') ax.set_ylabel(r'$h_{\rm obs}$') plt.show() astroML-0.3/book_figures/chapter10/fig_mincomp.py0000644000076500000240000000627312252721253022623 0ustar jakevdpstaff00000000000000""" Example of Minimum Component Filtering -------------------------------------- Figure 10.13 A minimum component filter applied to the spectrum of a white dwarf from SDSS data set (mjd= 52199, plate=659, fiber=381). The upper panel shows a portion of the input spectrum, along with the continuum computed via the minimum component filtering procedure described in Section 10.2.5 (see figure 10.12). The lower panel shows the PSD for both the input spectrum and the filtered result. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.fourier import PSD_continuous from astroML.datasets import fetch_sdss_spectrum from astroML.filters import min_component_filter #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the spectrum from SDSS database & pre-process plate = 659 mjd = 52199 fiber = 381 data = fetch_sdss_spectrum(plate, mjd, fiber) lam = data.wavelength() spec = data.spectrum # wavelengths are logorithmically spaced: we'll work in log(lam) loglam = np.log10(lam) flag = (lam > 4000) & (lam < 5000) lam = lam[flag] loglam = loglam[flag] spec = spec[flag] lam = lam[:-1] loglam = loglam[:-1] spec = spec[:-1] #---------------------------------------------------------------------- # Mask-out significant features and compute filtered version feature_mask = (((lam > 4080) & (lam < 4130)) | ((lam > 4315) & (lam < 4370)) | ((lam > 4830) & (lam < 4900))) spec_filtered = min_component_filter(loglam, spec, feature_mask, fcut=100) #------------------------------------------------------------ # Compute PSD of filtered and unfiltered versions f, spec_filt_PSD = PSD_continuous(loglam, spec_filtered) f, spec_PSD = PSD_continuous(loglam, spec) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.25) # Top panel: plot noisy and smoothed spectrum ax = fig.add_subplot(211) ax.plot(lam, spec, '-', c='gray', lw=1) ax.plot(lam, spec_filtered, '-k') ax.text(0.97, 0.93, "SDSS white dwarf\n %i-%i-%i" % (mjd, plate, fiber), ha='right', va='top', transform=ax.transAxes) ax.set_ylim(25, 110) ax.set_xlabel(r'$\lambda\ {\rm (\AA)}$') ax.set_ylabel('flux') # Bottom panel: plot noisy and smoothed PSD ax = fig.add_subplot(212, yscale='log') ax.plot(f, spec_PSD, '-', c='gray', lw=1) ax.plot(f, spec_filt_PSD, '-k') ax.set_xlabel(r'$f$') ax.set_ylabel('$PSD(f)$') ax.set_xlim(0, 2000) plt.show() astroML-0.3/book_figures/chapter10/fig_mincomp_procedure.py0000644000076500000240000001056212252721253024667 0ustar jakevdpstaff00000000000000""" Minimum component fitting procedure ----------------------------------- Figure 10.12 The intermediate steps of the minimum component filter procedure applied to the spectrum of a white dwarf from the SDSS data set (mjd= 52199, plate=659, fiber=381). The top panel shows the input spectrum; the masked sections of the input spectrum are shown by thin lines (i.e., step 1 of the process in Section 10.2.5). The bottom panel shows the PSD of the masked spectrum, after the linear fit has been subtracted (gray line). A simple low-pass filter (dashed line) is applied, and the resulting filtered spectrum (dark line) is used to construct the result shown in figure 10.13. Minimum component filtering is explained in Wall & Jenkins, as well as Wall 1997, A&A 122:371. The minimum component algorithm is implemented in astroML.filters.min_component_filter """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import fftpack from astroML.fourier import PSD_continuous from astroML.datasets import fetch_sdss_spectrum #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the spectrum from SDSS database & pre-process plate = 659 mjd = 52199 fiber = 381 data = fetch_sdss_spectrum(plate, mjd, fiber) lam = data.wavelength() spec = data.spectrum # wavelengths are logorithmically spaced: we'll work in log(lam) loglam = np.log10(lam) flag = (lam > 4000) & (lam < 5000) lam = lam[flag] loglam = loglam[flag] spec = spec[flag] lam = lam[:-1] loglam = loglam[:-1] spec = spec[:-1] #---------------------------------------------------------------------- # First step: mask-out significant features feature_mask = (((lam > 4080) & (lam < 4130)) | ((lam > 4315) & (lam < 4370)) | ((lam > 4830) & (lam < 4900))) #---------------------------------------------------------------------- # Second step: fit a line to the unmasked portion of the spectrum XX = loglam[:, None] ** np.arange(2) beta = np.linalg.lstsq(XX[~feature_mask], spec[~feature_mask])[0] spec_fit = np.dot(XX, beta) spec_patched = spec - spec_fit spec_patched[feature_mask] = 0 #---------------------------------------------------------------------- # Third step: Fourier transform the patched spectrum N = len(loglam) df = 1. / N / (loglam[1] - loglam[0]) f = fftpack.ifftshift(df * (np.arange(N) - N / 2.)) spec_patched_FT = fftpack.fft(spec_patched) #---------------------------------------------------------------------- # Fourth step: Low-pass filter on the transform filt = np.exp(- (0.01 * (abs(f) - 100.)) ** 2) filt[abs(f) < 100] = 1 spec_filt_FT = spec_patched_FT * filt #---------------------------------------------------------------------- # Fifth step: inverse Fourier transform, and add back the fit spec_filt = fftpack.ifft(spec_filt_FT) spec_filt += spec_fit #---------------------------------------------------------------------- # plot results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.25) ax = fig.add_subplot(211) ax.plot(lam, spec, '-', c='gray') ax.plot(lam, spec_patched + spec_fit, '-k') ax.set_ylim(25, 110) ax.set_xlabel(r'$\lambda\ {\rm(\AA)}$') ax.set_ylabel('flux') ax = fig.add_subplot(212) factor = 15 * (loglam[1] - loglam[0]) ax.plot(fftpack.fftshift(f), factor * fftpack.fftshift(abs(spec_patched_FT) ** 1), '-', c='gray', label='masked/shifted spectrum') ax.plot(fftpack.fftshift(f), factor * fftpack.fftshift(abs(spec_filt_FT) ** 1), '-k', label='filtered spectrum') ax.plot(fftpack.fftshift(f), fftpack.fftshift(filt), '--k', label='filter') ax.set_xlim(0, 2000) ax.set_ylim(0, 1.1) ax.set_xlabel('$f$') ax.set_ylabel('scaled $PSD(f)$') plt.show() astroML-0.3/book_figures/chapter10/fig_powerlaw.py0000644000076500000240000000505412252721253023015 0ustar jakevdpstaff00000000000000""" Generating Power-law Light Curves --------------------------------- Figure 10.29 Examples of stochastic time series generated from power-law PSDs (left: 1/ f; right: 1/f^2) using the method from [1]. The top panels show the generated data, while the bottom panels show the corresponding PSD (dashed lines: input PSD; solid lines: determined from time series shown in the top panels). References ~~~~~~~~~~ .. [1] Timmer, J. & Koenig, M. On Generating Power Law Noise. A&A 300:707 """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import generate_power_law from astroML.fourier import PSD_continuous #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) N = 1024 dt = 0.01 factor = 100 t = dt * np.arange(N) random_state = np.random.RandomState(1) fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(wspace=0.05) for i, beta in enumerate([1.0, 2.0]): # Generate the light curve and compute the PSD x = factor * generate_power_law(N, dt, beta, random_state=random_state) f, PSD = PSD_continuous(t, x) # First axes: plot the time series ax1 = fig.add_subplot(221 + i) ax1.plot(t, x, '-k') ax1.text(0.95, 0.05, r"$P(f) \propto f^{-%i}$" % beta, ha='right', va='bottom', transform=ax1.transAxes) ax1.set_xlim(0, 10.24) ax1.set_ylim(-1.5, 1.5) ax1.set_xlabel(r'$t$') # Second axes: plot the PSD ax2 = fig.add_subplot(223 + i, xscale='log', yscale='log') ax2.plot(f, PSD, '-k') ax2.plot(f[1:], (factor * dt) ** 2 * (2 * np.pi * f[1:]) ** -beta, '--k') ax2.set_xlim(1E-1, 60) ax2.set_ylim(1E-6, 1E1) ax2.set_xlabel(r'$f$') if i == 1: ax1.yaxis.set_major_formatter(plt.NullFormatter()) ax2.yaxis.set_major_formatter(plt.NullFormatter()) else: ax1.set_ylabel(r'${\rm counts}$') ax2.set_ylabel(r'$PSD(f)$') plt.show() astroML-0.3/book_figures/chapter10/fig_rrlyrae_reconstruct.py0000644000076500000240000000465012252721253025271 0ustar jakevdpstaff00000000000000""" Fourier Reconstruction of RR-Lyrae Templates -------------------------------------------- Figure 10.1 An example of a truncated Fourier representation of an RR Lyrae light curve. The thick dashed line shows the true curve; the gray lines show the approximation based on 1, 3, and 8 Fourier modes (sinusoids). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_rrlyrae_templates #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Load the RR Lyrae template templates = fetch_rrlyrae_templates() x, y = templates['115r'].T #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0) kvals = [1, 3, 8] subplots = [311, 312, 313] for (k, subplot) in zip(kvals, subplots): ax = fig.add_subplot(subplot) # Use FFT to fit a truncated Fourier series y_fft = np.fft.fft(y) y_fft[k + 1:-k] = 0 y_fit = np.fft.ifft(y_fft).real # plot the true value and the k-term reconstruction ax.plot(np.concatenate([x, 1 + x]), np.concatenate([y, y]), '--k', lw=2) ax.plot(np.concatenate([x, 1 + x]), np.concatenate([y_fit, y_fit]), color='gray') label = "%i mode" % k if k > 1: label += 's' ax.text(0.02, 0.1, label, ha='left', va='bottom', transform=ax.transAxes) if subplot == subplots[-1]: ax.set_xlabel('phase') else: ax.xaxis.set_major_formatter(plt.NullFormatter()) if subplot == subplots[1]: ax.set_ylabel('amplitude') ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlim(0, 2) ax.set_ylim(1.1, -0.1) plt.show() astroML-0.3/book_figures/chapter10/fig_sampling.py0000644000076500000240000001030312252721253022760 0ustar jakevdpstaff00000000000000""" The effect of Sampling ---------------------- Figure 10.14 An illustration of the impact of measurement errors on the Lomb-Scargle power (cf. figure 10.4). The top-left panel shows a simulated data set with 40 points drawn from the function y(t|P) = sin(t) (i.e., f = 1/(2pi) ~ 0.16) with random sampling. Heteroscedastic Gaussian noise is added to the observations, with a width drawn from a uniform distribution with 0.1 < sigma < 0.2 (this error level is negligible compared to the amplitude of variation). The spectral window function (PSD of sampling times) is shown in the bottom-left panel. The PSD (:math:`P_{LS}`) computed for the data set from the top-left panel is shown in the top-right panel; it is equal to a convolution of the single peak (shaded in gray) with the window PSD shown in the bottom-left panel (e.g., the peak at f ~ 0.42 in the top-right panel can be traced to a peak at f ~ 0.26 in the bottom-left panel). The bottom-right panel shows the PSD for a data set with errors increased by a factor of 10. Note that the peak f ~ 0.16 is now much shorter, in agreement with eq. 10.47. In addition, errors now exceed the amplitude of variation and the data PSD is no longer a simple convolution of a single peak and the spectral window. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.time_series import lomb_scargle #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate the data np.random.seed(42) t_obs = 100 * np.random.random(40) # 40 observations in 100 days y_obs1 = np.sin(np.pi * t_obs / 3) dy1 = 0.1 + 0.1 * np.random.random(y_obs1.shape) y_obs1 += np.random.normal(0, dy1) y_obs2 = np.sin(np.pi * t_obs / 3) dy2 = 10 * dy1 y_obs2 = y_obs2 + np.random.normal(dy2) y_window = np.ones_like(y_obs1) t = np.linspace(0, 100, 10000) y = np.sin(np.pi * t / 3) #------------------------------------------------------------ # Compute the periodogram omega = np.linspace(0, 5, 1001)[1:] P_obs1 = lomb_scargle(t_obs, y_obs1, dy1, omega) P_obs2 = lomb_scargle(t_obs, y_obs2, dy2, omega) P_window = lomb_scargle(t_obs, y_window, 1, omega, generalized=False, subtract_mean=False) P_true = lomb_scargle(t, y, 1, omega) omega /= 2 * np.pi #------------------------------------------------------------ # Prepare the figures fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, hspace=0.35, wspace=0.25, left=0.11, right=0.95) ax = fig.add_subplot(221) ax.plot(t, y, '-', c='gray') ax.errorbar(t_obs, y_obs1, dy1, fmt='.k', capsize=1, ecolor='#444444') ax.text(0.96, 0.92, "Data", ha='right', va='top', transform=ax.transAxes) ax.set_ylim(-1.5, 1.8) ax.set_xlabel('$t$') ax.set_ylabel('$y(t)$') ax = fig.add_subplot(223) ax.plot(omega, P_window, '-', c='black') ax.text(0.96, 0.92, "Window PSD", ha='right', va='top', transform=ax.transAxes) ax.set_ylim(-0.1, 1.1) ax.set_xlabel('$f$') ax.set_ylabel(r'$P_{\rm LS}(f)$') ax = fig.add_subplot(222) ax.fill(omega, P_true, fc='gray', ec='gray') ax.plot(omega, P_obs1, '-', c='black') ax.text(0.96, 0.92, "Data PSD", ha='right', va='top', transform=ax.transAxes) ax.set_ylim(-0.1, 1.1) ax.set_xlabel('$f$') ax.set_ylabel(r'$P_{\rm LS}(f)$') ax = fig.add_subplot(224) ax.fill(omega, P_true, fc='gray', ec='gray') ax.plot(omega, P_obs2, '-', c='black') ax.text(0.96, 0.92, "Data PSD\n(10x errors)", ha='right', va='top', transform=ax.transAxes) ax.set_ylim(-0.1, 1.1) ax.set_xlabel('$f$') ax.set_ylabel(r'$P_{\rm LS}(f)$') plt.show() astroML-0.3/book_figures/chapter10/fig_wavelet_PSD.py0000644000076500000240000000647212252721253023337 0ustar jakevdpstaff00000000000000""" Wavelet transform of Gaussian Noise ----------------------------------- Figure 10.7 Localized frequency analysis using the wavelet transform. The upper panel shows the input signal, which consists of localized Gaussian noise. The middle panel shows an example wavelet. The lower panel shows the power spectral density as a function of the frequency f0 and the time t0, for Q = 1.0. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.fourier import\ FT_continuous, IFT_continuous, sinegauss, sinegauss_FT, wavelet_PSD #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Sample the function: localized noise np.random.seed(0) N = 1024 t = np.linspace(-5, 5, N) x = np.ones(len(t)) h = np.random.normal(0, 1, len(t)) h *= np.exp(-0.5 * (t / 0.5) ** 2) #------------------------------------------------------------ # Compute an example wavelet W = sinegauss(t, 0, 1.5, Q=1.0) #------------------------------------------------------------ # Compute the wavelet PSD f0 = np.linspace(0.5, 7.5, 100) wPSD = wavelet_PSD(t, h, f0, Q=1.0) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.05, left=0.12, right=0.95, bottom=0.08, top=0.95) # First panel: the signal ax = fig.add_subplot(311) ax.plot(t, h, '-k', lw=1) ax.text(0.02, 0.95, ("Input Signal:\n" "Localized Gaussian noise"), ha='left', va='top', transform=ax.transAxes) ax.set_xlim(-4, 4) ax.set_ylim(-2.9, 2.9) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('$h(t)$') # Second panel: an example wavelet ax = fig.add_subplot(312) ax.plot(t, W.real, '-k', label='real part', lw=1) ax.plot(t, W.imag, '--k', label='imag part', lw=1) ax.text(0.02, 0.95, ("Example Wavelet\n" "$t_0 = 0$, $f_0=1.5$, $Q=1.0$"), ha='left', va='top', transform=ax.transAxes) ax.text(0.98, 0.05, (r"$w(t; t_0, f_0, Q) = e^{-[f_0 (t - t_0) / Q]^2}" "e^{2 \pi i f_0 (t - t_0)}$"), ha='right', va='bottom', transform=ax.transAxes) ax.legend(loc=1) ax.set_xlim(-4, 4) ax.set_ylim(-1.4, 1.4) ax.set_ylabel('$w(t; t_0, f_0, Q)$') ax.xaxis.set_major_formatter(plt.NullFormatter()) # Third panel: the spectrogram ax = plt.subplot(313) ax.imshow(wPSD, origin='lower', aspect='auto', cmap=plt.cm.jet, extent=[t[0], t[-1], f0[0], f0[-1]]) ax.text(0.02, 0.95, ("Wavelet PSD"), color='w', ha='left', va='top', transform=ax.transAxes) ax.set_xlim(-4, 4) ax.set_ylim(0.5, 7.5) ax.set_xlabel('$t$') ax.set_ylabel('$f_0$') plt.show() astroML-0.3/book_figures/chapter10/fig_wavelets.py0000644000076500000240000000423112252721253023003 0ustar jakevdpstaff00000000000000""" Examples of Wavelets -------------------- Figure 10.9 Wavelets for several values of wavelet parameters Q and f0. Solid lines show the real part and dashed lines show the imaginary part (see eq. 10.16). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.fourier import FT_continuous, IFT_continuous, sinegauss #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up the wavelets t0 = 0 t = np.linspace(-0.4, 0.4, 10000) f0 = np.array([5, 5, 10, 10]) Q = np.array([1, 0.5, 1, 0.5]) # compute wavelets all at once W = sinegauss(t, t0, f0[:, None], Q[:, None]) #------------------------------------------------------------ # Plot the wavelets fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.05, wspace=0.05) # in each panel, plot and label a different wavelet for i in range(4): ax = fig.add_subplot(221 + i) ax.plot(t, W[i].real, '-k') ax.plot(t, W[i].imag, '--k') ax.text(0.04, 0.95, "$f_0 = %i$\n$Q = %.1f$" % (f0[i], Q[i]), ha='left', va='top', transform=ax.transAxes) ax.set_ylim(-1.2, 1.2) ax.set_xlim(-0.35, 0.35) ax.xaxis.set_major_locator(plt.MultipleLocator(0.2)) if i in (0, 1): ax.xaxis.set_major_formatter(plt.NullFormatter()) else: ax.set_xlabel('$t$') if i in (1, 3): ax.yaxis.set_major_formatter(plt.NullFormatter()) else: ax.set_ylabel('$w(t)$') plt.show() astroML-0.3/book_figures/chapter10/fig_wiener_filter.py0000644000076500000240000001075412252721253024016 0ustar jakevdpstaff00000000000000""" Wiener Filter Example --------------------- Figure 10.10 An example of data filtering using a Wiener filter. The upper-left panel shows noisy input data (200 evenly spaced points) with a narrow Gaussian peak centered at x = 20. The bottom panels show the input (left) and Wiener-filtered (right) power spectral density (PSD) distributions. The two curves in the bottom-left panel represent two-component fit to PSD given by eq. 10.20. The upper-right panel shows the result of the Wiener filtering on the input: the Gaussian peak is clearly seen. For comparison, we also plot the result of a fourth-order Savitzky-Golay filter with a window size of lambda = 10. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import optimize, fftpack from astroML.filters import savitzky_golay, wiener_filter #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create the noisy data np.random.seed(5) N = 2000 dt = 0.05 t = dt * np.arange(N) h = np.exp(-0.5 * ((t - 20.) / 1.0) ** 2) hN = h + np.random.normal(0, 0.5, size=h.shape) Df = 1. / N / dt f = fftpack.ifftshift(Df * (np.arange(N) - N / 2)) HN = fftpack.fft(hN) #------------------------------------------------------------ # Set up the Wiener filter: # fit a model to the PSD consisting of the sum of a # gaussian and white noise h_smooth, PSD, P_S, P_N, Phi = wiener_filter(t, hN, return_PSDs=True) #------------------------------------------------------------ # Use the Savitzky-Golay filter to filter the values h_sg = savitzky_golay(hN, window_size=201, order=4, use_fft=False) #------------------------------------------------------------ # Plot the results N = len(t) Df = 1. / N / (t[1] - t[0]) f = fftpack.ifftshift(Df * (np.arange(N) - N / 2)) HN = fftpack.fft(hN) fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(wspace=0.05, hspace=0.25, bottom=0.1, top=0.95, left=0.12, right=0.95) # First plot: noisy signal ax = fig.add_subplot(221) ax.plot(t, hN, '-', c='gray') ax.plot(t, np.zeros_like(t), ':k') ax.text(0.98, 0.95, "Input Signal", ha='right', va='top', transform=ax.transAxes, bbox=dict(fc='w', ec='none')) ax.set_xlim(0, 90) ax.set_ylim(-0.5, 1.5) ax.xaxis.set_major_locator(plt.MultipleLocator(20)) ax.set_xlabel(r'$\lambda$') ax.set_ylabel('flux') # Second plot: filtered signal ax = plt.subplot(222) ax.plot(t, np.zeros_like(t), ':k', lw=1) ax.plot(t, h_smooth, '-k', lw=1.5, label='Wiener') ax.plot(t, h_sg, '-', c='gray', lw=1, label='Savitzky-Golay') ax.text(0.98, 0.95, "Filtered Signal", ha='right', va='top', transform=ax.transAxes) ax.legend(loc='upper right', bbox_to_anchor=(0.98, 0.9), frameon=False) ax.set_xlim(0, 90) ax.set_ylim(-0.5, 1.5) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_major_locator(plt.MultipleLocator(20)) ax.set_xlabel(r'$\lambda$') # Third plot: Input PSD ax = fig.add_subplot(223) ax.scatter(f[:N / 2], PSD[:N / 2], s=9, c='k', lw=0) ax.plot(f[:N / 2], P_S[:N / 2], '-k') ax.plot(f[:N / 2], P_N[:N / 2], '-k') ax.text(0.98, 0.95, "Input PSD", ha='right', va='top', transform=ax.transAxes) ax.set_ylim(-100, 3500) ax.set_xlim(0, 0.9) ax.yaxis.set_major_locator(plt.MultipleLocator(1000)) ax.xaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.set_xlabel('$f$') ax.set_ylabel('$PSD(f)$') # Fourth plot: Filtered PSD ax = fig.add_subplot(224) filtered_PSD = (Phi * abs(HN)) ** 2 ax.scatter(f[:N / 2], filtered_PSD[:N / 2], s=9, c='k', lw=0) ax.text(0.98, 0.95, "Filtered PSD", ha='right', va='top', transform=ax.transAxes) ax.set_ylim(-100, 3500) ax.set_xlim(0, 0.9) ax.yaxis.set_major_locator(plt.MultipleLocator(1000)) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.set_xlabel('$f$') plt.show() astroML-0.3/book_figures/chapter10/fig_wiener_kernel.py0000644000076500000240000000706112252721253024006 0ustar jakevdpstaff00000000000000""" Wiener Filter / kernel smooting Connection ------------------------------------------ Figure 10.11 The left panel shows the inverse Fourier transform of the Wiener filter Phi(f) applied in figure 10.10. By the convolution theorem, the Wiener-filtered result is equivalent to the convolution of the unfiltered signal with the kernel shown above, and thus Wiener filtering and kernel smoothing are directly related. The right panel shows the data smoothed by this kernel, which is equivalent to the Wiener filter smoothing in figure 10.10. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import optimize, fftpack, interpolate from astroML.fourier import IFT_continuous from astroML.filters import wiener_filter #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # sample the same data as the previous Wiener filter figure np.random.seed(5) t = np.linspace(0, 100, 2001)[:-1] h = np.exp(-0.5 * ((t - 20.) / 1.0) ** 2) hN = h + np.random.normal(0, 0.5, size=h.shape) #---------------------------------------------------------------------- # compute the PSD N = len(t) Df = 1. / N / (t[1] - t[0]) f = fftpack.ifftshift(Df * (np.arange(N) - N / 2)) h_wiener, PSD, P_S, P_N, Phi = wiener_filter(t, hN, return_PSDs=True) #------------------------------------------------------------ # inverse fourier transform Phi to find the effective kernel t_plot, kernel = IFT_continuous(f, Phi) #------------------------------------------------------------ # perform kernel smoothing on the data. This is faster in frequency # space (ie using the standard Wiener filter above) but we will do # it in the slow & simple way here to demonstrate the equivalence # explicitly kernel_func = interpolate.interp1d(t_plot, kernel.real) t_eval = np.linspace(0, 90, 1000) t_KDE = t_eval[:, np.newaxis] - t t_KDE[t_KDE < t_plot[0]] = t_plot[0] t_KDE[t_KDE > t_plot[-1]] = t_plot[-1] F = kernel_func(t_KDE) h_smooth = np.dot(F, hN) / np.sum(F, 1) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.2)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.25, bottom=0.15, top=0.9) # First plot: the equivalent Kernel to the WF ax = fig.add_subplot(121) ax.plot(t_plot, kernel.real, '-k') ax.text(0.95, 0.95, "Effective Wiener\nFilter Kernel", ha='right', va='top', transform=ax.transAxes) ax.set_xlim(-10, 10) ax.set_ylim(-0.05, 0.45) ax.set_xlabel(r'$\lambda$') ax.set_ylabel(r'$K(\lambda)$') # Second axes: Kernel smoothed results ax = fig.add_subplot(122) ax.plot(t_eval, h_smooth, '-k', lw=1) ax.plot(t_eval, 0 * t_eval, '-k', lw=1) ax.text(0.95, 0.95, "Kernel smoothing\nresult", ha='right', va='top', transform=ax.transAxes) ax.set_xlim(0, 90) ax.set_ylim(-0.5, 1.5) ax.set_xlabel('$\lambda$') ax.set_ylabel('flux') plt.show() astroML-0.3/book_figures/chapter10/README.rst0000644000076500000240000000030212115147567021444 0ustar jakevdpstaff00000000000000Chapter 10: Time Series Analysis -------------------------------- This chapter covers the analysis of both periodic and non-periodic time series, for both regularly and irregularly spaced data. astroML-0.3/book_figures/chapter2/0000755000076500000240000000000012462244012017667 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter2/fig_balltree_example.py0000644000076500000240000000744112252721253024405 0ustar jakevdpstaff00000000000000""" Ball Tree Example ----------------- Figure 2.5. This example creates a simple Ball tree partition of a two-dimensional parameter space, and plots a visualization of the result. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Circle #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) # We'll create a BallTree class which will recursively subdivide the # space into circular regions. Note that this is just an example and # shouldn't be used for real computation; instead use the optimized # code in scipy.spatial.cKDTree or sklearn.neighbors.BallTree class BallTree: """Simple Ball tree class""" # class initialization function def __init__(self, data): self.data = np.asarray(data) # data should be two-dimensional assert self.data.shape[1] == 2 self.loc = data.mean(0) self.radius = np.sqrt(np.max(np.sum((self.data - self.loc) ** 2, 1))) self.child1 = None self.child2 = None if len(self.data) > 1: # sort on the dimension with the largest spread largest_dim = np.argmax(self.data.max(0) - self.data.min(0)) i_sort = np.argsort(self.data[:, largest_dim]) self.data[:] = self.data[i_sort, :] # find split point N = self.data.shape[0] split_point = 0.5 * (self.data[N / 2, largest_dim] + self.data[N / 2 - 1, largest_dim]) # recursively create subnodes self.child1 = BallTree(self.data[N / 2:]) self.child2 = BallTree(self.data[:N / 2]) def draw_circle(self, ax, depth=None): """Recursively plot a visualization of the Ball tree region""" if depth is None or depth == 0: circ = Circle(self.loc, self.radius, ec='k', fc='none') ax.add_patch(circ) if self.child1 is not None: if depth is None: self.child1.draw_circle(ax) self.child2.draw_circle(ax) elif depth > 0: self.child1.draw_circle(ax, depth - 1) self.child2.draw_circle(ax, depth - 1) #------------------------------------------------------------ # Create a set of structured random points in two dimensions np.random.seed(0) X = np.random.random((30, 2)) * 2 - 1 X[:, 1] *= 0.1 X[:, 1] += X[:, 0] ** 2 #------------------------------------------------------------ # Use our Ball Tree class to recursively divide the space BT = BallTree(X) #------------------------------------------------------------ # Plot four different levels of the Ball tree fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(wspace=0.1, hspace=0.15, left=0.1, right=0.9, bottom=0.05, top=0.9) for level in range(1, 5): ax = fig.add_subplot(2, 2, level, xticks=[], yticks=[]) ax.scatter(X[:, 0], X[:, 1], s=9) BT.draw_circle(ax, depth=level - 1) ax.set_xlim(-1.35, 1.35) ax.set_ylim(-1.0, 1.7) ax.set_title('level %i' % level) # suptitle() adds a title to the entire figure fig.suptitle('Ball-tree Example') plt.show() astroML-0.3/book_figures/chapter2/fig_kdtree_example.py0000644000076500000240000001015212252721253024062 0ustar jakevdpstaff00000000000000""" KD Tree Example --------------- Figure 2.4. This example creates a simple KD-tree partition of a two-dimensional parameter space, and plots a visualization of the result. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) # We'll create a KDTree class which will recursively subdivide the # space into rectangular regions. Note that this is just an example # and shouldn't be used for real computation; instead use the optimized # code in scipy.spatial.cKDTree or sklearn.neighbors.BallTree class KDTree: """Simple KD tree class""" # class initialization function def __init__(self, data, mins, maxs): self.data = np.asarray(data) # data should be two-dimensional assert self.data.shape[1] == 2 if mins is None: mins = data.min(0) if maxs is None: maxs = data.max(0) self.mins = np.asarray(mins) self.maxs = np.asarray(maxs) self.sizes = self.maxs - self.mins self.child1 = None self.child2 = None if len(data) > 1: # sort on the dimension with the largest spread largest_dim = np.argmax(self.sizes) i_sort = np.argsort(self.data[:, largest_dim]) self.data[:] = self.data[i_sort, :] # find split point N = self.data.shape[0] split_point = 0.5 * (self.data[N / 2, largest_dim] + self.data[N / 2 - 1, largest_dim]) # create subnodes mins1 = self.mins.copy() mins1[largest_dim] = split_point maxs2 = self.maxs.copy() maxs2[largest_dim] = split_point # Recursively build a KD-tree on each sub-node self.child1 = KDTree(self.data[N / 2:], mins1, self.maxs) self.child2 = KDTree(self.data[:N / 2], self.mins, maxs2) def draw_rectangle(self, ax, depth=None): """Recursively plot a visualization of the KD tree region""" if depth == 0: rect = plt.Rectangle(self.mins, *self.sizes, ec='k', fc='none') ax.add_patch(rect) if self.child1 is not None: if depth is None: self.child1.draw_rectangle(ax) self.child2.draw_rectangle(ax) elif depth > 0: self.child1.draw_rectangle(ax, depth - 1) self.child2.draw_rectangle(ax, depth - 1) #------------------------------------------------------------ # Create a set of structured random points in two dimensions np.random.seed(0) X = np.random.random((30, 2)) * 2 - 1 X[:, 1] *= 0.1 X[:, 1] += X[:, 0] ** 2 #------------------------------------------------------------ # Use our KD Tree class to recursively divide the space KDT = KDTree(X, [-1.1, -0.1], [1.1, 1.1]) #------------------------------------------------------------ # Plot four different levels of the KD tree fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(wspace=0.1, hspace=0.15, left=0.1, right=0.9, bottom=0.05, top=0.9) for level in range(1, 5): ax = fig.add_subplot(2, 2, level, xticks=[], yticks=[]) ax.scatter(X[:, 0], X[:, 1], s=9) KDT.draw_rectangle(ax, depth=level - 1) ax.set_xlim(-1.2, 1.2) ax.set_ylim(-0.15, 1.15) ax.set_title('level %i' % level) # suptitle() adds a title to the entire figure fig.suptitle('$k$d-tree Example') plt.show() astroML-0.3/book_figures/chapter2/fig_quadtree_example.py0000644000076500000240000001203712252721253024422 0ustar jakevdpstaff00000000000000""" Quad Tree Example ----------------- Figure 2.3. This example creates a simple quad-tree partition of a two-dimensional parameter space, and plots a visualization of the result. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) # We'll create a QuadTree class which will recursively subdivide the # space into quadrants class QuadTree: """Simple Quad-tree class""" # class initialization function def __init__(self, data, mins, maxs, depth=3): self.data = np.asarray(data) # data should be two-dimensional assert self.data.shape[1] == 2 if mins is None: mins = data.min(0) if maxs is None: maxs = data.max(0) self.mins = np.asarray(mins) self.maxs = np.asarray(maxs) self.sizes = self.maxs - self.mins self.children = [] mids = 0.5 * (self.mins + self.maxs) xmin, ymin = self.mins xmax, ymax = self.maxs xmid, ymid = mids if depth > 0: # split the data into four quadrants data_q1 = data[(data[:, 0] < mids[0]) & (data[:, 1] < mids[1])] data_q2 = data[(data[:, 0] < mids[0]) & (data[:, 1] >= mids[1])] data_q3 = data[(data[:, 0] >= mids[0]) & (data[:, 1] < mids[1])] data_q4 = data[(data[:, 0] >= mids[0]) & (data[:, 1] >= mids[1])] # recursively build a quad tree on each quadrant which has data if data_q1.shape[0] > 0: self.children.append(QuadTree(data_q1, [xmin, ymin], [xmid, ymid], depth - 1)) if data_q2.shape[0] > 0: self.children.append(QuadTree(data_q2, [xmin, ymid], [xmid, ymax], depth - 1)) if data_q3.shape[0] > 0: self.children.append(QuadTree(data_q3, [xmid, ymin], [xmax, ymid], depth - 1)) if data_q4.shape[0] > 0: self.children.append(QuadTree(data_q4, [xmid, ymid], [xmax, ymax], depth - 1)) def draw_rectangle(self, ax, depth): """Recursively plot a visualization of the quad tree region""" if depth is None or depth == 0: rect = plt.Rectangle(self.mins, *self.sizes, zorder=2, ec='#000000', fc='none') ax.add_patch(rect) if depth is None or depth > 0: for child in self.children: child.draw_rectangle(ax, depth - 1) def draw_grid(ax, xlim, ylim, Nx, Ny, **kwargs): """ draw a background grid for the quad tree""" for x in np.linspace(xlim[0], xlim[1], Nx): ax.plot([x, x], ylim, **kwargs) for y in np.linspace(ylim[0], ylim[1], Ny): ax.plot(xlim, [y, y], **kwargs) #------------------------------------------------------------ # Create a set of structured random points in two dimensions np.random.seed(0) X = np.random.random((30, 2)) * 2 - 1 X[:, 1] *= 0.1 X[:, 1] += X[:, 0] ** 2 #------------------------------------------------------------ # Use our Quad Tree class to recursively divide the space mins = (-1.1, -0.1) maxs = (1.1, 1.1) QT = QuadTree(X, mins, maxs, depth=3) #------------------------------------------------------------ # Plot four different levels of the quad tree fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(wspace=0.1, hspace=0.15, left=0.1, right=0.9, bottom=0.05, top=0.9) for level in range(1, 5): ax = fig.add_subplot(2, 2, level, xticks=[], yticks=[]) ax.scatter(X[:, 0], X[:, 1]) QT.draw_rectangle(ax, depth=level - 1) Nlines = 1 + 2 ** (level - 1) draw_grid(ax, (mins[0], maxs[0]), (mins[1], maxs[1]), Nlines, Nlines, linewidth=1, color='#CCCCCC', zorder=0) ax.set_xlim(-1.2, 1.2) ax.set_ylim(-0.15, 1.15) ax.set_title('level %i' % level) # suptitle() adds a title to the entire figure fig.suptitle('Quad-tree Example') plt.show() astroML-0.3/book_figures/chapter2/fig_search_scaling.py0000644000076500000240000000543712252721253024050 0ustar jakevdpstaff00000000000000""" Search Algorithm Scaling ------------------------ Figure 2.1. The scaling of two methods to search for an item in an ordered list: a linear method which performs a comparison on all N items, and a binary search which uses a more sophisticated algorithm. The theoretical scalings are shown by dashed lines. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from time import time import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Compute the execution times as a function of array size Nsamples = 10 ** np.linspace(6.0, 7.8, 17) time_linear = np.zeros_like(Nsamples) time_binary = np.zeros_like(Nsamples) for i in range(len(Nsamples)): # create a sorted array x = np.arange(Nsamples[i], dtype=int) # Linear search: choose a single item in the array item = int(0.4 * Nsamples[i]) t0 = time() j = np.where(x == item) t1 = time() time_linear[i] = t1 - t0 # Binary search: this is much faster, so choose 1000 items to search for items = np.linspace(0, Nsamples[i], 1000).astype(int) t0 = time() j = np.searchsorted(x, items) t1 = time() time_binary[i] = (t1 - t0) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(bottom=0.15) ax = plt.axes(xscale='log', yscale='log') ax.grid() # plot the observed times ax.plot(Nsamples, time_linear, 'ok', color='gray', markersize=5, label=r'linear search $(\mathcal{O}[N])$') ax.plot(Nsamples, time_binary, 'sk', color='gray', markersize=5, label=r'efficient search $(\mathcal{O}[\log N])$') # plot the expected scaling scale = 10 ** np.linspace(5, 8, 100) scaling_N = scale * time_linear[7] / Nsamples[7] scaling_logN = np.log(scale) * time_binary[7] / np.log(Nsamples[7]) ax.plot(scale, scaling_N, '--k') ax.plot(scale, scaling_logN, '--k') ax.set_xlim(9E5, 1E8) # add text and labels ax.set_title("Scaling of Search Algorithms") ax.set_xlabel('Length of Array') ax.set_ylabel('Relative search time') ax.legend(loc='upper left') plt.show() astroML-0.3/book_figures/chapter2/fig_sort_scaling.py0000644000076500000240000000551112252721253023563 0ustar jakevdpstaff00000000000000""" Sort Algorithm Scaling ---------------------- Figure 2.2. The scaling of the quicksort algorithm. Plotted for comparison are lines showing O(N) and O(N log N) scaling. The quicksort algorithm falls along the O(N log N) line, as expected. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from time import time import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Compute the execution times as a function of array size # time quick-sort of a numpy array N_npy = 10 ** np.linspace(5, 7, 10) time_npy = np.zeros_like(N_npy) for i in range(len(N_npy)): x = np.random.random(int(N_npy[i])) t0 = time() x.sort(kind='quicksort') t1 = time() time_npy[i] = t1 - t0 # time built-in sort of python list N_list = N_npy[:-3] time_list = np.zeros_like(N_list) for i in range(len(N_list)): x = list(np.random.random(int(N_list[i]))) t0 = time() x.sort() t1 = time() time_list[i] = t1 - t0 #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(bottom=0.15) ax = plt.axes(xscale='log', yscale='log') ax.grid() # plot the observed times ax.plot(N_list, time_list, 'sk', color='gray', ms=5, label='list sort') ax.plot(N_npy, time_npy, 'ok', color='gray', ms=5, label='NumPy sort') # plot the expected scalings scale = np.linspace(N_npy[0] / 2, N_npy[-1] * 2, 100) scaling_N = scale * time_npy[0] / N_npy[0] scaling_NlogN = (scale * np.log2(scale) * time_npy[0] / N_npy[0] / np.log2(N_npy[0])) ax.plot(scale, scaling_NlogN, '--k', label=r'$\mathcal{O}[N \log N]$') ax.plot(scale, scaling_N, ':k', label=r'$\mathcal{O}[N]$') scaling_N = scale * time_list[0] / N_list[0] scaling_NlogN = (scale * np.log2(scale) * time_list[0] / N_list[0] / np.log2(N_list[0])) ax.plot(scale, scaling_NlogN, '--k') ax.plot(scale, scaling_N, ':k') # Create titles and labels ax.set_title("Scaling of Sort Algorithms") ax.set_xlabel('Length of Array') ax.set_ylabel('Relative sort time') plt.legend(loc='upper left') ax.set_xlim(scale[0], scale[-1]) plt.show() astroML-0.3/book_figures/chapter2/README.rst0000644000076500000240000000024612115147567021374 0ustar jakevdpstaff00000000000000Chapter 2: Fast Computation and Massive Datasets ------------------------------------------------ This chapter discusses computational strategies for large datasets. astroML-0.3/book_figures/chapter3/0000755000076500000240000000000012462244012017670 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter3/fig_beta_distribution.py0000644000076500000240000000460112252721253024606 0ustar jakevdpstaff00000000000000""" Example of a Beta distribution ------------------------------ Figure 3.17. This shows an example of a beta distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.beta(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import beta from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted alpha_values = [0.5, 1.5, 3.0, 0.5] beta_values = [0.5, 1.5, 3.0, 1.5] linestyles = ['-', '--', ':', '-.'] x = np.linspace(0, 1, 1002)[1:-1] #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for a, b, ls in zip(alpha_values, beta_values, linestyles): dist = beta(a, b) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$\alpha=%.1f,\ \beta=%.1f$' % (a, b)) plt.xlim(0, 1) plt.ylim(0, 3) plt.xlabel('$x$') plt.ylabel(r'$p(x|\alpha,\beta)$') plt.title('Beta Distribution') plt.legend(loc=0) plt.show() astroML-0.3/book_figures/chapter3/fig_binomial_distribution.py0000644000076500000240000000462612420767763025511 0ustar jakevdpstaff00000000000000""" Example of a Binomial distribution ---------------------------------- Figure 3.9. This shows an example of a binomial distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.binom(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import binom from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted n_values = [20, 20, 40] b_values = [0.2, 0.6, 0.6] linestyles = ['-', '--', ':'] x = np.arange(-1, 200) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for (n, b, ls) in zip(n_values, b_values, linestyles): # create a binomial distribution dist = binom(n, b) plt.plot(x, dist.pmf(x), color='black', linestyle='steps-mid' + ls, label=r'$b=%.1f,\ n=%i$' % (b, n)) plt.xlim(-0.5, 35) plt.ylim(0, 0.25) plt.xlabel('$x$') plt.ylabel(r'$p(x|b, n)$') plt.title('Binomial Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_bivariate_gaussian.py0000644000076500000240000000600112252721253024730 0ustar jakevdpstaff00000000000000""" Bivariate Gaussian ------------------ Figure 3.22. An example of data generated from a bivariate Gaussian distribution. The shaded pixels are a Hess diagram showing the density of points at each position. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse from astroML.stats.random import bivariate_normal #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the mean, principal axes, and rotation of the ellipse mean = np.array([0, 0]) sigma_1 = 2 sigma_2 = 1 alpha = np.pi / 4 #------------------------------------------------------------ # Draw 10^5 points from a multivariate normal distribution # # we use the bivariate_normal function from astroML. A more # general function for this is numpy.random.multivariate_normal(), # which requires the user to specify the full covariance matrix. # bivariate_normal() generates this covariance matrix for the # given inputs. np.random.seed(0) x, cov = bivariate_normal(mean, sigma_1, sigma_2, alpha, size=100000, return_cov=True) sigma_x = np.sqrt(cov[0, 0]) sigma_y = np.sqrt(cov[1, 1]) sigma_xy = cov[0, 1] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) # plot a 2D histogram/hess diagram of the points H, bins = np.histogramdd(x, bins=2 * [np.linspace(-4.5, 4.5, 51)]) ax.imshow(H, origin='lower', cmap=plt.cm.binary, interpolation='nearest', extent=[bins[0][0], bins[0][-1], bins[1][0], bins[1][-1]]) # draw 1, 2, 3-sigma ellipses over the distribution for N in (1, 2, 3): ax.add_patch(Ellipse(mean, N * sigma_1, N * sigma_2, angle=alpha * 180. / np.pi, lw=1, ec='k', fc='none')) kwargs = dict(ha='left', va='top', transform=ax.transAxes) ax.text(0.02, 0.98, r"$\sigma_1 = %i$" % sigma_1, **kwargs) ax.text(0.02, 0.93, r"$\sigma_2 = %i$" % sigma_2, **kwargs) ax.text(0.02, 0.88, r"$\alpha = \pi / %i$" % (np.pi / alpha), **kwargs) ax.text(0.15, 0.98, r"$\sigma_x = %.2f$" % sigma_x, **kwargs) ax.text(0.15, 0.93, r"$\sigma_y = %.2f$" % sigma_y, **kwargs) ax.text(0.15, 0.88, r"$\sigma_{xy} = %.2f$" % sigma_xy, **kwargs) ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter3/fig_cauchy_distribution.py0000644000076500000240000000454412252721253025155 0ustar jakevdpstaff00000000000000""" Example of a Cauchy distribution -------------------------------- Figure 3.11. This shows an example of a Cauchy distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.cauchy(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import cauchy from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted gamma_values = [0.5, 1.0, 2.0] linestyles = ['-', '--', ':'] mu = 0 x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for gamma, ls in zip(gamma_values, linestyles): dist = cauchy(mu, gamma) plt.plot(x, dist.pdf(x), ls=ls, color='black', label=r'$\mu=%i,\ \gamma=%.1f$' % (mu, gamma)) plt.xlim(-4.5, 4.5) plt.ylim(0, 0.65) plt.xlabel('$x$') plt.ylabel(r'$p(x|\mu,\gamma)$') plt.title('Cauchy Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_cauchy_median_mean.py0000644000076500000240000001042212252721253024663 0ustar jakevdpstaff00000000000000r""" Median and Mean for Cauchy distribution --------------------------------------- Figure 3.12. The bottom panel shows a sample of N points drawn from a Cauchy distribution with :math:`\mu = 0` and :math:`\gamma=2`. The top panel shows the sample median, sample mean, and two robust estimates of the location parameter (see text) as a function of the sample size (only points to the left from a given sample size are used). Note that the sample mean is not a good estimator of the distribution's location parameter. Though the mean appears to converge as N increases, this is deceiving: because of the large tails in the Cauchy distribution, there is always a high likelihood of a far-flung point affecting the sample mean. This behavior is markedly different from a Gaussian distribution where the probability of such "outliers" is much smaller. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import optimize from scipy.stats import cauchy, norm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def robust_mean_mixture(x): """Compute the mean via a mixture of two Gaussians One Gaussian accounts for outliers, and one Gaussian accounts for the true distribution. This cannot be computed analytically, so it uses scipy's function optimization """ if len(x) == 1: return x x = x.ravel() mu_bg = np.mean(x) sig_bg = 3 * np.std(x) likelihood = lambda v: -np.sum(np.log(norm.pdf(x, v[0], v[1]) + norm.pdf(x, mu_bg, sig_bg))) v0 = np.array([0, 30]) v_best = optimize.fmin(likelihood, v0, disp=False) return v_best[0] def robust_mean_iterated(x, sigma_cut=3): """Compute the robust mean iteratively After computing the mean, points further than 3 sigma from the mean are removed and the result is repeated until convergence. """ flag = np.ones(x.shape, dtype=bool) n_to_keep = x.size while True: xf = x[flag] mu = xf.mean() sig = xf.std() if len(xf) == 1: break x_sig = abs((x - mu) / sig) too_far = (x_sig > sigma_cut) flag[too_far] = False n_flag = flag.sum() if n_flag == n_to_keep: break else: n_to_keep = n_flag return mu #------------------------------------------------------------ # Create the distribution and compute means and medians np.random.seed(6) mu = 0 gamma = 2 xi = cauchy(mu, gamma).rvs(100) Nrange = np.arange(1, len(xi) + 1) mean = [np.mean(xi[:N]) for N in Nrange] median = [np.median(xi[:N]) for N in Nrange] mean_mixture = [robust_mean_mixture(xi[:N]) for N in Nrange] mean_iter = [robust_mean_iterated(xi[:N]) for N in Nrange] #------------------------------------------------------------ # Plot the results as a function of number of points fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.05) # first plot the mean ax = fig.add_subplot(211) ax.plot(Nrange, mean, '-.b', label='mean') ax.plot(Nrange, median, '-k', label='median') ax.plot(Nrange, mean_mixture, ':r', label='robust mean (mixture)') ax.plot(Nrange, mean_iter, '--g', label='robust mean (sigma-clip)') ax.plot(Nrange, 0 * Nrange, '-', c='gray', lw=0.5) ax.set_xlim(0, 100) ax.set_ylim(-7, 7) ax.legend(loc=4, ncol=2, frameon=False) ax.set_ylabel('Value') ax.xaxis.set_major_formatter(plt.NullFormatter()) # now plot the median ax = fig.add_subplot(212) ax.scatter(Nrange, xi, lw=0, s=10, c='k') ax.plot(Nrange, 0 * Nrange, '-', c='gray') ax.set_xlim(0, 100) ax.set_ylim(-75, 75) ax.set_xlabel('Sample Size') ax.set_ylabel('Value') plt.show() astroML-0.3/book_figures/chapter3/fig_central_limit.py0000644000076500000240000000543512252721253023730 0ustar jakevdpstaff00000000000000r""" Example of central limit theorem -------------------------------- Figure 3.20. An illustration of the central limit theorem. The histogram in each panel shows the distribution of the mean value of N random variables drawn from the (0, 1) range (a uniform distribution with :math:`\mu = 0.5` and W = 1; see eq. 3.39). The distribution for N = 2 has a triangular shape and as N increases it becomes increasingly similar to a Gaussian, in agreement with the central limit theorem. The predicted normal distribution with :math:`\mu = 0.5` and :math:`\sigma = 1/ \sqrt{12 N}` is shown by the line. Already for N = 10, the "observed" distribution is essentially the same as the predicted distribution. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate the uniform samples N = [2, 3, 10] np.random.seed(42) x = np.random.random((max(N), 1E6)) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.05) for i in range(len(N)): ax = fig.add_subplot(3, 1, i + 1) # take the mean of the first N[i] samples x_i = x[:N[i], :].mean(0) # histogram the data ax.hist(x_i, bins=np.linspace(0, 1, 101), histtype='stepfilled', alpha=0.5, normed=True) # plot the expected gaussian pdf mu = 0.5 sigma = 1. / np.sqrt(12 * N[i]) dist = norm(mu, sigma) x_pdf = np.linspace(-0.5, 1.5, 1000) ax.plot(x_pdf, dist.pdf(x_pdf), '-k') ax.set_xlim(0.0, 1.0) ax.set_ylim(0.001, None) ax.xaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.yaxis.set_major_locator(plt.MaxNLocator(5)) ax.text(0.99, 0.95, r"$N = %i$" % N[i], ha='right', va='top', transform=ax.transAxes) if i == len(N) - 1: ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%.4f')) ax.set_xlabel(r'$x$') else: ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter3/fig_chi2_distribution.py0000644000076500000240000000454712252721253024531 0ustar jakevdpstaff00000000000000r""" Example of a chi-squared distribution --------------------------------------- Figure 3.14. This shows an example of a :math:`\chi^2` distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.chi2(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import chi2 from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted k_values = [1, 2, 5, 7] linestyles = ['-', '--', ':', '-.'] mu = 0 x = np.linspace(-1, 20, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) fig.subplots_adjust(bottom=0.12) for k, ls in zip(k_values, linestyles): dist = chi2(k, mu) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$k=%i$' % k) plt.xlim(0, 10) plt.ylim(0, 0.5) plt.xlabel('$Q$') plt.ylabel(r'$p(Q|k)$') plt.title(r'$\chi^2\ \mathrm{Distribution}$') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_clone_distribution.py0000644000076500000240000001173712420767763025020 0ustar jakevdpstaff00000000000000r""" Random Values from an Empirical Distribution -------------------------------------------- Figure 3.25. A demonstration of how to empirically clone a distribution, using a spline interpolation to approximate the inverse of the observed cumulative distribution. This allows us to nonparametrically select new random samples approximating an observed distribution. First the list of points is sorted, and the rank of each point is used to approximate the cumulative distribution (upper right). Flipping the axes gives the inverse cumulative distribution on a regular grid (lower left). After performing a cubic spline fit to the inverse distribution, a uniformly sampled x value maps to a y value which approximates the observed pdf. The lower-right panel shows the result. The K-S test (see section 4.7.2) indicates that the samples are consistent with being drawn from the same distribution. This method, while fast and effective, cannot be easily extended to multiple dimensions. This example uses the routine :class:`astroML.density_estimation.EmpiricalDistribution` to clone the distribution """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from scipy import stats, interpolate from astroML.plotting import hist from astroML.density_estimation import EmpiricalDistribution #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create a distribution and clone it Ndata = 1000 Nclone = 100000 np.random.seed(0) # generate an 'observed' bimodal distribution with 10000 values dists = (stats.norm(-1.3, 0.5), stats.norm(1.3, 0.5)) fracs = (0.6, 0.4) x = np.hstack((d.rvs(f * Ndata) for d, f in zip(dists, fracs))) # We can clone the distribution easily with this function x_cloned = EmpiricalDistribution(x).rvs(Nclone) # compute the KS test to check if they're the same D, p = stats.ks_2samp(x, x_cloned) print("KS test: D = %.2g; p = %.2g" % (D, p)) #------------------------------------------------------------ # For the sake of this example, we need to calculate some # of the partial steps used by EmpiricalDistribution # create a cumulative distribution x.sort() Px_cuml = np.linspace(0, 1, Ndata) # set up an interpolation of the inverse cumulative distribution tck = interpolate.splrep(Px_cuml, x) # sample evenly along the cumulative distribution, and interpolate Px_cuml_sample = np.linspace(0, 1, 10 * Ndata) x_sample = interpolate.splev(Px_cuml_sample, tck) #------------------------------------------------------------ # Plot the cloned distribution and the procedure for obtaining it fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0.3, left=0.1, right=0.95, bottom=0.08, top=0.92) indices = np.linspace(0, Ndata - 1, 20).astype(int) # plot a histogram of the input ax = fig.add_subplot(221) hist(x, bins='knuth', ax=ax, histtype='stepfilled', ec='k', fc='#AAAAAA') ax.set_ylim(0, 300) ax.set_title('Input data distribution') ax.set_xlabel('$x$') ax.set_ylabel('$N(x)$') # plot the cumulative distribution ax = fig.add_subplot(222) ax.scatter(x[indices], Px_cuml[indices], lw=0, c='k', s=9) ax.plot(x, Px_cuml, '-k') ax.set_xlim(-3, 3) ax.set_ylim(-0.05, 1.05) ax.set_title('Cumulative Distribution') ax.set_xlabel('$x$') ax.set_ylabel('$p(= np.pi / 4] /= 2 theta[theta <= -np.pi / 4] /= 2 # define the curve parametrically r = np.sqrt(1. / abs(np.cos(theta) ** 2 - np.sin(theta) ** 2)) r += np.random.normal(0, 0.08, size=10000) x = r * np.cos(theta + np.pi / 4) y = r * np.sin(theta + np.pi / 4) return (x, y) #------------------------------------------------------------ # Generate the data and compute the normalized 2D histogram np.random.seed(1) x, y = banana_distribution(10000) Ngrid = 41 grid = np.linspace(0, 2, Ngrid + 1) H, xbins, ybins = np.histogram2d(x, y, grid) H /= np.sum(H) #------------------------------------------------------------ # plot the result fig = plt.figure(figsize=(5, 2.5)) # define axes ax_Pxy = plt.axes((0.2, 0.34, 0.27, 0.52)) ax_Px = plt.axes((0.2, 0.14, 0.27, 0.2)) ax_Py = plt.axes((0.1, 0.34, 0.1, 0.52)) ax_cb = plt.axes((0.48, 0.34, 0.01, 0.52)) ax_Px_y = [plt.axes((0.65, 0.62, 0.32, 0.23)), plt.axes((0.65, 0.38, 0.32, 0.23)), plt.axes((0.65, 0.14, 0.32, 0.23))] # set axis label formatters ax_Px_y[0].xaxis.set_major_formatter(NullFormatter()) ax_Px_y[1].xaxis.set_major_formatter(NullFormatter()) ax_Pxy.xaxis.set_major_formatter(NullFormatter()) ax_Pxy.yaxis.set_major_formatter(NullFormatter()) ax_Px.yaxis.set_major_formatter(NullFormatter()) ax_Py.xaxis.set_major_formatter(NullFormatter()) # draw the joint probability plt.axes(ax_Pxy) H *= 1000 plt.imshow(H, interpolation='nearest', origin='lower', aspect='auto', extent=[0, 2, 0, 2], cmap=plt.cm.binary) cb = plt.colorbar(cax=ax_cb) cb.set_label('$p(x, y)$') plt.text(0, 1.02, r'$\times 10^{-3}$', transform=ax_cb.transAxes) # draw p(x) distribution ax_Px.plot(xbins[1:], H.sum(0), '-k', drawstyle='steps') # draw p(y) distribution ax_Py.plot(H.sum(1), ybins[1:], '-k', drawstyle='steps') # define axis limits ax_Pxy.set_xlim(0, 2) ax_Pxy.set_ylim(0, 2) ax_Px.set_xlim(0, 2) ax_Py.set_ylim(0, 2) # label axes ax_Pxy.set_xlabel('$x$') ax_Pxy.set_ylabel('$y$') ax_Px.set_xlabel('$x$') ax_Px.set_ylabel('$p(x)$') ax_Px.yaxis.set_label_position('right') ax_Py.set_ylabel('$y$') ax_Py.set_xlabel('$p(y)$') ax_Py.xaxis.set_label_position('top') # draw marginal probabilities iy = [3 * Ngrid / 4, Ngrid / 2, Ngrid / 4] colors = 'rgc' axis = ax_Pxy.axis() for i in range(3): # overplot range on joint probability ax_Pxy.plot([0, 2, 2, 0], [ybins[iy[i] + 1], ybins[iy[i] + 1], ybins[iy[i]], ybins[iy[i]]], c=colors[i], lw=1) Px_y = H[iy[i]] / H[iy[i]].sum() ax_Px_y[i].plot(xbins[1:], Px_y, drawstyle='steps', c=colors[i]) ax_Px_y[i].yaxis.set_major_formatter(NullFormatter()) ax_Px_y[i].set_ylabel('$p(x | %.1f)$' % ybins[iy[i]]) ax_Pxy.axis(axis) ax_Px_y[2].set_xlabel('$x$') ax_Pxy.set_title('Joint Probability') ax_Px_y[0].set_title('Conditional Probability') plt.show() astroML-0.3/book_figures/chapter3/fig_contingency_table.py0000644000076500000240000000314612252721253024566 0ustar jakevdpstaff00000000000000""" A 2x2 Contingency Table ----------------------- Figure 3.3. A contingency table showing p(T|D). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) fig = plt.figure(figsize=(2, 2), facecolor='w') ax = fig.add_axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False) for i in [-1, 0, 1]: ax.plot([i, i], [-1, 1], '-k') ax.plot([-1, 1], [i, i], '-k') kwds = dict(ha='center', va='center', size=11) ax.text(-0.5, 1.15, '0', **kwds) ax.text(0.5, 1.15, '1', **kwds) ax.text(0, 1.25, 'T', **kwds) ax.text(-1.15, 0.5, '0', **kwds) ax.text(-1.15, -0.5, '1', **kwds) ax.text(-1.25, 0, 'D', **kwds) kwds['size'] = 14 ax.text(0.5, 0.5, '$\epsilon_{fP}$', **kwds) ax.text(-0.5, 0.5, '$1 - \epsilon_{fP}$', **kwds) ax.text(-0.5, -0.5, '$\epsilon_{fN}$', **kwds) ax.text(0.5, -0.5, '$1 - \epsilon_{fN}$', **kwds) ax.set_xlim(-1.5, 1.2) ax.set_ylim(-1.2, 1.5) plt.show() astroML-0.3/book_figures/chapter3/fig_correlations.py0000644000076500000240000001005312252721253023576 0ustar jakevdpstaff00000000000000""" Correlation estimates --------------------- Figure 3.24. Bootstrap estimates of the distribution of Pearson's, Spearman's, and Kendall's correlation coefficients based on 2000 resamplings of the 1000 points shown in figure 3.23. The true values are shown by the dashed lines. It is clear that Pearson's correlation coefficient is not robust to contamination. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import stats from matplotlib import pyplot as plt from astroML.stats.random import bivariate_normal from astroML.decorators import pickle_results # percent sign must be escaped if usetex=True import matplotlib if matplotlib.rcParams.get('text.usetex'): pct = '\%' else: pct = '%' #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set parameters for the distributions Nbootstraps = 5000 N = 1000 sigma1 = 2.0 sigma2 = 1.0 mu = (10.0, 10.0) alpha_deg = 45.0 alpha = alpha_deg * np.pi / 180 f = 0.01 #------------------------------------------------------------ # sample the distribution # without outliers and with outliers np.random.seed(0) X = bivariate_normal(mu, sigma1, sigma2, alpha, N) X_out = X.copy() X_out[:int(f * N)] = bivariate_normal(mu, 2, 5, 45 * np.pi / 180., int(f * N)) # true values of rho (pearson/spearman r) and tau # tau value comes from Eq. 41 of arXiv:1011.2009 rho_true = 0.6 tau_true = 2 / np.pi * np.arcsin(rho_true) #------------------------------------------------------------ # Create a function to compute the statistics. Since this # takes a while, we'll use the "pickle_results" decorator # to save the results of the computation to disk @pickle_results('fig_correlations_dump.pkl') def compute_results(N, Nbootstraps): results = np.zeros((3, 2, Nbootstraps)) for k in range(Nbootstraps): ind = np.random.randint(N, size=N) for j, data in enumerate([X, X_out]): x = data[ind, 0] y = data[ind, 1] for i, statistic in enumerate([stats.pearsonr, stats.spearmanr, stats.kendalltau]): results[i, j, k] = statistic(x, y)[0] return results results = compute_results(N, Nbootstraps) #------------------------------------------------------------ # Plot the results in a three-panel plot fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(bottom=0.1, top=0.95, hspace=0.25) histargs = (dict(alpha=0.5, label='No Outliers'), dict(alpha=0.8, label='%i%s Outliers' % (int(f * 100), pct))) distributions = ['Pearson-r', 'Spearman-r', r'Kendall-$\tau$'] xlabels = ['r_p', 'r_s', r'\tau']\ for i in range(3): ax = fig.add_subplot(311 + i) for j in range(2): ax.hist(results[i, j], 40, histtype='stepfilled', fc='gray', normed=True, **histargs[j]) if i == 0: ax.legend(loc=2) ylim = ax.get_ylim() if i < 2: ax.plot([rho_true, rho_true], ylim, '--k', lw=1) ax.set_xlim(0.34, 0.701) else: ax.plot([tau_true, tau_true], ylim, '--k', lw=1) ax.set_xlim(0.31, 0.48) ax.set_ylim(ylim) ax.text(0.98, 0.95, distributions[i], ha='right', va='top', transform=ax.transAxes, bbox=dict(fc='w', ec='w')) ax.set_xlabel('$%s$' % xlabels[i]) ax.set_ylabel('$N(%s)$' % xlabels[i]) plt.show() astroML-0.3/book_figures/chapter3/fig_fisher_f_distribution.py0000644000076500000240000000446112252721253025464 0ustar jakevdpstaff00000000000000""" Example of Fisher's F distribution ------------------------------------ Figure 3.16. This shows an example of Fisher's F distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.f(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import f as fisher_f from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted mu = 0 d1_values = [1, 5, 2, 10] d2_values = [1, 2, 5, 50] linestyles = ['-', '--', ':', '-.'] x = np.linspace(0, 5, 1001)[1:] fig, ax = plt.subplots(figsize=(5, 3.75)) for (d1, d2, ls) in zip(d1_values, d2_values, linestyles): dist = fisher_f(d1, d2, mu) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$d_1=%i,\ d_2=%i$' % (d1, d2)) plt.xlim(0, 4) plt.ylim(0.0, 1.0) plt.xlabel('$x$') plt.ylabel(r'$p(x|d_1, d_2)$') plt.title("Fisher's Distribution") plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_flux_errors.py0000644000076500000240000000564012252721253023452 0ustar jakevdpstaff00000000000000""" Flux Errors ----------- Figure 3.5. An example of Gaussian flux errors becoming non-Gaussian magnitude errors. The dotted line shows the location of the mean flux; note that this is not coincident with the peak of the magnitude distribution. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create our data # generate 10000 normally distributed points np.random.seed(1) dist = norm(1, 0.25) flux = dist.rvs(10000) flux_fit = np.linspace(0.001, 2, 1000) pdf_flux_fit = dist.pdf(flux_fit) # transform this distribution into magnitude space mag = -2.5 * np.log10(flux) mag_fit = -2.5 * np.log10(flux_fit) pdf_mag_fit = pdf_flux_fit.copy() pdf_mag_fit[1:] /= abs(mag_fit[1:] - mag_fit[:-1]) pdf_mag_fit /= np.dot(pdf_mag_fit[1:], abs(mag_fit[1:] - mag_fit[:-1])) #------------------------------------------------------------ # Plot the result fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.17, top=0.9, left=0.12, right=0.95, wspace=0.3) # first plot the flux distribution ax = fig.add_subplot(121) ax.hist(flux, bins=np.linspace(0, 2, 50), histtype='stepfilled', fc='gray', alpha=0.5, normed=True) ax.plot(flux_fit, pdf_flux_fit, '-k') ax.plot([1, 1], [0, 2], ':k', lw=1) ax.set_xlim(-0.1, 2.1) ax.set_ylim(0, 1.8) ax.set_xlabel(r'${\rm flux}$') ax.set_ylabel(r'$p({\rm flux})$') ax.yaxis.set_major_locator(plt.MultipleLocator(0.4)) ax.text(0.04, 0.98, r'${\rm 20\%\ flux\ error}$', ha='left', va='top', transform=ax.transAxes, bbox=dict(ec='none', fc='w')) # next plot the magnitude distribution ax = fig.add_subplot(122) ax.hist(mag, bins=np.linspace(-1, 2, 50), histtype='stepfilled', fc='gray', alpha=0.5, normed=True) ax.plot(mag_fit, pdf_mag_fit, '-k') ax.plot([0, 0], [0, 2], ':k', lw=1) ax.set_xlim(-1.1, 1.1) ax.set_ylim(0, 1.8) ax.yaxis.set_major_locator(plt.MultipleLocator(0.4)) ax.text(0.04, 0.98, r'${\rm mag} = -2.5\log_{10}({\rm flux})$', ha='left', va='top', transform=ax.transAxes, bbox=dict(ec='none', fc='w')) ax.set_xlabel(r'${\rm mag}$') ax.set_ylabel(r'$p({\rm mag})$') plt.show() astroML-0.3/book_figures/chapter3/fig_gamma_distribution.py0000644000076500000240000000452612252721253024763 0ustar jakevdpstaff00000000000000""" Example of a Gamma distribution ------------------------------- Figure 3.18. This shows an example of a gamma distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.gamma(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import gamma from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # plot the distributions k_values = [1, 2, 3, 5] theta_values = [2, 1, 1, 0.5] linestyles = ['-', '--', ':', '-.'] x = np.linspace(1E-6, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for k, t, ls in zip(k_values, theta_values, linestyles): dist = gamma(k, 0, t) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$k=%.1f,\ \theta=%.1f$' % (k, t)) plt.xlim(0, 10) plt.ylim(0, 0.45) plt.xlabel('$x$') plt.ylabel(r'$p(x|k,\theta)$') plt.title('Gamma Distribution') plt.legend(loc=0) plt.show() astroML-0.3/book_figures/chapter3/fig_gaussian_distribution.py0000644000076500000240000000460112252721253025505 0ustar jakevdpstaff00000000000000""" Example of a Gaussian distribution ---------------------------------- Figure 3.8. This shows an example of a gaussian distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.norm(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import norm from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distributions to be plotted sigma_values = [0.5, 1.0, 2.0] linestyles = ['-', '--', ':'] mu = 0 x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for sigma, ls in zip(sigma_values, linestyles): # create a gaussian / normal distribution dist = norm(mu, sigma) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$\mu=%i,\ \sigma=%.1f$' % (mu, sigma)) plt.xlim(-5, 5) plt.ylim(0, 0.85) plt.xlabel('$x$') plt.ylabel(r'$p(x|\mu,\sigma)$') plt.title('Gaussian Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_kurtosis_skew.py0000644000076500000240000000552612252721253024017 0ustar jakevdpstaff00000000000000r""" Kurtosis and Skew ----------------- Figure 3.6. An example of distributions with different skewness :math:`\Sigma` (top panel) and kurtosis K (bottom panel). The modified Gaussian in the upper panel is a normal distribution multiplied by a Gram-Charlier series (see eq. 4.70), with a0 = 2, a1 = 1, and a2 = 0.5. The log-normal has :math:`\sigma = 1.2`. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import stats from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) fig = plt.figure(figsize=(5, 6.25)) fig.subplots_adjust(right=0.95, hspace=0.05, bottom=0.07, top=0.95) # First show distributions with different skeq ax = fig.add_subplot(211) x = np.linspace(-8, 8, 1000) N = stats.norm(0, 1) l1, = ax.plot(x, N.pdf(x), '-k', label=r'${\rm Gaussian,}\ \Sigma=0$') l2, = ax.plot(x, 0.5 * N.pdf(x) * (2 + x + 0.5 * (x * x - 1)), '--k', label=r'${\rm mod.\ Gauss,}\ \Sigma=-0.36$') l3, = ax.plot(x[499:], stats.lognorm(1.2).pdf(x[499:]), '-.k', label=r'$\rm log\ normal,\ \Sigma=11.2$') ax.set_xlim(-5, 5) ax.set_ylim(0, 0.7001) ax.set_ylabel('$p(x)$') ax.xaxis.set_major_formatter(plt.NullFormatter()) # trick to show multiple legends leg1 = ax.legend([l1], [l1.get_label()], loc=1) leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2) ax.add_artist(leg1) ax.set_title('Skew $\Sigma$ and Kurtosis $K$') # next show distributions with different kurtosis ax = fig.add_subplot(212) x = np.linspace(-5, 5, 1000) l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), '--k', label=r'${\rm Laplace,}\ K=+3$') l2, = ax.plot(x, stats.norm(0, 1).pdf(x), '-k', label=r'${\rm Gaussian,}\ K=0$') l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), '-.k', label=r'${\rm Cosine,}\ K=-0.59$') l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ':k', label=r'${\rm Uniform,}\ K=-1.2$') ax.set_xlim(-5, 5) ax.set_ylim(0, 0.55) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') # trick to show multiple legends leg1 = ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=2) leg2 = ax.legend((l3, l4), (l3.get_label(), l4.get_label()), loc=1) ax.add_artist(leg1) plt.show() astroML-0.3/book_figures/chapter3/fig_laplace_distribution.py0000644000076500000240000000454312252721253025301 0ustar jakevdpstaff00000000000000""" Example of a Laplace distribution ---------------------------------- Figure 3.13. This shows an example of a Laplace distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.laplace(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import laplace from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted delta_values = [0.5, 1.0, 2.0] linestyles = ['-', '--', ':'] mu = 0 x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for delta, ls in zip(delta_values, linestyles): dist = laplace(mu, delta) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta)) plt.xlim(-6, 6) plt.ylim(0, 1.0) plt.xlabel('$x$') plt.ylabel(r'$p(x|\mu,\Delta)$') plt.title('Laplace Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_poisson_distribution.py0000644000076500000240000000542112420767763025403 0ustar jakevdpstaff00000000000000""" Example of a Poisson distribution ---------------------------------- Figure 3.10. This shows an example of a Poisson distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.poisson(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import poisson from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted mu_values = [1, 5, 15] linestyles = ['-', '--', ':'] #------------------------------------------------------------ # plot the distributions # we generate it using scipy.stats.poisson(). Once the distribution # object is created, we have many options: for example # - dist.pmf(x) evaluates the probability mass function in the case of # discrete distributions. # - dist.pdf(x) evaluates the probability density function for # evaluates fig, ax = plt.subplots(figsize=(5, 3.75)) for mu, ls in zip(mu_values, linestyles): # create a poisson distribution # we could generate a random sample from this distribution using, e.g. # rand = dist.rvs(1000) dist = poisson(mu) x = np.arange(-1, 200) plt.plot(x, dist.pmf(x), color='black', linestyle='steps-mid' + ls, label=r'$\mu=%i$' % mu) plt.xlim(-0.5, 30) plt.ylim(0, 0.4) plt.xlabel('$x$') plt.ylabel(r'$p(x|\mu)$') plt.title('Poisson Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_prob_sum.py0000644000076500000240000000305612252721253022725 0ustar jakevdpstaff00000000000000""" Sum of Probabilities -------------------- Figure 3.1. A representation of the sum of probabilities shown in eq.3.1. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) # create plot fig = plt.figure(figsize=(5, 3.75), facecolor='w') ax = plt.axes([0, 0, 1, 1], xticks=[], yticks=[], frameon=False) # draw intersecting circles ax.add_patch(plt.Circle((1.5, 0.2), 2.2, fc='gray', ec='black', alpha=0.5)) ax.add_patch(plt.Circle((-1.5, 0.2), 2.2, fc='gray', ec='black', alpha=0.5)) # add text text_kwargs = dict(ha='center', va='center', fontsize=12) ax.text(-1.6, 0.2, "$p(A)$", **text_kwargs) ax.text(1.6, 0.2, "$p(B)$", **text_kwargs) ax.text(0.0, 0.2, "$p(A \cap B)$", **text_kwargs) ax.text(0, -2.3, "$p(A \cup B) = p(A) + p(B) - p(A \cap B)$", **text_kwargs) ax.set_xlim(-4, 4) ax.set_ylim(-3, 3) plt.show() astroML-0.3/book_figures/chapter3/fig_robust_pca.py0000644000076500000240000001111512252721253023233 0ustar jakevdpstaff00000000000000r""" Bivariate Gaussian: Robust Parameter Estimation ----------------------------------------------- Figure 3.23. An example of computing the components of a bivariate Gaussian using a sample with 1000 data values (points), with two levels of contamination. The core of the distribution is a bivariate Gaussian with :math:`(\mu_x, \mu_y, \sigma_1, \sigma_2, \alpha) = (10, 10, 2, 1, 45^\odot)` The "contaminating" subsample contributes 5% (left) and 15% (right) of points centered on the same :math:`(\mu_x, \mu_y)`, and with :math:`\sigma_1 = \sigma_2 = 5`. Ellipses show the 1- and 3-sigma contours. The solid lines correspond to the input distribution. The thin dotted lines show the nonrobust estimate, and the dashed lines show the robust estimate of the best-fit distribution parameters (see Section 3.5.3 for details). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import stats from matplotlib import pyplot as plt from matplotlib.patches import Ellipse from astroML.stats import fit_bivariate_normal from astroML.stats.random import bivariate_normal # percent sign needs to be escaped if usetex is activated import matplotlib if matplotlib.rcParams.get('text.usetex'): pct = r'\%' else: pct = r'%' #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) N = 1000 sigma1 = 2.0 sigma2 = 1.0 mu = [10, 10] alpha_deg = 45.0 alpha = alpha_deg * np.pi / 180 #------------------------------------------------------------ # Draw N points from a multivariate normal distribution # # we use the bivariate_normal function from astroML. A more # general function for this is numpy.random.multivariate_normal(), # which requires the user to specify the full covariance matrix. # bivariate_normal() generates this covariance matrix for the # given inputs np.random.seed(0) X = bivariate_normal(mu, sigma1, sigma2, alpha, N) #------------------------------------------------------------ # Create the figure showing the fits fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.05, bottom=0.15, top=0.95) # We'll create two figures, with two levels of contamination for i, f in enumerate([0.05, 0.15]): ax = fig.add_subplot(1, 2, i + 1) # add outliers distributed using a bivariate normal. X[:int(f * N)] = bivariate_normal((10, 10), 2, 4, 45 * np.pi / 180., int(f * N)) x, y = X.T # compute the non-robust statistics (mu_nr, sigma1_nr, sigma2_nr, alpha_nr) = fit_bivariate_normal(x, y, robust=False) # compute the robust statistics (mu_r, sigma1_r, sigma2_r, alpha_r) = fit_bivariate_normal(x, y, robust=True) # scatter the points ax.scatter(x, y, s=2, lw=0, c='k', alpha=0.5) # Draw elipses showing the fits for Nsig in [1, 3]: # True fit E = Ellipse((10, 10), sigma1 * Nsig, sigma2 * Nsig, alpha_deg, ec='k', fc='none') ax.add_patch(E) # Non-robust fit E = Ellipse(mu_nr, sigma1_nr * Nsig, sigma2_nr * Nsig, (alpha_nr * 180. / np.pi), ec='k', fc='none', linestyle='dotted') ax.add_patch(E) # Robust fit E = Ellipse(mu_r, sigma1_r * Nsig, sigma2_r * Nsig, (alpha_r * 180. / np.pi), ec='k', fc='none', linestyle='dashed') ax.add_patch(E) ax.text(0.04, 0.96, '%i%s outliers' % (f * 100, pct), ha='left', va='top', transform=ax.transAxes) ax.set_xlim(5.5, 14.5) ax.set_ylim(5.5, 14.5) ax.set_xlabel('$x$') # This is a bit of a hack: # We'll draw some lines off the picture to make our legend look better ax.plot([0], [0], '-k', label='Input') ax.plot([0], [0], ':k', label='Fit') ax.plot([0], [0], '--k', label='Robust Fit') ax.legend(loc='lower right') if i == 0: ax.set_ylabel('$y$') else: ax.yaxis.set_major_formatter(plt.NullFormatter()) plt.show() astroML-0.3/book_figures/chapter3/fig_student_t_distribution.py0000644000076500000240000000465612252721253025716 0ustar jakevdpstaff00000000000000""" Example of Student's t distribution ----------------------------------- Figure 3.15. This shows an example of Student's t distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.student_t(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import t as student_t from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted mu = 0 k_values = [1E10, 2, 1, 0.5] linestyles = ['-', '--', ':', '-.'] x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for k, ls in zip(k_values, linestyles): dist = student_t(k, 0) if k >= 1E10: label = r'$\mathrm{t}(k=\infty)$' else: label = r'$\mathrm{t}(k=%.1f)$' % k plt.plot(x, dist.pdf(x), ls=ls, c='black', label=label) plt.xlim(-5, 5) plt.ylim(0.0, 0.45) plt.xlabel('$x$') plt.ylabel(r'$p(x|k)$') plt.title("Student's $t$ Distribution") plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_transform_distribution.py0000644000076500000240000000474112252721253025713 0ustar jakevdpstaff00000000000000r""" Transformation of Distribution ------------------------------ Figure 3.4. An example of transforming a uniform distribution. In the left panel, x is sampled from a uniform distribution of unit width centered on x = 0.5 (:math:`\mu` = 0 and W = 1; see Section 3.3.1). In the right panel, the distribution is transformed via y = exp(x). The form of the resulting pdf is computed from eq. 3.20. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import stats from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up the data np.random.seed(0) # create a uniform distribution uniform_dist = stats.uniform(0, 1) x_sample = uniform_dist.rvs(1000) x = np.linspace(-0.5, 1.5, 1000) Px = uniform_dist.pdf(x) # transform the data y_sample = np.exp(x_sample) y = np.exp(x) Py = Px / y #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.11, right=0.95, wspace=0.3, bottom=0.17, top=0.9) ax = fig.add_subplot(121) ax.hist(x_sample, 20, histtype='stepfilled', fc='#CCCCCC', normed=True) ax.plot(x, Px, '-k') ax.set_xlim(-0.2, 1.2) ax.set_ylim(0, 1.4001) ax.xaxis.set_major_locator(plt.MaxNLocator(6)) ax.text(0.95, 0.95, r'$p_x(x) = {\rm Uniform}(x)$', va='top', ha='right', transform=ax.transAxes) ax.set_xlabel('$x$') ax.set_ylabel('$p_x(x)$') ax = fig.add_subplot(122) ax.hist(y_sample, 20, histtype='stepfilled', fc='#CCCCCC', normed=True) ax.plot(y, Py, '-k') ax.set_xlim(0.85, 2.9) ax.xaxis.set_major_locator(plt.MaxNLocator(6)) ax.text(0.95, 0.95, '$y=\exp(x)$\n$p_y(y)=p_x(\ln y) / y$', va='top', ha='right', transform=ax.transAxes) ax.set_xlabel('$y$') ax.set_ylabel('$p_y(y)$') plt.show() astroML-0.3/book_figures/chapter3/fig_uniform_distribution.py0000644000076500000240000000453612252721253025361 0ustar jakevdpstaff00000000000000""" Example of a uniform distribution --------------------------------- Figure 3.7. This shows an example of a uniform distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.uniform(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import uniform from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted W_values = [1.0, 2.0, 3.0] linestyles = ['-', '--', ':'] mu = 0 x = np.linspace(-2, 2, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for W, ls in zip(W_values, linestyles): left = mu - 0.5 * W dist = uniform(left, W) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$\mu=%i,\ W=%i$' % (mu, W)) plt.xlim(-1.7, 1.7) plt.ylim(0, 1.2) plt.xlabel('$x$') plt.ylabel(r'$p(x|\mu, W)$') plt.title('Uniform Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/fig_uniform_mean.py0000644000076500000240000001044512420767763023573 0ustar jakevdpstaff00000000000000r""" Convergence of mean for uniformly distributed values ---------------------------------------------------- Figure 3.21. A comparison of the sample-size dependence of two estimators for the location parameter of a uniform distribution, with the sample size ranging from N = 100 to N =10,000. The estimator in the top panel is the sample mean, and the estimator in the bottom panel is the mean value of two extreme values. The theoretical 1-, 2-, and 3-sigma contours are shown for comparison. When using the sample mean to estimate the location parameter, the uncertainty decreases proportionally to 1/ N, and when using the mean of two extreme values as 1/N. Note different vertical scales for the two panels. The two methods of estimating the mean :math:`\mu` are: - :math:`\bar\mu = \mathrm{mean}(x)`, with an error that scales as :math:`1/\sqrt{N}`. - :math:`\bar\mu = \frac{1}{2}[\mathrm{max}(x) + \mathrm{min}(x)]`, with an error that scales as :math:`1/N`. The shaded regions on the plot show the expected 1, 2, and 3-:math:`\sigma` error. Notice the difference in scale between the y-axes of the two plots. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import uniform #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate the random distribution np.random.seed(0) N = (10 ** np.linspace(2, 4, 1000)).astype(int) mu = 0 W = 2 rng = uniform(mu - 0.5 * W, W) # uniform distribution between mu-W and mu+W #------------------------------------------------------------ # Compute the cumulative mean and min/max estimator of the sample mu_estimate_mean = np.zeros(N.shape) mu_estimate_minmax = np.zeros(N.shape) for i in range(len(N)): x = rng.rvs(N[i]) # generate N[i] uniformly distributed values mu_estimate_mean[i] = np.mean(x) mu_estimate_minmax[i] = 0.5 * (np.min(x) + np.max(x)) # compute the expected scalings of the estimator uncertainties N_scaling = 2. * W / N / np.sqrt(12) root_N_scaling = W / np.sqrt(N * 12) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0, bottom=0.15, left=0.15) # upper plot: mean statistic ax = fig.add_subplot(211, xscale='log') ax.scatter(N, mu_estimate_mean, c='b', lw=0, s=4) # draw shaded sigma contours for nsig in (1, 2, 3): ax.fill(np.hstack((N, N[::-1])), np.hstack((nsig * root_N_scaling, -nsig * root_N_scaling[::-1])), 'b', alpha=0.2) ax.set_xlim(N[0], N[-1]) ax.set_ylim(-0.199, 0.199) ax.set_ylabel(r'$\bar{\mu}$') ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.text(0.99, 0.95, r'$\bar\mu = \mathrm{mean}(x)$', ha='right', va='top', transform=ax.transAxes) ax.text(0.99, 0.02, r'$\sigma = \frac{1}{\sqrt{12}}\cdot\frac{W}{\sqrt{N}}$', ha='right', va='bottom', transform=ax.transAxes) # lower plot: min/max statistic ax = fig.add_subplot(212, xscale='log') ax.scatter(N, mu_estimate_minmax, c='g', lw=0, s=4) # draw shaded sigma contours for nsig in (1, 2, 3): ax.fill(np.hstack((N, N[::-1])), np.hstack((nsig * N_scaling, -nsig * N_scaling[::-1])), 'g', alpha=0.2) ax.set_xlim(N[0], N[-1]) ax.set_ylim(-0.0399, 0.0399) ax.set_xlabel('$N$') ax.set_ylabel(r'$\bar{\mu}$') ax.text(0.99, 0.95, r'$\bar\mu = \frac{1}{2}[\mathrm{max}(x) + \mathrm{min}(x)]$', ha='right', va='top', transform=ax.transAxes) ax.text(0.99, 0.02, r'$\sigma = \frac{1}{\sqrt{12}}\cdot\frac{2W}{N}$', ha='right', va='bottom', transform=ax.transAxes) plt.show() astroML-0.3/book_figures/chapter3/fig_weibull_distribution.py0000644000076500000240000000461412252721253025342 0ustar jakevdpstaff00000000000000""" Example of a Weibull distribution ---------------------------------- Figure 3.19. This shows an example of a weibull distribution with various parameters. We'll generate the distribution using:: dist = scipy.stats.dweibull(...) Where ... should be filled in with the desired distribution parameters Once we have defined the distribution parameters in this way, these distribution objects have many useful methods; for example: * ``dist.pmf(x)`` computes the Probability Mass Function at values ``x`` in the case of discrete distributions * ``dist.pdf(x)`` computes the Probability Density Function at values ``x`` in the case of continuous distributions * ``dist.rvs(N)`` computes ``N`` random variables distributed according to the given distribution Many further options exist; refer to the documentation of ``scipy.stats`` for more details. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import dweibull from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted k_values = [0.5, 1, 2, 2] lam_values = [1, 1, 1, 2] linestyles = ['-', '--', ':', '-.', '--'] mu = 0 x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for (k, lam, ls) in zip(k_values, lam_values, linestyles): dist = dweibull(k, mu, lam) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$k=%.1f,\ \lambda=%i$' % (k, lam)) plt.xlim(0, 5) plt.ylim(0, 0.6) plt.xlabel('$x$') plt.ylabel(r'$p(x|k,\lambda)$') plt.title('Weibull Distribution') plt.legend() plt.show() astroML-0.3/book_figures/chapter3/README.rst0000644000076500000240000000044112115147567021372 0ustar jakevdpstaff00000000000000Chapter 3: Probability and Statistical Distributions ---------------------------------------------------- This chapter discusses basic foundational principles, including probability and basic descriptive statistics. It also introduces several important probability distribution functions. astroML-0.3/book_figures/chapter4/0000755000076500000240000000000012462244012017671 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter4/fig_anderson_darling.py0000644000076500000240000000720212420767763024423 0ustar jakevdpstaff00000000000000""" Gaussianity Tests ----------------- Figure 4.7. The results of the Anderson-Darling test, the Kolmogorov-Smirnov test, and the Shapiro-Wilk test when applied to a sample of 10,000 values drawn from a normal distribution (upper panel) and from a combination of two Gaussian distributions (lower panel). The functions are available in the ``scipy`` package: - The Anderson-Darling test (``scipy.stats.anderson``) - The Kolmogorov-Smirnov test (``scipy.stats.kstest``) - The Shapiro-Wilk test (``scipy.stats.shapiro``) """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from scipy import stats from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) from astroML.stats import mean_sigma, median_sigmaG # create some distributions np.random.seed(1) normal_vals = stats.norm(loc=0, scale=1).rvs(10000) dual_vals = stats.norm(0, 1).rvs(10000) dual_vals[:4000] = stats.norm(loc=3, scale=2).rvs(4000) x = np.linspace(-4, 10, 1000) normal_pdf = stats.norm(0, 1).pdf(x) dual_pdf = 0.6 * stats.norm(0, 1).pdf(x) + 0.4 * stats.norm(3, 2).pdf(x) vals = [normal_vals, dual_vals] pdf = [normal_pdf, dual_pdf] xlims = [(-4, 4), (-4, 10)] #------------------------------------------------------------ # Compute the statistics and plot the results fig = plt.figure(figsize=(5, 7)) fig.subplots_adjust(left=0.13, right=0.95, bottom=0.06, top=0.95, hspace=0.1) for i in range(2): ax = fig.add_subplot(2, 1, 1 + i) # 2 x 1 subplot # compute some statistics A2, sig, crit = stats.anderson(vals[i]) D, pD = stats.kstest(vals[i], "norm") W, pW = stats.shapiro(vals[i]) mu, sigma = mean_sigma(vals[i], ddof=1) median, sigmaG = median_sigmaG(vals[i]) N = len(vals[i]) Z1 = 1.3 * abs(mu - median) / sigma * np.sqrt(N) Z2 = 1.1 * abs(sigma / sigmaG - 1) * np.sqrt(N) print(70 * '_') print(" Kolmogorov-Smirnov test: D = %.2g p = %.2g" % (D, pD)) print(" Anderson-Darling test: A^2 = %.2g" % A2) print(" significance | critical value ") print(" --------------|----------------") for j in range(len(sig)): print(" {0:.2f} | {1:.1f}%".format(sig[j], crit[j])) print(" Shapiro-Wilk test: W = %.2g p = %.2g" % (W, pW)) print(" Z_1 = %.1f" % Z1) print(" Z_2 = %.1f" % Z2) # plot a histogram ax.hist(vals[i], bins=50, normed=True, histtype='stepfilled', alpha=0.5) ax.plot(x, pdf[i], '-k') ax.set_xlim(xlims[i]) # print information on the plot info = "Anderson-Darling: $A^2 = %.2f$\n" % A2 info += "Kolmogorov-Smirnov: $D = %.2g$\n" % D info += "Shapiro-Wilk: $W = %.2g$\n" % W info += "$Z_1 = %.1f$\n$Z_2 = %.1f$" % (Z1, Z2) ax.text(0.97, 0.97, info, ha='right', va='top', transform=ax.transAxes) if i == 0: ax.set_ylim(0, 0.55) else: ax.set_ylim(0, 0.35) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter4/fig_benjamini_method.py0000644000076500000240000000573212420577220024377 0ustar jakevdpstaff00000000000000r""" Example of Benjamini & Hochberg Method -------------------------------------- Figure 4.6. Illustration of the Benjamini and Hochberg method for 106 points drawn from the distribution shown in figure 4.5. The solid line shows the cumulative distribution of observed p values, normalized by the sample size. The dashed lines show the cutoff for various limits on contamination rate :math:`\varepsilon` computed using eq. 4.44 (the accepted measurements are those with p smaller than that corresponding to the intersection of solid and dashed curves). The dotted line shows how the distribution would look in the absence of sources. The value of the cumulative distribution at p = 0.5 is 0.55, and yields a correction factor :math:`\lambda = 1.11` (see eq. 4.46). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import norm from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up the background and foreground distributions background = norm(100, 10) foreground = norm(150, 12) f = 0.1 # Draw from the distribution np.random.seed(42) N = 1E6 X = np.random.random(N) mask = (X < 0.1) X[mask] = foreground.rvs(np.sum(mask)) X[~mask] = background.rvs(np.sum(~mask)) #------------------------------------------------------------ # Perform Benjamini-Hochberg method p = 1 - background.cdf(X) p_sorted = np.sort(p) #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(bottom=0.15) ax = plt.axes(xscale='log', yscale='log') # only plot every 1000th; plotting all 1E6 takes too long ax.plot(p_sorted[::1000], np.linspace(0, 1, 1000), '-k') ax.plot(p_sorted[::1000], p_sorted[::1000], ':k', lw=1) # plot the cutoffs for various values of expsilon p_reg_over_eps = 10 ** np.linspace(-3, 0, 100) for (i, epsilon) in enumerate([0.1, 0.01, 0.001, 0.0001]): x = p_reg_over_eps * epsilon y = p_reg_over_eps ax.plot(x, y, '--k') ax.text(x[1], y[1], r'$\epsilon = %.1g$' % epsilon, ha='center', va='bottom', rotation=70) ax.xaxis.set_major_locator(plt.LogLocator(base=100)) ax.set_xlim(1E-12, 1) ax.set_ylim(1E-3, 1) ax.set_xlabel('$p = 1 - H_B(i)$') ax.set_ylabel('normalized $C(p)$') plt.show() astroML-0.3/book_figures/chapter4/fig_bootstrap_gaussian.py0000644000076500000240000000543312252721253025010 0ustar jakevdpstaff00000000000000r""" Bootstrap Calculations of Error on Mean --------------------------------------- Figure 4.3. The bootstrap uncertainty estimates for the sample standard deviation :math:`\sigma` (dashed line; see eq. 3.32) and :math:`\sigma_G` (solid line; see eq. 3.36). The sample consists of N = 1000 values drawn from a Gaussian distribution with :math:`\mu = 0` and :math:`\sigma = 1`. The bootstrap estimates are based on 10,000 samples. The thin lines show Gaussians with the widths determined as :math:`s / \sqrt{2(N - 1)}` (eq. 3.35) for :math:`\sigma` and :math:`1.06 s / \sqrt{N}` (eq. 3.37) for :math:`\sigma_G`. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import norm from matplotlib import pyplot as plt from astroML.resample import bootstrap from astroML.stats import sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) m = 1000 # number of points n = 10000 # number of bootstraps #------------------------------------------------------------ # sample values from a normal distribution np.random.seed(123) data = norm(0, 1).rvs(m) #------------------------------------------------------------ # Compute bootstrap resamplings of data mu1_bootstrap = bootstrap(data, n, np.std, kwargs=dict(axis=1, ddof=1)) mu2_bootstrap = bootstrap(data, n, sigmaG, kwargs=dict(axis=1)) #------------------------------------------------------------ # Compute the theoretical expectations for the two distributions x = np.linspace(0.8, 1.2, 1000) sigma1 = 1. / np.sqrt(2 * (m - 1)) pdf1 = norm(1, sigma1).pdf(x) sigma2 = 1.06 / np.sqrt(m) pdf2 = norm(1, sigma2).pdf(x) #------------------------------------------------------------ # Plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) ax.hist(mu1_bootstrap, bins=50, normed=True, histtype='step', color='blue', ls='dashed', label=r'$\sigma\ {\rm (std. dev.)}$') ax.plot(x, pdf1, color='gray') ax.hist(mu2_bootstrap, bins=50, normed=True, histtype='step', color='red', label=r'$\sigma_G\ {\rm (quartile)}$') ax.plot(x, pdf2, color='gray') ax.set_xlim(0.82, 1.18) ax.set_xlabel(r'$\sigma$') ax.set_ylabel(r'$p(\sigma|x,I)$') ax.legend() plt.show() astroML-0.3/book_figures/chapter4/fig_chi2_eval.py0000644000076500000240000000725112252721253022735 0ustar jakevdpstaff00000000000000r""" Evaluating a model fit with chi-square -------------------------------------- Figure 4.1. The use of the :math:`\chi^2` statistic for evaluating the goodness of fit. The data here are a series of observations of the luminosity of a star, with known error bars. Our model assumes that the brightness of the star does not vary; that is, all the scatter in the data is due to measurement error. :math:`\chi^2_{\rm dof} \approx 1` indicates that the model fits the data well (upper-left panel). :math:`\chi^2_{\rm dof}` much smaller than 1 (upper-right panel) is an indication that the errors are overestimated. :math:`\chi^2_{\rm dof}` much larger than 1 is an indication either that the errors are underestimated (lower-left panel) or that the model is not a good description of the data (lower-right panel). In this last case, it is clear from the data that the star's luminosity is varying with time: this situation is be treated more fully in chapter 10. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import stats from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate Dataset np.random.seed(1) N = 50 L0 = 10 dL = 0.2 t = np.linspace(0, 1, N) L_obs = np.random.normal(L0, dL, N) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.05, bottom=0.1, top=0.95, hspace=0.05) y_vals = [L_obs, L_obs, L_obs, L_obs + 0.5 - t ** 2] y_errs = [dL, dL * 2, dL / 2, dL] titles = ['correct errors', 'overestimated errors', 'underestimated errors', 'incorrect model'] for i in range(4): ax = fig.add_subplot(2, 2, 1 + i, xticks=[]) # compute the mean and the chi^2/dof mu = np.mean(y_vals[i]) z = (y_vals[i] - mu) / y_errs[i] chi2 = np.sum(z ** 2) chi2dof = chi2 / (N - 1) # compute the standard deviations of chi^2/dof sigma = np.sqrt(2. / (N - 1)) nsig = (chi2dof - 1) / sigma # plot the points with errorbars ax.errorbar(t, y_vals[i], y_errs[i], fmt='.k', ecolor='gray', lw=1) ax.plot([-0.1, 1.3], [L0, L0], ':k', lw=1) # Add labels and text ax.text(0.95, 0.95, titles[i], ha='right', va='top', transform=ax.transAxes, bbox=dict(boxstyle='round', fc='w', ec='k')) ax.text(0.02, 0.02, r'$\hat{\mu} = %.2f$' % mu, ha='left', va='bottom', transform=ax.transAxes) ax.text(0.98, 0.02, r'$\chi^2_{\rm dof} = %.2f\, (%.2g\,\sigma)$' % (chi2dof, nsig), ha='right', va='bottom', transform=ax.transAxes) # set axis limits ax.set_xlim(-0.05, 1.05) ax.set_ylim(8.6, 11.4) # set ticks and labels ax.yaxis.set_major_locator(plt.MultipleLocator(1)) if i > 1: ax.set_xlabel('observations') if i % 2 == 0: ax.set_ylabel('Luminosity') else: ax.yaxis.set_major_formatter(plt.NullFormatter()) plt.show() astroML-0.3/book_figures/chapter4/fig_classification_example.py0000644000076500000240000000416112252721253025604 0ustar jakevdpstaff00000000000000""" Example of classification ------------------------- Figure 4.5. An example of a simple classification problem between two Gaussian distributions. Given a value of x, we need to assign that measurement to one of the two distributions (background vs. source). The cut at xc = 120 leads to very few Type II errors (i.e., false negatives: points from the distribution hS with x < xc being classified as background), but this comes at the cost of a significant number of Type I errors (i.e., false positives: points from the distribution :math:`h_B` with x > xc being classified as sources). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import norm from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate and draw the curves x = np.linspace(50, 200, 1000) p1 = 0.9 * norm(100, 10).pdf(x) p2 = 0.1 * norm(150, 12).pdf(x) fig, ax = plt.subplots(figsize=(5, 3.75)) ax.fill(x, p1, ec='k', fc='#AAAAAA', alpha=0.5) ax.fill(x, p2, '-k', fc='#AAAAAA', alpha=0.5) ax.plot([120, 120], [0.0, 0.04], '--k') ax.text(100, 0.036, r'$h_B(x)$', ha='center', va='bottom') ax.text(150, 0.0035, r'$h_S(x)$', ha='center', va='bottom') ax.text(122, 0.039, r'$x_c=120$', ha='left', va='top') ax.text(125, 0.01, r'$(x > x_c\ {\rm classified\ as\ sources})$') ax.set_xlim(50, 200) ax.set_ylim(0, 0.04) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter4/fig_GMM_1D.py0000644000076500000240000001034412252721253022042 0ustar jakevdpstaff00000000000000""" 1D Gaussian Mixture Example --------------------------- Figure 4.2. Example of a one-dimensional Gaussian mixture model with three components. The left panel shows a histogram of the data, along with the best-fit model for a mixture with three components. The center panel shows the model selection criteria AIC (see Section 4.3) and BIC (see Section 5.4) as a function of the number of components. Both are minimized for a three-component model. The right panel shows the probability that a given point is drawn from each class as a function of its position. For a given x value, the vertical extent of each region is proportional to that probability. Note that extreme values are most likely to belong to class 1. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from matplotlib import pyplot as plt import numpy as np from sklearn.mixture import GMM #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up the dataset. # We'll use scikit-learn's Gaussian Mixture Model to sample # data from a mixture of Gaussians. The usual way of using # this involves fitting the mixture to data: we'll see that # below. Here we'll set the internal means, covariances, # and weights by-hand. np.random.seed(1) gmm = GMM(3, n_iter=1) gmm.means_ = np.array([[-1], [0], [3]]) gmm.covars_ = np.array([[1.5], [1], [0.5]]) ** 2 gmm.weights_ = np.array([0.3, 0.5, 0.2]) X = gmm.sample(1000) #------------------------------------------------------------ # Learn the best-fit GMM models # Here we'll use GMM in the standard way: the fit() method # uses an Expectation-Maximization approach to find the best # mixture of Gaussians for the data # fit models with 1-10 components N = np.arange(1, 11) models = [None for i in range(len(N))] for i in range(len(N)): models[i] = GMM(N[i]).fit(X) # compute the AIC and the BIC AIC = [m.aic(X) for m in models] BIC = [m.bic(X) for m in models] #------------------------------------------------------------ # Plot the results # We'll use three panels: # 1) data + best-fit mixture # 2) AIC and BIC vs number of components # 3) probability that a point came from each component fig = plt.figure(figsize=(5, 1.7)) fig.subplots_adjust(left=0.12, right=0.97, bottom=0.21, top=0.9, wspace=0.5) # plot 1: data + best-fit mixture ax = fig.add_subplot(131) M_best = models[np.argmin(AIC)] x = np.linspace(-6, 6, 1000) logprob, responsibilities = M_best.eval(x) pdf = np.exp(logprob) pdf_individual = responsibilities * pdf[:, np.newaxis] ax.hist(X, 30, normed=True, histtype='stepfilled', alpha=0.4) ax.plot(x, pdf, '-k') ax.plot(x, pdf_individual, '--k') ax.text(0.04, 0.96, "Best-fit Mixture", ha='left', va='top', transform=ax.transAxes) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') # plot 2: AIC and BIC ax = fig.add_subplot(132) ax.plot(N, AIC, '-k', label='AIC') ax.plot(N, BIC, '--k', label='BIC') ax.set_xlabel('n. components') ax.set_ylabel('information criterion') ax.legend(loc=2) # plot 3: posterior probabilities for each component ax = fig.add_subplot(133) p = M_best.predict_proba(x) p = p[:, (1, 0, 2)] # rearrange order so the plot looks better p = p.cumsum(1).T ax.fill_between(x, 0, p[0], color='gray', alpha=0.3) ax.fill_between(x, p[0], p[1], color='gray', alpha=0.5) ax.fill_between(x, p[1], 1, color='gray', alpha=0.7) ax.set_xlim(-6, 6) ax.set_ylim(0, 1) ax.set_xlabel('$x$') ax.set_ylabel(r'$p({\rm class}|x)$') ax.text(-5, 0.3, 'class 1', rotation='vertical') ax.text(0, 0.5, 'class 2', rotation='vertical') ax.text(3, 0.3, 'class 3', rotation='vertical') plt.show() astroML-0.3/book_figures/chapter4/fig_jackknife_gaussian.py0000644000076500000240000001023212420767763024726 0ustar jakevdpstaff00000000000000r""" Jackknife Calculations of Error on Mean --------------------------------------- Figure 4.4. The jackknife uncertainty estimates for the width of a Gaussian distribution. This example uses the same data as figure 4.3. The upper panel shows a histogram of the widths determined using the sample standard deviation, and using the interquartile range. The lower panel shows the corrected jackknife estimates (eqs. 4.33 and 4.35) for the two methods. The gray lines show the theoretical results, given by eq. 3.35 for :math:`\sigma` and eq. 3.37 for :math:`\sigma_G`. The result for :math:`\sigma` matches the theoretical result almost exactly, but note the failure of the jackknife to correctly estimate :math:`\sigma_G` (see the text for a discussion of this result). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from scipy.stats import norm from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # sample values from a normal distribution np.random.seed(123) m = 1000 # number of points data = norm(0, 1).rvs(m) #------------------------------------------------------------ # Compute jackknife resamplings of data from astroML.resample import jackknife from astroML.stats import sigmaG # mu1 is the mean of the standard-deviation-based width mu1, sigma_mu1, mu1_raw = jackknife(data, np.std, kwargs=dict(axis=1, ddof=1), return_raw_distribution=True) pdf1_theory = norm(1, 1. / np.sqrt(2 * (m - 1))) pdf1_jackknife = norm(mu1, sigma_mu1) # mu2 is the mean of the interquartile-based width # WARNING: do not use the following in practice. This example # shows that jackknife fails for rank-based statistics. mu2, sigma_mu2, mu2_raw = jackknife(data, sigmaG, kwargs=dict(axis=1), return_raw_distribution=True) pdf2_theory = norm(data.std(), 1.06 / np.sqrt(m)) pdf2_jackknife = norm(mu2, sigma_mu2) print(mu2, sigma_mu2) #------------------------------------------------------------ # plot the results print("mu_1 mean: %.2f +- %.2f" % (mu1, sigma_mu1)) print("mu_2 mean: %.2f +- %.2f" % (mu2, sigma_mu2)) fig = plt.figure(figsize=(5, 2)) fig.subplots_adjust(left=0.11, right=0.95, bottom=0.2, top=0.9, wspace=0.25) ax = fig.add_subplot(121) ax.hist(mu1_raw, np.linspace(0.996, 1.008, 100), label=r'$\sigma^*\ {\rm (std.\ dev.)}$', histtype='stepfilled', fc='white', normed=False) ax.hist(mu2_raw, np.linspace(0.996, 1.008, 100), label=r'$\sigma_G^*\ {\rm (quartile)}$', histtype='stepfilled', fc='gray', normed=False) ax.legend(loc='upper left', handlelength=2) ax.xaxis.set_major_locator(plt.MultipleLocator(0.004)) ax.set_xlabel(r'$\sigma^*$') ax.set_ylabel(r'$N(\sigma^*)$') ax.set_xlim(0.998, 1.008) ax.set_ylim(0, 550) ax = fig.add_subplot(122) x = np.linspace(0.45, 1.15, 1000) ax.plot(x, pdf1_jackknife.pdf(x), color='blue', ls='dashed', label=r'$\sigma\ {\rm (std.\ dev.)}$', zorder=2) ax.plot(x, pdf1_theory.pdf(x), color='gray', zorder=1) ax.plot(x, pdf2_jackknife.pdf(x), color='red', label=r'$\sigma_G\ {\rm (quartile)}$', zorder=2) ax.plot(x, pdf2_theory.pdf(x), color='gray', zorder=1) plt.legend(loc='upper left', handlelength=2) ax.set_xlabel(r'$\sigma$') ax.set_ylabel(r'$p(\sigma|x,I)$') ax.set_xlim(0.45, 1.15) ax.set_ylim(0, 24) plt.show() astroML-0.3/book_figures/chapter4/fig_lyndenbell_gals.py0000644000076500000240000001621412420767763024253 0ustar jakevdpstaff00000000000000""" Lynden-Bell Luminosity function ------------------------------- Figure 4.10. An example of computing the luminosity function for two u-r color-selected subsamples of SDSS galaxies using Lynden-Bell's C- method. The galaxies are selected from the SDSS spectroscopic sample, with redshift in the range 0.08 < z < 0.12 and flux limited to r < 17.7. The left panels show the distribution of sources as a function of redshift and absolute magnitude. The distribution p(z, M) = rho(z) Phi(m) is obtained using Lynden-Bell's method, with errors determined by 20 bootstrap resamples. The results are shown in the right panels. For the redshift distribution, we multiply the result by z^2 for clarity. Note that the most luminous galaxies belong to the photometrically red subsample, as discernible in the bottom-right panel. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import os import numpy as np from matplotlib import pyplot as plt from scipy import interpolate, stats from astroML.lumfunc import binned_Cminus, bootstrap_Cminus from astroML.cosmology import Cosmology from astroML.datasets import fetch_sdss_specgals #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get the data and perform redshift/magnitude cuts data = fetch_sdss_specgals() z_min = 0.08 z_max = 0.12 m_max = 17.7 # redshift and magnitude cuts data = data[data['z'] > z_min] data = data[data['z'] < z_max] data = data[data['petroMag_r'] < m_max] # divide red sample and blue sample based on u-r color ur = data['modelMag_u'] - data['modelMag_r'] flag_red = (ur > 2.22) flag_blue = ~flag_red data_red = data[flag_red] data_blue = data[flag_blue] # truncate sample (optional: speeds up computation) #data_red = data_red[::10] #data_blue = data_blue[::10] print(data_red.size, "red galaxies") print(data_blue.size, "blue galaxies") #------------------------------------------------------------ # Distance Modulus calculation: # We need functions approximating mu(z) and z(mu) # where z is redshift and mu is distance modulus. # We'll accomplish this using the cosmology class and # scipy's cubic spline interpolation. cosmo = Cosmology() z_sample = np.linspace(0.01, 1.5, 100) mu_sample = [cosmo.mu(z) for z in z_sample] mu_z = interpolate.interp1d(z_sample, mu_sample) z_mu = interpolate.interp1d(mu_sample, z_sample) data = [data_red, data_blue] titles = ['$u-r > 2.22$', '$u-r < 2.22$'] markers = ['o', '^'] archive_files = ['lumfunc_red.npz', 'lumfunc_blue.npz'] def compute_luminosity_function(z, m, M, m_max, archive_file): """Compute the luminosity function and archive in the given file. If the file exists, then the saved results are returned. """ Mmax = m_max - (m - M) zmax = z_mu(m_max - M) if not os.path.exists(archive_file): print("- computing bootstrapped luminosity function ", "for {0} points".format(len(z))) zbins = np.linspace(0.08, 0.12, 21) Mbins = np.linspace(-24, -20.2, 21) dist_z, err_z, dist_M, err_M = bootstrap_Cminus(z, M, zmax, Mmax, zbins, Mbins, Nbootstraps=20, normalize=True) np.savez(archive_file, zbins=zbins, dist_z=dist_z, err_z=err_z, Mbins=Mbins, dist_M=dist_M, err_M=err_M) else: print("- using precomputed bootstrapped luminosity function results") archive = np.load(archive_file) zbins = archive['zbins'] dist_z = archive['dist_z'] err_z = archive['err_z'] Mbins = archive['Mbins'] dist_M = archive['dist_M'] err_M = archive['err_M'] return zbins, dist_z, err_z, Mbins, dist_M, err_M #------------------------------------------------------------ # Perform the computation and plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.13, right=0.95, wspace=0.3, bottom=0.08, top=0.95, hspace=0.2) for i in range(2): m = data[i]['petroMag_r'] z = data[i]['z'] M = m - mu_z(z) # compute the luminosity function for the given subsample zbins, dist_z, err_z, Mbins, dist_M, err_M = \ compute_luminosity_function(z, m, M, m_max, archive_files[i]) #------------------------------------------------------------ # First axes: plot the observed 2D distribution of (z, M) ax = fig.add_subplot(2, 2, 1 + 2 * i) H, xbins, ybins = np.histogram2d(z, M, bins=(np.linspace(0.08, 0.12, 31), np.linspace(-23, -20, 41))) ax.imshow(H.T, origin='lower', aspect='auto', interpolation='nearest', cmap=plt.cm.binary, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1])) # plot the cutoff curve zrange = np.linspace(0.07, 0.13, 100) Mmax = m_max - mu_z(zrange) ax.plot(zrange, Mmax, '-k') ax.text(0.95, 0.95, titles[i] + "\n$N = %i$" % len(z), ha='right', va='top', transform=ax.transAxes) ax.set_xlim(0.075, 0.125) ax.set_ylim(-22, -19.8) ax.set_xlabel('$z$') ax.set_ylabel('$M$') #------------------------------------------------------------ # Second axes: plot the inferred 1D distribution in z ax2 = fig.add_subplot(2, 2, 2) factor = 0.08 ** 2 / (0.5 * (zbins[1:] + zbins[:-1])) ** 2 ax2.errorbar(0.5 * (zbins[1:] + zbins[:-1]), factor * dist_z, factor * err_z, fmt='-k' + markers[i], ecolor='gray', lw=1, ms=4, label=titles[i]) #------------------------------------------------------------ # Third axes: plot the inferred 1D distribution in M ax3 = fig.add_subplot(224, yscale='log') # truncate the bins so the plot looks better Mbins = Mbins[3:-1] dist_M = dist_M[3:-1] err_M = err_M[3:-1] ax3.errorbar(0.5 * (Mbins[1:] + Mbins[:-1]), dist_M, err_M, fmt='-k' + markers[i], ecolor='gray', lw=1, ms=4, label=titles[i]) #------------------------------------------------------------ # set labels and limits ax2.legend(loc=1) ax2.xaxis.set_major_locator(plt.MultipleLocator(0.01)) ax2.set_xlabel(r'$z$') ax2.set_ylabel(r'$\rho(z) / [z / 0.08]^2$') ax2.set_xlim(0.075, 0.125) ax2.set_ylim(10, 25) ax3.legend(loc=3) ax3.xaxis.set_major_locator(plt.MultipleLocator(1.0)) ax3.set_xlabel(r'$M$') ax3.set_ylabel(r'$\Phi(M)$') ax3.set_xlim(-20, -23.5) ax3.set_ylim(1E-5, 2) plt.show() astroML-0.3/book_figures/chapter4/fig_lyndenbell_setup.py0000644000076500000240000000546212252721253024453 0ustar jakevdpstaff00000000000000""" Lynden-Bell C- setup -------------------- Figure 4.8. Illustration for the definition of a truncated data set, and for the comparable or associated subset used by the Lynden-Bell C- method. The sample is limited by x < xmax and y < ymax(x) (light-shaded area). Associated sets Ji and Jk are shown by the dark-shaded area. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Rectangle #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Draw the schematic fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.06, right=0.95, wspace=0.12) ax1 = fig.add_subplot(121, xticks=[], yticks=[]) ax2 = fig.add_subplot(122, xticks=[], yticks=[]) # define a convenient function max_func = lambda t: 1. / (0.5 + t) - 0.5 x = np.linspace(0, 1.0, 100) ymax = max_func(x) ymax[ymax > 1] = 1 # draw and label the common background for ax in (ax1, ax2): ax.fill_between(x, 0, ymax, color='gray', alpha=0.5) ax.plot([-0.1, 1], [1, 1], '--k', lw=1) ax.text(0.7, 0.35, r'$y_{\rm max}(x)$', rotation=-30) ax.plot([1, 1], [0, 1], '--k', lw=1) ax.text(1.01, 0.5, r'$x_{\rm max}$', ha='left', va='center', rotation=90) # draw and label J_i in the first axes xi = 0.4 yi = 0.35 ax1.scatter([xi], [yi], s=16, lw=0, c='k') ax1.text(xi + 0.02, yi + 0.02, ' $(x_i, y_i)$', ha='left', va='center') ax1.add_patch(Rectangle((0, 0), xi, max_func(xi), ec='k', fc='gray', linestyle='dashed', lw=1, alpha=0.5)) ax1.text(0.5 * xi, 0.5 * max_func(xi), '$J_i$', ha='center', va='center') # draw and label J_k in the second axes ax2.scatter([xi], [yi], s=16, lw=0, c='k') ax2.text(xi + 0.02, yi + 0.02, ' $(x_k, y_k)$', ha='center', va='bottom') ax2.add_patch(Rectangle((0, 0), max_func(yi), yi, ec='k', fc='gray', linestyle='dashed', lw=1, alpha=0.5)) ax2.text(0.5 * max_func(yi), 0.5 * yi, '$J_k$', ha='center', va='center') # adjust the limits of both axes for ax in (ax1, ax2): ax.set_xlim(0, 1.1) ax.set_ylim(0, 1.1) ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter4/fig_lyndenbell_toy.py0000644000076500000240000000776212252721253024133 0ustar jakevdpstaff00000000000000""" Luminosity function code on toy data ------------------------------------ Figure 4.9. An example of using Lynden-Bell's C- method to estimate a bivariate distribution from a truncated sample. The lines in the left panel show the true one-dimensional distributions of x and y (truncated Gaussian distributions). The two-dimensional distribution is assumed to be separable; see eq. 4.85. A realization of the distribution is shown in the right panel, with a truncation given by the solid line. The points in the left panel are computed from the truncated data set using the C- method, with error bars from 20 bootstrap resamples. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats from astroML.lumfunc import bootstrap_Cminus #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define and sample our distributions N = 10000 np.random.seed(42) # Define the input distributions for x and y x_pdf = stats.truncnorm(-2, 1, 0.66666, 0.33333) y_pdf = stats.truncnorm(-1, 2, 0.33333, 0.33333) x = x_pdf.rvs(N) y = y_pdf.rvs(N) # define the truncation: we'll design this to be symmetric # so that xmax(y) = max_func(y) # and ymax(x) = max_func(x) max_func = lambda t: 1. / (0.5 + t) - 0.5 xmax = max_func(y) xmax[xmax > 1] = 1 # cutoff at x=1 ymax = max_func(x) ymax[ymax > 1] = 1 # cutoff at y=1 # truncate the data flag = (x < xmax) & (y < ymax) x = x[flag] y = y[flag] xmax = xmax[flag] ymax = ymax[flag] x_fit = np.linspace(0, 1, 21) y_fit = np.linspace(0, 1, 21) #------------------------------------------------------------ # compute the Cminus distributions (with bootstrap) x_dist, dx_dist, y_dist, dy_dist = bootstrap_Cminus(x, y, xmax, ymax, x_fit, y_fit, Nbootstraps=20, normalize=True) x_mid = 0.5 * (x_fit[1:] + x_fit[:-1]) y_mid = 0.5 * (y_fit[1:] + y_fit[:-1]) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2)) fig.subplots_adjust(bottom=0.2, top=0.95, left=0.1, right=0.92, wspace=0.25) # First subplot is the true & inferred 1D distributions ax = fig.add_subplot(121) ax.plot(x_mid, x_pdf.pdf(x_mid), '-k', label='$p(x)$') ax.plot(y_mid, y_pdf.pdf(y_mid), '--k', label='$p(y)$') ax.legend(loc='lower center') ax.errorbar(x_mid, x_dist, dx_dist, fmt='ok', ecolor='k', lw=1, ms=4) ax.errorbar(y_mid, y_dist, dy_dist, fmt='^k', ecolor='k', lw=1, ms=4) ax.set_ylim(0, 1.8) ax.set_xlim(0, 1) ax.set_xlabel('$x$, $y$') ax.set_ylabel('normalized distribution') # Second subplot is the "observed" 2D distribution ax = fig.add_subplot(122) H, xb, yb = np.histogram2d(x, y, bins=np.linspace(0, 1, 41)) plt.imshow(H.T, origin='lower', interpolation='nearest', extent=[0, 1, 0, 1], cmap=plt.cm.binary) cb = plt.colorbar() x_limit = np.linspace(-0.1, 1.1, 1000) y_limit = max_func(x_limit) x_limit[y_limit > 1] = 0 y_limit[x_limit > 1] = 0 ax.plot(x_limit, y_limit, '-k') ax.set_xlim(0, 1.1) ax.set_ylim(0, 1.1) ax.set_xlabel('$x$') ax.set_ylabel('$y$') cb.set_label('counts per pixel') ax.text(0.93, 0.93, '%i points' % len(x), ha='right', va='top', transform=ax.transAxes) plt.show() astroML-0.3/book_figures/chapter4/README.rst0000644000076500000240000000024412115147567021374 0ustar jakevdpstaff00000000000000Chapter 4: Classical Statistical Inference ------------------------------------------ This chapter develops the classical or "frequentist" approach to statistics. astroML-0.3/book_figures/chapter5/0000755000076500000240000000000012462244012017672 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter5/fig_bayes_blocks.py0000644000076500000240000000545412252721253023545 0ustar jakevdpstaff00000000000000""" Distribution Representation Comparison -------------------------------------- Figure 5.21 Comparison of Knuth's histogram and a Bayesian blocks histogram. The adaptive bin widths of the Bayesian blocks histogram yield a better representation of the underlying data, especially with fewer points. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats from astroML.plotting import hist #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate our data: a mix of several Cauchy distributions np.random.seed(0) N = 10000 mu_gamma_f = [(5, 1.0, 0.1), (7, 0.5, 0.5), (9, 0.1, 0.1), (12, 0.5, 0.2), (14, 1.0, 0.1)] true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x) for (mu, gamma, f) in mu_gamma_f]) x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N)) for (mu, gamma, f) in mu_gamma_f]) np.random.shuffle(x) x = x[x > -10] x = x[x < 30] #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(bottom=0.08, top=0.95, right=0.95, hspace=0.1) N_values = (500, 5000) subplots = (211, 212) for N, subplot in zip(N_values, subplots): ax = fig.add_subplot(subplot) xN = x[:N] t = np.linspace(-10, 30, 1000) # plot the results ax.plot(xN, -0.005 * np.ones(len(xN)), '|k') hist(xN, bins='knuth', ax=ax, normed=True, histtype='stepfilled', alpha=0.3, label='Knuth Histogram') hist(xN, bins='blocks', ax=ax, normed=True, histtype='step', color='k', label="Bayesian Blocks") ax.plot(t, true_pdf(t), '-', color='black', label="Generating Distribution") # label the plot ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top', transform=ax.transAxes) ax.set_ylabel('$p(x)$') ax.legend(loc='upper right', prop=dict(size=8)) if subplot == 212: ax.set_xlabel('$x$') ax.set_xlim(0, 20) ax.set_ylim(-0.01, 0.4001) plt.show() astroML-0.3/book_figures/chapter5/fig_cauchy_mcmc.py0000644000076500000240000001160412252721253023352 0ustar jakevdpstaff00000000000000""" MCMC for the Cauchy distribution -------------------------------- Figure 5.22 Markov chain monte carlo (MCMC) estimates of the posterior pdf for parameters describing the Cauchy distribution. The data are the same as those used in figure 5.10: the dashed curves in the top-right panel show the results of direct computation on a regular grid from that diagram. The solid curves are the corresponding MCMC estimates using 10,000 sample points. The left and the bottom panels show marginalized distributions. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import cauchy from matplotlib import pyplot as plt from astroML.plotting.mcmc import convert_to_stdev # this fixes a problem when using older versions of pymc with newer # versions of scipy import scipy scipy.derivative = scipy.misc.derivative import pymc #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def cauchy_logL(xi, sigma, mu): """Equation 5.74: cauchy likelihood""" xi = np.asarray(xi) n = xi.size shape = np.broadcast(sigma, mu).shape xi = xi.reshape(xi.shape + tuple([1 for s in shape])) return ((n - 1) * np.log(sigma) - np.sum(np.log(sigma ** 2 + (xi - mu) ** 2), 0)) #---------------------------------------------------------------------- # Draw the sample from a Cauchy distribution np.random.seed(44) mu_0 = 0 gamma_0 = 2 xi = cauchy(mu_0, gamma_0).rvs(10) #---------------------------------------------------------------------- # Perform MCMC: # set up our Stochastic variables, mu and gamma mu = pymc.Uniform('mu', -5, 5) log_gamma = pymc.Uniform('log_gamma', -10, 10, value=0) @pymc.deterministic def gamma(log_gamma=log_gamma): return np.exp(log_gamma) # set up our observed variable x x = pymc.Cauchy('x', mu, gamma, observed=True, value=xi) # set up our model dictionary model = dict(mu=mu, log_gamma=log_gamma, gamma=gamma, x=x) # perform the MCMC S = pymc.MCMC(model) S.sample(iter=50000, burn=5000) # extract the traces we're interested in trace_mu = S.trace('mu')[:] trace_gamma = S.trace('gamma')[:] # compute histogram of results to plot below L_MCMC, mu_bins, gamma_bins = np.histogram2d(trace_mu, trace_gamma, bins=(np.linspace(-5, 5, 41), np.linspace(0, 5, 41))) L_MCMC[L_MCMC == 0] = 1E-16 # prevents zero-division errors #---------------------------------------------------------------------- # Compute likelihood analytically for comparison mu = np.linspace(-5, 5, 70) gamma = np.linspace(0.1, 5, 70) logL = cauchy_logL(xi, gamma[:, np.newaxis], mu) logL -= logL.max() p_mu = np.exp(logL).sum(0) p_mu /= p_mu.sum() * (mu[1] - mu[0]) p_gamma = np.exp(logL).sum(1) p_gamma /= p_gamma.sum() * (gamma[1] - gamma[0]) hist_mu, bins_mu = np.histogram(trace_mu, bins=mu_bins, normed=True) hist_gamma, bins_gamma = np.histogram(trace_gamma, bins=gamma_bins, normed=True) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 5)) # first axis: likelihood contours ax1 = fig.add_axes((0.4, 0.4, 0.55, 0.55)) ax1.xaxis.set_major_formatter(plt.NullFormatter()) ax1.yaxis.set_major_formatter(plt.NullFormatter()) ax1.contour(mu, gamma, convert_to_stdev(logL), levels=(0.683, 0.955, 0.997), colors='b', linestyles='dashed') ax1.contour(0.5 * (mu_bins[:-1] + mu_bins[1:]), 0.5 * (gamma_bins[:-1] + gamma_bins[1:]), convert_to_stdev(np.log(L_MCMC.T)), levels=(0.683, 0.955, 0.997), colors='k') # second axis: marginalized over mu ax2 = fig.add_axes((0.1, 0.4, 0.29, 0.55)) ax2.xaxis.set_major_formatter(plt.NullFormatter()) ax2.plot(hist_gamma, 0.5 * (bins_gamma[1:] + bins_gamma[:-1] - bins_gamma[1] + bins_gamma[0]), '-k', drawstyle='steps') ax2.plot(p_gamma, gamma, '--b') ax2.set_ylabel(r'$\gamma$') ax2.set_ylim(0, 5) # third axis: marginalized over gamma ax3 = fig.add_axes((0.4, 0.1, 0.55, 0.29)) ax3.yaxis.set_major_formatter(plt.NullFormatter()) ax3.plot(0.5 * (bins_mu[1:] + bins_mu[:-1]), hist_mu, '-k', drawstyle='steps-mid') ax3.plot(mu, p_mu, '--b') ax3.set_xlabel(r'$\mu$') plt.xlim(-5, 5) plt.show() astroML-0.3/book_figures/chapter5/fig_distribution_gaussgauss.py0000644000076500000240000000504612420767763026103 0ustar jakevdpstaff00000000000000""" Gaussian/Gaussian distribution ------------------------------ Figure 5.6 The distribution of 106 points drawn from :math:`\mathcal{N}(0,1)` and sampled with heteroscedastic Gaussian errors with widths, :math:`e_i`, uniformly distributed between 0 and 3. A linear superposition of these Gaussian distributions with widths equal to :math:`\sqrt{1 + e_i^2} results in a non-Gaussian distribution. The best-fit Gaussians centered on the sample median with widths equal to sample standard deviation and quartile-based :math:`\sigma_G` (eq.3.36) are shown for comparison. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm, anderson from astroML.stats import mean_sigma, median_sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create distributions # draw underlying points np.random.seed(0) Npts = 1E6 x = np.random.normal(loc=0, scale=1, size=Npts) # add error for each point e = 3 * np.random.random(Npts) x += np.random.normal(0, e) # compute anderson-darling test A2, sig, crit = anderson(x) print("anderson-darling A^2 = %.1f" % A2) # compute point statistics mu_sample, sig_sample = mean_sigma(x, ddof=1) med_sample, sigG_sample = median_sigmaG(x) #------------------------------------------------------------ # plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) ax.hist(x, 100, histtype='stepfilled', alpha=0.2, color='k', normed=True) # plot the fitting normal curves x_sample = np.linspace(-15, 15, 1000) ax.plot(x_sample, norm(mu_sample, sig_sample).pdf(x_sample), '-k', label='$\sigma$ fit') ax.plot(x_sample, norm(med_sample, sigG_sample).pdf(x_sample), '--k', label='$\sigma_G$ fit') ax.legend() ax.set_xlim(-7.5, 7.5) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter5/fig_gaussgauss_mcmc.py0000644000076500000240000001054712252721253024270 0ustar jakevdpstaff00000000000000""" Gaussian Distribution with Gaussian Errors ------------------------------------------ Figure 5.25 The posterior pdf for mu and sigma for a Gaussian distribution with heteroscedastic errors. This is the same data set as used in figure 5.7, but here each measurement error is assumed unknown, treated as a model parameter with a scale-invariant prior, and marginalized over to obtain the distribution of mu and sigma shown by contours. For comparison, the posterior pdf from figure 5.7 is shown by shaded contours. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt # Hack to fix import issue in older versions of pymc import scipy import scipy.misc scipy.derivative = scipy.misc.derivative import pymc from astroML.plotting.mcmc import convert_to_stdev from astroML.plotting import plot_mcmc #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def gaussgauss_logL(xi, ei, mu, sigma): """Equation 5.22: gaussian likelihood""" ndim = len(np.broadcast(sigma, mu).shape) xi = xi.reshape(xi.shape + tuple(ndim * [1])) ei = ei.reshape(ei.shape + tuple(ndim * [1])) s2_e2 = sigma ** 2 + ei ** 2 return -0.5 * np.sum(np.log(s2_e2) + (xi - mu) ** 2 / s2_e2, 0) #------------------------------------------------------------ # Select the data np.random.seed(5) mu_true = 1. sigma_true = 1. N = 10 ei = 3 * np.random.random(N) xi = np.random.normal(mu_true, np.sqrt(sigma_true ** 2 + ei ** 2)) #---------------------------------------------------------------------- # Set up MCMC for our model parameters: (mu, sigma, ei) mu = pymc.Uniform('mu', -10, 10, value=0) log_sigma = pymc.Uniform('log_sigma', -10, 10, value=0) log_error = pymc.Uniform('log_error', -10, 10, value=np.zeros(N)) @pymc.deterministic def sigma(log_sigma=log_sigma): return np.exp(log_sigma) @pymc.deterministic def error(log_error=log_error): return np.exp(log_error) def gaussgauss_like(x, mu, sigma, error): """likelihood of gaussian with gaussian errors""" sig2 = sigma ** 2 + error ** 2 x_mu2 = (x - mu) ** 2 return -0.5 * np.sum(np.log(sig2) + x_mu2 / sig2) GaussGauss = pymc.stochastic_from_dist('gaussgauss', logp=gaussgauss_like, dtype=np.float, mv=True) M = GaussGauss('M', mu, sigma, error, observed=True, value=xi) model = dict(mu=mu, log_sigma=log_sigma, sigma=sigma, log_error=log_error, error=error, M=M) #------------------------------------------------------------ # perform the MCMC sampling np.random.seed(0) S = pymc.MCMC(model) S.sample(iter=25000, burn=2000) #------------------------------------------------------------ # Extract the MCMC traces trace_mu = S.trace('mu')[:] trace_sigma = S.trace('sigma')[:] fig = plt.figure(figsize=(5, 3.75)) ax, = plot_mcmc([trace_mu, trace_sigma], fig=fig, limits=[(-3.2, 4.2), (0, 5)], bounds=(0.08, 0.12, 0.95, 0.95), labels=(r'$\mu$', r'$\sigma$'), levels=[0.683, 0.955, 0.997], colors='k') #---------------------------------------------------------------------- # Compute and plot likelihood with known ei for comparison # (Same as fig_likelihood_gaussgauss) sigma = np.linspace(0.01, 5, 41) mu = np.linspace(-3.2, 4.2, 41) logL = gaussgauss_logL(xi, ei, mu, sigma[:, np.newaxis]) logL -= logL.max() im = ax.contourf(mu, sigma, convert_to_stdev(logL), levels=(0, 0.683, 0.955, 0.997), cmap=plt.cm.binary_r, alpha=0.5) im.set_clim(0, 1.1) ax.set_xlabel(r'$\mu$') ax.set_ylabel(r'$\sigma$') ax.set_xlim(-3.2, 4.2) ax.set_ylim(0, 5) ax.set_aspect(1. / ax.get_data_ratio()) plt.show() astroML-0.3/book_figures/chapter5/fig_hist_binsize.py0000644000076500000240000000757712252721253023607 0ustar jakevdpstaff00000000000000""" Selection of Histogram bin size ------------------------------- Figure 5.20 The results of Scott's rule, the Freedman-Diaconis rule, and Knuth's rule for selecting the optimal bin width for a histogram. These histograms are based on 5000 points drawn from the shown pdfs. On the left is a simple normal distribution. On the right is a Laplacian distribution at the center, with two small Gaussian peaks added in the wings. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats from astroML.plotting import hist #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def plot_labeled_histogram(style, data, name, x, pdf_true, ax=None, hide_x=False, hide_y=False): if ax is not None: ax = plt.axes(ax) counts, bins, patches = hist(data, bins=style, ax=ax, color='k', histtype='step', normed=True) ax.text(0.95, 0.93, '%s:\n%i bins' % (name, len(counts)), transform=ax.transAxes, ha='right', va='top') ax.fill(x, pdf_true, '-', color='#CCCCCC', zorder=0) if hide_x: ax.xaxis.set_major_formatter(plt.NullFormatter()) if hide_y: ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlim(-5, 5) return ax #------------------------------------------------------------ # Set up distributions: Npts = 5000 np.random.seed(0) x = np.linspace(-6, 6, 1000) # Gaussian distribution data_G = stats.norm(0, 1).rvs(Npts) pdf_G = stats.norm(0, 1).pdf(x) # Non-Gaussian distribution distributions = [stats.laplace(0, 0.4), stats.norm(-4.0, 0.2), stats.norm(4.0, 0.2)] weights = np.array([0.8, 0.1, 0.1]) weights /= weights.sum() data_NG = np.hstack(d.rvs(int(w * Npts)) for (d, w) in zip(distributions, weights)) pdf_NG = sum(w * d.pdf(x) for (d, w) in zip(distributions, weights)) #------------------------------------------------------------ # Plot results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(hspace=0, left=0.07, right=0.95, wspace=0.05, bottom=0.15) ax = [fig.add_subplot(3, 2, i + 1) for i in range(6)] # first column: Gaussian distribution plot_labeled_histogram('scotts', data_G, 'Scott\'s Rule', x, pdf_G, ax=ax[0], hide_x=True, hide_y=True) plot_labeled_histogram('freedman', data_G, 'Freed.-Diac.', x, pdf_G, ax=ax[2], hide_x=True, hide_y=True) plot_labeled_histogram('knuth', data_G, 'Knuth\'s Rule', x, pdf_G, ax=ax[4], hide_x=False, hide_y=True) ax[0].set_title('Gaussian distribution') ax[2].set_ylabel('$p(x)$') ax[4].set_xlabel('$x$') # second column: non-gaussian distribution plot_labeled_histogram('scotts', data_NG, 'Scott\'s Rule', x, pdf_NG, ax=ax[1], hide_x=True, hide_y=True) plot_labeled_histogram('freedman', data_NG, 'Freed.-Diac.', x, pdf_NG, ax=ax[3], hide_x=True, hide_y=True) plot_labeled_histogram('knuth', data_NG, 'Knuth\'s Rule', x, pdf_NG, ax=ax[5], hide_x=False, hide_y=True) ax[1].set_title('non-Gaussian distribution') ax[5].set_xlabel('$x$') plt.show() astroML-0.3/book_figures/chapter5/fig_likelihood_cauchy.py0000644000076500000240000000622312420767763024574 0ustar jakevdpstaff00000000000000""" Log-likelihood for Cauchy Distribution -------------------------------------- Figure 5.10 An illustration of the logarithm of posterior probability distribution for :math:`\mu` and :math:`\gamma`, :math:`L(\mu,\gamma)` (see eq. 5.75) for N = 10 (the sample is generated using the Cauchy distribution with :math:`\mu = 0` and :math:`\gamma = 2`). The maximum of L is renormalized to 0, and color coded as shown in the legend. The contours enclose the regions that contain 0.683, 0.955 and 0.997 of the cumulative (integrated) posterior probability. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from scipy.stats import cauchy from astroML.plotting.mcmc import convert_to_stdev from astroML.stats import median_sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def cauchy_logL(xi, gamma, mu): """Equation 5.74: cauchy likelihood""" xi = np.asarray(xi) n = xi.size shape = np.broadcast(gamma, mu).shape xi = xi.reshape(xi.shape + tuple([1 for s in shape])) return ((n - 1) * np.log(gamma) - np.sum(np.log(gamma ** 2 + (xi - mu) ** 2), 0)) #------------------------------------------------------------ # Define the grid and compute logL gamma = np.linspace(0.1, 5, 70) mu = np.linspace(-5, 5, 70) np.random.seed(44) mu0 = 0 gamma0 = 2 xi = cauchy(mu0, gamma0).rvs(10) logL = cauchy_logL(xi, gamma[:, np.newaxis], mu) logL -= logL.max() #------------------------------------------------------------ # Find the max and print some information i, j = np.where(logL >= np.max(logL)) print("mu from likelihood:", mu[j]) print("gamma from likelihood:", gamma[i]) print() med, sigG = median_sigmaG(xi) print("mu from median", med) print("gamma from quartiles:", sigG / 1.483) # Equation 3.54 print() #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) plt.imshow(logL, origin='lower', cmap=plt.cm.binary, extent=(mu[0], mu[-1], gamma[0], gamma[-1]), aspect='auto') plt.colorbar().set_label(r'$\log(L)$') plt.clim(-5, 0) plt.contour(mu, gamma, convert_to_stdev(logL), levels=(0.683, 0.955, 0.997), colors='k') plt.text(0.5, 0.93, r'$L(\mu,\gamma)\ \mathrm{for}\ \bar{x}=0,\ \gamma=2,\ n=10$', bbox=dict(ec='k', fc='w', alpha=0.9), ha='center', va='center', transform=plt.gca().transAxes) plt.xlabel(r'$\mu$') plt.ylabel(r'$\gamma$') plt.show() astroML-0.3/book_figures/chapter5/fig_likelihood_gaussgauss.py0000644000076500000240000000601212252721253025464 0ustar jakevdpstaff00000000000000""" Gaussian Distribution with Gaussian Errors ------------------------------------------ Figure 5.7 The logarithm of the posterior probability density function for :math:`\mu` and :math:`\sigma`, :math:`L_p(\mu,\sigma)`, for a Gaussian distribution with heteroscedastic Gaussian measurement errors (sampled uniformly from the 0-3 interval), given by eq. 5.64. The input values are :math:`\mu = 1` and :math:`\sigma = 1`, and a randomly generated sample has 10 points. Note that the posterior pdf is not symmetric with respect to the :math:`\mu = 1` line, and that the outermost contour, which encloses the region that contains 0.997 of the cumulative (integrated) posterior probability, allows solutions with :math:`\sigma = 0`. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.plotting.mcmc import convert_to_stdev #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def gaussgauss_logL(xi, ei, mu, sigma): """Equation 5.63: gaussian likelihood with gaussian errors""" ndim = len(np.broadcast(sigma, mu).shape) xi = xi.reshape(xi.shape + tuple(ndim * [1])) ei = ei.reshape(ei.shape + tuple(ndim * [1])) s2_e2 = sigma ** 2 + ei ** 2 return -0.5 * np.sum(np.log(s2_e2) + (xi - mu) ** 2 / s2_e2, 0) #------------------------------------------------------------ # Define the grid and compute logL np.random.seed(5) mu_true = 1. sigma_true = 1. N = 10 ei = 3 * np.random.random(N) xi = np.random.normal(mu_true, np.sqrt(sigma_true ** 2 + ei ** 2)) sigma = np.linspace(0.01, 5, 70) mu = np.linspace(-3, 5, 70) logL = gaussgauss_logL(xi, ei, mu, sigma[:, np.newaxis]) logL -= logL.max() #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 3.75)) plt.imshow(logL, origin='lower', extent=(mu[0], mu[-1], sigma[0], sigma[-1]), cmap=plt.cm.binary, aspect='auto') plt.colorbar().set_label(r'$\log(L)$') plt.clim(-5, 0) plt.text(0.5, 0.93, (r'$L(\mu,\sigma)\ \mathrm{for}\ \bar{x}=1,\ ' r'\sigma_{\rm true}=1,\ n=10$'), bbox=dict(ec='k', fc='w', alpha=0.9), ha='center', va='center', transform=plt.gca().transAxes) plt.contour(mu, sigma, convert_to_stdev(logL), levels=(0.683, 0.955, 0.997), colors='k') plt.xlabel(r'$\mu$') plt.ylabel(r'$\sigma$') plt.show() astroML-0.3/book_figures/chapter5/fig_likelihood_gaussian.py0000644000076500000240000000510412252721253025112 0ustar jakevdpstaff00000000000000""" Log-likelihood for Gaussian Distribution ---------------------------------------- Figure5.4 An illustration of the logarithm of the posterior probability density function for :math:`\mu` and :math:`\sigma`, :math:`L_p(\mu,\sigma)` (see eq. 5.58) for data drawn from a Gaussian distribution and N = 10, x = 1, and V = 4. The maximum of :math:`L_p` is renormalized to 0, and color coded as shown in the legend. The maximum value of :math:`L_p` is at :math:`\mu_0 = 1.0` and :math:`\sigma_0 = 1.8`. The contours enclose the regions that contain 0.683, 0.955, and 0.997 of the cumulative (integrated) posterior probability. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.plotting.mcmc import convert_to_stdev #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def gauss_logL(xbar, V, n, sigma, mu): """Equation 5.57: gaussian likelihood""" return (-(n + 1) * np.log(sigma) - 0.5 * n * ((xbar - mu) ** 2 + V) / sigma ** 2) #------------------------------------------------------------ # Define the grid and compute logL sigma = np.linspace(1, 5, 70) mu = np.linspace(-3, 5, 70) xbar = 1 V = 4 n = 10 logL = gauss_logL(xbar, V, n, sigma[:, np.newaxis], mu) logL -= logL.max() #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) plt.imshow(logL, origin='lower', extent=(mu[0], mu[-1], sigma[0], sigma[-1]), cmap=plt.cm.binary, aspect='auto') plt.colorbar().set_label(r'$\log(L)$') plt.clim(-5, 0) plt.contour(mu, sigma, convert_to_stdev(logL), levels=(0.683, 0.955, 0.997), colors='k') plt.text(0.5, 0.93, r'$L(\mu,\sigma)\ \mathrm{for}\ \bar{x}=1,\ V=4,\ n=10$', bbox=dict(ec='k', fc='w', alpha=0.9), ha='center', va='center', transform=plt.gca().transAxes) plt.xlabel(r'$\mu$') plt.ylabel(r'$\sigma$') plt.show() astroML-0.3/book_figures/chapter5/fig_likelihood_gausslin.py0000644000076500000240000000755712252721253025143 0ustar jakevdpstaff00000000000000""" Log-likelihood for Gaussian plus linear background -------------------------------------------------- Figure 5.13 An illustration of the logarithm of the posterior probability density function :math:`L(\sigma,A)` (see eq. 5.85) for data generated using N = 200, :math:`\mu=5`, :math:`\sigma = 1`, and A = 0.5, with the background strength (1 - A)/W = 0.05 in the interval 0 < x < W, W = 10. The maximum of :math:`L(\sigma, A)` is renormalized to 0, and color coded on a scale -5 to 0, as shown in the legend. The contours enclose the regions that contain 0.683, 0.955, and 0.997 of the cumulative (integrated) posterior probability. Note the covariance between A and :math:`\sigma`. The histogram in the bottom panel shows the distribution of data values used to construct the posterior pdf in the top panel, and the probability density function from which the data were drawn as the solid line. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import truncnorm, uniform from astroML.plotting.mcmc import convert_to_stdev #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def gausslin_logL(xi, A=0.5, sigma=1.0, mu=5.0, L=10.0): """Equation 5.80: gaussian likelihood with uniform background""" xi = np.asarray(xi) shape = np.broadcast(sigma, A, mu, L).shape xi = xi.reshape(xi.shape + tuple([1 for s in shape])) return np.sum(np.log(A * np.exp(-0.5 * ((xi - mu) / sigma) ** 2) / (sigma * np.sqrt(2 * np.pi)) + (1. - A) / L), 0) #------------------------------------------------------------ # Define the distribution np.random.seed(0) mu = 5.0 sigma = 1.0 L = 10.0 A = 0.5 N = 200 xi = np.random.random(N) NA = np.sum(xi < A) dist1 = truncnorm((0 - mu) / sigma, (L - mu) / sigma, mu, sigma) dist2 = uniform(0, 10) xi[:NA] = dist1.rvs(NA) xi[NA:] = dist2.rvs(N - NA) x = np.linspace(-1, 11, 1000) fracA = NA * 1. / N #------------------------------------------------------------ # define the (sigma, A) grid and compute logL sigma = np.linspace(0.5, 2, 70) A = np.linspace(0, 1, 70) logL = gausslin_logL(xi, A[:, np.newaxis], sigma) logL -= logL.max() #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 8)) fig.subplots_adjust(bottom=0.07, left=0.11, hspace=0.15, top=0.95) ax = fig.add_subplot(211) plt.imshow(logL, origin='lower', aspect='auto', extent=(sigma[0], sigma[-1], A[0], A[-1]), cmap=plt.cm.binary) plt.colorbar().set_label(r'$\log(L)$') plt.clim(-5, 0) ax.set_xlabel(r'$\sigma$') ax.set_ylabel(r'$A$') ax.text(0.5, 0.9, r'$L(\sigma,A)\ (\mathrm{Gauss + bkgd},\ n=200)$', bbox=dict(ec='k', fc='w', alpha=0.9), ha='center', va='center', transform=plt.gca().transAxes) ax.contour(sigma, A, convert_to_stdev(logL), levels=(0.683, 0.955, 0.997), colors='k') ax2 = plt.subplot(212) ax2.yaxis.set_major_locator(plt.MultipleLocator(0.1)) ax2.plot(x, fracA * dist1.pdf(x) + (1. - fracA) * dist2.pdf(x), '-k') ax2.hist(xi, 30, normed=True, histtype='stepfilled', fc='gray', alpha=0.5) ax2.set_ylim(0, 0.301) ax2.set_xlim(-1, 11) ax2.set_xlabel('$x$') ax2.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter5/fig_likelihood_uniform.py0000644000076500000240000000632112420767763024776 0ustar jakevdpstaff00000000000000""" Log-likelihood for Uniform Distribution --------------------------------------- Figure 5.12 An illustration of the logarithm of the posterior probability distribution :math:`L(\mu, W)` (see eq. 5.77) for N = 100, :math:`\mu = 5`, and W = 10. The maximum of L is renormalized to 0, and color coded on a scale from -5 to 0, as shown in the legend. The bottom panel shows the marginal posterior for :math:`\mu` (see eq. 5.79), and the left panel shows the marginal posterior for W (see eq. 5.80). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def uniform_logL(x, W, mu): """Equation 5.76:""" xmin = np.min(x) xmax = np.max(x) n = x.size res = np.zeros(mu.shape, dtype=float) - (n + 1) * np.log(W) res[(abs(xmin - mu) > 0.5 * W) | (abs(xmax - mu) > 0.5 * W)] = -np.inf return res #------------------------------------------------------------ # Define the grid and compute logL W = np.linspace(9.7, 10.7, 70) mu = np.linspace(4.5, 5.5, 70) np.random.seed(0) x = 10 * np.random.random(100) logL = uniform_logL(x, W[:, None], mu) logL -= logL.max() #------------------------------------------------------------ # Compute marginal likelihoods n = x.size p_mu = np.exp(logL).sum(0) Wmin = x.max() - x.min() p_W = (W - Wmin) / W ** (n + 1) p_W[W < Wmin] = 0 p_W /= p_W.sum() #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) # 2D likelihood plot ax = fig.add_axes([0.35, 0.35, 0.45, 0.6], xticks=[], yticks=[]) logL[logL < -10] = -10 # truncate for clean plotting plt.imshow(logL, origin='lower', extent=(mu[0], mu[-1], W[0], W[-1]), cmap=plt.cm.binary, aspect='auto') # colorbar cax = plt.axes([0.82, 0.35, 0.02, 0.6]) cb = plt.colorbar(cax=cax) cb.set_label(r'$\log L(\mu, W)$') plt.clim(-7, 0) ax.text(0.5, 0.93, r'$L(\mu,W)\ \mathrm{uniform,\ n=100}$', bbox=dict(ec='k', fc='w', alpha=0.9), ha='center', va='center', transform=ax.transAxes) ax.set_xlim(4.5, 5.5) ax.set_ylim(9.7, 10.7) ax1 = fig.add_axes([0.35, 0.1, 0.45, 0.23], yticks=[]) ax1.plot(mu, p_mu, '-k') ax1.set_xlabel(r'$\mu$') ax1.set_ylabel(r'$p(\mu)$') ax1.set_xlim(4.5, 5.5) ax2 = fig.add_axes([0.15, 0.35, 0.18, 0.6], xticks=[]) ax2.plot(p_W, W, '-k') ax2.set_xlabel(r'$p(W)$') ax2.set_ylabel(r'$W$') ax2.set_xlim(ax2.get_xlim()[::-1]) # reverse x axis ax2.set_ylim(9.7, 10.7) print("data extent:", min(x), max(x)) plt.show() astroML-0.3/book_figures/chapter5/fig_lutz_kelker.py0000644000076500000240000001070712420577220023435 0ustar jakevdpstaff00000000000000""" Eddington-Malmquist & Lutz-Kelker Biases ---------------------------------------- Figure 5.3 An illustration of the Eddington-Malmquist (left) and Lutz-Kelker (right) biases for mock data sets that simulate upcoming LSST and Gaia surveys (see text). The left panel shows a bias in photometric calibration when using pairs of measurements of the same stars with realistic photometric error distributions. Depending on the adopted faint limit (x-axis), the median difference between two measurements (dashed line) is biased when this limit is too close to the 5-sigma data limit (corresponding to errors of 0.2 mag); in this example the 5-sigma magnitude limit is set to 24. The solid line shows the assumed random measurement errors - if the number of stars in the sample is large, the random error for magnitude difference may become much smaller than the bias. The right panel shows the bias in absolute magnitude for samples calibrated using trigonometric parallax measurements with relative errors :math:`\sigma_\pi / \pi`, and two hypothetical parallax distributions given by eq. 5.41 and p = 2, 4. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.stats import median_sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def generate_magnitudes(N, k=0.6, m_min=20, m_max=25): """ generate magnitudes from a distribution with p(m) ~ 10^(k m) """ klog10 = k * np.log(10) Pmin = np.exp(klog10 * m_min) Pmax = np.exp(klog10 * m_max) return (1. / klog10) * np.log(Pmin + (Pmax - Pmin) * np.random.random(N)) def mag_errors(m_true, m5=24.0, fGamma=0.039): """ compute magnitude errors based on the true magnitude and the 5-sigma limiting magnitude, m5 """ x = 10 ** (0.4 * (mtrue - m5)) return np.sqrt((0.04 - fGamma) * x + fGamma * x ** 2) #---------------------------------------------------------------------- # Compute the Eddington-Malmquist bias & scatter np.random.seed(42) mtrue = generate_magnitudes(1E6, m_min=20, m_max=25) photomErr = mag_errors(mtrue) m1 = mtrue + np.random.normal(0, photomErr) m2 = mtrue + np.random.normal(0, photomErr) dm = m1 - m2 mGrid = np.linspace(21, 24, 50) medGrid = np.zeros(mGrid.size) sigGrid = np.zeros(mGrid.size) for i in range(mGrid.size): medGrid[i], sigGrid[i] = median_sigmaG(dm[m1 < mGrid[i]]) #---------------------------------------------------------------------- # Lutz-Kelker bias and scatter mtrue = generate_magnitudes(1E6, m_min=17, m_max=20) relErr = 0.3 * 10 ** (0.4 * (mtrue - 20)) pErrGrid = np.arange(0.02, 0.31, 0.01) deltaM2 = 5 * np.log(1 + 2 * relErr ** 2) deltaM4 = 5 * np.log(1 + 4 * relErr ** 2) med2 = [np.median(deltaM2[relErr < e]) for e in pErrGrid] med4 = [np.median(deltaM4[relErr < e]) for e in pErrGrid] #---------------------------------------------------------------------- # plot results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.25, bottom=0.17, top=0.95) ax = fig.add_subplot(121) ax.plot(mGrid, sigGrid, '-k', label='scatter') ax.plot(mGrid, medGrid, '--k', label='bias') ax.plot(mGrid, 0 * mGrid, ':k', lw=1) ax.legend(loc=2) ax.set_xlabel(r'$m_{\rm obs}$') ax.set_ylabel('bias/scatter (mag)') ax.set_ylim(-0.04, 0.21) ax.xaxis.set_major_locator(plt.MultipleLocator(1.0)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.1)) ax.yaxis.set_minor_locator(plt.MultipleLocator(0.01)) for l in ax.yaxis.get_minorticklines(): l.set_markersize(3) ax = fig.add_subplot(122) ax.plot(pErrGrid, med2, '-k', label='$p=2$') ax.plot(pErrGrid, med4, '--k', label='$p=4$') ax.legend(loc=2) ax.set_xlabel(r'$\sigma_\pi / \pi$') ax.set_ylabel('absolute magnitude bias') ax.xaxis.set_major_locator(plt.MultipleLocator(0.1)) ax.set_xlim(0.02, 0.301) ax.set_ylim(0, 0.701) plt.show() astroML-0.3/book_figures/chapter5/fig_malmquist_bias.py0000644000076500000240000000760112252721253024113 0ustar jakevdpstaff00000000000000""" Malmquist Bias Example ---------------------- Figure 5.2 An illustration of the bias in a subsample selected using measurements with finite errors, when the population distribution is a steep function. The sample is drawn from the distribution :math:`p(x) \propto 10^{0.6x}`, shown by the solid line in the left panel, and convolved with heteroscedastic errors with widths in the range :math:`0.5 < \sigma < 1.5`. When a subsample is selected using "measured" values, as illustrated in the left panel, the distribution of differences between the "observed" and true values is biased, as shown by the histogram in the right panel. The distribution is biased because more objects with larger true x are scattered into the subsample from the right side, than from the left side where the true x are smaller. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.stats.random import trunc_exp #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Sample from a truncated exponential distribution N = 1E6 hmin = 4.3 hmax = 5.7 k = 0.6 * np.log(10) true_dist = trunc_exp(hmin - 1.4, hmax + 3.4, 0.6 * np.log(10)) # draw the true distributions and heteroscedastic noise np.random.seed(0) h_true = true_dist.rvs(N) dh = 0.5 * (1 + np.random.random(N)) h_obs = np.random.normal(h_true, dh) # create observational cuts cut = (h_obs < hmax) & (h_obs > hmin) # select a random (not observationally cut) subsample rand = np.arange(len(h_obs)) np.random.shuffle(rand) rand = rand[:cut.sum()] #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.12, right=0.95, wspace=0.3, bottom=0.15, top=0.9) # First axes: plot the true and observed distribution ax = fig.add_subplot(121) bins = np.linspace(0, 12, 100) x_pdf = np.linspace(0, 12, 1000) ax.plot(x_pdf, true_dist.pdf(x_pdf), '-k', label='true distribution') ax.hist(h_obs, bins, histtype='stepfilled', alpha=0.3, fc='b', normed=True, label='observed distribution') ax.legend(loc=2, handlelength=2) ax.add_patch(plt.Rectangle((hmin, 0), hmax - hmin, 1.2, fc='gray', ec='k', linestyle='dashed', alpha=0.3)) ax.text(5, 0.07, 'sampled region', rotation=45, ha='center', va='center', color='gray') ax.set_xlim(hmin - 1.3, hmax + 1.3) ax.set_ylim(0, 0.14001) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.set_xlabel(r'$x_{\rm obs}$') ax.set_ylabel(r'$p(x_{\rm obs})$') # Second axes: plot the histogram of (x_obs - x_true) ax = fig.add_subplot(122) bins = 30 ax.hist(h_obs[cut] - h_true[cut], bins, histtype='stepfilled', alpha=0.3, color='k', normed=True, label='observed\nsample') ax.hist(h_obs[rand] - h_true[rand], bins, histtype='step', color='k', linestyle='dashed', normed=True, label='random\nsample') ax.plot([0, 0], [0, 1], ':k') ax.legend(ncol=2, loc='upper center', frameon=False, handlelength=1) ax.set_xlim(-4, 4) ax.set_ylim(0, 0.65) ax.set_xlabel(r'$x_{\rm obs} - x_{\rm true}$') ax.set_ylabel(r'$p(x_{\rm obs} - x_{\rm true})$') plt.show() astroML-0.3/book_figures/chapter5/fig_model_comparison_hist.py0000644000076500000240000000461412252721253025463 0ustar jakevdpstaff00000000000000""" Histogram for Double-gaussian model test ---------------------------------------- Figure 5.23 A sample of 200 points drawn from a Gaussian mixture model used to illustrate model selection with MCMC. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm from astroML.density_estimation import GaussianMixture1D #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate the data mu1_in = 0 sigma1_in = 0.3 mu2_in = 1 sigma2_in = 1 ratio_in = 1.5 N = 200 np.random.seed(10) gm = GaussianMixture1D([mu1_in, mu2_in], [sigma1_in, sigma2_in], [ratio_in, 1]) x_sample = gm.sample(N) #------------------------------------------------------------ # Get the MLE fit for a single gaussian sample_mu = np.mean(x_sample) sample_std = np.std(x_sample, ddof=1) #------------------------------------------------------------ # Plot the sampled data fig, ax = plt.subplots(figsize=(5, 3.75)) ax.hist(x_sample, 20, histtype='stepfilled', normed=True, fc='#CCCCCC') x = np.linspace(-2.1, 4.1, 1000) factor1 = ratio_in / (1. + ratio_in) factor2 = 1. / (1. + ratio_in) ax.plot(x, gm.pdf(x), '-k', label='true distribution') ax.plot(x, gm.pdf_individual(x), ':k') ax.plot(x, norm.pdf(x, sample_mu, sample_std), '--k', label='best fit normal') ax.legend(loc=1) ax.set_xlim(-2.1, 4.1) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') ax.set_title('Input pdf and sampled data') ax.text(0.95, 0.80, ('$\mu_1 = 0;\ \sigma_1=0.3$\n' '$\mu_2=1;\ \sigma_2=1.0$\n' '$\mathrm{ratio}=1.5$'), transform=ax.transAxes, ha='right', va='top') plt.show() astroML-0.3/book_figures/chapter5/fig_model_comparison_mcmc.py0000644000076500000240000001764712252721253025445 0ustar jakevdpstaff00000000000000""" MCMC Model Comparison --------------------- Figure 5.24 The top-right panel shows the posterior pdf for mu and sigma for a single Gaussian fit to the data shown in figure 5.23. The remaining panels show the projections of the five-dimensional pdf for a Gaussian mixture model with two components. Contours are based on a 10,000 point MCMC chain. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.special import gamma from scipy.stats import norm from sklearn.neighbors import BallTree from astroML.density_estimation import GaussianMixture1D from astroML.plotting import plot_mcmc # hack to fix an import issue in older versions of pymc import scipy scipy.derivative = scipy.misc.derivative import pymc #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def get_logp(S, model): """compute log(p) given a pyMC model""" M = pymc.MAP(model) traces = np.array([S.trace(s)[:] for s in S.stochastics]) logp = np.zeros(traces.shape[1]) for i in range(len(logp)): logp[i] = -M.func(traces[:, i]) return logp def estimate_bayes_factor(traces, logp, r=0.05, return_list=False): """Estimate the bayes factor using the local density of points""" D, N = traces.shape # compute volume of a D-dimensional sphere of radius r Vr = np.pi ** (0.5 * D) / gamma(0.5 * D + 1) * (r ** D) # use neighbor count within r as a density estimator bt = BallTree(traces.T) count = bt.query_radius(traces.T, r=r, count_only=True) BF = logp + np.log(N) + np.log(Vr) - np.log(count) if return_list: return BF else: p25, p50, p75 = np.percentile(BF, [25, 50, 75]) return p50, 0.7413 * (p75 - p25) #------------------------------------------------------------ # Generate the data mu1_in = 0 sigma1_in = 0.3 mu2_in = 1 sigma2_in = 1 ratio_in = 1.5 N = 200 np.random.seed(10) gm = GaussianMixture1D([mu1_in, mu2_in], [sigma1_in, sigma2_in], [ratio_in, 1]) x_sample = gm.sample(N) #------------------------------------------------------------ # Set up pyMC model: single gaussian # 2 parameters: (mu, sigma) M1_mu = pymc.Uniform('M1_mu', -5, 5, value=0) M1_log_sigma = pymc.Uniform('M1_log_sigma', -10, 10, value=0) @pymc.deterministic def M1_sigma(M1_log_sigma=M1_log_sigma): return np.exp(M1_log_sigma) @pymc.deterministic def M1_tau(M1_sigma=M1_sigma): return 1. / M1_sigma ** 2 M1 = pymc.Normal('M1', M1_mu, M1_tau, observed=True, value=x_sample) model1 = dict(M1_mu=M1_mu, M1_log_sigma=M1_log_sigma, M1_sigma=M1_sigma, M1_tau=M1_tau, M1=M1) #------------------------------------------------------------ # Set up pyMC model: double gaussian # 5 parameters: (mu1, mu2, sigma1, sigma2, ratio) def doublegauss_like(x, mu1, mu2, sigma1, sigma2, ratio): """log-likelihood for double gaussian""" r1 = ratio / (1. + ratio) r2 = 1 - r1 L = r1 * norm(mu1, sigma1).pdf(x) + r2 * norm(mu2, sigma2).pdf(x) L[L == 0] = 1E-16 # prevent divide-by-zero error logL = np.log(L).sum() if np.isinf(logL): raise pymc.ZeroProbability else: return logL def rdoublegauss(mu1, mu2, sigma1, sigma2, ratio, size=None): """random variable from double gaussian""" r1 = ratio / (1. + ratio) r2 = 1 - r1 R = np.asarray(np.random.random(size)) Rshape = R.shape R = np.atleast1d(R) mask1 = (R < r1) mask2 = ~mask1 N1 = mask1.sum() N2 = R.size - N1 R[mask1] = norm(mu1, sigma1).rvs(N1) R[mask2] = norm(mu2, sigma2).rvs(N2) return R.reshape(Rshape) DoubleGauss = pymc.stochastic_from_dist('doublegauss', logp=doublegauss_like, random=rdoublegauss, dtype=np.float, mv=True) # set up our Stochastic variables, mu1, mu2, sigma1, sigma2, ratio M2_mu1 = pymc.Uniform('M2_mu1', -5, 5, value=0) M2_mu2 = pymc.Uniform('M2_mu2', -5, 5, value=1) M2_log_sigma1 = pymc.Uniform('M2_log_sigma1', -10, 10, value=0) M2_log_sigma2 = pymc.Uniform('M2_log_sigma2', -10, 10, value=0) @pymc.deterministic def M2_sigma1(M2_log_sigma1=M2_log_sigma1): return np.exp(M2_log_sigma1) @pymc.deterministic def M2_sigma2(M2_log_sigma2=M2_log_sigma2): return np.exp(M2_log_sigma2) M2_ratio = pymc.Uniform('M2_ratio', 1E-3, 1E3, value=1) M2 = DoubleGauss('M2', M2_mu1, M2_mu2, M2_sigma1, M2_sigma2, M2_ratio, observed=True, value=x_sample) model2 = dict(M2_mu1=M2_mu1, M2_mu2=M2_mu2, M2_log_sigma1=M2_log_sigma1, M2_log_sigma2=M2_log_sigma2, M2_sigma1=M2_sigma1, M2_sigma2=M2_sigma2, M2_ratio=M2_ratio, M2=M2) #------------------------------------------------------------ # Set up MCMC sampling def compute_MCMC_models(Niter=10000, burn=1000, rseed=0): pymc.numpy.random.seed(rseed) S1 = pymc.MCMC(model1) S1.sample(iter=Niter, burn=burn) trace1 = np.vstack([S1.trace('M1_mu')[:], S1.trace('M1_sigma')[:]]) logp1 = get_logp(S1, model1) S2 = pymc.MCMC(model2) S2.sample(iter=Niter, burn=burn) trace2 = np.vstack([S2.trace('M2_mu1')[:], S2.trace('M2_mu2')[:], S2.trace('M2_sigma1')[:], S2.trace('M2_sigma2')[:], S2.trace('M2_ratio')[:]]) logp2 = get_logp(S2, model2) return trace1, logp1, trace2, logp2 trace1, logp1, trace2, logp2 = compute_MCMC_models() #------------------------------------------------------------ # Compute Odds ratio with density estimation technique BF1, dBF1 = estimate_bayes_factor(trace1, logp1, r=0.02) BF2, dBF2 = estimate_bayes_factor(trace2, logp2, r=0.05) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) labels = [r'$\mu_1$', r'$\mu_2$', r'$\sigma_1$', r'$\sigma_2$', r'${\rm ratio}$'] true_values = [mu1_in, mu2_in, sigma1_in, sigma2_in, ratio_in] limits = [(-0.24, 0.12), (0.55, 1.75), (0.15, 0.45), (0.55, 1.3), (0.25, 2.1)] # we assume mu1 < mu2, but the results may be switched # due to the symmetry of the problem. If so, switch back if np.median(trace2[0]) > np.median(trace2[1]): trace2 = trace2[[1, 0, 3, 2, 4], :] N2_norm_mu = N2.mu[N2.M2_mu2, N2.M2_mu1, N2.M2_sigma2, N2.M2_sigma1, N2.M2_ratio] N2_norm_Sig = N2.C[N2.M2_mu2, N2.M2_mu1, N2.M2_sigma2, N2.M2_sigma1, N2.M2_ratio] # Plot the simple 2-component model ax, = plot_mcmc(trace1, fig=fig, bounds=[0.6, 0.6, 0.95, 0.95], limits=[(0.3, 0.8), (0.75, 1.15)], labels=[r'$\mu$', r'$\sigma$'], colors='k') ax.text(0.05, 0.95, "Single Gaussian fit", va='top', ha='left', transform=ax.transAxes) # Plot the 5-component model ax_list = plot_mcmc(trace2, limits=limits, labels=labels, true_values=true_values, fig=fig, bounds=(0.12, 0.12, 0.95, 0.95), colors='k') for ax in ax_list: for axis in [ax.xaxis, ax.yaxis]: axis.set_major_locator(plt.MaxNLocator(4)) plt.show() astroML-0.3/book_figures/chapter5/fig_odds_ratio_cauchy.py0000644000076500000240000001151312420767763024576 0ustar jakevdpstaff00000000000000""" Odds Ratio for Cauchy vs Gaussian --------------------------------- Figure 5.19 The Cauchy vs. Gaussian model odds ratio for a data set drawn from a Cauchy distribution (mu = 0, gamma = 2) as a function of the number of points used to perform the calculation. Note the sharp increase in the odds ratio when points falling far from the mean are added. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from scipy.stats import cauchy, norm from scipy import integrate #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def logL_cauchy(xi, gamma, mu, mu_min=-10, mu_max=10, sigma_min=0.01, sigma_max=100): """Equation 5.74: cauchy likelihood""" xi = np.asarray(xi) n = xi.size shape = np.broadcast(gamma, mu).shape xi = xi.reshape(xi.shape + tuple([1 for s in shape])) prior_normalization = - (np.log(mu_max - mu_min) + np.log(np.log(sigma_max / sigma_min))) return (prior_normalization - n * np.log(np.pi) + (n - 1) * np.log(gamma) - np.sum(np.log(gamma ** 2 + (xi - mu) ** 2), 0)) def logL_gaussian(xi, sigma, mu, mu_min=-10, mu_max=10, sigma_min=0.01, sigma_max=100): """Equation 5.57: gaussian likelihood""" xi = np.asarray(xi) n = xi.size shape = np.broadcast(sigma, mu).shape xi = xi.reshape(xi.shape + tuple([1 for s in shape])) prior_normalization = - (np.log(mu_max - mu_min) + np.log(np.log(sigma_max / sigma_min))) return (prior_normalization - 0.5 * n * np.log(2 * np.pi) - (n + 1) * np.log(sigma) - np.sum(0.5 * ((xi - mu) / sigma) ** 2, 0)) def calculate_odds_ratio(xi, epsrel=1E-8, epsabs=1E-15): """ Compute the odds ratio by perfoming a double integral over the likelihood space. """ gauss_Ifunc = lambda mu, sigma: np.exp(logL_gaussian(xi, mu, sigma)) cauchy_Ifunc = lambda mu, gamma: np.exp(logL_cauchy(xi, mu, gamma)) I_gauss, err_gauss = integrate.dblquad(gauss_Ifunc, -np.inf, np.inf, lambda x: 0, lambda x: np.inf, epsabs=epsabs, epsrel=epsrel) I_cauchy, err_cauchy = integrate.dblquad(cauchy_Ifunc, -np.inf, np.inf, lambda x: 0, lambda x: np.inf, epsabs=epsabs, epsrel=epsrel) if I_gauss == 0: O_CG = np.inf err_O_CG = np.inf else: O_CG = I_cauchy / I_gauss err_O_CG = O_CG * np.sqrt((err_gauss / I_gauss) ** 2) return (I_gauss, err_gauss), (I_cauchy, err_cauchy), (O_CG, err_O_CG) #------------------------------------------------------------ # Draw points from a Cauchy distribution np.random.seed(44) mu = 0 gamma = 2 xi = cauchy(mu, gamma).rvs(100) #------------------------------------------------------------ # compute the odds ratio for the first 10 points ((I_gauss, err_gauss), (I_cauchy, err_cauchy), (O_CG, err_O_CG)) = calculate_odds_ratio(xi[:10]) print("Results for first 10 points:") print(" L(M = Cauchy) = %.2e +/- %.2e" % (I_cauchy, err_cauchy)) print(" L(M = Gauss) = %.2e +/- %.2e" % (I_gauss, err_gauss)) print(" O_{CG} = %.3g +/- %.3g" % (O_CG, err_O_CG)) #------------------------------------------------------------ # calculate the results as a function of number of points Nrange = np.arange(10, 101, 2) Odds = np.zeros(Nrange.shape) for i, N in enumerate(Nrange): res = calculate_odds_ratio(xi[:N]) Odds[i] = res[2][0] #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.1) ax1 = fig.add_subplot(211, yscale='log') ax1.plot(Nrange, Odds, '-k') ax1.set_ylabel(r'$O_{CG}$ for $N$ points') ax1.set_xlim(0, 100) ax1.xaxis.set_major_formatter(plt.NullFormatter()) ax1.yaxis.set_major_locator(plt.LogLocator(base=10000.0)) ax2 = fig.add_subplot(212) ax2.scatter(np.arange(1, len(xi) + 1), xi, lw=0, s=16, c='k') ax2.set_xlim(0, 100) ax2.set_xlabel('Sample Size $N$') ax2.set_ylabel('Sample Value') plt.show() astroML-0.3/book_figures/chapter5/fig_odds_ratio_coin.py0000644000076500000240000000537112252721253024242 0ustar jakevdpstaff00000000000000""" Coin Toss Odds Ratio -------------------- Figure 5.1 Odds ratio for two models, :math:`O_{21}`, describing coin tosses (eq. 5.26). Out of N tosses (left: N = 10; right: N = 20), k tosses are heads. Model 2 is a one-parameter model with the heads probability determined from data (:math:`b^0 = k/N`), and model 1 claims an a priori known heads probability equal to :math:`b_*`.The results are shown for two values of :math:`b_*`, as indicated in the legend. Note that the odds ratio is minimized and below 1 (model 1 wins) when :math:`k = b_* N`. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import integrate from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) @np.vectorize def odds_ratio(n, k, bstar): """Odds ratio between M_2, where the heads probability is unknown, and M_1, where the heads probability is known to be `bstar`, evaluated in the case of `k` heads observed in `n` tosses. Eqn. 5.25 in the text """ factor = 1. / (bstar ** k * (1 - bstar) ** (n - k)) f = lambda b: b ** k * (1 - b) ** (n - k) return factor * integrate.quad(f, 0, 1)[0] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.13, right=0.95, wspace=0.05, bottom=0.15) subplots = [121, 122] n_array = [10, 20] linestyles = ['-k', '--b'] bstar_array = [0.5, 0.1] for subplot, n in zip(subplots, n_array): ax = fig.add_subplot(subplot, yscale='log') k = np.arange(n + 1) # plot curves for two values of bstar for ls, bstar in zip(linestyles, bstar_array): ax.plot(k, odds_ratio(n, k, bstar), ls, label=r'$b^* = %.1f$' % bstar) if subplot == 121: ax.set_xlim(0, n - 0.01) ax.set_ylabel(r'$O_{21}$') ax.legend(loc=2) else: ax.set_xlim(0, n) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_xlabel('$k$') ax.set_title('$n = %i$' % n) ax.set_ylim(8E-2, 1E3) ax.xaxis.set_major_locator(plt.MultipleLocator(n / 5)) ax.grid() plt.show() astroML-0.3/book_figures/chapter5/fig_outlier_distribution.py0000644000076500000240000000472112420767763025400 0ustar jakevdpstaff00000000000000""" Gaussian Distribution with Outliers ----------------------------------- This figure shows the distribution of points drawn from a narrow Gaussian distribution, with 20% "outliers" drawn from a wider Gaussian distribution. Over-plotted are the robust and non-robust estimators of the mean and standard deviation. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm, anderson from astroML.stats import mean_sigma, median_sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create distribution Npts = 1E6 f_out = 0.2 N_out = int(f_out * Npts) sigma1 = 1 sigma2 = 3 np.random.seed(1) x = np.hstack((np.random.normal(0, sigma1, Npts - N_out), np.random.normal(0, sigma2, N_out))) #------------------------------------------------------------ # Compute anderson-darling test A2, sig, crit = anderson(x) print("anderson-darling A^2 = {0:.1f}".format(A2)) #------------------------------------------------------------ # Compute non-robust and robust point statistics mu_sample, sig_sample = mean_sigma(x) med_sample, sigG_sample = median_sigmaG(x) #------------------------------------------------------------ # Plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) # histogram of data ax.hist(x, 100, histtype='stepfilled', alpha=0.2, color='k', normed=True) # best-fit normal curves x_sample = np.linspace(-15, 15, 1000) ax.plot(x_sample, norm(mu_sample, sig_sample).pdf(x_sample), '-k', label='$\sigma$ fit') ax.plot(x_sample, norm(med_sample, sigG_sample).pdf(x_sample), '--k', label='$\sigma_G$ fit') ax.legend() ax.set_xlim(-8, 8) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter5/fig_outlier_likelihood.py0000644000076500000240000000574312252721253024774 0ustar jakevdpstaff00000000000000""" Plot the posterior of mu vs g1 with outliers -------------------------------------------- Figure 5.17 The marginal joint distribution between mu and g_i, as given by eq. 5.100. The left panel shows a point identified as bad (:math:`\hat{g_i|} = 0`), while the right panel shows a point identified as good(:math:`\hat{g_i|} = 1`). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm from astroML.plotting.mcmc import convert_to_stdev #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def p(mu, g1, xi, sigma1, sigma2): """Equation 5.97: marginalized likelihood over outliers""" L = (g1 * norm.pdf(xi[0], mu, sigma1) + (1 - g1) * norm.pdf(xi[0], mu, sigma2)) mu = mu.reshape(mu.shape + (1,)) g1 = g1.reshape(g1.shape + (1,)) return L * np.prod(norm.pdf(xi[1:], mu, sigma1) + norm.pdf(xi[1:], mu, sigma2), -1) #------------------------------------------------------------ # Sample the points np.random.seed(138) N1 = 8 N2 = 2 sigma1 = 1 sigma2 = 3 sigmai = np.zeros(N1 + N2) sigmai[N2:] = sigma1 sigmai[:N2] = sigma2 xi = np.random.normal(0, sigmai) #------------------------------------------------------------ # Compute the marginalized posterior for the first and last point mu = np.linspace(-5, 5, 71) g1 = np.linspace(0, 1, 11) L1 = p(mu[:, None], g1, xi, 1, 10) L1 /= np.max(L1) L2 = p(mu[:, None], g1, xi[::-1], 1, 10) L2 /= np.max(L2) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.05, bottom=0.15, top=0.9) ax1 = fig.add_subplot(121) ax1.imshow(L1.T, origin='lower', aspect='auto', cmap=plt.cm.binary, extent=[mu[0], mu[-1], g1[0], g1[-1]]) ax1.contour(mu, g1, convert_to_stdev(np.log(L1).T), levels=(0.683, 0.955, 0.997), colors='k') ax1.set_xlabel(r'$\mu$') ax1.set_ylabel(r'$g_1$') ax2 = fig.add_subplot(122) ax2.imshow(L2.T, origin='lower', aspect='auto', cmap=plt.cm.binary, extent=[mu[0], mu[-1], g1[0], g1[-1]]) ax2.contour(mu, g1, convert_to_stdev(np.log(L2).T), levels=(0.683, 0.955, 0.997), colors='k') ax2.set_xlabel(r'$\mu$') ax2.yaxis.set_major_locator(plt.NullLocator()) plt.show() astroML-0.3/book_figures/chapter5/fig_outlier_marginalized.py0000644000076500000240000000635612252721253025320 0ustar jakevdpstaff00000000000000""" Plot the Outlier Probability ---------------------------- Figure 5.18 The marginal probability for g_i for the "good" and "bad" points shown in figure 5.17. The solid curves show the marginalized probability: that is, eq. 5.100 is integrated over mu. The dashed curves show the probability conditioned on :math:`\mu = \mu_0`, the MAP estimate of :math:`\mu` (eq. 5.102) """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def p(mu, g1, xi, sigma1, sigma2): """Equation 5.97: marginalized likelihood over outliers""" L = (g1 * norm.pdf(xi[0], mu, sigma1) + (1 - g1) * norm.pdf(xi[0], mu, sigma2)) mu = mu.reshape(mu.shape + (1,)) g1 = g1.reshape(g1.shape + (1,)) return L * np.prod(norm.pdf(xi[1:], mu, sigma1) + norm.pdf(xi[1:], mu, sigma2), -1) #------------------------------------------------------------ # Sample the points np.random.seed(138) N1 = 8 N2 = 2 sigma1 = 1 sigma2 = 3 sigmai = np.zeros(N1 + N2) sigmai[N2:] = sigma1 sigmai[:N2] = sigma2 xi = np.random.normal(0, sigmai) #------------------------------------------------------------ # Compute the marginalized posterior for the first and last point mu = np.linspace(-5, 5, 71) g1 = np.linspace(0, 1, 71) L1 = p(mu[:, None], g1, xi, 1, 10) L1 /= np.max(L1) (i1, j1) = np.where(L1 == 1) mu0_1 = mu[i1] L2 = p(mu[:, None], g1, xi[::-1], 1, 10) L2 /= np.max(L2) (i2, j2) = np.where(L2 == 1) mu0_2 = mu[i2] p1 = L1.sum(0) p2 = L2.sum(0) p1 /= np.sum(p1) * (g1[1] - g1[0]) p2 /= np.sum(p2) * (g1[1] - g1[0]) p1a = L1[i1[0]] p2a = L2[i2[0]] p1a /= p1a.sum() * (g1[1] - g1[0]) p2a /= p2a.sum() * (g1[1] - g1[0]) #------------------------------------------------------------ # Plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) l1, = ax.plot(g1, p1, '-k', lw=2) l2, = ax.plot(g1, p1a, '--k', lw=2) leg1 = ax.legend([l1, l2], [r'$p(g_1)$ (bad point)', r'$p(g_1|\mu_0)$ (bad point)'], loc=9) l3, = ax.plot(g1, p2, '-b', lw=1, label=r'$p(g_1)$ (good point)') l4, = ax.plot(g1, p2a, '--b', lw=1, label=r'$p(g_1|\mu_0)$ (good point)') leg2 = ax.legend([l3, l4], [r'$p(g_1)$ (good point)', r'$p(g_1|\mu_0)$ (good point)'], loc=8) # trick to display two legends: # when legend() is called the second time, the first one is removed from # the axes. We add it back in here: ax.add_artist(leg1) ax.set_xlabel('$g_1$') ax.set_ylabel('$p(g_1)$') ax.set_xlim(0, 1) ax.set_ylim(0.1, 1.8) plt.show() astroML-0.3/book_figures/chapter5/fig_poisson_comparison.py0000644000076500000240000000640212252721253025023 0ustar jakevdpstaff00000000000000""" Poisson Statistics with arbitrarily small bins ---------------------------------------------- Figure 5.16 The comparison of the continuous method (figure 5.14) and the binned method (figure 5.15) on the same data set. In the limit of a large number of bins, most bins register only zero or one count, and the binned Poisson statistic gives nearly the same marginalized distribution for a as the continuous statistic. For as few as two bins, the constraint on the slope is only slightly biased. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.stats.random import linear #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def logL_continuous(x, a, xmin, xmax): """Continuous log-likelihood (Eq. 5.84)""" x = x.ravel() a = a.reshape(a.shape + (1,)) mu = 0.5 * (xmin + xmax) W = (xmax - xmin) return np.sum(np.log(a * (x - mu) + 1. / W), -1) def logL_poisson(xi, yi, a, b): """poisson log-likelihood (Eq. 5.88)""" xi = xi.ravel() yi = yi.ravel() a = a.reshape(a.shape + (1,)) b = b.reshape(b.shape + (1,)) yyi = a * xi + b return np.sum(yi * np.log(yyi) - yyi, -1) #---------------------------------------------------------------------- # draw the data np.random.seed(0) N = 200 a_true = 0.01 xmin = 0.0 xmax = 10.0 lin_dist = linear(xmin, xmax, a_true) x = lin_dist.rvs(N) a = np.linspace(0.0, 0.02, 101) b = np.linspace(0.00001, 0.15, 51) #------------------------------------------------------------ # Compute the log-likelihoods # continuous case logL = logL_continuous(x, a, xmin, xmax) L_c = np.exp(logL - logL.max()) L_c /= L_c.sum() * (a[1] - a[0]) # discrete case: compute for 2 and 1000 bins nbins = [1000, 2] L_p = [0, 0] for i, n in enumerate(nbins): yi, bins = np.histogram(x, bins=np.linspace(xmin, xmax, n + 1)) xi = 0.5 * (bins[:-1] + bins[1:]) factor = N * (xmax - xmin) * 1. / n logL = logL_poisson(xi, yi, factor * a, factor * b[:, None]) L_p[i] = np.exp(logL - np.max(logL)).sum(0) L_p[i] /= L_p[i].sum() * (a[1] - a[0]) #------------------------------------------------------------ # Plot the results fig, ax = plt.subplots(figsize=(5, 3.75)) ax.plot(a, L_c, '-k', label='continuous') for L, ls, n in zip(L_p, ['-', '--'], nbins): ax.plot(a, L, ls, color='gray', lw=1, label='discrete, %i bins' % n) # plot a vertical line: in newer matplotlib, use ax.vlines([a_true]) ylim = (0, 200) ax.plot([a_true, a_true], ylim, ':k', lw=1) ax.set_xlim(-0.001, 0.021) ax.set_ylim(ylim) ax.set_xlabel('$a$') ax.set_ylabel('$p(a)$') ax.legend(loc=2) plt.show() astroML-0.3/book_figures/chapter5/fig_poisson_continuous.py0000644000076500000240000000603612252721253025062 0ustar jakevdpstaff00000000000000""" Unbinned Poisson Data --------------------- Figure 5.14 Regression of unbinned data. The distribution of N = 500 data points is shown in the left panel; the true pdf is shown by the solid curve. Note that although the data are binned in the left panel for visualization purposes, the analysis is performed on the unbinned data. The right panel shows the likelihood for the slope a (eq. 5.88) for three different sample sizes. The input value is indicated by the vertical dotted line. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.stats.random import linear #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def linprob_logL(x, a, xmin, xmax): x = x.ravel() a = a.reshape(a.shape + (1,)) mu = 0.5 * (xmin + xmax) W = (xmax - xmin) return np.sum(np.log(a * (x - mu) + 1. / W), -1) #---------------------------------------------------------------------- # Draw the data from the linear distribution np.random.seed(0) N = 500 a_true = 0.01 xmin = 0.0 xmax = 10.0 lin_dist = linear(xmin, xmax, a_true) data = lin_dist.rvs(N) x = np.linspace(xmin - 1, xmax + 1, 1000) px = lin_dist.pdf(x) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.12, right=0.95, wspace=0.28, bottom=0.15, top=0.9) # left panel: plot the model and a histogram of the data ax1 = fig.add_subplot(121) ax1.hist(data, bins=np.linspace(0, 10, 11), normed=True, histtype='stepfilled', fc='gray', alpha=0.5) ax1.plot(x, px, '-k') ax1.set_xlim(-1, 11) ax1.set_ylim(0, 0.18) ax1.set_xlabel('$x$') ax1.set_ylabel('$p(x)$') # right panel: construct and plot the likelihood ax2 = fig.add_subplot(122) ax2.xaxis.set_major_locator(plt.MultipleLocator(0.01)) a = np.linspace(-0.01, 0.02, 1000) Npts = (500, 100, 20) styles = ('-k', '--b', '-.g') for n, s in zip(Npts, styles): logL = linprob_logL(data[:n], a, xmin, xmax) logL = np.exp(logL - logL.max()) logL /= logL.sum() * (a[1] - a[0]) ax2.plot(a, logL, s, label=r'$\rm %i\ pts$' % n) ax2.legend(loc=2, prop=dict(size=8)) ax2.set_xlim(-0.011, 0.02) ax2.set_xlabel('$a$') ax2.set_ylabel('$p(a)$') # vertical line: in newer matplotlib versions, use ax.vlines([a_true]) ylim = ax2.get_ylim() ax2.plot([a_true, a_true], ylim, ':k', lw=1) ax2.set_ylim(ylim) plt.show() astroML-0.3/book_figures/chapter5/fig_poisson_likelihood.py0000644000076500000240000001155212252721253024776 0ustar jakevdpstaff00000000000000""" Binned Regression: Poisson vs Gaussian -------------------------------------- Figure 5.15 The left panels show data sets with 50 points, binned in 5 bins (upper panels) and 40 bins (lower panels). The curves show the input distribution (solid), the Poisson solution (dashed), and the Gaussian solution (dotted). The right panels show 1-sigma, 2-sigma, and 3-sigma likelihood contours for eqs. 5.91 (dark lines) and 5.90 (light lines). With 5 bins (top row) there are enough counts in each bin so that the Gaussian and Poisson predictions are very similar. As the number of bins is increased, the counts decrease and the Gaussian approximation becomes biased. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats, interpolate from astroML.stats.random import linear from astroML.plotting.mcmc import convert_to_stdev #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def logL_gaussian(xi, yi, a, b): """gaussian log-likelihood (Eq. 5.87)""" xi = xi.ravel() yi = yi.ravel() a = a.reshape(a.shape + (1,)) b = b.reshape(b.shape + (1,)) yyi = a * xi + b return -0.5 * np.sum(np.log(yyi) + (yi - yyi) ** 2 / yyi, -1) def logL_poisson(xi, yi, a, b): """poisson log-likelihood (Eq. 5.88)""" xi = xi.ravel() yi = yi.ravel() a = a.reshape(a.shape + (1,)) b = b.reshape(b.shape + (1,)) yyi = a * xi + b return np.sum(yi * np.log(yyi) - yyi, -1) #------------------------------------------------------------ # Draw points from distribution np.random.seed(0) N = 50 a_true = 0.01 xmin = 0.0 xmax = 10.0 b_true = 1. / (xmax - xmin) - 0.5 * a_true * (xmax + xmin) lin_dist = linear(xmin, xmax, a_true) data = lin_dist.rvs(N) #------------------------------------------------------------ # Compute and plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.3, bottom=0.1, top=0.95, hspace=0.2) a = np.linspace(0.00001, 0.04, 71) b = np.linspace(0.00001, 0.15, 71) for num, nbins in enumerate([5, 40]): # divide points into bins yi, bins = np.histogram(data, bins=np.linspace(xmin, xmax, nbins + 1)) xi = 0.5 * (bins[:-1] + bins[1:]) # compute likelihoods for Poisson and Gaussian models factor = N * (xmax - xmin) * 1. / nbins LP = logL_poisson(xi, yi, factor * a, factor * b[:, None]) LG = logL_gaussian(xi, yi, factor * a, factor * b[:, None]) LP -= np.max(LP) LG -= np.max(LG) # find maximum likelihood point i, j = np.where(LP == np.max(LP)) aP, bP = a[j[0]], b[i[0]] i, j = np.where(LG == np.max(LG)) aG, bG = a[j[0]], b[i[0]] # plot scatter and lines ax = fig.add_subplot(2, 2, 1 + 2 * num) plt.scatter(xi, yi, s=9, c='gray', lw=0) x = np.linspace(xmin - 1, xmax + 1, 1000) for (ai, bi, s) in [(a_true, b_true, '-k'), (aP, bP, '--k'), (aG, bG, '-.k')]: px = ai * x + bi px[x < xmin] = 0 px[x > xmax] = 0 ax.plot(x, factor * px, s) ax.set_xlim(xmin - 1, xmax + 1) ax.set_xlabel('$x$') ax.set_ylabel('$y_i$') ax.text(0.04, 0.96, r'$\rm %i\ points$' % N + '\n' + r'$\rm %i\ bins$' % nbins, ha='left', va='top', transform=ax.transAxes) # plot likelihood contours ax = fig.add_subplot(2, 2, 2 + 2 * num) ax.contour(a, b, convert_to_stdev(LP), levels=(0.683, 0.955, 0.997), colors='k', linewidths=2) ax.contour(a, b, convert_to_stdev(LG), levels=(0.683, 0.955, 0.997), colors='gray', linewidths=1, linestyle='dashed') # trick the legend command ax.plot([0], [0], '-k', lw=2, label='Poisson Likelihood') ax.plot([0], [0], '-', c='gray', lw=1, label='Gaussian Likelihood') ax.legend(loc=1) # plot horizontal and vertical lines # in newer matplotlib versions, use ax.vlines() and ax.hlines() ax.plot([a_true, a_true], [0, 0.2], ':k', lw=1) ax.plot([0, 0.06], [b_true, b_true], ':k', lw=1) ax.set_xlabel(r'$a^\ast$') ax.set_ylabel(r'$b^\ast$') ax.set_xlim(0, 0.04) ax.set_ylim(0.001, 0.15) ax.xaxis.set_major_locator(plt.MultipleLocator(0.02)) plt.show() astroML-0.3/book_figures/chapter5/fig_posterior_binomial.py0000644000076500000240000000453712252721253025006 0ustar jakevdpstaff00000000000000""" Binomial Posterior ------------------ Figure 5.9 The solid line in the left panel shows the posterior pdf p(b|k, N) described by eq. 5.71, for k = 4 and N = 10. The dashed line shows a Gaussian approximation described in Section 3.3.3. The right panel shows the corresponding cumulative distributions. A value of 0.1 is marginally likely according to the Gaussian approximation (p_approx(< 0.1) ~ 0.03) but strongly rejected by the true distribution (p_true(< 0.1) ~ 0.003). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy.stats import norm, binom from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Plot posterior as a function of b n = 10 # number of points k = 4 # number of successes from n draws b = np.linspace(0, 1, 100) db = b[1] - b[0] # compute the probability p(b) (eqn. 5.70) p_b = b ** k * (1 - b) ** (n - k) p_b /= p_b.sum() p_b /= db cuml_p_b = p_b.cumsum() cuml_p_b /= cuml_p_b[-1] # compute the gaussian approximation (eqn. 5.71) p_g = norm(k * 1. / n, 0.16).pdf(b) cuml_p_g = p_g.cumsum() cuml_p_g /= cuml_p_g[-1] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.11, right=0.95, wspace=0.35, bottom=0.18) ax = fig.add_subplot(121) ax.plot(b, p_b, '-b') ax.plot(b, p_g, '--r') ax.set_ylim(-0.05, 3) ax.set_xlabel('$b$') ax.set_ylabel('$p(b|x,I)$') ax = fig.add_subplot(122, yscale='log') ax.plot(b, cuml_p_b, '-b') ax.plot(b, cuml_p_g, '--r') ax.plot([0.1, 0.1], [1E-6, 2], ':k') ax.set_xlabel('$b$') ax.set_ylabel('$P(')) ax.text(1.0, 1.0, "(a) 2 point") ax.text(3.5, 1.0, "(b) 3 point") ax.text(6.0, 1.0, "(c) 4 point") ax.text(1.4, 2.8, '$\mathbf{r_{12}}$') ax.text(3.9, 3.7, '$\mathbf{r_{12}}$') ax.text(3.0, 3.0, '$\mathbf{r_{23}}$') ax.text(4.0, 2.3, '$\mathbf{r_{31}}$') ax.text(6.4, 4.1, '$\mathbf{r_{12}}$') ax.text(5.5, 3.3, '$\mathbf{r_{13}}$') ax.text(7.2, 2.6, '$\mathbf{r_{24}}$') ax.text(6.7, 2.8, '$\mathbf{r_{14}}$') ax.text(6.0, 1.9, '$\mathbf{r_{23}}$') ax.text(6.5, 1.8, '$\mathbf{r_{34}}$') ax.set_xlim(0, 8) ax.set_ylim(0.5, 5) plt.show() astroML-0.3/book_figures/chapter6/fig_correlation_function.py0000644000076500000240000001004012420767763025334 0ustar jakevdpstaff00000000000000r""" Angular Two-point Correlation Function -------------------------------------- Figure 6.17 The two-point correlation function of SDSS spectroscopic galaxies in the range 0.08 < z < 0.12, with m < 17.7. This is the same sample for which the luminosity function is computed in figure 4.10. Errors are estimated using ten bootstrap samples. Dotted lines are added to guide the eye and correspond to a power law proportional to :math:`\theta^{-0.8}`. Note that the red galaxies (left panel) are clustered more strongly than the blue galaxies (right panel). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from astroML.decorators import pickle_results from astroML.datasets import fetch_sdss_specgals from astroML.correlation import bootstrap_two_point_angular #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get data and do some quality cuts data = fetch_sdss_specgals() m_max = 17.7 # redshift and magnitude cuts data = data[data['z'] > 0.08] data = data[data['z'] < 0.12] data = data[data['petroMag_r'] < m_max] # RA/DEC cuts RAmin, RAmax = 140, 220 DECmin, DECmax = 5, 45 data = data[data['ra'] < RAmax] data = data[data['ra'] > RAmin] data = data[data['dec'] < DECmax] data = data[data['dec'] > DECmin] ur = data['modelMag_u'] - data['modelMag_r'] flag_red = (ur > 2.22) flag_blue = ~flag_red data_red = data[flag_red] data_blue = data[flag_blue] print("data size:") print(" red gals: ", len(data_red)) print(" blue gals:", len(data_blue)) #------------------------------------------------------------ # Set up correlation function computation # This calculation takes a long time with the bootstrap resampling, # so we'll save the results. @pickle_results("correlation_functions.pkl") def compute_results(Nbins=16, Nbootstraps=10, method='landy-szalay', rseed=0): np.random.seed(rseed) bins = 10 ** np.linspace(np.log10(1 / 60.), np.log10(6), 16) results = [bins] for D in [data_red, data_blue]: results += bootstrap_two_point_angular(D['ra'], D['dec'], bins=bins, method=method, Nbootstraps=Nbootstraps) return results (bins, r_corr, r_corr_err, r_bootstraps, b_corr, b_corr_err, b_bootstraps) = compute_results() bin_centers = 0.5 * (bins[1:] + bins[:-1]) #------------------------------------------------------------ # Plot the results corr = [r_corr, b_corr] corr_err = [r_corr_err, b_corr_err] bootstraps = [r_bootstraps, b_bootstraps] labels = ['$u-r > 2.22$\n$N=%i$' % len(data_red), '$u-r < 2.22$\n$N=%i$' % len(data_blue)] fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.2, top=0.9, left=0.13, right=0.95) for i in range(2): ax = fig.add_subplot(121 + i, xscale='log', yscale='log') ax.errorbar(bin_centers, corr[i], corr_err[i], fmt='.k', ecolor='gray', lw=1) t = np.array([0.01, 10]) ax.plot(t, 10 * (t / 0.01) ** -0.8, ':k', linewidth=1) ax.text(0.95, 0.95, labels[i], ha='right', va='top', transform=ax.transAxes) ax.set_xlabel(r'$\theta\ (deg)$') if i == 0: ax.set_ylabel(r'$\hat{w}(\theta)$') plt.show() astroML-0.3/book_figures/chapter6/fig_density_estimation.py0000644000076500000240000001046712252721253025021 0ustar jakevdpstaff00000000000000""" Comparison of 1D Density Estimators ----------------------------------- Figure 6.5 A comparison of different density estimation methods for two simulated one-dimensional data sets (cf. figure 5.21). The generating distribution is same in both cases and shown as the dotted line; the samples include 500 (top panel) and 5000 (bottom panel) data points (illustrated by vertical bars at the bottom of each panel). Density estimators are Bayesian blocks (Section 5.7.2), KDE (Section 6.1.1) and the nearest-neighbor method (eq. 6.15). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats from astroML.density_estimation import KNeighborsDensity from astroML.plotting import hist # Scikit-learn 0.14 added sklearn.neighbors.KernelDensity, which is a very # fast kernel density estimator based on a KD Tree. We'll use this if # available (and raise a warning if it isn't). try: from sklearn.neighbors import KernelDensity use_sklearn_KDE = True except: import warnings warnings.warn("KDE will be removed in astroML version 0.3. Please " "upgrade to scikit-learn 0.14+ and use " "sklearn.neighbors.KernelDensity.", DeprecationWarning) from astroML.density_estimation import KDE use_sklearn_KDE = False #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate our data: a mix of several Cauchy distributions # this is the same data used in the Bayesian Blocks figure np.random.seed(0) N = 10000 mu_gamma_f = [(5, 1.0, 0.1), (7, 0.5, 0.5), (9, 0.1, 0.1), (12, 0.5, 0.2), (14, 1.0, 0.1)] true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x) for (mu, gamma, f) in mu_gamma_f]) x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N)) for (mu, gamma, f) in mu_gamma_f]) np.random.shuffle(x) x = x[x > -10] x = x[x < 30] #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(bottom=0.08, top=0.95, right=0.95, hspace=0.1) N_values = (500, 5000) subplots = (211, 212) k_values = (10, 100) for N, k, subplot in zip(N_values, k_values, subplots): ax = fig.add_subplot(subplot) xN = x[:N] t = np.linspace(-10, 30, 1000) # Compute density with KDE if use_sklearn_KDE: kde = KernelDensity(0.1, kernel='gaussian') kde.fit(xN[:, None]) dens_kde = np.exp(kde.score_samples(t[:, None])) else: kde = KDE('gaussian', h=0.1).fit(xN[:, None]) dens_kde = kde.eval(t[:, None]) / N # Compute density with Bayesian nearest neighbors nbrs = KNeighborsDensity('bayesian', n_neighbors=k).fit(xN[:, None]) dens_nbrs = nbrs.eval(t[:, None]) / N # plot the results ax.plot(t, true_pdf(t), ':', color='black', zorder=3, label="Generating Distribution") ax.plot(xN, -0.005 * np.ones(len(xN)), '|k') hist(xN, bins='blocks', ax=ax, normed=True, zorder=1, histtype='stepfilled', color='k', alpha=0.2, label="Bayesian Blocks") ax.plot(t, dens_nbrs, '-', lw=1.5, color='gray', zorder=2, label="Nearest Neighbors (k=%i)" % k) ax.plot(t, dens_kde, '-', color='black', zorder=3, label="Kernel Density (h=0.1)") # label the plot ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top', transform=ax.transAxes) ax.set_ylabel('$p(x)$') ax.legend(loc='upper right') if subplot == 212: ax.set_xlabel('$x$') ax.set_xlim(0, 20) ax.set_ylim(-0.01, 0.4001) plt.show() astroML-0.3/book_figures/chapter6/fig_EM_metallicity.py0000644000076500000240000001140612420767763024016 0ustar jakevdpstaff00000000000000""" EM example: Gaussian Mixture Models ----------------------------------- Figure 6.6 A two-dimensional mixture of Gaussians for the stellar metallicity data. The left panel shows the number density of stars as a function of two measures of their chemical composition: metallicity ([Fe/H]) and alpha-element abundance ([alpha/Fe]). The right panel shows the density estimated using mixtures of Gaussians together with the positions and covariances (2-sigma levels) of those Gaussians. The center panel compares the information criteria AIC and BIC (see Sections 4.3.2 and 5.4.3). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm from sklearn.mixture import GMM from astroML.datasets import fetch_sdss_sspp from astroML.decorators import pickle_results from astroML.plotting.tools import draw_ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get the Segue Stellar Parameters Pipeline data data = fetch_sdss_sspp(cleaned=True) X = np.vstack([data['FeH'], data['alphFe']]).T # truncate dataset for speed X = X[::5] #------------------------------------------------------------ # Compute GMM models & AIC/BIC N = np.arange(1, 14) @pickle_results("GMM_metallicity.pkl") def compute_GMM(N, covariance_type='full', n_iter=1000): models = [None for n in N] for i in range(len(N)): print(N[i]) models[i] = GMM(n_components=N[i], n_iter=n_iter, covariance_type=covariance_type) models[i].fit(X) return models models = compute_GMM(N) AIC = [m.aic(X) for m in models] BIC = [m.bic(X) for m in models] i_best = np.argmin(BIC) gmm_best = models[i_best] print("best fit converged:", gmm_best.converged_) print("BIC: n_components = %i" % N[i_best]) #------------------------------------------------------------ # compute 2D density FeH_bins = 51 alphFe_bins = 51 H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], (FeH_bins, alphFe_bins)) Xgrid = np.array(list(map(np.ravel, np.meshgrid(0.5 * (FeH_bins[:-1] + FeH_bins[1:]), 0.5 * (alphFe_bins[:-1] + alphFe_bins[1:]))))).T log_dens = gmm_best.score(Xgrid).reshape((51, 51)) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 1.66)) fig.subplots_adjust(wspace=0.45, bottom=0.25, top=0.9, left=0.1, right=0.97) # plot density ax = fig.add_subplot(131) ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto', extent=[FeH_bins[0], FeH_bins[-1], alphFe_bins[0], alphFe_bins[-1]], cmap=plt.cm.binary) ax.set_xlabel(r'$\rm [Fe/H]$') ax.set_ylabel(r'$\rm [\alpha/Fe]$') ax.xaxis.set_major_locator(plt.MultipleLocator(0.3)) ax.set_xlim(-1.101, 0.101) ax.text(0.93, 0.93, "Input", va='top', ha='right', transform=ax.transAxes) # plot AIC/BIC ax = fig.add_subplot(132) ax.plot(N, AIC, '-k', label='AIC') ax.plot(N, BIC, ':k', label='BIC') ax.legend(loc=1) ax.set_xlabel('N components') plt.setp(ax.get_yticklabels(), fontsize=7) # plot best configurations for AIC and BIC ax = fig.add_subplot(133) ax.imshow(np.exp(log_dens), origin='lower', interpolation='nearest', aspect='auto', extent=[FeH_bins[0], FeH_bins[-1], alphFe_bins[0], alphFe_bins[-1]], cmap=plt.cm.binary) ax.scatter(gmm_best.means_[:, 0], gmm_best.means_[:, 1], c='w') for mu, C, w in zip(gmm_best.means_, gmm_best.covars_, gmm_best.weights_): draw_ellipse(mu, C, scales=[1.5], ax=ax, fc='none', ec='k') ax.text(0.93, 0.93, "Converged", va='top', ha='right', transform=ax.transAxes) ax.set_xlim(-1.101, 0.101) ax.set_ylim(alphFe_bins[0], alphFe_bins[-1]) ax.xaxis.set_major_locator(plt.MultipleLocator(0.3)) ax.set_xlabel(r'$\rm [Fe/H]$') ax.set_ylabel(r'$\rm [\alpha/Fe]$') plt.show() astroML-0.3/book_figures/chapter6/fig_GMM_clone.py0000644000076500000240000000563312252721253022705 0ustar jakevdpstaff00000000000000""" Cloning a Distribution with Gaussian Mixtures --------------------------------------------- Figure 6.10 Cloning a two-dimensional distribution. The left panel shows 1000 observed points. The center panel shows a ten-component Gaussian mixture model fit to the data (two components dominate over other eight). The third panel shows 5000 points drawn from the model in the second panel. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.mixture import GMM #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create our data: two overlapping gaussian clumps, # in a uniform background np.random.seed(1) X = np.concatenate([np.random.normal(0, 1, (200, 2)), np.random.normal(1, 1, (200, 2)), np.random.normal(4, 1.5, (400, 2)), 9 - 12 * np.random.random((200, 2))]) #------------------------------------------------------------ # Use a GMM to model the density and clone the points gmm = GMM(5, 'full').fit(X) X_new = gmm.sample(5000) xmin = -3 xmax = 9 Xgrid = np.meshgrid(np.linspace(xmin, xmax, 50), np.linspace(xmin, xmax, 50)) Xgrid = np.array(Xgrid).reshape(2, -1).T dens = np.exp(gmm.score(Xgrid)).reshape((50, 50)) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.05, bottom=0.12, top=0.9) # first plot the input ax = fig.add_subplot(131, aspect='equal') ax.plot(X[:, 0], X[:, 1], '.k', ms=2) ax.set_title("Input Distribution") ax.set_ylabel('$y$') # next plot the gmm fit ax = fig.add_subplot(132, aspect='equal') ax.imshow(dens.T, origin='lower', extent=[xmin, xmax, xmin, xmax], cmap=plt.cm.binary) ax.set_title("Density Model") ax.yaxis.set_major_formatter(plt.NullFormatter()) # next plot the cloned distribution ax = fig.add_subplot(133, aspect='equal') ax.plot(X_new[:, 0], X_new[:, 1], '.k', alpha=0.3, ms=2) ax.set_title("Cloned Distribution") ax.yaxis.set_major_formatter(plt.NullFormatter()) for ax in fig.axes: ax.set_xlim(xmin, xmax) ax.set_ylim(xmin, xmax) ax.set_xlabel('$x$') plt.show() astroML-0.3/book_figures/chapter6/fig_GMM_density_estimation.py0000644000076500000240000001100412252721253025505 0ustar jakevdpstaff00000000000000""" Comparison of 1D Density Estimators ----------------------------------- Figure 6.8 A comparison of different density estimation methods for two simulated one-dimensional data sets (same as in figure 6.5). Density estimators are Bayesian blocks (Section 5.7.2), KDE (Section 6.1.1), and a Gaussian mixture model. In the latter, the optimal number of Gaussian components is chosen using the BIC (eq. 5.35). In the top panel, GMM solution has three components but one of the components has a very large width and effectively acts as a nearly flat background. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats from astroML.plotting import hist from sklearn.mixture import GMM # Scikit-learn 0.14 added sklearn.neighbors.KernelDensity, which is a very # fast kernel density estimator based on a KD Tree. We'll use this if # available (and raise a warning if it isn't). try: from sklearn.neighbors import KernelDensity use_sklearn_KDE = True except: import warnings warnings.warn("KDE will be removed in astroML version 0.3. Please " "upgrade to scikit-learn 0.14+ and use " "sklearn.neighbors.KernelDensity.", DeprecationWarning) from astroML.density_estimation import KDE use_sklearn_KDE = False #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate our data: a mix of several Cauchy distributions # this is the same data used in the Bayesian Blocks figure np.random.seed(0) N = 10000 mu_gamma_f = [(5, 1.0, 0.1), (7, 0.5, 0.5), (9, 0.1, 0.1), (12, 0.5, 0.2), (14, 1.0, 0.1)] true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x) for (mu, gamma, f) in mu_gamma_f]) x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N)) for (mu, gamma, f) in mu_gamma_f]) np.random.shuffle(x) x = x[x > -10] x = x[x < 30] #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(bottom=0.08, top=0.95, right=0.95, hspace=0.1) N_values = (500, 5000) subplots = (211, 212) k_values = (10, 100) for N, k, subplot in zip(N_values, k_values, subplots): ax = fig.add_subplot(subplot) xN = x[:N] t = np.linspace(-10, 30, 1000) # Compute density with KDE if use_sklearn_KDE: kde = KernelDensity(0.1, kernel='gaussian') kde.fit(xN[:, None]) dens_kde = np.exp(kde.score_samples(t[:, None])) else: kde = KDE('gaussian', h=0.1).fit(xN[:, None]) dens_kde = kde.eval(t[:, None]) / N # Compute density via Gaussian Mixtures # we'll try several numbers of clusters n_components = np.arange(3, 16) gmms = [GMM(n_components=n).fit(xN) for n in n_components] BICs = [gmm.bic(xN) for gmm in gmms] i_min = np.argmin(BICs) t = np.linspace(-10, 30, 1000) logprob, responsibilities = gmms[i_min].eval(t) # plot the results ax.plot(t, true_pdf(t), ':', color='black', zorder=3, label="Generating Distribution") ax.plot(xN, -0.005 * np.ones(len(xN)), '|k', lw=1.5) hist(xN, bins='blocks', ax=ax, normed=True, zorder=1, histtype='stepfilled', lw=1.5, color='k', alpha=0.2, label="Bayesian Blocks") ax.plot(t, np.exp(logprob), '-', color='gray', label="Mixture Model\n(%i components)" % n_components[i_min]) ax.plot(t, dens_kde, '-', color='black', zorder=3, label="Kernel Density $(h=0.1)$") # label the plot ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top', transform=ax.transAxes) ax.set_ylabel('$p(x)$') ax.legend(loc='upper right') if subplot == 212: ax.set_xlabel('$x$') ax.set_xlim(0, 20) ax.set_ylim(-0.01, 0.4001) plt.show() astroML-0.3/book_figures/chapter6/fig_GMM_nclusters.py0000644000076500000240000001062212420767763023636 0ustar jakevdpstaff00000000000000""" Number of Clusters for Gaussian Mixtures ---------------------------------------- Figure 6.9 The BIC-optimized number of components in a Gaussian mixture model as a function of the sample size. All three samples (with 100, 1000, and 10,000 points) are drawn from the same distribution: two narrow foreground Gaussians and two wide background Gaussians. The top-right panel shows the BIC as a function of the number of components in the mixture. The remaining panels show the distribution of points in the sample and the 1, 2, and 3 standard deviation contours of the best-fit mixture model. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm from sklearn.mixture import GMM from astroML.utils import convert_2D_cov from astroML.plotting.tools import draw_ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up the dataset # We'll use scikit-learn's Gaussian Mixture Model to sample # data from a mixture of Gaussians. The usual way of using # this involves fitting the mixture to data: we'll see that # below. Here we'll set the internal means, covariances, # and weights by-hand. # we'll define clusters as (mu, sigma1, sigma2, alpha, frac) clusters = [((50, 50), 20, 20, 0, 0.1), ((40, 40), 10, 10, np.pi / 6, 0.6), ((80, 80), 5, 5, np.pi / 3, 0.2), ((60, 60), 30, 30, 0, 0.1)] gmm_input = GMM(len(clusters), covariance_type='full') gmm_input.means_ = np.array([c[0] for c in clusters]) gmm_input.covars_ = np.array([convert_2D_cov(*c[1:4]) for c in clusters]) gmm_input.weights_ = np.array([c[4] for c in clusters]) gmm_input.weights_ /= gmm_input.weights_.sum() #------------------------------------------------------------ # Compute and plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.11, right=0.9, bottom=0.11, top=0.9, hspace=0, wspace=0) ax_list = [fig.add_subplot(s) for s in [221, 223, 224]] ax_list.append(fig.add_axes([0.62, 0.62, 0.28, 0.28])) linestyles = ['-', '--', ':'] grid = np.linspace(-5, 105, 70) Xgrid = np.array(np.meshgrid(grid, grid)) Xgrid = Xgrid.reshape(2, -1).T Nclusters = np.arange(1, 8) for Npts, ax, ls in zip([100, 1000, 10000], ax_list, linestyles): np.random.seed(1) X = gmm_input.sample(Npts) # find best number of clusters via BIC clfs = [GMM(N, n_iter=500).fit(X) for N in Nclusters] BICs = np.array([clf.bic(X) for clf in clfs]) print("{0} points convergence:".format(Npts), [clf.converged_ for clf in clfs]) # plot the BIC ax_list[3].plot(Nclusters, BICs / Npts, ls, c='k', label="N=%i" % Npts) clf = clfs[np.argmin(BICs)] log_dens = clf.score(Xgrid).reshape((70, 70)) # scatter the points ax.plot(X[:, 0], X[:, 1], ',k', alpha=0.3, zorder=1) # plot the components for i in range(clf.n_components): mean = clf.means_[i] cov = clf.covars_[i] if cov.ndim == 1: cov = np.diag(cov) draw_ellipse(mean, cov, ax=ax, fc='none', ec='k', zorder=2) # label the plot ax.text(0.05, 0.95, "N = %i points" % Npts, ha='left', va='top', transform=ax.transAxes, bbox=dict(fc='w', ec='k')) ax.set_xlim(-5, 105) ax.set_ylim(-5, 105) ax_list[0].xaxis.set_major_formatter(plt.NullFormatter()) ax_list[2].yaxis.set_major_formatter(plt.NullFormatter()) for i in (0, 1): ax_list[i].set_ylabel('$y$') for j in (1, 2): ax_list[j].set_xlabel('$x$') ax_list[-1].legend(loc=1) ax_list[-1].set_xlabel('n. clusters') ax_list[-1].set_ylabel('$BIC / N$') ax_list[-1].set_ylim(16, 18.5) plt.show() astroML-0.3/book_figures/chapter6/fig_great_wall.py0000644000076500000240000001027012252721253023217 0ustar jakevdpstaff00000000000000""" Great Wall Density ------------------ Figure 6.4 Density estimation for galaxies within the SDSS "Great Wall." The upper-left panel shows points that are galaxies, projected by their spatial locations onto the equatorial plane (declination ~ 0 degrees). The remaining panels show estimates of the density of these points using kernel density estimation (with a Gaussian kernel with width 5Mpc), a K-nearest-neighbor estimator (eq. 6.15) optimized for a small-scale structure (with K = 5), and a K-nearest-neighbor estimator optimized for a large-scale structure (with K = 40). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.colors import LogNorm from scipy.spatial import cKDTree from astroML.datasets import fetch_great_wall from astroML.density_estimation import KDE, KNeighborsDensity #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the great wall data X = fetch_great_wall() #------------------------------------------------------------ # Create the grid on which to evaluate the results Nx = 50 Ny = 125 xmin, xmax = (-375, -175) ymin, ymax = (-300, 200) #------------------------------------------------------------ # Evaluate for several models Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx), np.linspace(ymin, ymax, Ny)))).T kde = KDE(metric='gaussian', h=5) dens_KDE = kde.fit(X).eval(Xgrid).reshape((Ny, Nx)) knn5 = KNeighborsDensity('bayesian', 5) dens_k5 = knn5.fit(X).eval(Xgrid).reshape((Ny, Nx)) knn40 = KNeighborsDensity('bayesian', 40) dens_k40 = knn40.fit(X).eval(Xgrid).reshape((Ny, Nx)) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.2)) fig.subplots_adjust(left=0.12, right=0.95, bottom=0.2, top=0.9, hspace=0.01, wspace=0.01) # First plot: scatter the points ax1 = plt.subplot(221, aspect='equal') ax1.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax1.text(0.95, 0.9, "input", ha='right', va='top', transform=ax1.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) # Second plot: KDE ax2 = plt.subplot(222, aspect='equal') ax2.imshow(dens_KDE.T, origin='lower', norm=LogNorm(), extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary) ax2.text(0.95, 0.9, "KDE: Gaussian $(h=5)$", ha='right', va='top', transform=ax2.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) # Third plot: KNN, k=5 ax3 = plt.subplot(223, aspect='equal') ax3.imshow(dens_k5.T, origin='lower', norm=LogNorm(), extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary) ax3.text(0.95, 0.9, "$k$-neighbors $(k=5)$", ha='right', va='top', transform=ax3.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) # Fourth plot: KNN, k=40 ax4 = plt.subplot(224, aspect='equal') ax4.imshow(dens_k40.T, origin='lower', norm=LogNorm(), extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary) ax4.text(0.95, 0.9, "$k$-neighbors $(k=40)$", ha='right', va='top', transform=ax4.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) for ax in [ax1, ax2, ax3, ax4]: ax.set_xlim(ymin, ymax - 0.01) ax.set_ylim(xmin, xmax) for ax in [ax1, ax2]: ax.xaxis.set_major_formatter(plt.NullFormatter()) for ax in [ax3, ax4]: ax.set_xlabel('$y$ (Mpc)') for ax in [ax2, ax4]: ax.yaxis.set_major_formatter(plt.NullFormatter()) for ax in [ax1, ax3]: ax.set_ylabel('$x$ (Mpc)') plt.show() astroML-0.3/book_figures/chapter6/fig_great_wall_GMM.py0000644000076500000240000000631012420767763023734 0ustar jakevdpstaff00000000000000""" Mixture Model of SDSS Great Wall -------------------------------- Figure 6.7 A two-dimensional mixture of 100 Gaussians (bottom) used to estimate the number density distribution of galaxies within the SDSS Great Wall (top). Compare to figures 6.3 and 6.4, where the density for the same distribution is computed using both kernel density and nearest-neighbor-based estimates. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.mixture import GMM from astroML.datasets import fetch_great_wall from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # load great wall data X = fetch_great_wall() #------------------------------------------------------------ # Create a function which will save the results to a pickle file # for large number of clusters, computation will take a long time! @pickle_results('great_wall_GMM.pkl') def compute_GMM(n_clusters, n_iter=1000, min_covar=3, covariance_type='full'): clf = GMM(n_clusters, covariance_type=covariance_type, n_iter=n_iter, min_covar=min_covar, random_state=0) clf.fit(X) print("converged:", clf.converged_) return clf #------------------------------------------------------------ # Compute a grid on which to evaluate the result Nx = 100 Ny = 250 xmin, xmax = (-375, -175) ymin, ymax = (-300, 200) Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx), np.linspace(ymin, ymax, Ny)))).T #------------------------------------------------------------ # Compute the results # # we'll use 100 clusters. In practice, one should cross-validate # with AIC and BIC to settle on the correct number of clusters. clf = compute_GMM(n_clusters=100) log_dens = clf.score(Xgrid).reshape(Ny, Nx) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0, left=0.08, right=0.95, bottom=0.13, top=0.9) ax = fig.add_subplot(211, aspect='equal') ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax) ax.xaxis.set_major_formatter(plt.NullFormatter()) plt.ylabel(r'$x\ {\rm (Mpc)}$') ax = fig.add_subplot(212, aspect='equal') ax.imshow(np.exp(log_dens.T), origin='lower', cmap=plt.cm.binary, extent=[ymin, ymax, xmin, xmax]) ax.set_xlabel(r'$y\ {\rm (Mpc)}$') ax.set_ylabel(r'$x\ {\rm (Mpc)}$') plt.show() astroML-0.3/book_figures/chapter6/fig_great_wall_KDE.py0000644000076500000240000001245612252721253023712 0ustar jakevdpstaff00000000000000""" Great Wall KDE -------------- Figure 6.3 Kernel density estimation for galaxies within the SDSS "Great Wall." The top-left panel shows points that are galaxies, projected by their spatial locations (right ascension and distance determined from redshift measurement) onto the equatorial plane (declination ~ 0 degrees). The remaining panels show estimates of the density of these points using kernel density estimation with a Gaussian kernel (upper right), a top-hat kernel (lower left), and an exponential kernel (lower right). Compare also to figure 6.4. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.colors import LogNorm from scipy.spatial import cKDTree from scipy.stats import gaussian_kde from astroML.datasets import fetch_great_wall # Scikit-learn 0.14 added sklearn.neighbors.KernelDensity, which is a very # fast kernel density estimator based on a KD Tree. We'll use this if # available (and raise a warning if it isn't). try: from sklearn.neighbors import KernelDensity use_sklearn_KDE = True except: import warnings warnings.warn("KDE will be removed in astroML version 0.3. Please " "upgrade to scikit-learn 0.14+ and use " "sklearn.neighbors.KernelDensity.", DeprecationWarning) from astroML.density_estimation import KDE use_sklearn_KDE = False #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch the great wall data X = fetch_great_wall() #------------------------------------------------------------ # Create the grid on which to evaluate the results Nx = 50 Ny = 125 xmin, xmax = (-375, -175) ymin, ymax = (-300, 200) #------------------------------------------------------------ # Evaluate for several models Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx), np.linspace(ymin, ymax, Ny)))).T kernels = ['gaussian', 'tophat', 'exponential'] dens = [] if use_sklearn_KDE: kde1 = KernelDensity(5, kernel='gaussian') log_dens1 = kde1.fit(X).score_samples(Xgrid) dens1 = X.shape[0] * np.exp(log_dens1).reshape((Ny, Nx)) kde2 = KernelDensity(5, kernel='tophat') log_dens2 = kde2.fit(X).score_samples(Xgrid) dens2 = X.shape[0] * np.exp(log_dens2).reshape((Ny, Nx)) kde3 = KernelDensity(5, kernel='exponential') log_dens3 = kde3.fit(X).score_samples(Xgrid) dens3 = X.shape[0] * np.exp(log_dens3).reshape((Ny, Nx)) else: kde1 = KDE(metric='gaussian', h=5) dens1 = kde1.fit(X).eval(Xgrid).reshape((Ny, Nx)) kde2 = KDE(metric='tophat', h=5) dens2 = kde2.fit(X).eval(Xgrid).reshape((Ny, Nx)) kde3 = KDE(metric='exponential', h=5) dens3 = kde3.fit(X).eval(Xgrid).reshape((Ny, Nx)) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.2)) fig.subplots_adjust(left=0.12, right=0.95, bottom=0.2, top=0.9, hspace=0.01, wspace=0.01) # First plot: scatter the points ax1 = plt.subplot(221, aspect='equal') ax1.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax1.text(0.95, 0.9, "input", ha='right', va='top', transform=ax1.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) # Second plot: gaussian kernel ax2 = plt.subplot(222, aspect='equal') ax2.imshow(dens1.T, origin='lower', norm=LogNorm(), extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary) ax2.text(0.95, 0.9, "Gaussian $(h=5)$", ha='right', va='top', transform=ax2.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) # Third plot: top-hat kernel ax3 = plt.subplot(223, aspect='equal') ax3.imshow(dens2.T, origin='lower', norm=LogNorm(), extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary) ax3.text(0.95, 0.9, "top-hat $(h=5)$", ha='right', va='top', transform=ax3.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) ax3.images[0].set_clim(0.01, 0.8) # Fourth plot: exponential kernel ax4 = plt.subplot(224, aspect='equal') ax4.imshow(dens3.T, origin='lower', norm=LogNorm(), extent=(ymin, ymax, xmin, xmax), cmap=plt.cm.binary) ax4.text(0.95, 0.9, "exponential $(h=5)$", ha='right', va='top', transform=ax4.transAxes, bbox=dict(boxstyle='round', ec='k', fc='w')) for ax in [ax1, ax2, ax3, ax4]: ax.set_xlim(ymin, ymax - 0.01) ax.set_ylim(xmin, xmax) for ax in [ax1, ax2]: ax.xaxis.set_major_formatter(plt.NullFormatter()) for ax in [ax3, ax4]: ax.set_xlabel('$y$ (Mpc)') for ax in [ax2, ax4]: ax.yaxis.set_major_formatter(plt.NullFormatter()) for ax in [ax1, ax3]: ax.set_ylabel('$x$ (Mpc)') plt.show() astroML-0.3/book_figures/chapter6/fig_great_wall_MST.py0000644000076500000240000001214012420767763023755 0ustar jakevdpstaff00000000000000""" Euclidean Minimum Spanning Tree ------------------------------- Figure 6.15 An approximate Euclidean minimum spanning tree over the two-dimensional projection of the SDSS Great Wall. The upper panel shows the input points, and the middle panel shows the dendrogram connecting them. The lower panel shows clustering based on this dendrogram, created by removing the largest 10% of the graph edges, and keeping the remaining connected clusters with 30 or more members. Additional information ~~~~~~~~~~~~~~~~~~~~~~ This figure is based on the data presented in Figure 1 of Cowan & Ivezic (2008). A similar figure appears in the book "Statistics, Data Mining, and Machine Learning in Astronomy", by Ivezic, Connolly, Vanderplas, and Gray (2013). The three panels of this figure show a hierarchical clustering of a subset of galaxies from the Sloan Digital Sky Survey (SDSS). This region is known as the "SDSS Great Wall", and contains an extended cluster of several thousand galaxies approximately 300Mpc (about 1 billion light years) from earth. The top panel shows the positions of over 8,000 galaxies projected to a 2D plane with Earth at the point (0, 0). The middle panel shows a dendrogram representation of a Euclidean Minimum Spanning Tree (MST) over the galaxy locations. By eliminating edges of a MST which are greater than a given length, we can measure the amount of clustering at that scale: this is one version of a class of models known as Hierarchical Clustering. The bottom panel shows the results of this clustering approach for an edge cutoff of 3.5Mpc, along with a Gaussian Mixture Model fit to the distribution within each cluster. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from scipy import sparse from sklearn.mixture import GMM from astroML.clustering import HierarchicalClustering, get_graph_segments from astroML.datasets import fetch_great_wall #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # get data X = fetch_great_wall() xmin, xmax = (-375, -175) ymin, ymax = (-300, 200) #------------------------------------------------------------ # Compute the MST clustering model n_neighbors = 10 edge_cutoff = 0.9 cluster_cutoff = 10 model = HierarchicalClustering(n_neighbors=10, edge_cutoff=edge_cutoff, min_cluster_size=cluster_cutoff) model.fit(X) print(" scale: %2g Mpc" % np.percentile(model.full_tree_.data, 100 * edge_cutoff)) n_components = model.n_components_ labels = model.labels_ #------------------------------------------------------------ # Get the x, y coordinates of the beginning and end of each line segment T_x, T_y = get_graph_segments(model.X_train_, model.full_tree_) T_trunc_x, T_trunc_y = get_graph_segments(model.X_train_, model.cluster_graph_) #------------------------------------------------------------ # Fit a GMM to each individual cluster Nx = 100 Ny = 250 Xgrid = np.vstack(map(np.ravel, np.meshgrid(np.linspace(xmin, xmax, Nx), np.linspace(ymin, ymax, Ny)))).T density = np.zeros(Xgrid.shape[0]) for i in range(n_components): ind = (labels == i) Npts = ind.sum() Nclusters = min(12, Npts // 5) gmm = GMM(Nclusters, random_state=0).fit(X[ind]) dens = np.exp(gmm.score(Xgrid)) density += dens / dens.max() density = density.reshape((Ny, Nx)) #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(5, 6)) fig.subplots_adjust(hspace=0, left=0.1, right=0.95, bottom=0.1, top=0.9) ax = fig.add_subplot(311, aspect='equal') ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('(Mpc)') ax = fig.add_subplot(312, aspect='equal') ax.plot(T_y, T_x, c='k', lw=0.5) ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('(Mpc)') ax = fig.add_subplot(313, aspect='equal') ax.plot(T_trunc_y, T_trunc_x, c='k', lw=0.5) ax.imshow(density.T, origin='lower', cmap=plt.cm.hot_r, extent=[ymin, ymax, xmin, xmax]) ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax) ax.set_xlabel('(Mpc)') ax.set_ylabel('(Mpc)') plt.show() astroML-0.3/book_figures/chapter6/fig_hist_to_kernel.py0000644000076500000240000001124612252721253024113 0ustar jakevdpstaff00000000000000""" Histogram vs Kernel Density Estimation -------------------------------------- Figure 6.1 Density estimation using histograms and kernels. The top panels show two histogram representations of the same data (shown by plus signs in the bottom of each panel) using the same bin width, but with the bin centers of the histograms offset by 0.25. The middle-left panel shows an adaptive histogram where each bin is centered on an individual point and these bins can overlap. This adaptive representation preserves the bimodality of the data. The remaining panels show kernel density estimation using Gaussian kernels with different bandwidths, increasing from the middle-right panel to the bottom-right, and with the largest bandwidth in the bottom-left panel. The trade-off of variance for bias becomes apparent as the bandwidth of the kernels increases. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy import stats #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Draw the random data np.random.seed(1) x = np.concatenate([np.random.normal(-0.5, 0.3, size=14), np.random.normal(1, 0.3, size=7)]) #------------------------------------------------------------ # First figure: silly histogram binning fig1 = plt.figure(figsize=(5, 3)) fig1.subplots_adjust(left=0.12, right=0.95, wspace=0.05, bottom=0.15, top=0.9, hspace=0.05) FC = '#6666FF' XLIM = (-2, 2.9) YLIM = (-0.09, 1.1) ax = fig1.add_subplot(121) bins = np.linspace(-1.8, 2.7, 13) ax.hist(x, bins=bins, normed=True, histtype='stepfilled', fc='k', alpha=0.3) ax.plot(XLIM, [0, 0], '-k', lw=1) ax.plot(x, 0 * x - 0.05, '+k') ax.set_xlim(XLIM) ax.set_ylim(YLIM) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') ax = fig1.add_subplot(122) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.hist(x, bins=bins + 0.25, normed=True, histtype='stepfilled', fc='k', alpha=0.3) ax.plot(XLIM, [0, 0], '-k', lw=1) ax.plot(x, 0 * x - 0.05, '+k') ax.set_xlim(XLIM) ax.set_ylim(YLIM) ax.set_xlabel('$x$') #------------------------------------------------------------ # First figure: transition to KDE fig2 = plt.figure(figsize=(5, 5)) fig2.subplots_adjust(left=0.12, right=0.95, wspace=0.05, bottom=0.1, top=0.95, hspace=0.05) ax = fig2.add_subplot(221) ax.xaxis.set_major_formatter(plt.NullFormatter()) binwidth = bins[1] - bins[0] x_plot = np.linspace(-4, 4, 1000) y_plot = (abs(x_plot - x[:, None]) <= 0.5 * binwidth).astype(float) y_plot /= (binwidth * len(x)) ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3) ax.plot(x_plot, y_plot.T, '-k', lw=1) ax.plot(x, 0 * x - 0.05, '+k') ax.set_xlim(XLIM) ax.set_ylim(YLIM) ax.set_ylabel('$p(x)$') ax = fig2.add_subplot(222) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.yaxis.set_major_formatter(plt.NullFormatter()) binwidth = bins[1] - bins[0] x_plot = np.linspace(-4, 4, 1000) y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.1) y_plot /= (binwidth * len(x)) ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3) ax.plot(x_plot, y_plot.T, '-k', lw=1) ax.plot(x, 0 * x - 0.05, '+k') ax.set_xlim(XLIM) ax.set_ylim(YLIM) ax = fig2.add_subplot(223) binwidth = bins[1] - bins[0] x_plot = np.linspace(-4, 4, 1000) y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.7) y_plot /= (binwidth * len(x)) ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3) ax.plot(x_plot, 4 * y_plot.T, '-k', lw=1) ax.plot(x, 0 * x - 0.05, '+k') ax.set_xlim(XLIM) ax.set_ylim(YLIM) ax.set_ylabel('$p(x)$') ax.set_xlabel('$x$') ax = fig2.add_subplot(224) ax.yaxis.set_major_formatter(plt.NullFormatter()) binwidth = bins[1] - bins[0] x_plot = np.linspace(-4, 4, 1000) y_plot = binwidth * stats.norm.pdf(x_plot, x[:, None], 0.2) y_plot /= (binwidth * len(x)) ax.fill(x_plot, y_plot.sum(0), ec='k', lw=1, fc='k', alpha=0.3) ax.plot(x_plot, y_plot.T, '-k', lw=1) ax.plot(x, 0 * x - 0.05, '+k') ax.set_xlim(XLIM) ax.set_ylim(YLIM) ax.set_xlabel('$x$') plt.show() astroML-0.3/book_figures/chapter6/fig_kernels.py0000644000076500000240000000335212252721253022544 0ustar jakevdpstaff00000000000000""" Example Kernels --------------- Figure 6.2 A comparison of the three kernels used for density estimation in figure 6.3: the Gaussian kernel (eq. 6.2), the top-hat kernel (eq. 6.3), and the exponential kernel (eq. 6.4). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Compute Kernels. x = np.linspace(-5, 5, 10000) dx = x[1] - x[0] gauss = (1. / np.sqrt(2 * np.pi)) * np.exp(-0.5 * x ** 2) exp = 0.5 * np.exp(-abs(x)) tophat = 0.5 * np.ones_like(x) tophat[abs(x) > 1] = 0 #------------------------------------------------------------ # Plot the kernels fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) ax.plot(x, gauss, '-', c='black', lw=3, label='Gaussian') ax.plot(x, exp, '-', c='#666666', lw=2, label='Exponential') ax.plot(x, tophat, '-', c='#999999', lw=1, label='Top-hat') ax.legend(loc=1) ax.set_xlabel('$u$') ax.set_ylabel('$K(u)$') ax.set_xlim(-5, 5) ax.set_ylim(0, 0.6001) plt.show() astroML-0.3/book_figures/chapter6/fig_kmeans_metallicity.py0000644000076500000240000000620012420577220024752 0ustar jakevdpstaff00000000000000""" EM example: K-means ------------------- Figure 6.13 The K-means analysis of the stellar metallicity data used in figure 6.6. Note how the background distribution "pulls" the cluster centers away from the locus where one would place them by eye. This is why more sophisticated models like GMM are often better in practice. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse from scipy.stats import norm from sklearn.cluster import KMeans from sklearn import preprocessing from astroML.datasets import fetch_sdss_sspp #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get data data = fetch_sdss_sspp(cleaned=True) X = np.vstack([data['FeH'], data['alphFe']]).T # truncate dataset for speed X = X[::5] #------------------------------------------------------------ # Compute a 2D histogram of the input H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 50) #------------------------------------------------------------ # Compute the KMeans clustering n_clusters = 4 scaler = preprocessing.StandardScaler() clf = KMeans(n_clusters) clf.fit(scaler.fit_transform(X)) #------------------------------------------------------------ # Visualize the results fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot() # plot density ax = plt.axes() ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto', extent=[FeH_bins[0], FeH_bins[-1], alphFe_bins[0], alphFe_bins[-1]], cmap=plt.cm.binary) # plot cluster centers cluster_centers = scaler.inverse_transform(clf.cluster_centers_) ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1], s=40, c='w', edgecolors='k') # plot cluster boundaries FeH_centers = 0.5 * (FeH_bins[1:] + FeH_bins[:-1]) alphFe_centers = 0.5 * (alphFe_bins[1:] + alphFe_bins[:-1]) Xgrid = np.meshgrid(FeH_centers, alphFe_centers) Xgrid = np.array(Xgrid).reshape((2, 50 * 50)).T H = clf.predict(scaler.transform(Xgrid)).reshape((50, 50)) for i in range(n_clusters): Hcp = H.copy() flag = (Hcp == i) Hcp[flag] = 1 Hcp[~flag] = 0 ax.contour(FeH_centers, alphFe_centers, Hcp, [-0.5, 0.5], linewidths=1, colors='k') ax.xaxis.set_major_locator(plt.MultipleLocator(0.3)) ax.set_xlim(-1.101, 0.101) ax.set_ylim(alphFe_bins[0], alphFe_bins[-1]) ax.set_xlabel(r'$\rm [Fe/H]$') ax.set_ylabel(r'$\rm [\alpha/Fe]$') plt.show() astroML-0.3/book_figures/chapter6/fig_meanshift_metallicity.py0000644000076500000240000000671112420767763025476 0ustar jakevdpstaff00000000000000""" Mean Shift Example ------------------ Figure 6.14 Mean-shift clustering on the metallicity datas et used in figures 6.6 and 6.13. The method finds two clusters associated with local maxima of the distribution (interior of the circles). Points outside the circles have been determined to lie in the background. The mean shift does not attempt to model correlation in the clusters: that is, the resulting clusters are axis aligned. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse from scipy.stats import norm from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn import preprocessing from astroML.datasets import fetch_sdss_sspp #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get the data np.random.seed(0) data = fetch_sdss_sspp(cleaned=True) # cut out some additional strange outliers data = data[~((data['alphFe'] > 0.4) & (data['FeH'] > -0.3))] X = np.vstack([data['FeH'], data['alphFe']]).T #---------------------------------------------------------------------- # Compute clustering with MeanShift # # We'll work with the scaled data, because MeanShift finds circular clusters X_scaled = preprocessing.scale(X) # The following bandwidth can be automatically detected using # the routine estimate_bandwidth(). Because bandwidth estimation # is very expensive in memory and computation, we'll skip it here. #bandwidth = estimate_bandwidth(X) bandwidth = 0.4 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False) ms.fit(X_scaled) labels_unique = np.unique(ms.labels_) n_clusters = len(labels_unique[labels_unique >= 0]) print(labels_unique) print(bandwidth) print("number of estimated clusters :", n_clusters) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) # plot density H, FeH_bins, alphFe_bins = np.histogram2d(data['FeH'], data['alphFe'], 51) ax.imshow(H.T, origin='lower', interpolation='nearest', aspect='auto', extent=[FeH_bins[0], FeH_bins[-1], alphFe_bins[0], alphFe_bins[-1]], cmap=plt.cm.binary) # plot clusters colors = ['b', 'g', 'r', 'k'] for i in range(n_clusters): Xi = X[ms.labels_ == i] H, b1, b2 = np.histogram2d(Xi[:, 0], Xi[:, 1], (FeH_bins, alphFe_bins)) bins = [0.1] ax.contour(0.5 * (FeH_bins[1:] + FeH_bins[:-1]), 0.5 * (alphFe_bins[1:] + alphFe_bins[:-1]), H.T, bins, colors='w') ax.xaxis.set_major_locator(plt.MultipleLocator(0.3)) ax.set_xlim(-1.101, 0.101) ax.set_ylim(alphFe_bins[0], alphFe_bins[-1]) ax.set_xlabel(r'$\rm [Fe/H]$') ax.set_ylabel(r'$\rm [\alpha/Fe]$') plt.show() astroML-0.3/book_figures/chapter6/fig_stellar_XD.py0000644000076500000240000002005112420767763023152 0ustar jakevdpstaff00000000000000""" Extreme Deconvolution of Stellar Data ------------------------------------- Figure 6.12 Extreme deconvolution applied to stellar data from SDSS Stripe 82. The top panels compare the color distributions for a high signal-to-noise sample of standard stars (left) with lower signal-to-noise, single epoch, data (right). The middle panels show the results of applying extreme deconvolution to the single epoch data. The bottom panel compares the distributions of a color measured perpendicularly to the locus (the so-called w color is defined following Ivezic et al 2004). The distribution of colors from the extreme deconvolution of the noisy data recovers the tight distribution of the high signal-to-noise data. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from astroML.density_estimation import XDGMM from astroML.crossmatch import crossmatch from astroML.datasets import fetch_sdss_S82standards, fetch_imaging_sample from astroML.plotting.tools import draw_ellipse from astroML.decorators import pickle_results from astroML.stats import sigmaG #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # define u-g-r-i-z extinction from Berry et al, arXiv 1111.4985 # multiply extinction by A_r extinction_vector = np.array([1.810, 1.400, 1.0, 0.759, 0.561]) #---------------------------------------------------------------------- # Fetch and process the noisy imaging data data_noisy = fetch_imaging_sample() # select only stars data_noisy = data_noisy[data_noisy['type'] == 6] # Get the extinction-corrected magnitudes for each band X = np.vstack([data_noisy[f + 'RawPSF'] for f in 'ugriz']).T Xerr = np.vstack([data_noisy[f + 'psfErr'] for f in 'ugriz']).T # extinction terms from Berry et al, arXiv 1111.4985 X -= (extinction_vector * data_noisy['rExtSFD'][:, None]) #---------------------------------------------------------------------- # Fetch and process the stacked imaging data data_stacked = fetch_sdss_S82standards() # cut to RA, DEC range of imaging sample RA = data_stacked['RA'] DEC = data_stacked['DEC'] data_stacked = data_stacked[(RA > 0) & (RA < 10) & (DEC > -1) & (DEC < 1)] # get stacked magnitudes for each band Y = np.vstack([data_stacked['mmu_' + f] for f in 'ugriz']).T Yerr = np.vstack([data_stacked['msig_' + f] for f in 'ugriz']).T # extinction terms from Berry et al, arXiv 1111.4985 Y -= (extinction_vector * data_stacked['A_r'][:, None]) # quality cuts g = Y[:, 1] mask = ((Yerr.max(1) < 0.05) & (g < 20)) data_stacked = data_stacked[mask] Y = Y[mask] Yerr = Yerr[mask] #---------------------------------------------------------------------- # cross-match # the imaging sample contains both standard and variable stars. We'll # perform a cross-match with the standard star catalog and choose objects # which are common to both. Xlocs = np.hstack((data_noisy['ra'][:, np.newaxis], data_noisy['dec'][:, np.newaxis])) Ylocs = np.hstack((data_stacked['RA'][:, np.newaxis], data_stacked['DEC'][:, np.newaxis])) print("number of noisy points: ", Xlocs.shape) print("number of stacked points:", Ylocs.shape) # find all points within 0.9 arcsec. This cutoff was selected # by plotting a histogram of the log(distances). dist, ind = crossmatch(Xlocs, Ylocs, max_distance=0.9 / 3600) noisy_mask = (~np.isinf(dist)) stacked_mask = ind[noisy_mask] # select the data data_noisy = data_noisy[noisy_mask] X = X[noisy_mask] Xerr = Xerr[noisy_mask] data_stacked = data_stacked[stacked_mask] Y = Y[stacked_mask] Yerr = Yerr[stacked_mask] # double-check that our cross-match succeeded assert X.shape == Y.shape print("size after crossmatch:", X.shape) #---------------------------------------------------------------------- # perform extreme deconvolution on the noisy sample # first define mixing matrix W W = np.array([[0, 1, 0, 0, 0], # g magnitude [1, -1, 0, 0, 0], # u-g color [0, 1, -1, 0, 0], # g-r color [0, 0, 1, -1, 0], # r-i color [0, 0, 0, 1, -1]]) # i-z color X = np.dot(X, W.T) Y = np.dot(Y, W.T) # compute error covariance from mixing matrix Xcov = np.zeros(Xerr.shape + Xerr.shape[-1:]) Xcov[:, range(Xerr.shape[1]), range(Xerr.shape[1])] = Xerr ** 2 # each covariance C = WCW^T # best way to do this is with a tensor dot-product Xcov = np.tensordot(np.dot(Xcov, W.T), W, (-2, -1)) #---------------------------------------------------------------------- # This is a long calculation: save results to file @pickle_results("XD_stellar.pkl") def compute_XD(n_clusters=12, rseed=0, n_iter=100, verbose=True): np.random.seed(rseed) clf = XDGMM(n_clusters, n_iter=n_iter, tol=1E-5, verbose=verbose) clf.fit(X, Xcov) return clf clf = compute_XD(12) #------------------------------------------------------------ # Fit and sample from the underlying distribution np.random.seed(42) X_sample = clf.sample(X.shape[0]) #------------------------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(left=0.12, right=0.95, bottom=0.1, top=0.95, wspace=0.02, hspace=0.02) # only plot 1/10 of the stars for clarity ax1 = fig.add_subplot(221) ax1.scatter(Y[::10, 2], Y[::10, 3], s=9, lw=0, c='k') ax2 = fig.add_subplot(222) ax2.scatter(X[::10, 2], X[::10, 3], s=9, lw=0, c='k') ax3 = fig.add_subplot(223) ax3.scatter(X_sample[::10, 2], X_sample[::10, 3], s=9, lw=0, c='k') ax4 = fig.add_subplot(224) for i in range(clf.n_components): draw_ellipse(clf.mu[i, 2:4], clf.V[i, 2:4, 2:4], scales=[2], ec='k', fc='gray', alpha=0.2, ax=ax4) titles = ["Standard Stars", "Single Epoch", "Extreme Deconvolution\n resampling", "Extreme Deconvolution\n cluster locations"] ax = [ax1, ax2, ax3, ax4] for i in range(4): ax[i].set_xlim(-0.6, 1.8) ax[i].set_ylim(-0.6, 1.8) ax[i].xaxis.set_major_locator(plt.MultipleLocator(0.5)) ax[i].yaxis.set_major_locator(plt.MultipleLocator(0.5)) ax[i].text(0.05, 0.95, titles[i], ha='left', va='top', transform=ax[i].transAxes) if i in (0, 1): ax[i].xaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_xlabel('$g-r$') if i in (1, 3): ax[i].yaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_ylabel('$r-i$') #------------------------------------------------------------ # Second figure: the width of the locus fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) labels = ['single epoch', 'standard stars', 'XD resampled'] linestyles = ['solid', 'dashed', 'dotted'] for data, label, ls in zip((X, Y, X_sample), labels, linestyles): g = data[:, 0] gr = data[:, 2] ri = data[:, 3] r = g - gr i = r - ri mask = (gr > 0.3) & (gr < 1.0) g = g[mask] r = r[mask] i = i[mask] w = -0.227 * g + 0.792 * r - 0.567 * i + 0.05 sigma = sigmaG(w) ax.hist(w, bins=np.linspace(-0.08, 0.08, 100), linestyle=ls, histtype='step', label=label + '\n\t' + r'$\sigma_G=%.3f$' % sigma, normed=True) ax.legend(loc=2) ax.text(0.95, 0.95, '$w = -0.227g + 0.792r$\n$ - 0.567i + 0.05$', transform=ax.transAxes, ha='right', va='top') ax.set_xlim(-0.07, 0.07) ax.set_ylim(0, 55) ax.set_xlabel('$w$') ax.set_ylabel('$N(w)$') plt.show() astroML-0.3/book_figures/chapter6/fig_XD_example.py0000644000076500000240000000751612252721253023135 0ustar jakevdpstaff00000000000000""" Extreme Deconvolution example ----------------------------- Figure 6.11 An example of extreme deconvolution showing a simulated two-dimensional distribution of points, where the positions are subject to errors. The top two panels show the distributions with small (left) and large (right) errors. The bottom panels show the densities derived from the noisy sample (top-right panel) using extreme deconvolution; the resulting distribution closely matches that shown in the top-left panel. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.decorators import pickle_results from astroML.density_estimation import XDGMM from astroML.plotting.tools import draw_ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Sample the dataset N = 2000 np.random.seed(0) # generate the true data x_true = (1.4 + 2 * np.random.random(N)) ** 2 y_true = 0.1 * x_true ** 2 # add scatter to "true" distribution dx = 0.1 + 4. / x_true ** 2 dy = 0.1 + 10. / x_true ** 2 x_true += np.random.normal(0, dx, N) y_true += np.random.normal(0, dy, N) # add noise to get the "observed" distribution dx = 0.2 + 0.5 * np.random.random(N) dy = 0.2 + 0.5 * np.random.random(N) x = x_true + np.random.normal(0, dx) y = y_true + np.random.normal(0, dy) # stack the results for computation X = np.vstack([x, y]).T Xerr = np.zeros(X.shape + X.shape[-1:]) diag = np.arange(X.shape[-1]) Xerr[:, diag, diag] = np.vstack([dx ** 2, dy ** 2]).T #------------------------------------------------------------ # compute and save results @pickle_results("XD_toy.pkl") def compute_XD_results(n_components=10, n_iter=500): clf = XDGMM(n_components, n_iter=n_iter) clf.fit(X, Xerr) return clf clf = compute_XD_results(10, 500) sample = clf.sample(N) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95, wspace=0.02, hspace=0.02) ax1 = fig.add_subplot(221) ax1.scatter(x_true, y_true, s=4, lw=0, c='k') ax2 = fig.add_subplot(222) ax2.scatter(x, y, s=4, lw=0, c='k') ax3 = fig.add_subplot(223) ax3.scatter(sample[:, 0], sample[:, 1], s=4, lw=0, c='k') ax4 = fig.add_subplot(224) for i in range(clf.n_components): draw_ellipse(clf.mu[i], clf.V[i], scales=[2], ax=ax4, ec='k', fc='gray', alpha=0.2) titles = ["True Distribution", "Noisy Distribution", "Extreme Deconvolution\n resampling", "Extreme Deconvolution\n cluster locations"] ax = [ax1, ax2, ax3, ax4] for i in range(4): ax[i].set_xlim(-1, 13) ax[i].set_ylim(-6, 16) ax[i].xaxis.set_major_locator(plt.MultipleLocator(4)) ax[i].yaxis.set_major_locator(plt.MultipleLocator(5)) ax[i].text(0.05, 0.95, titles[i], ha='left', va='top', transform=ax[i].transAxes) if i in (0, 1): ax[i].xaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_xlabel('$x$') if i in (1, 3): ax[i].yaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter6/README.rst0000644000076500000240000000033612115147567021400 0ustar jakevdpstaff00000000000000Chapter 6: Searching for Structure in Point Data ------------------------------------------------ This chapter covers high-dimensional point statistics, including density estimation, clustering, and correlation functions. astroML-0.3/book_figures/chapter7/0000755000076500000240000000000012462244012017674 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter7/fig_eigenvalues.py0000644000076500000240000000474612252721253023421 0ustar jakevdpstaff00000000000000""" SDSS Eigenvalues ---------------- Figure 7.5 The eigenvalues for the PCA decomposition of the SDSS spectra described in Section 7.3.2. The top panel shows the decrease in eigenvalue as a function of the number of eigenvectors, with a break in the distribution at ten eigenvectors. The lower panel shows the cumulative sum of eigenvalues normalized to unity. 94% of the variance in the SDSS spectra can be captured using the first ten eigenvectors. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML import datasets #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # load data: data = datasets.fetch_sdss_corrected_spectra() spectra = datasets.sdss_corrected_spectra.reconstruct_spectra(data) # Eigenvalues can be computed using PCA as in the commented code below: #from sklearn.decomposition import PCA #pca = PCA() #pca.fit(spectra) #evals = pca.explained_variance_ratio_ #evals_cs = evals.cumsum() # because the spectra have been reconstructed from masked values, this # is not exactly correct in this case: we'll use the values computed # in the file compute_sdss_pca.py evals = data['evals'] ** 2 evals_cs = evals.cumsum() evals_cs /= evals_cs[-1] #------------------------------------------------------------ # plot the eigenvalues fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.05, bottom=0.12) ax = fig.add_subplot(211, xscale='log', yscale='log') ax.grid() ax.plot(evals, c='k') ax.set_ylabel('Normalized Eigenvalues') ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylim(5E-4, 100) ax = fig.add_subplot(212, xscale='log') ax.grid() ax.semilogx(evals_cs, color='k') ax.set_xlabel('Eigenvalue Number') ax.set_ylabel('Cumulative Eigenvalues') ax.set_ylim(0.65, 1.00) plt.show() astroML-0.3/book_figures/chapter7/fig_PCA_LLE.py0000644000076500000240000001114012420767763022210 0ustar jakevdpstaff00000000000000""" PCA Projection of SDSS Spectra ------------------------------ Figure 7.9 A comparison of the classification of quiescent galaxies and sources with strong line emission using LLE and PCA. The top panel shows the segregation of galaxy types as a function of the first three PCA components. The lower panel shows the segregation using the first three LLE dimensions. The preservation of locality in LLE enables nonlinear features within a spectrum (e.g., variation in the width of an emission line) to be captured with fewer components. This results in better segregation of spectral types with fewer dimensions. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn import manifold, neighbors from astroML.datasets import sdss_corrected_spectra from astroML.datasets import fetch_sdss_corrected_spectra from astroML.plotting.tools import discretize_cmap from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up color-map properties clim = (1.5, 6.5) cmap = discretize_cmap(plt.cm.jet, 5) cdict = ['unknown', 'star', 'absorption galaxy', 'galaxy', 'emission galaxy', 'narrow-line QSO', 'broad-line QSO'] cticks = [2, 3, 4, 5, 6] formatter = plt.FuncFormatter(lambda t, *args: cdict[int(np.round(t))]) #------------------------------------------------------------ # Fetch the data; PCA coefficients have been pre-computed data = fetch_sdss_corrected_spectra() coeffs_PCA = data['coeffs'] c_PCA = data['lineindex_cln'] spec = sdss_corrected_spectra.reconstruct_spectra(data) color = data['lineindex_cln'] #------------------------------------------------------------ # Compute the LLE projection; save the results @pickle_results("spec_LLE.pkl") def compute_spec_LLE(n_neighbors=10, out_dim=3): # Compute the LLE projection LLE = manifold.LocallyLinearEmbedding(n_neighbors, out_dim, method='modified', eigen_solver='dense') Y_LLE = LLE.fit_transform(spec) print(" - finished LLE projection") # remove outliers for the plot BT = neighbors.BallTree(Y_LLE) dist, ind = BT.query(Y_LLE, n_neighbors) dist_to_n = dist[:, -1] dist_to_n -= dist_to_n.mean() std = np.std(dist_to_n) flag = (dist_to_n > 0.25 * std) print(" - removing {0} outliers for plot".format(flag.sum())) return Y_LLE[~flag], color[~flag] coeffs_LLE, c_LLE = compute_spec_LLE(10, 3) #---------------------------------------------------------------------- # Plot the results: for (c, coeffs, xlim) in zip([c_PCA, c_LLE], [coeffs_PCA, coeffs_LLE], [(-1.2, 1.0), (-0.01, 0.014)]): fig = plt.figure(figsize=(5, 3.75)) fig.subplots_adjust(hspace=0.05, wspace=0.05) # axes for colorbar cax = plt.axes([0.525, 0.525, 0.02, 0.35]) # Create scatter-plots scatter_kwargs = dict(s=4, lw=0, edgecolors='none', c=c, cmap=cmap) ax1 = plt.subplot(221) im1 = ax1.scatter(coeffs[:, 0], coeffs[:, 1], **scatter_kwargs) im1.set_clim(clim) ax1.set_ylabel('$c_2$') ax2 = plt.subplot(223) im2 = ax2.scatter(coeffs[:, 0], coeffs[:, 2], **scatter_kwargs) im2.set_clim(clim) ax2.set_xlabel('$c_1$') ax2.set_ylabel('$c_3$') ax3 = plt.subplot(224) im3 = ax3.scatter(coeffs[:, 1], coeffs[:, 2], **scatter_kwargs) im3.set_clim(clim) ax3.set_xlabel('$c_2$') fig.colorbar(im3, ax=ax3, cax=cax, ticks=cticks, format=formatter) ax1.xaxis.set_major_formatter(plt.NullFormatter()) ax3.yaxis.set_major_formatter(plt.NullFormatter()) ax1.set_xlim(xlim) ax2.set_xlim(xlim) for ax in (ax1, ax2, ax3): ax.xaxis.set_major_locator(plt.MaxNLocator(5)) ax.yaxis.set_major_locator(plt.MaxNLocator(5)) plt.show() astroML-0.3/book_figures/chapter7/fig_PCA_reconstruction.py0000644000076500000240000000646212252721253024653 0ustar jakevdpstaff00000000000000""" SDSS Reconstruction from Eigenspectra ------------------------------------- Figure 7.7 The principal component vectors defined for the SDSS spectra can be used to interpolate across or reconstruct missing data. Examples of three masked spectral regions are shown comparing the reconstruction of the input spectrum (black line) using the mean and the first ten eigenspectra (blue line) The gray bands represent the masked region of the spectrum. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib import ticker from astroML.datasets import fetch_sdss_corrected_spectra from astroML.datasets import sdss_corrected_spectra #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get spectra and eigenvectors used to reconstruct them data = fetch_sdss_corrected_spectra() spec = sdss_corrected_spectra.reconstruct_spectra(data) lam = sdss_corrected_spectra.compute_wavelengths(data) evecs = data['evecs'] mu = data['mu'] norms = data['norms'] mask = data['mask'] #------------------------------------------------------------ # plot the results i_plot = ((lam > 5750) & (lam < 6350)) lam = lam[i_plot] specnums = [20, 8, 9] subplots = [311, 312, 313] fig = plt.figure(figsize=(5, 6.25)) fig.subplots_adjust(left=0.09, bottom=0.08, hspace=0, right=0.92, top=0.95) for subplot, i in zip(subplots, specnums): ax = fig.add_subplot(subplot) # compute eigen-coefficients spec_i_centered = spec[i] / norms[i] - mu coeffs = np.dot(spec_i_centered, evecs.T) # blank out masked regions spec_i = spec[i] mask_i = mask[i] spec_i[mask_i] = np.nan # plot the raw masked spectrum ax.plot(lam, spec_i[i_plot], '-', color='k', label='True spectrum', lw=1.5) # plot two levels of reconstruction for nev in [10]: if nev == 0: label = 'mean' else: label = 'reconstruction\n(nev=%i)' % nev spec_i_recons = norms[i] * (mu + np.dot(coeffs[:nev], evecs[:nev])) ax.plot(lam, spec_i_recons[i_plot], label=label, color='grey') # plot shaded background in masked region ylim = ax.get_ylim() mask_shade = ylim[0] + mask[i][i_plot].astype(float) * ylim[1] plt.fill(np.concatenate([lam[:1], lam, lam[-1:]]), np.concatenate([[ylim[0]], mask_shade, [ylim[0]]]), lw=0, fc='k', alpha=0.2) ax.set_xlim(lam[0], lam[-1]) ax.set_ylim(ylim) ax.yaxis.set_major_formatter(ticker.NullFormatter()) if subplot == 311: ax.legend(loc=1) ax.set_xlabel('$\lambda\ (\AA)$') ax.set_ylabel('normalized flux') plt.show() astroML-0.3/book_figures/chapter7/fig_PCA_rotation.py0000644000076500000240000000567012252721253023431 0ustar jakevdpstaff00000000000000""" Scematic Diagram of PCA ----------------------- Figure 7.2 A distribution of points drawn from a bivariate Gaussian and centered on the origin of x and y. PCA defines a rotation such that the new axes (x' and y') are aligned along the directions of maximal variance (the principal components) with zero covariance. This is equivalent to minimizing the square of the perpendicular distances between the points and the principal components. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set parameters and draw the random sample np.random.seed(42) r = 0.9 sigma1 = 0.25 sigma2 = 0.08 rotation = np.pi / 6 s = np.sin(rotation) c = np.cos(rotation) X = np.random.normal(0, [sigma1, sigma2], size=(100, 2)).T R = np.array([[c, -s], [s, c]]) X = np.dot(R, X) #------------------------------------------------------------ # Plot the diagram fig = plt.figure(figsize=(5, 5), facecolor='w') ax = plt.axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False) # draw axes ax.annotate(r'$x$', (-r, 0), (r, 0), ha='center', va='center', arrowprops=dict(arrowstyle='<->', color='k', lw=1)) ax.annotate(r'$y$', (0, -r), (0, r), ha='center', va='center', arrowprops=dict(arrowstyle='<->', color='k', lw=1)) # draw rotated axes ax.annotate(r'$x^\prime$', (-r * c, -r * s), (r * c, r * s), ha='center', va='center', arrowprops=dict(color='k', arrowstyle='<->', lw=1)) ax.annotate(r'$y^\prime$', (r * s, -r * c), (-r * s, r * c), ha='center', va='center', arrowprops=dict(color='k', arrowstyle='<->', lw=1)) # scatter points ax.scatter(X[0], X[1], s=25, lw=0, c='k', zorder=2) # draw lines vnorm = np.array([s, -c]) for v in (X.T): d = np.dot(v, vnorm) v1 = v - d * vnorm ax.plot([v[0], v1[0]], [v[1], v1[1]], '-k') # draw ellipses for sigma in (1, 2, 3): ax.add_patch(Ellipse((0, 0), 2 * sigma * sigma1, 2 * sigma * sigma2, rotation * 180. / np.pi, ec='k', fc='gray', alpha=0.2, zorder=1)) ax.set_xlim(-1, 1) ax.set_ylim(-1, 1) plt.show() astroML-0.3/book_figures/chapter7/fig_S_manifold_PCA.py0000644000076500000240000001031212420577220023632 0ustar jakevdpstaff00000000000000""" Comparison of PCA and Manifold Learning --------------------------------------- Figure 7.8 A comparison of PCA and manifold learning. The top-left panel shows an example S-shaped data set (a two-dimensional manifold in a three-dimensional space). PCA identifies three principal components within the data. Projection onto the first two PCA components results in a mixing of the colors along the manifold. Manifold learning (LLE and IsoMap) preserves the local structure when projecting the data, preventing the mixing of the colors. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt import matplotlib from matplotlib import ticker from sklearn import manifold, datasets, decomposition #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # generate the S-curve dataset np.random.seed(0) n_points = 1100 n_neighbors = 10 out_dim = 2 X, color = datasets.samples_generator.make_s_curve(n_points) # change the proportions to emphasize the weakness of PCA X[:, 1] -= 1 X[:, 1] *= 1.5 X[:, 2] *= 0.5 #------------------------------------------------------------ # Compute the projections pca = decomposition.PCA(out_dim) Y_pca = pca.fit_transform(X) lle = manifold.LocallyLinearEmbedding(n_neighbors, out_dim, method='modified', random_state=0, eigen_solver='dense') Y_lle = lle.fit_transform(X) iso = manifold.Isomap(n_neighbors, out_dim) Y_iso = iso.fit_transform(X) #------------------------------------------------------------ # plot the 3D dataset fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.05, right=0.95, bottom=0.05, top=0.9) try: # matplotlib 1.0+ has a toolkit for generating 3D plots from mpl_toolkits.mplot3d import Axes3D ax1 = fig.add_subplot(221, projection='3d', xticks=[], yticks=[], zticks=[]) ax1.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.jet, s=9, lw=0) ax1.view_init(11, -73) except: # In older versions, we'll have to wing it with a 2D plot ax1 = fig.add_subplot(221) # Create a projection to mimic 3D scatter-plot X_proj = X / (X.max(0) - X.min(0)) X_proj -= X_proj.mean(0) R = np.array([[0.5, 0.0], [0.1, 0.1], [0.0, 0.5]]) R /= np.sqrt(np.sum(R ** 2, 0)) X_proj = np.dot(X_proj, R) # change line width with depth lw = X[:, 1].copy() lw -= lw.min() lw /= lw.max() lw = 1 - lw ax1.scatter(X_proj[:, 0], X_proj[:, 1], c=color, cmap=plt.cm.jet, s=9, lw=lw, zorder=10) # draw the shaded axes ax1.fill([-0.7, -0.3, -0.3, -0.7, -0.7], [-0.7, -0.3, 0.7, 0.3, -0.7], ec='k', fc='#DDDDDD', zorder=0) ax1.fill([-0.3, 0.7, 0.7, -0.3, -0.3], [-0.3, -0.3, 0.7, 0.7, -0.3], ec='k', fc='#DDDDDD', zorder=0) ax1.fill([-0.7, 0.3, 0.7, -0.3, -0.7], [-0.7, -0.7, -0.3, -0.3, -0.7], ec='k', fc='#DDDDDD', zorder=0) ax1.xaxis.set_major_locator(ticker.NullLocator()) ax1.yaxis.set_major_locator(ticker.NullLocator()) #------------------------------------------------------------ # Plot the projections subplots = [222, 223, 224] titles = ['PCA projection', 'LLE projection', 'IsoMap projection'] Yvals = [Y_pca, Y_lle, Y_iso] for (Y, title, subplot) in zip(Yvals, titles, subplots): ax = fig.add_subplot(subplot) ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.jet, s=9, lw=0) ax.set_title(title) ax.set_xticks([]) ax.set_yticks([]) plt.show() astroML-0.3/book_figures/chapter7/fig_spec_decompositions.py0000644000076500000240000001110112420767763025157 0ustar jakevdpstaff00000000000000""" SDSS spectra Decompositions --------------------------- Figure 7.4 A comparison of the decomposition of SDSS spectra using PCA (left panel - see Section 7.3.1), ICA (middle panel - see Section 7.6) and NMF (right panel - see Section 7.4). The rank of the component increases from top to bottom. For the ICA and PCA the first component is the mean spectrum (NMF does not require mean subtraction). All of these techniques isolate a common set of spectral features (identifying features associated with the continuum and line emission). The ordering of the spectral components is technique dependent. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.decomposition import NMF from sklearn.decomposition import FastICA from sklearn.decomposition import RandomizedPCA from astroML.datasets import sdss_corrected_spectra from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Download data data = sdss_corrected_spectra.fetch_sdss_corrected_spectra() spectra = sdss_corrected_spectra.reconstruct_spectra(data) wavelengths = sdss_corrected_spectra.compute_wavelengths(data) #---------------------------------------------------------------------- # Compute PCA, ICA, and NMF components # we'll save the results so that they can be re-used @pickle_results('spec_decompositions.pkl') def compute_PCA_ICA_NMF(n_components=5): spec_mean = spectra.mean(0) # PCA: use randomized PCA for speed pca = RandomizedPCA(n_components - 1, random_state=0) pca.fit(spectra) pca_comp = np.vstack([spec_mean, pca.components_]) # ICA treats sequential observations as related. Because of this, we need # to fit with the transpose of the spectra ica = FastICA(n_components - 1, random_state=0) ica.fit(spectra.T) ica_comp = np.vstack([spec_mean, ica.transform(spectra.T).T]) # NMF requires all elements of the input to be greater than zero spectra[spectra < 0] = 0 nmf = NMF(n_components, random_state=0) nmf.fit(spectra) nmf_comp = nmf.components_ return pca_comp, ica_comp, nmf_comp n_components = 5 decompositions = compute_PCA_ICA_NMF(n_components) #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(5, 4)) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05, bottom=0.1, top=0.95, hspace=0.05) titles = ['PCA components', 'ICA components', 'NMF components'] for i, comp in enumerate(decompositions): for j in range(n_components): ax = fig.add_subplot(n_components, 3, 3 * j + 1 + i) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_major_locator(plt.MultipleLocator(1000)) if j < n_components - 1: ax.xaxis.set_major_formatter(plt.NullFormatter()) else: ax.xaxis.set_major_locator( plt.FixedLocator(list(range(3000, 7999, 1000)))) ax.set_xlabel(r'wavelength ${\rm (\AA)}$') ax.plot(wavelengths, comp[j], '-k', lw=1) # plot zero line xlim = [3000, 8000] ax.plot(xlim, [0, 0], '-', c='gray', lw=1) if j == 0: ax.set_title(titles[i]) if titles[i].startswith('PCA') or titles[i].startswith('ICA'): if j == 0: label = 'mean' else: label = 'component %i' % j else: label = 'component %i' % (j + 1) ax.text(0.03, 0.94, label, transform=ax.transAxes, ha='left', va='top') for l in ax.get_xticklines() + ax.get_yticklines(): l.set_markersize(2) # adjust y limits ylim = plt.ylim() dy = 0.05 * (ylim[1] - ylim[0]) ax.set_ylim(ylim[0] - dy, ylim[1] + 4 * dy) ax.set_xlim(xlim) plt.show() astroML-0.3/book_figures/chapter7/fig_spec_examples.py0000644000076500000240000000524512252721253023735 0ustar jakevdpstaff00000000000000""" SDSS spectra Examples --------------------- Figure 7.1 A sample of 15 galaxy spectra selected from the SDSS spectroscopic data set (see Section 1.5.5). These spectra span a range of galaxy types, from star-forming to passive galaxies. Each spectrum has been shifted to its rest frame and covers the wavelength interval 3000-8000 Angstroms. The specific fluxes, :math:`F_\lambda(\lambda)`, on the ordinate axes have an arbitrary scaling. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.decomposition import RandomizedPCA from astroML.datasets import sdss_corrected_spectra #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # Use pre-computed PCA to reconstruct spectra data = sdss_corrected_spectra.fetch_sdss_corrected_spectra() spectra = sdss_corrected_spectra.reconstruct_spectra(data) lam = sdss_corrected_spectra.compute_wavelengths(data) #------------------------------------------------------------ # select random spectra np.random.seed(5) nrows = 5 ncols = 3 ind = np.random.randint(spectra.shape[0], size=nrows * ncols) spec_sample = spectra[ind] #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(5, 4)) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05, bottom=0.1, top=0.95, hspace=0.05) for i in range(ncols): for j in range(nrows): ax = fig.add_subplot(nrows, ncols, ncols * j + 1 + i) ax.plot(lam, spec_sample[ncols * j + i], '-k', lw=1) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_major_locator(plt.MultipleLocator(1000)) if j < nrows - 1: ax.xaxis.set_major_formatter(plt.NullFormatter()) else: plt.xlabel(r'wavelength $(\AA)$') ax.set_xlim(3000, 7999) ylim = ax.get_ylim() dy = 0.05 * (ylim[1] - ylim[0]) ax.set_ylim(ylim[0] - dy, ylim[1] + dy) plt.show() astroML-0.3/book_figures/chapter7/fig_spec_reconstruction.py0000644000076500000240000000652212252721253025177 0ustar jakevdpstaff00000000000000""" PCA Reconstruction of a spectrum -------------------------------- Figure 7.6 The reconstruction of a particular spectrum from its eigenvectors. The input spectrum is shown in gray, and the partial reconstruction for progressively more terms is shown in black. The top panel shows only the mean of the set of spectra. By the time 20 PCA components are added, the reconstruction is very close to the input, as indicated by the expected total variance of 94%. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.decomposition import PCA from astroML.datasets import sdss_corrected_spectra from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Download data data = sdss_corrected_spectra.fetch_sdss_corrected_spectra() spectra = sdss_corrected_spectra.reconstruct_spectra(data) wavelengths = sdss_corrected_spectra.compute_wavelengths(data) #------------------------------------------------------------ # Compute PCA components # Eigenvalues can be computed using PCA as in the commented code below: #from sklearn.decomposition import PCA #pca = PCA() #pca.fit(spectra) #evals = pca.explained_variance_ratio_ #evals_cs = evals.cumsum() # because the spectra have been reconstructed from masked values, this # is not exactly correct in this case: we'll use the values computed # in the file compute_sdss_pca.py evals = data['evals'] ** 2 evals_cs = evals.cumsum() evals_cs /= evals_cs[-1] evecs = data['evecs'] spec_mean = spectra.mean(0) #------------------------------------------------------------ # Find the coefficients of a particular spectrum spec = spectra[1] coeff = np.dot(evecs, spec - spec_mean) #------------------------------------------------------------ # Plot the sequence of reconstructions fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(hspace=0, top=0.95, bottom=0.1, left=0.12, right=0.93) for i, n in enumerate([0, 4, 8, 20]): ax = fig.add_subplot(411 + i) ax.plot(wavelengths, spec, '-', c='gray') ax.plot(wavelengths, spec_mean + np.dot(coeff[:n], evecs[:n]), '-k') if i < 3: ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylim(-2, 21) ax.set_ylabel('flux') if n == 0: text = "mean" elif n == 1: text = "mean + 1 component\n" text += r"$(\sigma^2_{tot} = %.2f)$" % evals_cs[n - 1] else: text = "mean + %i components\n" % n text += r"$(\sigma^2_{tot} = %.2f)$" % evals_cs[n - 1] ax.text(0.02, 0.93, text, ha='left', va='top', transform=ax.transAxes) fig.axes[-1].set_xlabel(r'${\rm wavelength\ (\AA)}$') plt.show() astroML-0.3/book_figures/chapter7/fig_svd_visual.py0000644000076500000240000000750112252721253023261 0ustar jakevdpstaff00000000000000""" Plot a visual representation of an SVD -------------------------------------- Figure 7.3 Singular value decomposition (SVD) can factorize an N x K matrix into :math:`U \Sigma V^T`. There are different conventions for computing the SVD in the literature, and this figure illustrates the convention used in this text. The matrix of singular values :math:`\Sigma` is always a square matrix of size [R x R] where R = min(N, K). The shape of the resulting U and V matrices depends on whether N or K is larger. The columns of the matrix U are called the left-singular vectors, and the columns of the matrix V are called the right-singular vectors. The columns are orthonormal bases, and satisfy :math:`U^T U = V^T V = I`. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Rectangle #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) # Define a function to create a rectangle def labeled_rect(ax, center, width, height, text, stripe='vert', N=7, color='#CCCCCC'): left = center[0] - 0.5 * width bottom = center[1] - 0.5 * height ax.add_patch(Rectangle((left, bottom), width, height, fill=True, color=color, ec='k')) ax.text(center[0], center[1], text, fontsize=14, ha='center', va='center', bbox=dict(ec=color, fc=color)) if stripe == 'vert': xlocs = np.linspace(center[0] - 0.5 * width, center[0] + 0.5 * width, N + 2)[1:-1] for x in xlocs: plt.plot([x, x], [center[1] - 0.5 * height, center[1] + 0.5 * height], '-k') elif stripe == 'horiz': ylocs = np.linspace(center[1] - 0.5 * height, center[1] + 0.5 * height, N + 2)[1:-1] for y in ylocs: plt.plot([center[0] - 0.5 * width, center[0] + 0.5 * width], [y, y], '-k') elif stripe == 'diag': plt.plot([center[0] - 0.5 * width, center[0] + 0.5 * width], [center[1] + 0.5 * height, center[1] - 0.5 * height], '-k') else: raise ValueError("unrecognized stripe type") #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0, bottom=0, right=1, top=1) ax = fig.add_subplot(111, xticks=[], yticks=[], frameon=False) labeled_rect(ax, (0.3, 0.75), 0.5, 0.25, '$X_1$', 'horiz') labeled_rect(ax, (0.975, 0.75), 0.25, 0.25, '$U_1$', 'vert') labeled_rect(ax, (1.275, 0.75), 0.25, 0.25, r'$\Sigma_1$', 'diag') labeled_rect(ax, (1.7, 0.75), 0.5, 0.25, r'$V_1^T$', 'horiz') labeled_rect(ax, (0.3, 0.3), 0.25, 0.5, r'$X_2$', 'horiz', N=15) labeled_rect(ax, (0.975, 0.3), 0.25, 0.5, r'$U_2$', 'vert') labeled_rect(ax, (1.275, 0.3), 0.25, 0.25, r'$\Sigma_2$', 'diag') labeled_rect(ax, (1.575, 0.3), 0.25, 0.25, r'$V_2^T$', 'horiz') ax.text(0.7, 0.75, '$=$', fontsize=14, ha='center', va='center') ax.text(0.7, 0.3, '$=$', fontsize=14, ha='center', va='center') ax.set_xlim(0, 2) ax.set_ylim(0, 1) plt.show() astroML-0.3/book_figures/chapter7/README.rst0000644000076500000240000000034212115147567021376 0ustar jakevdpstaff00000000000000Chapter 7: Dimensionality and its Reduction ------------------------------------------- This chapter covers dimensionality reduction, from linear methods such as PCA, NMF, and ICA, to nonlinear methods such as LLE and Isomap. astroML-0.3/book_figures/chapter8/0000755000076500000240000000000012462244012017675 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter8/fig_cross_val_A.py0000644000076500000240000000417112252721253023336 0ustar jakevdpstaff00000000000000""" Cross Validation Examples: Part 1 --------------------------------- Figure 8.12 Our toy data set described by eq. 8.75. Shown is the line of best fit, which quite clearly underfits the data. In other words, a linear model in this case has high bias. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib import ticker from matplotlib.patches import FancyArrow #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define our functional form def func(x, dy=0.1): return np.random.normal(np.sin(x) * x, dy) #------------------------------------------------------------ # select the (noisy) data np.random.seed(0) x = np.linspace(0, 3, 22)[1:-1] dy = 0.1 y = func(x, dy) #------------------------------------------------------------ # Select the cross-validation points np.random.seed(1) x_cv = 3 * np.random.random(20) y_cv = func(x_cv) x_fit = np.linspace(0, 3, 1000) #------------------------------------------------------------ # First figure: plot points with a linear fit fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) ax.scatter(x, y, marker='x', c='k', s=30) p = np.polyfit(x, y, 1) y_fit = np.polyval(p, x_fit) ax.text(0.03, 0.96, "d = 1", transform=plt.gca().transAxes, ha='left', va='top', bbox=dict(ec='k', fc='w', pad=10)) ax.plot(x_fit, y_fit, '-b') ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter8/fig_cross_val_B.py0000644000076500000240000000512112252721253023333 0ustar jakevdpstaff00000000000000""" Cross Validation Examples: part 2 --------------------------------- Figure 8.13 Three models of increasing complexity applied to our toy dataset (eq. 8.75). The d = 2 model, like the linear model in figure 8.12, suffers from high bias, and underfits the data. The d = 19 model suffers from high variance, and overfits the data. The d = 3 model is a good compromise between these extremes. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib import ticker from matplotlib.patches import FancyArrow #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define our functional form def func(x, dy=0.1): return np.random.normal(np.sin(x) * x, dy) #------------------------------------------------------------ # select the (noisy) data np.random.seed(0) x = np.linspace(0, 3, 22)[1:-1] dy = 0.1 y = func(x, dy) #------------------------------------------------------------ # Select the cross-validation points np.random.seed(1) x_cv = 3 * np.random.random(20) y_cv = func(x_cv) x_fit = np.linspace(0, 3, 1000) #------------------------------------------------------------ # Second figure: plot fit for several orders of polynomial fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(wspace=0.03, bottom=0.15, top=0.95, left=0.07, right=0.97) for i, d in enumerate([2, 3, 19]): ax = fig.add_subplot(131 + i) ax.scatter(x, y, marker='x', c='k', s=30) p = np.polyfit(x, y, d) y_fit = np.polyval(p, x_fit) ax.plot(x_fit, y_fit, '-b') ax.set_ylim(-0.1, 2.1) ax.set_xlim(-0.2, 3.2) if i in (1, 2): ax.yaxis.set_major_formatter(plt.NullFormatter()) else: ax.set_ylabel('$y$') ax.set_xlabel('$x$') ax.text(0.08, 0.94, "d = %i" % d, transform=ax.transAxes, ha='left', va='top', bbox=dict(ec='k', fc='w', pad=10)) plt.show() astroML-0.3/book_figures/chapter8/fig_cross_val_C.py0000644000076500000240000000623312252721253023341 0ustar jakevdpstaff00000000000000""" Cross Validation Examples ------------------------- Figure 8.14 The top panel shows the root-mean-square (rms) training error and validation error for our toy model (eq. 8.75) as a function of the polynomial degree d. The horizontal dotted line indicates the level of intrinsic scatter in the data. Models with polyno- mial degree from 3 to 5 minimize the validation rms error. The bottom panel shows the Bayesian information criterion (BIC) for the training and cross-validation subsamples. According to the BIC, a degree-3 polynomial gives the best fit to this data set. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib import ticker from matplotlib.patches import FancyArrow #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define our functional form def func(x, dy=0.1): return np.random.normal(np.sin(x) * x, dy) #------------------------------------------------------------ # select the (noisy) data np.random.seed(0) x = np.linspace(0, 3, 22)[1:-1] dy = 0.1 y = func(x, dy) #------------------------------------------------------------ # Select the cross-validation points np.random.seed(1) x_cv = 3 * np.random.random(20) y_cv = func(x_cv) x_fit = np.linspace(0, 3, 1000) #------------------------------------------------------------ # Third figure: plot errors as a function of polynomial degree d d = np.arange(0, 21) training_err = np.zeros(d.shape) crossval_err = np.zeros(d.shape) fig = plt.figure(figsize=(5, 5)) for i in range(len(d)): p = np.polyfit(x, y, d[i]) training_err[i] = np.sqrt(np.sum((np.polyval(p, x) - y) ** 2) / len(y)) crossval_err[i] = np.sqrt(np.sum((np.polyval(p, x_cv) - y_cv) ** 2) / len(y_cv)) BIC_train = np.sqrt(len(y)) * training_err / dy + d * np.log(len(y)) BIC_crossval = np.sqrt(len(y)) * crossval_err / dy + d * np.log(len(y)) ax = fig.add_subplot(211) ax.plot(d, crossval_err, '--k', label='cross-validation') ax.plot(d, training_err, '-k', label='training') ax.plot(d, 0.1 * np.ones(d.shape), ':k') ax.set_xlim(0, 14) ax.set_ylim(0, 0.8) ax.set_xlabel('polynomial degree') ax.set_ylabel('rms error') ax.legend(loc=2) ax = fig.add_subplot(212) ax.plot(d, BIC_crossval, '--k', label='cross-validation') ax.plot(d, BIC_train, '-k', label='training') ax.set_xlim(0, 14) ax.set_ylim(0, 100) ax.legend(loc=2) ax.set_xlabel('polynomial degree') ax.set_ylabel('BIC') plt.show() astroML-0.3/book_figures/chapter8/fig_cross_val_D.py0000644000076500000240000000612112252721253023336 0ustar jakevdpstaff00000000000000""" Cross Validation Examples: Part 4 --------------------------------- Figure 8.15 The learning curves for the data given by eq. 8.75, with d = 2 and d = 3. Both models have high variance for a few data points, visible in the spread between training and validation error. As the number of points increases, it is clear that d = 2 is a high-bias model which cannot be improved simply by adding training points. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib import ticker from matplotlib.patches import FancyArrow #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define our functional form def func(x, dy=0.1): return np.random.normal(np.sin(x) * x, dy) #------------------------------------------------------------ # select the (noisy) data np.random.seed(0) x = np.linspace(0, 3, 22)[1:-1] dy = 0.1 y = func(x, dy) #------------------------------------------------------------ # Select the cross-validation points np.random.seed(1) x_cv = 3 * np.random.random(20) y_cv = func(x_cv) x_fit = np.linspace(0, 3, 1000) #------------------------------------------------------------ # Fourth figure: plot errors as a function of training set size np.random.seed(0) x = 3 * np.random.random(100) y = func(x) np.random.seed(1) x_cv = 3 * np.random.random(100) y_cv = func(x_cv) Nrange = np.arange(10, 101, 2) fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.15, top=0.95) for subplot, d in zip([211, 212], [2, 3]): ax = fig.add_subplot(subplot) training_err = np.zeros(Nrange.shape) crossval_err = np.zeros(Nrange.shape) for j, N in enumerate(Nrange): p = np.polyfit(x[:N], y[:N], d) training_err[j] = np.sqrt(np.sum((np.polyval(p, x[:N]) - y[:N]) ** 2) / len(y)) crossval_err[j] = np.sqrt(np.sum((np.polyval(p, x_cv) - y_cv) ** 2) / len(y_cv)) ax.plot(Nrange, crossval_err, '--k', label='cross-validation') ax.plot(Nrange, training_err, '-k', label='training') ax.plot(Nrange, 0.1 * np.ones(Nrange.shape), ':k') ax.legend(loc=1) ax.text(0.03, 0.94, "d = %i" % d, transform=ax.transAxes, ha='left', va='top', bbox=dict(ec='k', fc='w', pad=10)) ax.set_ylim(0, 0.4) ax.set_xlabel('Number of training points') ax.set_ylabel('rms error') plt.show() astroML-0.3/book_figures/chapter8/fig_gp_example.py0000644000076500000240000001102412420767763023234 0ustar jakevdpstaff00000000000000""" Gaussian Process Example ------------------------ Figure 8.10 An example of Gaussian process regression. The upper-left panel shows three functions drawn from an unconstrained Gaussian process with squared-exponential covari- ance of bandwidth h = 1.0. The upper-right panel adds two constraints, and shows the 2-sigma contours of the constrained function space. The lower-left panel shows the function space constrained by the points with error bars. The lower-right panel shows the function space constrained by 20 noisy points drawn from f(x) = cos(x). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from sklearn.gaussian_process import GaussianProcess #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # define a squared exponential covariance function def squared_exponential(x1, x2, h): return np.exp(-0.5 * (x1 - x2) ** 2 / h ** 2) #------------------------------------------------------------ # draw samples from the unconstrained covariance np.random.seed(1) x = np.linspace(0, 10, 100) h = 1.0 mu = np.zeros(len(x)) C = squared_exponential(x, x[:, None], h) draws = np.random.multivariate_normal(mu, C, 3) #------------------------------------------------------------ # Constrain the mean and covariance with two points x1 = np.array([2.5, 7]) y1 = np.cos(x1) gp1 = GaussianProcess(corr='squared_exponential', theta0=0.5, random_state=0) gp1.fit(x1[:, None], y1) f1, MSE1 = gp1.predict(x[:, None], eval_MSE=True) f1_err = np.sqrt(MSE1) #------------------------------------------------------------ # Constrain the mean and covariance with two noisy points # scikit-learn gaussian process uses nomenclature from the geophysics # community, where a "nugget" can be specified. The diagonal of the # assumed covariance matrix is multiplied by the nugget. This is # how the error on inputs is incorporated into the calculation dy2 = 0.2 gp2 = GaussianProcess(corr='squared_exponential', theta0=0.5, nugget=(dy2 / y1) ** 2, random_state=0) gp2.fit(x1[:, None], y1) f2, MSE2 = gp2.predict(x[:, None], eval_MSE=True) f2_err = np.sqrt(MSE2) #------------------------------------------------------------ # Constrain the mean and covariance with many noisy points x3 = np.linspace(0, 10, 20) y3 = np.cos(x3) dy3 = 0.2 y3 = np.random.normal(y3, dy3) gp3 = GaussianProcess(corr='squared_exponential', theta0=0.5, thetaL=0.01, thetaU=10.0, nugget=(dy3 / y3) ** 2, random_state=0) gp3.fit(x3[:, None], y3) f3, MSE3 = gp3.predict(x[:, None], eval_MSE=True) f3_err = np.sqrt(MSE3) # we have fit for the `h` parameter: print the result here: print("best-fit theta =", gp3.theta_[0, 0]) #------------------------------------------------------------ # Plot the diagrams fig = plt.figure(figsize=(5, 5)) # first: plot a selection of unconstrained functions ax = fig.add_subplot(221) ax.plot(x, draws.T, '-k') ax.set_ylabel('$f(x)$') # second: plot a constrained function ax = fig.add_subplot(222) ax.plot(x, f1, '-', color='gray') ax.fill_between(x, f1 - 2 * f1_err, f1 + 2 * f1_err, color='gray', alpha=0.3) ax.plot(x1, y1, '.k', ms=6) # third: plot a constrained function with errors ax = fig.add_subplot(223) ax.plot(x, f2, '-', color='gray') ax.fill_between(x, f2 - 2 * f2_err, f2 + 2 * f2_err, color='gray', alpha=0.3) ax.errorbar(x1, y1, dy2, fmt='.k', ms=6) ax.set_xlabel('$x$') ax.set_ylabel('$f(x)$') # third: plot a more constrained function with errors ax = fig.add_subplot(224) ax.plot(x, f3, '-', color='gray') ax.fill_between(x, f3 - 2 * f3_err, f3 + 2 * f3_err, color='gray', alpha=0.3) ax.errorbar(x3, y3, dy3, fmt='.k', ms=6) ax.plot(x, np.cos(x), ':k') ax.set_xlabel('$x$') for ax in fig.axes: ax.set_xlim(0, 10) plt.show() astroML-0.3/book_figures/chapter8/fig_gp_mu_z.py0000644000076500000240000000535612420767763022566 0ustar jakevdpstaff00000000000000""" Cosmology Regression Example ---------------------------- Figure 8.11 A Gaussian process regression analysis of the simulated supernova sample used in figure 8.2. This uses a squared-exponential covariance model, with bandwidth learned through cross-validation. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from sklearn.gaussian_process import GaussianProcess from astroML.cosmology import Cosmology from astroML.datasets import generate_mu_z #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate data z_sample, mu_sample, dmu = generate_mu_z(100, random_state=0) cosmo = Cosmology() z = np.linspace(0.01, 2, 1000) mu_true = np.asarray([cosmo.mu(zi) for zi in z]) #------------------------------------------------------------ # fit the data # Mesh the input space for evaluations of the real function, # the prediction and its MSE z_fit = np.linspace(0, 2, 1000) gp = GaussianProcess(corr='squared_exponential', theta0=1e-1, thetaL=1e-2, thetaU=1, normalize=False, nugget=(dmu / mu_sample) ** 2, random_start=1) gp.fit(z_sample[:, None], mu_sample) y_pred, MSE = gp.predict(z_fit[:, None], eval_MSE=True) sigma = np.sqrt(MSE) print("theta:", gp.theta_) #------------------------------------------------------------ # Plot the gaussian process # gaussian process allows computation of the error at each point # so we will show this as a shaded region fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95) ax = fig.add_subplot(111) ax.plot(z, mu_true, '--k') ax.errorbar(z_sample, mu_sample, dmu, fmt='.k', ecolor='gray', markersize=6) ax.plot(z_fit, y_pred, '-k') ax.fill_between(z_fit, y_pred - 1.96 * sigma, y_pred + 1.96 * sigma, alpha=0.2, color='b', label='95% confidence interval') ax.set_xlabel('$z$') ax.set_ylabel(r'$\mu$') ax.set_xlim(0, 2) ax.set_ylim(36, 48) plt.show() astroML-0.3/book_figures/chapter8/fig_huber_func.py0000644000076500000240000000317512252721253023226 0ustar jakevdpstaff00000000000000""" Huber Loss Function ------------------- Figure 8.7 The Huber loss function for various values of c. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the Huber loss def Phi(t, c): t = abs(t) flag = (t > c) return (~flag) * (0.5 * t ** 2) - (flag) * c * (0.5 * c - t) #------------------------------------------------------------ # Plot for several values of c fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) x = np.linspace(-10, 10, 100) for c in (1, 2, 3, 5, 1000): y = Phi(x, c) ax.plot(x, y, '-k') if c > 10: s = r'\infty' else: s = str(c) ax.text(x[6], y[6], '$c=%s$' % s, ha='center', va='center', bbox=dict(boxstyle='round', ec='k', fc='w')) ax.set_xlabel('$t$') ax.set_ylabel(r'$\Phi(t)$') plt.show() astroML-0.3/book_figures/chapter8/fig_huber_loss.py0000644000076500000240000000556512420767763023275 0ustar jakevdpstaff00000000000000""" Huber Loss Function ------------------- Figure 8.8 An example of fitting a simple linear model to data which includes outliers (data is from table 1 of Hogg et al 2010). A comparison of linear regression using the squared-loss function (equivalent to ordinary least-squares regression) and the Huber loss function, with c = 1 (i.e., beyond 1 standard deviation, the loss becomes linear). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from scipy import optimize from astroML.datasets import fetch_hogg2010test #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Get data: this includes outliers data = fetch_hogg2010test() x = data['x'] y = data['y'] dy = data['sigma_y'] # Define the standard squared-loss function def squared_loss(m, b, x, y, dy): y_fit = m * x + b return np.sum(((y - y_fit) / dy) ** 2, -1) # Define the log-likelihood via the Huber loss function def huber_loss(m, b, x, y, dy, c=2): y_fit = m * x + b t = abs((y - y_fit) / dy) flag = t > c return np.sum((~flag) * (0.5 * t ** 2) - (flag) * c * (0.5 * c - t), -1) f_squared = lambda beta: squared_loss(beta[0], beta[1], x=x, y=y, dy=dy) f_huber = lambda beta: huber_loss(beta[0], beta[1], x=x, y=y, dy=dy, c=1) #------------------------------------------------------------ # compute the maximum likelihood using the huber loss beta0 = (2, 30) beta_squared = optimize.fmin(f_squared, beta0) beta_huber = optimize.fmin(f_huber, beta0) print(beta_squared) print(beta_huber) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(111) x_fit = np.linspace(0, 350, 10) ax.plot(x_fit, beta_squared[0] * x_fit + beta_squared[1], '--k', label="squared loss:\n $y=%.2fx + %.1f$" % tuple(beta_squared)) ax.plot(x_fit, beta_huber[0] * x_fit + beta_huber[1], '-k', label="Huber loss:\n $y=%.2fx + %.1f$" % tuple(beta_huber)) ax.legend(loc=4) ax.errorbar(x, y, dy, fmt='.k', lw=1, ecolor='gray') ax.set_xlim(0, 350) ax.set_ylim(100, 700) ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter8/fig_lasso_ridge.py0000644000076500000240000001031012252721253023366 0ustar jakevdpstaff00000000000000""" Ridge and Lasso: Geometric Interpretation ----------------------------------------- Figure 8.3 A geometric interpretation of regularization. The right panel shows L1 regularization (LASSO regression) and the left panel L2 regularization (ridge regularization). The ellipses indicate the posterior distribution for no prior or regularization. The solid lines show the constraints due to regularization (limiting theta^2 for ridge regression and abs(theta) for LASSO regression). The corners of the L1 regularization create more opportunities for the solution to have zeros for some of the weights. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse, Circle, RegularPolygon #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots #------------------------------------------------------------ # Set up figure fig = plt.figure(figsize=(5, 2.5), facecolor='w') #------------------------------------------------------------ # plot ridge diagram ax = fig.add_axes([0, 0, 0.5, 1], frameon=False, xticks=[], yticks=[]) # plot the axes ax.arrow(-1, 0, 9, 0, head_width=0.1, fc='k') ax.arrow(0, -1, 0, 9, head_width=0.1, fc='k') # plot the ellipses and circles for i in range(3): ax.add_patch(Ellipse((3, 5), 3.5 * np.sqrt(2 * i + 1), 1.7 * np.sqrt(2 * i + 1), -15, fc='none')) ax.add_patch(Circle((0, 0), 3.815, fc='none')) # plot arrows ax.arrow(0, 0, 1.46, 3.52, head_width=0.2, fc='k', length_includes_head=True) ax.arrow(0, 0, 3, 5, head_width=0.2, fc='k', length_includes_head=True) ax.arrow(0, -0.2, 3.81, 0, head_width=0.1, fc='k', length_includes_head=True) ax.arrow(3.81, -0.2, -3.81, 0, head_width=0.1, fc='k', length_includes_head=True) # annotate with text ax.text(7.5, -0.1, r'$\theta_1$', va='top') ax.text(-0.1, 7.5, r'$\theta_2$', ha='right') ax.text(3, 5 + 0.2, r'$\rm \theta_{normal\ equation}$', ha='center', bbox=dict(boxstyle='round', ec='k', fc='w')) ax.text(1.46, 3.52 + 0.2, r'$\rm \theta_{ridge}$', ha='center', bbox=dict(boxstyle='round', ec='k', fc='w')) ax.text(1.9, -0.3, r'$r$', ha='center', va='top') ax.set_xlim(-2, 9) ax.set_ylim(-2, 9) #------------------------------------------------------------ # plot lasso diagram ax = fig.add_axes([0.5, 0, 0.5, 1], frameon=False, xticks=[], yticks=[]) # plot axes ax.arrow(-1, 0, 9, 0, head_width=0.1, fc='k') ax.arrow(0, -1, 0, 9, head_width=0.1, fc='k') # plot ellipses and circles for i in range(3): ax.add_patch(Ellipse((3, 5), 3.5 * np.sqrt(2 * i + 1), 1.7 * np.sqrt(2 * i + 1), -15, fc='none')) # this is producing some weird results on save #ax.add_patch(RegularPolygon((0, 0), 4, 4.4, np.pi, fc='none')) ax.plot([-4.4, 0, 4.4, 0, -4.4], [0, 4.4, 0, -4.4, 0], '-k') # plot arrows ax.arrow(0, 0, 0, 4.4, head_width=0.2, fc='k', length_includes_head=True) ax.arrow(0, 0, 3, 5, head_width=0.2, fc='k', length_includes_head=True) ax.arrow(0, -0.2, 4.2, 0, head_width=0.1, fc='k', length_includes_head=True) ax.arrow(4.2, -0.2, -4.2, 0, head_width=0.1, fc='k', length_includes_head=True) # annotate plot ax.text(7.5, -0.1, r'$\theta_1$', va='top') ax.text(-0.1, 7.5, r'$\theta_2$', ha='right') ax.text(3, 5 + 0.2, r'$\rm \theta_{normal\ equation}$', ha='center', bbox=dict(boxstyle='round', ec='k', fc='w')) ax.text(0, 4.4 + 0.2, r'$\rm \theta_{lasso}$', ha='center', bbox=dict(boxstyle='round', ec='k', fc='w')) ax.text(2, -0.3, r'$r$', ha='center', va='top') ax.set_xlim(-2, 9) ax.set_ylim(-2, 9) plt.show() astroML-0.3/book_figures/chapter8/fig_linreg_inline.py0000644000076500000240000001062512252721253023722 0ustar jakevdpstaff00000000000000""" Inline Bayesian Linear Regression --------------------------------- Figure 8.1 An example showing the online nature of Bayesian regression. The upper panel shows the four points used in regression, drawn from the line y = theta_1 x + theta_2 with theta_1 = 1 and theta_2 = 0. The lower panel shows the posterior pdf in the (theta_1, theta_2) plane as each point is added in sequence. For clarity, the implied dark regions for sigma > 3 have been removed. The fourth point is an upper-limit measurement of y, and the resulting posterior cuts off half the parameter space. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.plotting.mcmc import convert_to_stdev #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up the data and errors np.random.seed(13) a = 1 b = 0 x = np.array([-1, 0.44, -0.16]) y = a * x + b dy = np.array([0.25, 0.22, 0.2]) y = np.random.normal(y, dy) # add a fourth point which is a lower bound x4 = 1.0 y4 = a * x4 + b + 0.2 #------------------------------------------------------------ # Compute the likelihoods for each point a_range = np.linspace(0, 2, 80) b_range = np.linspace(-1, 1, 80) logL = -((a_range[:, None, None] * x + b_range[None, :, None] - y) / dy) ** 2 sigma = [convert_to_stdev(logL[:, :, i]) for i in range(3)] # compute best-fit from first three points logL_together = logL.sum(-1) i, j = np.where(logL_together == np.max(logL_together)) amax = a_range[i[0]] bmax = b_range[j[0]] #------------------------------------------------------------ # Plot the first figure: the points and errorbars fig1 = plt.figure(figsize=(5, 3.75)) ax1 = fig1.add_subplot(111) # Draw the true and best-fit lines xfit = np.array([-1.5, 1.5]) ax1.plot(xfit, a * xfit + b, ':k', label='True fit') ax1.plot(xfit, amax * xfit + bmax, '--k', label='fit to $\{x_1, x_2, x_3\}$') ax1.legend(loc=2) ax1.errorbar(x, y, dy, fmt='ok') ax1.errorbar([x4], [y4], [[0.5], [0]], fmt='_k', lolims=True) for i in range(3): ax1.text(x[i] + 0.05, y[i] - 0.3, "$x_{%i}$" % (i + 1)) ax1.text(x4 + 0.05, y4 - 0.5, "$x_4$") ax1.set_xlabel('$x$') ax1.set_ylabel('$y$') ax1.set_xlim(-1.5, 1.5) ax1.set_ylim(-2, 2) #------------------------------------------------------------ # Plot the second figure: likelihoods for each point fig2 = plt.figure(figsize=(5, 5)) fig2.subplots_adjust(hspace=0.05, wspace=0.05) # plot likelihood contours for i in range(4): ax = fig2.add_subplot(221 + i) for j in range(min(i + 1, 3)): ax.contourf(a_range, b_range, sigma[j].T, levels=(0, 0.683, 0.955, 0.997), cmap=plt.cm.binary, alpha=0.5) # plot the excluded area from the fourth point axpb = a_range[:, None] * x4 + b_range[None, :] mask = y4 < axpb fig2.axes[3].fill_between(a_range, y4 - x4 * a_range, 2, color='k', alpha=0.5) # plot ellipses for i in range(1, 4): ax = fig2.axes[i] logL_together = logL[:, :, :i + 1].sum(-1) if i == 3: logL_together[mask] = -np.inf sigma_together = convert_to_stdev(logL_together) ax.contour(a_range, b_range, sigma_together.T, levels=(0.683, 0.955, 0.997), colors='k') # Label and adjust axes for i in range(4): ax = fig2.axes[i] ax.text(1.98, -0.98, "$x_{%i}$" % (i + 1), ha='right', va='bottom') ax.plot([0, 2], [0, 0], ':k', lw=1) ax.plot([1, 1], [-1, 1], ':k', lw=1) ax.set_xlim(0.001, 2) ax.set_ylim(-0.999, 1) if i in (1, 3): ax.yaxis.set_major_formatter(plt.NullFormatter()) if i in (0, 1): ax.xaxis.set_major_formatter(plt.NullFormatter()) if i in (0, 2): ax.set_ylabel(r'$\theta_2$') if i in (2, 3): ax.set_xlabel(r'$\theta_1$') plt.show() astroML-0.3/book_figures/chapter8/fig_nonlinear_mu_z.py0000644000076500000240000000742312420767763024142 0ustar jakevdpstaff00000000000000""" Nonlinear cosmology fit to mu vs z ---------------------------------- Figure 8.5 Cosmology fit to the standard cosmological integral. Errors in mu are a factor of ten smaller than for the sample used in figure 8.2. Contours are 1-sigma, 2-sigma, and 3-sigma for the posterior (uniform prior in :math:`\Omega_M` and :math:`\Omega_\Lambda`). The dashed line shows flat cosmology. The dotted lines show the input values. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from astroML.datasets import generate_mu_z from astroML.cosmology import Cosmology from astroML.plotting.mcmc import convert_to_stdev from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate the data z_sample, mu_sample, dmu = generate_mu_z(100, z0=0.3, dmu_0=0.05, dmu_1=0.004, random_state=1) #------------------------------------------------------------ # define a log likelihood in terms of the parameters # beta = [omegaM, omegaL] def compute_logL(beta): cosmo = Cosmology(omegaM=beta[0], omegaL=beta[1]) mu_pred = np.array([cosmo.mu(z) for z in z_sample]) return - np.sum(0.5 * ((mu_sample - mu_pred) / dmu) ** 2) #------------------------------------------------------------ # Define a function to compute (and save to file) the log-likelihood @pickle_results('mu_z_nonlinear.pkl') def compute_mu_z_nonlinear(Nbins=50): omegaM = np.linspace(0.05, 0.75, Nbins) omegaL = np.linspace(0.4, 1.1, Nbins) logL = np.empty((Nbins, Nbins)) for i in range(len(omegaM)): for j in range(len(omegaL)): logL[i, j] = compute_logL([omegaM[i], omegaL[j]]) return omegaM, omegaL, logL omegaM, omegaL, res = compute_mu_z_nonlinear() res -= np.max(res) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.25, bottom=0.15, top=0.9) # left plot: the data and best-fit ax = fig.add_subplot(121) whr = np.where(res == np.max(res)) omegaM_best = omegaM[whr[0][0]] omegaL_best = omegaL[whr[1][0]] cosmo = Cosmology(omegaM=omegaM_best, omegaL=omegaL_best) z_fit = np.linspace(0.04, 2, 100) mu_fit = np.asarray([cosmo.mu(z) for z in z_fit]) ax.plot(z_fit, mu_fit, '-k') ax.errorbar(z_sample, mu_sample, dmu, fmt='.k', ecolor='gray') ax.set_xlim(0, 1.8) ax.set_ylim(36, 46) ax.set_xlabel('$z$') ax.set_ylabel(r'$\mu$') ax.text(0.04, 0.96, "%i observations" % len(z_sample), ha='left', va='top', transform=ax.transAxes) # right plot: the likelihood ax = fig.add_subplot(122) ax.contour(omegaM, omegaL, convert_to_stdev(res.T), levels=(0.683, 0.955, 0.997), colors='k') ax.plot([0, 1], [1, 0], '--k') ax.plot([0, 1], [0.73, 0.73], ':k') ax.plot([0.27, 0.27], [0, 2], ':k') ax.set_xlim(0.05, 0.75) ax.set_ylim(0.4, 1.1) ax.set_xlabel(r'$\Omega_M$') ax.set_ylabel(r'$\Omega_\Lambda$') plt.show() astroML-0.3/book_figures/chapter8/fig_outlier_rejection.py0000644000076500000240000002323512422073173024632 0ustar jakevdpstaff00000000000000""" Perform Outlier Rejection with MCMC ----------------------------------- Figure 8.9 Bayesian outlier detection for the same data as shown in figure 8.8. The top-left panel shows the data, with the fits from each model. The top-right panel shows the 1-sigma and 2-sigma contours for the slope and intercept with no outlier correction: the resulting fit (shown by the dotted line) is clearly highly affected by the presence of outliers. The bottom-left panel shows the marginalized 1-sigma and 2-sigma contours for a mixture model (eq. 8.67). The bottom-right panel shows the marginalized 1-sigma and 2-sigma contours for a model in which points are identified individually as "good" or "bad" (eq. 8.68). The points which are identified by this method as bad with a probability greater than 68% are circled in the first panel. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_hogg2010test from astroML.plotting.mcmc import convert_to_stdev # Hack to fix import issue in older versions of pymc import scipy import scipy.misc scipy.derivative = scipy.misc.derivative import pymc #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) np.random.seed(0) #------------------------------------------------------------ # Get data: this includes outliers data = fetch_hogg2010test() xi = data['x'] yi = data['y'] dyi = data['sigma_y'] #---------------------------------------------------------------------- # First model: no outlier correction # define priors on beta = (slope, intercept) @pymc.stochastic def beta_M0(value=np.array([2., 100.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = 1 + 0 * intercept # uniform prior on theta = arctan(slope) # d[arctan(x)]/dx = 1 / (1 + x^2) prob_slope = np.log(1. / (1. + slope ** 2)) return prob_intercept + prob_slope @pymc.deterministic def model_M0(xi=xi, beta=beta_M0): slope, intercept = beta return slope * xi + intercept y = pymc.Normal('y', mu=model_M0, tau=dyi ** -2, observed=True, value=yi) M0 = dict(beta_M0=beta_M0, model_M0=model_M0, y=y) #---------------------------------------------------------------------- # Second model: nuisance variables correcting for outliers # This is the mixture model given in equation 17 in Hogg et al # define priors on beta = (slope, intercept) @pymc.stochastic def beta_M1(value=np.array([2., 100.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = 1 + 0 * intercept # uniform prior on theta = arctan(slope) # d[arctan(x)]/dx = 1 / (1 + x^2) prob_slope = np.log(1. / (1. + slope ** 2)) return prob_intercept + prob_slope @pymc.deterministic def model_M1(xi=xi, beta=beta_M1): slope, intercept = beta return slope * xi + intercept # uniform prior on Pb, the fraction of bad points Pb = pymc.Uniform('Pb', 0, 1.0, value=0.1) # uniform prior on Yb, the centroid of the outlier distribution Yb = pymc.Uniform('Yb', -10000, 10000, value=0) # uniform prior on log(sigmab), the spread of the outlier distribution log_sigmab = pymc.Uniform('log_sigmab', -10, 10, value=5) @pymc.deterministic def sigmab(log_sigmab=log_sigmab): return np.exp(log_sigmab) # set up the expression for likelihood def mixture_likelihood(yi, model, dyi, Pb, Yb, sigmab): """Equation 17 of Hogg 2010""" if (Pb < 0) or (Pb > 1): raise ValueError("Pb out of range. This is a bug in PyMC: try " "re-running the script (and see " "https://github.com/pymc-devs/pymc/issues/629)") Vi = dyi ** 2 Vb = sigmab ** 2 root2pi = np.sqrt(2 * np.pi) L_in = (1. / root2pi / dyi * np.exp(-0.5 * (yi - model) ** 2 / Vi)) L_out = (1. / root2pi / np.sqrt(Vi + Vb) * np.exp(-0.5 * (yi - Yb) ** 2 / (Vi + Vb))) return np.sum(np.log((1 - Pb) * L_in + Pb * L_out)) MixtureNormal = pymc.stochastic_from_dist('mixturenormal', logp=mixture_likelihood, dtype=np.float, mv=True) y_mixture = MixtureNormal('y_mixture', model=model_M1, dyi=dyi, Pb=Pb, Yb=Yb, sigmab=sigmab, observed=True, value=yi) M1 = dict(y_mixture=y_mixture, beta_M1=beta_M1, model_M1=model_M1, Pb=Pb, Yb=Yb, log_sigmab=log_sigmab, sigmab=sigmab) #---------------------------------------------------------------------- # Third model: marginalizes over the probability that each point is an outlier. # define priors on beta = (slope, intercept) @pymc.stochastic def beta_M2(value=np.array([2., 100.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = 1 + 0 * intercept # uniform prior on theta = arctan(slope) # d[arctan(x)]/dx = 1 / (1 + x^2) prob_slope = np.log(1. / (1. + slope ** 2)) return prob_intercept + prob_slope @pymc.deterministic def model_M2(xi=xi, beta=beta_M2): slope, intercept = beta return slope * xi + intercept # qi is bernoulli distributed # Note: this syntax requires pymc version 2.2 qi = pymc.Bernoulli('qi', p=1 - Pb, value=np.ones(len(xi))) def outlier_likelihood(yi, mu, dyi, qi, Yb, sigmab): """likelihood for full outlier posterior""" Vi = dyi ** 2 Vb = sigmab ** 2 root2pi = np.sqrt(2 * np.pi) logL_in = -0.5 * np.sum(qi * (np.log(2 * np.pi * Vi) + (yi - mu) ** 2 / Vi)) logL_out = -0.5 * np.sum((1 - qi) * (np.log(2 * np.pi * (Vi + Vb)) + (yi - Yb) ** 2 / (Vi + Vb))) return logL_out + logL_in OutlierNormal = pymc.stochastic_from_dist('outliernormal', logp=outlier_likelihood, dtype=np.float, mv=True) y_outlier = OutlierNormal('y_outlier', mu=model_M2, dyi=dyi, Yb=Yb, sigmab=sigmab, qi=qi, observed=True, value=yi) M2 = dict(y_outlier=y_outlier, beta_M2=beta_M2, model_M2=model_M2, qi=qi, Pb=Pb, Yb=Yb, log_sigmab=log_sigmab, sigmab=sigmab) #------------------------------------------------------------ # plot the data fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.25, bottom=0.1, top=0.95, hspace=0.2) # first axes: plot the data ax1 = fig.add_subplot(221) ax1.errorbar(xi, yi, dyi, fmt='.k', ecolor='gray', lw=1) ax1.set_xlabel('$x$') ax1.set_ylabel('$y$') #------------------------------------------------------------ # Go through models; compute and plot likelihoods models = [M0, M1, M2] linestyles = [':', '--', '-'] labels = ['no outlier correction\n(dotted fit)', 'mixture model\n(dashed fit)', 'outlier rejection\n(solid fit)'] x = np.linspace(0, 350, 10) bins = [(np.linspace(140, 300, 51), np.linspace(0.6, 1.6, 51)), (np.linspace(-40, 120, 51), np.linspace(1.8, 2.8, 51)), (np.linspace(-40, 120, 51), np.linspace(1.8, 2.8, 51))] for i, M in enumerate(models): S = pymc.MCMC(M) S.sample(iter=25000, burn=5000) trace = S.trace('beta_M%i' % i) H2D, bins1, bins2 = np.histogram2d(trace[:, 0], trace[:, 1], bins=50) w = np.where(H2D == H2D.max()) # choose the maximum posterior slope and intercept slope_best = bins1[w[0][0]] intercept_best = bins2[w[1][0]] # plot the best-fit line ax1.plot(x, intercept_best + slope_best * x, linestyles[i], c='k') # For the model which identifies bad points, # plot circles around points identified as outliers. if i == 2: qi = S.trace('qi')[:] Pi = qi.astype(float).mean(0) outlier_x = xi[Pi < 0.32] outlier_y = yi[Pi < 0.32] ax1.scatter(outlier_x, outlier_y, lw=1, s=400, alpha=0.5, facecolors='none', edgecolors='red') # plot the likelihood contours ax = plt.subplot(222 + i) H, xbins, ybins = np.histogram2d(trace[:, 1], trace[:, 0], bins=bins[i]) H[H == 0] = 1E-16 Nsigma = convert_to_stdev(np.log(H)) ax.contour(0.5 * (xbins[1:] + xbins[:-1]), 0.5 * (ybins[1:] + ybins[:-1]), Nsigma.T, levels=[0.683, 0.955], colors='black') ax.set_xlabel('intercept') ax.set_ylabel('slope') ax.grid(color='gray') ax.xaxis.set_major_locator(plt.MultipleLocator(40)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.text(0.98, 0.98, labels[i], ha='right', va='top', bbox=dict(fc='w', ec='none', alpha=0.5), transform=ax.transAxes) ax.set_xlim(bins[i][0][0], bins[i][0][-1]) ax.set_ylim(bins[i][1][0], bins[i][1][-1]) ax1.set_xlim(0, 350) ax1.set_ylim(100, 700) plt.show() astroML-0.3/book_figures/chapter8/fig_rbf_ridge_mu_z.py0000644000076500000240000001124612422073013024052 0ustar jakevdpstaff00000000000000""" Regularized Regression Example ------------------------------ Figure 8.4 Regularized regression for the same sample as Fig. 8.2. Here we use Gaussian basis function regression with a Gaussian of width sigma = 0.2 centered at 100 regular intervals between 0 < z < 2. The lower panels show the best-fit weights as a function of basis function position. The left column shows the results with no regularization: the basis function weights w are on the order of 108, and overfitting is evident. The middle column shows ridge regression (L2 regularization) with alpha = 0.005, and the right column shows LASSO regression (L1 regularization) with alpha = 0.005. All three methods are fit without the bias term (intercept). Changes from Published Version ++++++++++++++++++++++++++++++ Note that this figure has been changed slightly from its published version: the original version of the figure did not take into account data errors. The update (as of astroML version 0.3) correctly takes into account data errors. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import lognorm from astroML.linear_model import LinearRegression from astroML.cosmology import Cosmology from astroML.datasets import generate_mu_z #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # generate data np.random.seed(0) z_sample, mu_sample, dmu = generate_mu_z(100, random_state=0) cosmo = Cosmology() z = np.linspace(0.01, 2, 1000) mu = np.asarray([cosmo.mu(zi) for zi in z]) #------------------------------------------------------------ # Manually convert data to a gaussian basis # note that we're ignoring errors here, for the sake of example. def gaussian_basis(x, mu, sigma): return np.exp(-0.5 * ((x - mu) / sigma) ** 2) centers = np.linspace(0, 1.8, 100) widths = 0.2 X = gaussian_basis(z_sample[:, np.newaxis], centers, widths) #------------------------------------------------------------ # Set up the figure to plot the results fig = plt.figure(figsize=(5, 2.7)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95, hspace=0.15, wspace=0.2) regularization = ['none', 'l2', 'l1'] kwargs = [dict(), dict(alpha=0.005), dict(alpha=0.001)] labels = ['Linear Regression', 'Ridge Regression', 'Lasso Regression'] for i in range(3): clf = LinearRegression(regularization=regularization[i], fit_intercept=True, kwds=kwargs[i]) clf.fit(X, mu_sample, dmu) w = clf.coef_[1:] fit = clf.predict(gaussian_basis(z[:, None], centers, widths)) # plot fit ax = fig.add_subplot(231 + i) ax.xaxis.set_major_formatter(plt.NullFormatter()) # plot curves for regularized fits if i == 0: ax.set_ylabel('$\mu$') else: ax.yaxis.set_major_formatter(plt.NullFormatter()) curves = 37 + w * gaussian_basis(z[:, np.newaxis], centers, widths) curves = curves[:, abs(w) > 0.01] ax.plot(z, curves, c='gray', lw=1, alpha=0.5) ax.plot(z, fit, '-k') ax.plot(z, mu, '--', c='gray') ax.errorbar(z_sample, mu_sample, dmu, fmt='.k', ecolor='gray', lw=1, ms=4) ax.set_xlim(0.001, 1.8) ax.set_ylim(36, 52) ax.text(0.05, 0.93, labels[i], ha='left', va='top', bbox=dict(boxstyle='round', ec='k', fc='w'), transform=ax.transAxes) # plot weights ax = plt.subplot(234 + i) ax.xaxis.set_major_locator(plt.MultipleLocator(0.5)) ax.set_xlabel('$z$') if i == 0: ax.set_ylabel(r'$\theta$') w *= 1E-12 ax.text(0, 1.01, r'$\rm \times 10^{12}$', transform=ax.transAxes) ax.scatter(centers, w, s=9, lw=0, c='k') ax.set_xlim(-0.05, 1.8) if i == 1: ax.set_ylim(-2, 4) elif i == 2: ax.set_ylim(-0.5, 2) ax.text(0.05, 0.93, labels[i], ha='left', va='top', bbox=dict(boxstyle='round', ec='k', fc='w'), transform=ax.transAxes) plt.show() astroML-0.3/book_figures/chapter8/fig_regression_mu_z.py0000644000076500000240000000734512420767763024340 0ustar jakevdpstaff00000000000000""" Cosmology Regression Example ---------------------------- Figure 8.2 Various regression fits to the distance modulus vs. redshift relation for a simulated set of 100 supernovas, selected from a distribution :math:`p(z) \propto (z/z_0)^2 \exp[(z/z_0)^{1.5}]` with :math:`z_0 = 0.3`. Gaussian basis functions have 15 Gaussians evenly spaced between z = 0 and 2, with widths of 0.14. Kernel regression uses a Gaussian kernel with width 0.1. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import lognorm from astroML.cosmology import Cosmology from astroML.datasets import generate_mu_z from astroML.linear_model import LinearRegression, PolynomialRegression,\ BasisFunctionRegression, NadarayaWatson #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Generate data z_sample, mu_sample, dmu = generate_mu_z(100, random_state=0) cosmo = Cosmology() z = np.linspace(0.01, 2, 1000) mu_true = np.asarray([cosmo.mu(zi) for zi in z]) #------------------------------------------------------------ # Define our classifiers basis_mu = np.linspace(0, 2, 15)[:, None] basis_sigma = 3 * (basis_mu[1] - basis_mu[0]) subplots = [221, 222, 223, 224] classifiers = [LinearRegression(), PolynomialRegression(4), BasisFunctionRegression('gaussian', mu=basis_mu, sigma=basis_sigma), NadarayaWatson('gaussian', h=0.1)] text = ['Straight-line Regression', '4th degree Polynomial\n Regression', 'Gaussian Basis Function\n Regression', 'Gaussian Kernel\n Regression'] # number of constraints of the model. Because # Nadaraya-watson is just a weighted mean, it has only one constraint n_constraints = [2, 5, len(basis_mu) + 1, 1] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 5)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95, hspace=0.05, wspace=0.05) for i in range(4): ax = fig.add_subplot(subplots[i]) # fit the data clf = classifiers[i] clf.fit(z_sample[:, None], mu_sample, dmu) mu_sample_fit = clf.predict(z_sample[:, None]) mu_fit = clf.predict(z[:, None]) chi2_dof = (np.sum(((mu_sample_fit - mu_sample) / dmu) ** 2) / (len(mu_sample) - n_constraints[i])) ax.plot(z, mu_fit, '-k') ax.plot(z, mu_true, '--', c='gray') ax.errorbar(z_sample, mu_sample, dmu, fmt='.k', ecolor='gray', lw=1) ax.text(0.5, 0.05, r"$\chi^2_{\rm dof} = %.2f$" % chi2_dof, ha='center', va='bottom', transform=ax.transAxes) ax.set_xlim(0.01, 1.8) ax.set_ylim(36.01, 48) ax.text(0.05, 0.95, text[i], ha='left', va='top', transform=ax.transAxes) if i in (0, 2): ax.set_ylabel(r'$\mu$') else: ax.yaxis.set_major_formatter(plt.NullFormatter()) if i in (2, 3): ax.set_xlabel(r'$z$') else: ax.xaxis.set_major_formatter(plt.NullFormatter()) plt.show() astroML-0.3/book_figures/chapter8/fig_total_least_squares.py0000644000076500000240000001105512252721253025160 0ustar jakevdpstaff00000000000000""" Total Least Squares Figure -------------------------- Figure 8.6 A linear fit to data with correlated errors in x and y. In the literature, this is often referred to as total least squares or errors-in-variables fitting. The left panel shows the lines of best fit; the right panel shows the likelihood contours in slope/intercept space. The points are the same set used for the examples in Hogg, Bovy & Lang 2010. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from scipy import optimize from matplotlib import pyplot as plt from matplotlib.patches import Ellipse from astroML.linear_model import TLS_logL from astroML.plotting.mcmc import convert_to_stdev from astroML.datasets import fetch_hogg2010test #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define some convenience functions # translate between typical slope-intercept representation, # and the normal vector representation def get_m_b(beta): b = np.dot(beta, beta) / beta[1] m = -beta[0] / beta[1] return m, b def get_beta(m, b): denom = (1 + m * m) return np.array([-b * m / denom, b / denom]) # compute the ellipse pricipal axes and rotation from covariance def get_principal(sigma_x, sigma_y, rho_xy): sigma_xy2 = rho_xy * sigma_x * sigma_y alpha = 0.5 * np.arctan2(2 * sigma_xy2, (sigma_x ** 2 - sigma_y ** 2)) tmp1 = 0.5 * (sigma_x ** 2 + sigma_y ** 2) tmp2 = np.sqrt(0.25 * (sigma_x ** 2 - sigma_y ** 2) ** 2 + sigma_xy2 ** 2) return np.sqrt(tmp1 + tmp2), np.sqrt(tmp1 - tmp2), alpha # plot ellipses def plot_ellipses(x, y, sigma_x, sigma_y, rho_xy, factor=2, ax=None): if ax is None: ax = plt.gca() sigma1, sigma2, alpha = get_principal(sigma_x, sigma_y, rho_xy) for i in range(len(x)): ax.add_patch(Ellipse((x[i], y[i]), factor * sigma1[i], factor * sigma2[i], alpha[i] * 180. / np.pi, fc='none', ec='k')) #------------------------------------------------------------ # We'll use the data from table 1 of Hogg et al. 2010 data = fetch_hogg2010test() data = data[5:] # no outliers x = data['x'] y = data['y'] sigma_x = data['sigma_x'] sigma_y = data['sigma_y'] rho_xy = data['rho_xy'] #------------------------------------------------------------ # Find best-fit parameters X = np.vstack((x, y)).T dX = np.zeros((len(x), 2, 2)) dX[:, 0, 0] = sigma_x ** 2 dX[:, 1, 1] = sigma_y ** 2 dX[:, 0, 1] = dX[:, 1, 0] = rho_xy * sigma_x * sigma_y min_func = lambda beta: -TLS_logL(beta, X, dX) beta_fit = optimize.fmin(min_func, x0=[-1, 1]) #------------------------------------------------------------ # Plot the data and fits fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, wspace=0.25, bottom=0.15, top=0.9) #------------------------------------------------------------ # first let's visualize the data ax = fig.add_subplot(121) ax.scatter(x, y, c='k', s=9) plot_ellipses(x, y, sigma_x, sigma_y, rho_xy, ax=ax) #------------------------------------------------------------ # plot the best-fit line m_fit, b_fit = get_m_b(beta_fit) x_fit = np.linspace(0, 300, 10) ax.plot(x_fit, m_fit * x_fit + b_fit, '-k') ax.set_xlim(40, 250) ax.set_ylim(100, 600) ax.set_xlabel('$x$') ax.set_ylabel('$y$') #------------------------------------------------------------ # plot the likelihood contour in m, b ax = fig.add_subplot(122) m = np.linspace(1.7, 2.8, 100) b = np.linspace(-60, 110, 100) logL = np.zeros((len(m), len(b))) for i in range(len(m)): for j in range(len(b)): logL[i, j] = TLS_logL(get_beta(m[i], b[j]), X, dX) ax.contour(m, b, convert_to_stdev(logL.T), levels=(0.683, 0.955, 0.997), colors='k') ax.set_xlabel('slope') ax.set_ylabel('intercept') ax.set_xlim(1.7, 2.8) ax.set_ylim(-60, 110) plt.show() astroML-0.3/book_figures/chapter8/README.rst0000644000076500000240000000035112115147567021377 0ustar jakevdpstaff00000000000000Chapter 8: Regression and Model Fitting --------------------------------------- This chapter covers linear and nonlinear regression and model fitting, including strategies which are robust to data error and the presence of outliers. astroML-0.3/book_figures/chapter9/0000755000076500000240000000000012462244012017676 5ustar jakevdpstaff00000000000000astroML-0.3/book_figures/chapter9/fig_bayes_DB.py0000644000076500000240000000356312252721253022560 0ustar jakevdpstaff00000000000000""" Bayes Decision Boundary ----------------------- Figure 9.1 An illustration of a decision boundary between two Gaussian distributions. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.stats import norm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Compute the two PDFs x = np.linspace(-3, 7, 1000) pdf1 = norm(0, 1).pdf(x) pdf2 = norm(3, 1.5).pdf(x) x_bound = x[np.where(pdf1 < pdf2)][0] #------------------------------------------------------------ # Plot the pdfs and decision boundary fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) ax.plot(x, pdf1, '-k', lw=1) ax.fill_between(x, pdf1, color='gray', alpha=0.5) ax.plot(x, pdf2, '-k', lw=1) ax.fill_between(x, pdf2, color='gray', alpha=0.5) # plot decision boundary ax.plot([x_bound, x_bound], [0, 0.5], '--k') ax.text(x_bound + 0.2, 0.49, "decision boundary", ha='left', va='top', rotation=90) ax.text(0, 0.2, '$g_1(x)$', ha='center', va='center') ax.text(3, 0.1, '$g_2(x)$', ha='center', va='center') ax.set_xlim(-2, 7) ax.set_ylim(0, 0.5) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') plt.show() astroML-0.3/book_figures/chapter9/fig_bayes_DB_2d.py0000644000076500000240000000435312252721253023143 0ustar jakevdpstaff00000000000000""" 2D Bayes Decision Boundary -------------------------- Plot a schematic of a two-dimensional decision boundary """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Set up diagram mu1 = (0.25, 0.25) mu2 = (0.85, 0.7) sigma1 = (0.5, 0.5) sigma2 = (0.25, 0.5) y_boundary = np.linspace(-0.1, 1.1, 100) x_boundary = (0.5 + 0.4 * (y_boundary - 0.9) ** 2) #------------------------------------------------------------ # Set up plot fig = plt.figure(figsize=(5, 5), facecolor='w') ax = fig.add_axes([0, 0, 1, 1], frameon=False, xticks=[], yticks=[]) # draw axes plt.annotate(r'$x_1$', (-0.08, -0.02), (1.05, -0.02), ha='center', va='center', arrowprops=dict(arrowstyle='<-', color='k')) plt.annotate(r'$x_2$', (-0.02, -0.08), (-0.02, 1.05), ha='center', va='center', arrowprops=dict(arrowstyle='<-', color='k')) # draw ellipses, points, and boundaries ax.scatter(mu1[:1], mu1[1:], c='k') ax.scatter(mu2[:1], mu2[1:], c='k') ax.add_patch(Ellipse(mu1, sigma1[0], sigma1[1], fc='none', ec='k')) ax.add_patch(Ellipse(mu2, sigma2[0], sigma2[1], fc='none', ec='k')) ax.text(mu1[0] + 0.02, mu1[1] + 0.02, r'$\mu_1$') ax.text(mu2[0] + 0.02, mu2[1] + 0.02, r'$\mu_2$') ax.plot(x_boundary, y_boundary, '--k') ax.text(0.53, 0.28, "decision boundary", rotation=-70, ha='left', va='bottom') ax.set_xlim(-0.1, 1.1) ax.set_ylim(-0.1, 1.1) plt.show() astroML-0.3/book_figures/chapter9/fig_discriminant_function.py0000644000076500000240000000310112252721253025465 0ustar jakevdpstaff00000000000000""" Example of a Discriminant Function ---------------------------------- This plot shows a simple example of a discriminant function between two sets of points """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # create some toy data np.random.seed(0) cluster_1 = np.random.normal([1, 0.5], 0.5, size=(10, 2)) cluster_2 = np.random.normal([-1, -0.5], 0.5, size=(10, 2)) #------------------------------------------------------------ # plot the data and boundary fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111, xticks=[], yticks=[]) ax.scatter(cluster_1[:, 0], cluster_1[:, 1], c='k', s=30) ax.scatter(cluster_2[:, 0], cluster_2[:, 1], c='w', s=30) ax.plot([0, 1], [1.5, -1.5], '-k', lw=2) ax.set_xlim(-2, 2.5) ax.set_ylim(-2, 2) plt.show() astroML-0.3/book_figures/chapter9/fig_photoz_bagging.py0000644000076500000240000000564012420767763024124 0ustar jakevdpstaff00000000000000""" Photometric Redshifts by Random Forests --------------------------------------- """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import itertools import numpy as np from matplotlib import pyplot as plt from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor from astroML.datasets import fetch_sdss_specgals #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) data = fetch_sdss_specgals() # put magnitudes in a matrix mag = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T z = data['z'] # train on ~60,000 points mag_train = mag[::10] z_train = z[::10] # test on ~6,000 distinct points mag_test = mag[1::100] z_test = z[1::100] def plot_results(z, z_fit, plotlabel=None, xlabel=True, ylabel=True): plt.scatter(z[::1], z_fit[::1], s=1, lw=0, c='k') plt.plot([-0.1, 0.4], [-0.1, 0.4], ':k') plt.xlim(-0.02, 0.4001) plt.ylim(-0.02, 0.4001) plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.1)) plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.1)) if plotlabel: plt.text(0.04, 0.96, plotlabel, ha='left', va='top', transform=ax.transAxes) if xlabel: plt.xlabel(r'$\rm z_{true}$') else: plt.gca().xaxis.set_major_formatter(plt.NullFormatter()) if ylabel: plt.ylabel(r'$\rm z_{fit}$') else: plt.gca().yaxis.set_major_formatter(plt.NullFormatter()) plt.figure(figsize=(5, 2.5)) plt.subplots_adjust(wspace=0.1, left=0.1, right=0.95, bottom=0.15, top=0.9) ax = plt.subplot(121) z_fit = DecisionTreeRegressor(max_depth=10).fit(mag_train, z_train).predict(mag_test) print("one tree: rms =", np.sqrt(np.mean((z_test - z_fit) ** 2))) plot_results(z_test, z_fit, plotlabel="Decision Tree") ax = plt.subplot(122) z_fit = RandomForestRegressor(n_estimators=10, max_depth=15).fit(mag_train, z_train).predict(mag_test) print("ten trees: rms =", np.sqrt(np.mean((z_test - z_fit) ** 2))) plot_results(z_test, z_fit, plotlabel="Random Forest\nof 10 trees", ylabel=False) plt.show() astroML-0.3/book_figures/chapter9/fig_photoz_basic.py0000644000076500000240000001330012252721253023562 0ustar jakevdpstaff00000000000000""" Photometric Redshifts via Linear Regression ------------------------------------------- Linear Regression for photometric redshifts We could use sklearn.linear_model.LinearRegression, but to be more transparent, we'll do it by hand using linear algebra. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import itertools import numpy as np from matplotlib import pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics.pairwise import euclidean_distances from astroML.datasets import fetch_sdss_specgals #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) np.random.seed(0) data = fetch_sdss_specgals() # put magnitudes in a matrix # with a constant (for the intercept) at position zero mag = np.vstack([np.ones(data.shape)] + [data['modelMag_%s' % f] for f in 'ugriz']).T z = data['z'] # train on ~60,000 points mag_train = mag[::10] z_train = z[::10] # test on ~6,000 distinct points mag_test = mag[1::100] z_test = z[1::100] def plot_results(z, z_fit, plotlabel=None, xlabel=True, ylabel=True): plt.scatter(z, z_fit, s=1, lw=0, c='k') plt.plot([-0.1, 0.4], [-0.1, 0.4], ':k') plt.xlim(-0.05, 0.4001) plt.ylim(-0.05, 0.4001) plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.1)) plt.gca().yaxis.set_major_locator(plt.MultipleLocator(0.1)) if plotlabel: plt.text(0.03, 0.97, plotlabel, ha='left', va='top', transform=ax.transAxes) if xlabel: plt.xlabel(r'$\rm z_{true}$') else: plt.gca().xaxis.set_major_formatter(plt.NullFormatter()) if ylabel: plt.ylabel(r'$\rm z_{fit}$') else: plt.gca().yaxis.set_major_formatter(plt.NullFormatter()) def combinations_with_replacement(iterable, r): pool = tuple(iterable) n = len(pool) for indices in itertools.product(range(n), repeat=r): if sorted(indices) == list(indices): yield tuple(pool[i] for i in indices) def poly_features(X, p): """Compute polynomial features Parameters ---------- X: array_like shape (n_samples, n_features) p: int degree of polynomial Returns ------- X_p: array polynomial feature matrix """ X = np.asarray(X) N, D = X.shape ind = list(combinations_with_replacement(range(D), p)) X_poly = np.empty((X.shape[0], len(ind))) for i in range(len(ind)): X_poly[:, i] = X[:, ind[i]].prod(1) return X_poly def gaussian_RBF_features(X, centers, widths): """Compute gaussian Radial Basis Function features Parameters ---------- X: array_like shape (n_samples, n_features) centers: array_like shape (n_centers, n_features) widths: array_like shape (n_centers, n_features) or (n_centers,) Returns ------- X_RBF: array RBF feature matrix, shape=(n_samples, n_centers) """ X, centers, widths = map(np.asarray, (X, centers, widths)) if widths.ndim == 1: widths = widths[:, np.newaxis] return np.exp(-0.5 * ((X[:, np.newaxis, :] - centers) / widths) ** 2).sum(-1) plt.figure(figsize=(5, 5)) plt.subplots_adjust(hspace=0.05, wspace=0.05, left=0.1, right=0.95, bottom=0.1, top=0.95) #---------------------------------------------------------------------- # first do a simple linear regression between the r-band and redshift, # ignoring uncertainties ax = plt.subplot(221) X_train = mag_train[:, [0, 3]] X_test = mag_test[:, [0, 3]] z_fit = LinearRegression().fit(X_train, z_train).predict(X_test) plot_results(z_test, z_fit, plotlabel='Linear Regression:\n r-band', xlabel=False) #---------------------------------------------------------------------- # next do a linear regression with all bands ax = plt.subplot(222) z_fit = LinearRegression().fit(mag_train, z_train).predict(mag_test) plot_results(z_test, z_fit, plotlabel="Linear Regression:\n ugriz bands", xlabel=False, ylabel=False) #---------------------------------------------------------------------- # next do a 3rd-order polynomial regression with all bands ax = plt.subplot(223) X_train = poly_features(mag_train, 3) X_test = poly_features(mag_test, 3) z_fit = LinearRegression().fit(X_train, z_train).predict(X_test) plot_results(z_test, z_fit, plotlabel="3rd order Polynomial\nRegression") #---------------------------------------------------------------------- # next do a radial basis function regression with all bands ax = plt.subplot(224) # remove bias term mag = mag[:, 1:] mag_train = mag_train[:, 1:] mag_test = mag_test[:, 1:] centers = mag[np.random.randint(mag.shape[0], size=100)] centers_dist = euclidean_distances(centers, centers, squared=True) widths = np.sqrt(centers_dist[:, :10].mean(1)) X_train = gaussian_RBF_features(mag_train, centers, widths) X_test = gaussian_RBF_features(mag_test, centers, widths) z_fit = LinearRegression().fit(X_train, z_train).predict(X_test) plot_results(z_test, z_fit, plotlabel="Gaussian Basis Function\nRegression", ylabel=False) plt.show() astroML-0.3/book_figures/chapter9/fig_photoz_boosting.py0000644000076500000240000001055612420767763024354 0ustar jakevdpstaff00000000000000""" Photometric Redshifts by Random Forests --------------------------------------- Figure 9.16 Photometric redshift estimation using gradient-boosted decision trees, with 100 boosting steps. As with random forests (figure 9.15), boosting allows for improved results over the single tree case (figure 9.14). Note, however, that the computational cost of boosted decision trees is such that it is computationally prohibitive to use very deep trees. By stringing together a large number of very naive estimators, boosted trees improve on the underfitting of each individual estimator. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.ensemble import GradientBoostingRegressor from astroML.datasets import fetch_sdss_specgals from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch and prepare the data data = fetch_sdss_specgals() # put magnitudes in a matrix mag = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T z = data['z'] # train on ~60,000 points mag_train = mag[::10] z_train = z[::10] # test on ~6,000 distinct points mag_test = mag[1::100] z_test = z[1::100] #------------------------------------------------------------ # Compute the results # This is a long computation, so we'll save the results to a pickle. @pickle_results('photoz_boosting.pkl') def compute_photoz_forest(N_boosts): rms_test = np.zeros(len(N_boosts)) rms_train = np.zeros(len(N_boosts)) i_best = 0 z_fit_best = None for i, Nb in enumerate(N_boosts): try: # older versions of scikit-learn clf = GradientBoostingRegressor(n_estimators=Nb, learn_rate=0.1, max_depth=3, random_state=0) except TypeError: clf = GradientBoostingRegressor(n_estimators=Nb, learning_rate=0.1, max_depth=3, random_state=0) clf.fit(mag_train, z_train) z_fit_train = clf.predict(mag_train) z_fit = clf.predict(mag_test) rms_train[i] = np.mean(np.sqrt((z_fit_train - z_train) ** 2)) rms_test[i] = np.mean(np.sqrt((z_fit - z_test) ** 2)) if rms_test[i] <= rms_test[i_best]: i_best = i z_fit_best = z_fit return rms_test, rms_train, i_best, z_fit_best N_boosts = (10, 100, 200, 300, 400, 500) rms_test, rms_train, i_best, z_fit_best = compute_photoz_forest(N_boosts) best_N = N_boosts[i_best] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(wspace=0.25, left=0.1, right=0.95, bottom=0.15, top=0.9) # left panel: plot cross-validation results ax = fig.add_subplot(121) ax.plot(N_boosts, rms_test, '-k', label='cross-validation') ax.plot(N_boosts, rms_train, '--k', label='training set') ax.legend(loc=1) ax.set_xlabel('number of boosts') ax.set_ylabel('rms error') ax.set_xlim(0, 510) ax.set_ylim(0.009, 0.032) ax.yaxis.set_major_locator(plt.MultipleLocator(0.01)) ax.text(0.03, 0.03, "Tree depth: 3", ha='left', va='bottom', transform=ax.transAxes) # right panel: plot best fit ax = fig.add_subplot(122) ax.scatter(z_test, z_fit_best, s=1, lw=0, c='k') ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k') ax.text(0.04, 0.96, "N = %i\nrms = %.3f" % (best_N, rms_test[i_best]), ha='left', va='top', transform=ax.transAxes) ax.set_xlabel(r'$z_{\rm true}$') ax.set_ylabel(r'$z_{\rm fit}$') ax.set_xlim(-0.02, 0.4001) ax.set_ylim(-0.02, 0.4001) ax.xaxis.set_major_locator(plt.MultipleLocator(0.1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.1)) plt.show() astroML-0.3/book_figures/chapter9/fig_photoz_forest.py0000644000076500000240000000743612252721253024020 0ustar jakevdpstaff00000000000000""" Photometric Redshifts by Random Forests --------------------------------------- Figure 9.15 Photometric redshift estimation using random forest regression, with ten random trees. Comparison to figure 9.14 shows that random forests correct for the overfitting evident in very deep decision trees. Here the optimal depth is 20 or above, and a much better cross-validation error is achieved. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.ensemble import RandomForestRegressor from astroML.datasets import fetch_sdss_specgals from astroML.decorators import pickle_results #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch and prepare the data data = fetch_sdss_specgals() # put magnitudes in a matrix mag = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T z = data['z'] # train on ~60,000 points mag_train = mag[::10] z_train = z[::10] # test on ~6,000 distinct points mag_test = mag[1::100] z_test = z[1::100] #------------------------------------------------------------ # Compute the results # This is a long computation, so we'll save the results to a pickle. @pickle_results('photoz_forest.pkl') def compute_photoz_forest(depth): rms_test = np.zeros(len(depth)) rms_train = np.zeros(len(depth)) i_best = 0 z_fit_best = None for i, d in enumerate(depth): clf = RandomForestRegressor(n_estimators=10, max_depth=d, random_state=0) clf.fit(mag_train, z_train) z_fit_train = clf.predict(mag_train) z_fit = clf.predict(mag_test) rms_train[i] = np.mean(np.sqrt((z_fit_train - z_train) ** 2)) rms_test[i] = np.mean(np.sqrt((z_fit - z_test) ** 2)) if rms_test[i] <= rms_test[i_best]: i_best = i z_fit_best = z_fit return rms_test, rms_train, i_best, z_fit_best depth = np.arange(1, 21) rms_test, rms_train, i_best, z_fit_best = compute_photoz_forest(depth) best_depth = depth[i_best] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(wspace=0.25, left=0.1, right=0.95, bottom=0.15, top=0.9) # left panel: plot cross-validation results ax = fig.add_subplot(121) ax.plot(depth, rms_test, '-k', label='cross-validation') ax.plot(depth, rms_train, '--k', label='training set') ax.legend(loc=1) ax.set_xlabel('depth of tree') ax.set_ylabel('rms error') ax.set_xlim(0, 21) ax.set_ylim(0.009, 0.04) ax.yaxis.set_major_locator(plt.MultipleLocator(0.01)) # right panel: plot best fit ax = fig.add_subplot(122) ax.scatter(z_test, z_fit_best, s=1, lw=0, c='k') ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k') ax.text(0.03, 0.97, "depth = %i\nrms = %.3f" % (best_depth, rms_test[i_best]), ha='left', va='top', transform=ax.transAxes) ax.set_xlabel(r'$z_{\rm true}$') ax.set_ylabel(r'$z_{\rm fit}$') ax.set_xlim(-0.02, 0.4001) ax.set_ylim(-0.02, 0.4001) ax.xaxis.set_major_locator(plt.MultipleLocator(0.1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.1)) plt.show() astroML-0.3/book_figures/chapter9/fig_photoz_tree.py0000644000076500000240000000706512252721253023453 0ustar jakevdpstaff00000000000000""" Photometric Redshifts by Decision Trees --------------------------------------- Figure 9.14 Photometric redshift estimation using decision-tree regression. The data is described in Section 1.5.5. The training set consists of u, g , r, i, z magnitudes of 60,000 galaxies from the SDSS spectroscopic sample. Cross-validation is performed on an additional 6000 galaxies. The left panel shows training error and cross-validation error as a function of the maximum depth of the tree. For a number of nodes N > 13, overfitting is evident. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn.tree import DecisionTreeRegressor from astroML.datasets import fetch_sdss_specgals #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch data and prepare it for the computation data = fetch_sdss_specgals() # put magnitudes in a matrix mag = np.vstack([data['modelMag_%s' % f] for f in 'ugriz']).T z = data['z'] # train on ~60,000 points mag_train = mag[::10] z_train = z[::10] # test on ~6,000 separate points mag_test = mag[1::100] z_test = z[1::100] #------------------------------------------------------------ # Compute the cross-validation scores for several tree depths depth = np.arange(1, 21) rms_test = np.zeros(len(depth)) rms_train = np.zeros(len(depth)) i_best = 0 z_fit_best = None for i, d in enumerate(depth): clf = DecisionTreeRegressor(max_depth=d, random_state=0) clf.fit(mag_train, z_train) z_fit_train = clf.predict(mag_train) z_fit = clf.predict(mag_test) rms_train[i] = np.mean(np.sqrt((z_fit_train - z_train) ** 2)) rms_test[i] = np.mean(np.sqrt((z_fit - z_test) ** 2)) if rms_test[i] <= rms_test[i_best]: i_best = i z_fit_best = z_fit best_depth = depth[i_best] #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(wspace=0.25, left=0.1, right=0.95, bottom=0.15, top=0.9) # first panel: cross-validation ax = fig.add_subplot(121) ax.plot(depth, rms_test, '-k', label='cross-validation') ax.plot(depth, rms_train, '--k', label='training set') ax.set_xlabel('depth of tree') ax.set_ylabel('rms error') ax.yaxis.set_major_locator(plt.MultipleLocator(0.01)) ax.set_xlim(0, 21) ax.set_ylim(0.009, 0.04) ax.legend(loc=1) # second panel: best-fit results ax = fig.add_subplot(122) ax.scatter(z_test, z_fit_best, s=1, lw=0, c='k') ax.plot([-0.1, 0.4], [-0.1, 0.4], ':k') ax.text(0.04, 0.96, "depth = %i\nrms = %.3f" % (best_depth, rms_test[i_best]), ha='left', va='top', transform=ax.transAxes) ax.set_xlabel(r'$z_{\rm true}$') ax.set_ylabel(r'$z_{\rm fit}$') ax.set_xlim(-0.02, 0.4001) ax.set_ylim(-0.02, 0.4001) ax.xaxis.set_major_locator(plt.MultipleLocator(0.1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.1)) plt.show() astroML-0.3/book_figures/chapter9/fig_ROC_curve.py0000644000076500000240000001116012420767763022744 0ustar jakevdpstaff00000000000000""" RR Lyrae ROC Curves ------------------- Figure 9.17 ROC curves (left panel) and completeness-efficiency curves (left panel) for the four-color RR Lyrae data using several of the classifiers explored in this chapter: Gaussian naive Bayes (GNB), linear discriminant analysis (LDA), quadratic discriminant analysis (QDA), logistic regression (LR), K -nearest-neighbor classification (KNN), decision tree classification (DT), and GMM Bayes classification (GMMB). """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from astroML.classification import GMMBayes from sklearn.metrics import precision_recall_curve, roc_curve from astroML.utils import split_samples, completeness_contamination from astroML.datasets import fetch_rrlyrae_combined #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() y = y.astype(int) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) #------------------------------------------------------------ # Fit all the models to the training data def compute_models(*args): names = [] probs = [] for classifier, kwargs in args: print(classifier.__name__) clf = classifier(**kwargs) clf.fit(X_train, y_train) y_probs = clf.predict_proba(X_test)[:, 1] names.append(classifier.__name__) probs.append(y_probs) return names, probs names, probs = compute_models((GaussianNB, {}), (LDA, {}), (QDA, {}), (LogisticRegression, dict(class_weight='auto')), (KNeighborsClassifier, dict(n_neighbors=10)), (DecisionTreeClassifier, dict(random_state=0, max_depth=12, criterion='entropy')), (GMMBayes, dict(n_components=3, min_covar=1E-5, covariance_type='full'))) #------------------------------------------------------------ # Plot ROC curves and completeness/efficiency fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15, top=0.9, wspace=0.25) # ax2 will show roc curves ax1 = plt.subplot(121) # ax1 will show completeness/efficiency ax2 = plt.subplot(122) labels = dict(GaussianNB='GNB', LDA='LDA', QDA='QDA', KNeighborsClassifier='KNN', DecisionTreeClassifier='DT', GMMBayes='GMMB', LogisticRegression='LR') thresholds = np.linspace(0, 1, 1001)[:-1] # iterate through and show results for name, y_prob in zip(names, probs): fpr, tpr, thresh = roc_curve(y_test, y_prob) # add (0, 0) as first point fpr = np.concatenate([[0], fpr]) tpr = np.concatenate([[0], tpr]) ax1.plot(fpr, tpr, label=labels[name]) comp = np.zeros_like(thresholds) cont = np.zeros_like(thresholds) for i, t in enumerate(thresholds): y_pred = (y_prob >= t) comp[i], cont[i] = completeness_contamination(y_pred, y_test) ax2.plot(1 - cont, comp, label=labels[name]) ax1.set_xlim(0, 0.04) ax1.set_ylim(0, 1.02) ax1.xaxis.set_major_locator(plt.MaxNLocator(5)) ax1.set_xlabel('false positive rate') ax1.set_ylabel('true positive rate') ax1.legend(loc=4) ax2.set_xlabel('efficiency') ax2.set_ylabel('completeness') ax2.set_xlim(0, 1.0) ax2.set_ylim(0.2, 1.02) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_decisiontree.py0000644000076500000240000001111512420767763025332 0ustar jakevdpstaff00000000000000""" Decision Tree Classification of photometry ------------------------------------------ Figure 9.13 Decision tree applied to the RR Lyrae data (see caption of figure 9.3 for details). This example uses tree depths of 7 and 12. With all four colors,this decision tree achieves a completeness of 0.569 and a contamination of 0.386. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.tree import DecisionTreeClassifier from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # Fit Decision tree Ncolors = np.arange(1, X.shape[1] + 1) classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) depths = [7, 12] for depth in depths: classifiers.append([]) predictions.append([]) for nc in Ncolors: clf = DecisionTreeClassifier(random_state=0, max_depth=depth, criterion='entropy') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers[-1].append(clf) predictions[-1].append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # compute the decision boundary clf = classifiers[1][1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 101), np.linspace(ylim[0], ylim[1], 101)) Z = clf.predict(np.c_[yy.ravel(), xx.ravel()]) Z = Z.reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') ax.text(0.02, 0.02, "depth = %i" % depths[1], transform=ax.transAxes) # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness[0], 'o-k', ms=6, label="depth=%i" % depths[0]) ax.plot(Ncolors, completeness[1], '^--k', ms=6, label="depth=%i" % depths[1]) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination[0], 'o-k', ms=6, label="depth=%i" % depths[0]) ax.plot(Ncolors, contamination[1], '^--k', ms=6, label="depth=%i" % depths[1]) ax.legend(loc='lower right', bbox_to_anchor=(1.0, 0.79)) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_forest.py0000644000076500000240000001464112420767763024166 0ustar jakevdpstaff00000000000000""" Decision Tree Classification of photometry ------------------------------------------ Decision Tree photometric classification of rr-lyrae stars. This uses averaged photometry from the rr-lyrae catalog and stripe 82 standards catalogs. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from sklearn.ensemble import RandomForestClassifier from astroML.datasets import fetch_rrlyrae_mags, fetch_sdss_S82standards #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) def load_rrlyrae_data(): """Load the RR Lyrae data. This will be used in several examples. """ #---------------------------------------------------------------------- # Load data rrlyrae = fetch_rrlyrae_mags() standards = fetch_sdss_S82standards() # perform color cuts on standard stars # these come from eqns 1-4 of Sesar et al 2010, ApJ 708:717 u_g = standards['mmu_u'] - standards['mmu_g'] g_r = standards['mmu_g'] - standards['mmu_r'] r_i = standards['mmu_r'] - standards['mmu_i'] i_z = standards['mmu_i'] - standards['mmu_z'] standards = standards[(u_g > 0.7) & (u_g < 1.35) & (g_r > -0.15) & (g_r < 0.4) & (r_i > -0.15) & (r_i < 0.22) & (i_z > -0.21) & (i_z < 0.25)] #---------------------------------------------------------------------- # get magnitudes and colors; split into train and test sets mags_rr = np.vstack([rrlyrae[f + 'mag'] for f in 'ugriz']) colors_rr = mags_rr[:-1] - mags_rr[1:] mags_st = np.vstack([standards['mmu_' + f] for f in 'ugriz']) colors_st = mags_st[:-1] - mags_st[1:] # stack the two sets of colors together X = np.vstack((colors_st.T, colors_rr.T)) y = np.zeros(X.shape[0]) y[-colors_rr.shape[1]:] = 1 return X, y def split_samples(X, y, rseed=0, training_fraction=0.75): """split samples into training and test sets""" np.random.seed(0) indices = np.arange(len(y)) np.random.shuffle(indices) N_train = int(training_fraction * len(y)) X_train = X[indices[:N_train]] X_test = X[indices[N_train:]] y_train = y[indices[:N_train]] y_test = y[indices[N_train:]] return X_train, X_test, y_train, y_test #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = load_rrlyrae_data() # SVM takes several minutes to run, and is order[N^2] # truncating the dataset can be useful for experimentation. #X = X[::10] #y = y[::10] X_train, X_test, y_train, y_test = split_samples(X, y) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # Fit Decision tree Ncolors = np.arange(1, X.shape[1] + 1) classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) depths = [5, 20] for depth in depths: classifiers.append([]) predictions.append([]) for nc in Ncolors: clf = RandomForestClassifier(random_state=0, max_depth=depth, criterion='entropy') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers[-1].append(clf) predictions[-1].append(y_pred) predictions = np.array(predictions) #---------------------------------------------------------------------- # compute completeness and contamination # matches = (predictions == y_test) tp = np.sum(matches & (y_test == 1), -1) tn = np.sum(matches & (y_test == 0), -1) fp = np.sum(~matches & (y_test == 0), -1) fn = np.sum(~matches & (y_test == 1), -1) completeness = tp * 1. / (tp + fn) contamination = fp * 1. / (tp + fp) completeness[np.isnan(completeness)] = 0 contamination[np.isnan(contamination)] = 0 print("completeness", completeness) print("contamination", contamination) #---------------------------------------------------------------------- # plot the results plt.figure(figsize=(5, 2.5)) plt.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) ax = plt.subplot(121) plt.scatter(X[-N_plot:, 0], X[-N_plot:, 1], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) plt.clim(-0.5, 1) clf = classifiers[1][1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 101), np.linspace(ylim[0], ylim[1], 101)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # smooth the boundary #from scipy.ndimage import gaussian_filter #Z = gaussian_filter(Z, 2) plt.contour(xx, yy, Z, [0.5], colors='k') plt.xlim(xlim) plt.ylim(ylim) plt.xlabel('$u-g$') plt.ylabel('$g-r$') plt.text(0.02, 0.02, "depth = %i" % depths[1], transform=ax.transAxes) ax = plt.subplot(222) plt.plot(Ncolors, completeness[0], 'o-k', ms=6, label="depth=%i" % depths[0]) plt.plot(Ncolors, completeness[1], '^--k', ms=6, label="depth=%i" % depths[1]) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) plt.ylabel('completeness') plt.xlim(0.5, 4.5) plt.ylim(-0.1, 1.1) plt.grid(True) ax = plt.subplot(224) plt.plot(Ncolors, contamination[0], 'o-k', ms=6, label="depth=%i" % depths[0]) plt.plot(Ncolors, contamination[1], '^--k', ms=6, label="depth=%i" % depths[1]) plt.legend(loc='lower right', bbox_to_anchor=(1.0, 0.79)) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) plt.xlabel('N colors') plt.ylabel('contamination') plt.xlim(0.5, 4.5) plt.ylim(-0.1, 1.1) plt.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_GMMbayes.py0000644000076500000240000001221012420767763024316 0ustar jakevdpstaff00000000000000""" Gausian Mixture Bayes Classification of photometry -------------------------------------------------- Figure 9.6 Gaussian mixture Bayes classifier for RR Lyrae stars (see caption of figure 9.3 for details). Here the left panel shows the decision boundary for the three-component model, and the right panel shows the completeness and contamination for both a one-component and three-component mixture model. With all four colors and a three-component model, GMM Bayes achieves a completeness of 0.686 and a contamination of 0.236. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from astroML.classification import GMMBayes from astroML.decorators import pickle_results from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results # GMM-bayes takes several minutes to run, and is order[N^2] # truncating the dataset can be useful for experimentation. #X = X[::10] #y = y[::10] (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform GMM Bayes Ncolors = np.arange(1, X.shape[1] + 1) Ncomp = [1, 3] @pickle_results('GMMbayes_rrlyrae.pkl') def compute_GMMbayes(Ncolors, Ncomp): classifiers = [] predictions = [] for ncm in Ncomp: classifiers.append([]) predictions.append([]) for nc in Ncolors: clf = GMMBayes(ncm, min_covar=1E-5, covariance_type='full') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers[-1].append(clf) predictions[-1].append(y_pred) return classifiers, predictions classifiers, predictions = compute_GMMbayes(Ncolors, Ncomp) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1][1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto', cmap=plt.cm.binary, zorder=1, extent=xlim + ylim) im.set_clim(0, 1.5) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness[0], '^--k', ms=6, label='N=%i' % Ncomp[0]) ax.plot(Ncolors, completeness[1], 'o-k', ms=6, label='N=%i' % Ncomp[1]) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination[0], '^--k', ms=6, label='N=%i' % Ncomp[0]) ax.plot(Ncolors, contamination[1], 'o-k', ms=6, label='N=%i' % Ncomp[1]) ax.legend(loc='lower right', bbox_to_anchor=(1.0, 0.78)) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_kernelsvm.py0000644000076500000240000001100712420767763024663 0ustar jakevdpstaff00000000000000""" Kernel SVM Classification of photometry --------------------------------------- Figure 9.11 Kernel SVM applied to the RR Lyrae data (see caption of figure 9.3 for details). This example uses a Gaussian kernel with gamma = 20. With all four colors, kernel SVM achieves a completeness of 1.0 and a contamination of 0.852. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.svm import SVC from sklearn import metrics from astroML.datasets import fetch_rrlyrae_mags from astroML.decorators import pickle_results from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # re-order the colors for better 1-color results # SVM takes several minutes to run, and is order[N^2] # truncating the dataset can be useful for experimentation. #X = X[::5] #y = y[::5] (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # Fit Kernel SVM Ncolors = np.arange(1, X.shape[1] + 1) @pickle_results('kernelSVM_rrlyrae.pkl') def compute_SVM(Ncolors): classifiers = [] predictions = [] for nc in Ncolors: # perform support vector classification clf = SVC(kernel='rbf', gamma=20.0, class_weight='auto') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) return classifiers, predictions classifiers, predictions = compute_SVM(Ncolors) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 101), np.linspace(ylim[0], ylim[1], 101)) Z = clf.predict(np.c_[yy.ravel(), xx.ravel()]) Z = Z.reshape(xx.shape) # smooth the boundary from scipy.ndimage import gaussian_filter Z = gaussian_filter(Z, 2) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) ax = fig.add_subplot(224) ax.plot(Ncolors, contamination, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_knn.py0000644000076500000240000001142012420767763023442 0ustar jakevdpstaff00000000000000""" K-neighbors Classification of photometry ---------------------------------------- Figure 9.7 K-nearest-neighbor classification for RR Lyrae stars (see caption of figure 9.3 for details). Here the left panel shows the decision boundary for the model based on K = 10 neighbors, and the right panel shows the completeness and contamination for both K = 1 and K = 10. With all four colors and K = 10, K-neighbors classification achieves a completeness of 0.533 and a contamination of 0.240. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from matplotlib import colors from sklearn.neighbors import KNeighborsClassifier from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform Classification classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) kvals = [1, 10] for k in kvals: classifiers.append([]) predictions.append([]) for nc in Ncolors: clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers[-1].append(clf) predictions[-1].append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1][1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict(np.c_[yy.ravel(), xx.ravel()]) Z = Z.reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto', cmap=plt.cm.binary, zorder=1, extent=xlim + ylim) im.set_clim(0, 2) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') ax.text(0.02, 0.02, "k = %i" % kvals[1], transform=ax.transAxes) # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness[0], 'o-k', ms=6, label='k=%i' % kvals[0]) ax.plot(Ncolors, completeness[1], '^--k', ms=6, label='k=%i' % kvals[1]) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination[0], 'o-k', ms=6, label='k=%i' % kvals[0]) ax.plot(Ncolors, contamination[1], '^--k', ms=6, label='k=%i' % kvals[1]) ax.legend(loc='lower right', bbox_to_anchor=(1.0, 0.79)) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_lda.py0000644000076500000240000001012212420767763023412 0ustar jakevdpstaff00000000000000""" LDA Classification of photometry -------------------------------- Figure 9.4 The linear discriminant boundary for RR Lyrae stars (see caption of figure 9.3 for details). With all four colors, LDA achieves a completeness of 0.672 and a contamination of 0.806. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from matplotlib import colors from sklearn.lda import LDA from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform LDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = LDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto', cmap=plt.cm.binary, zorder=1, extent=xlim + ylim) im.set_clim(0, 1.5) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness, 'o-k', ms=6, label='unweighted') ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination, 'o-k', ms=6, label='unweighted') ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_logreg.py0000644000076500000240000001023412420767763024135 0ustar jakevdpstaff00000000000000""" Logistic Regression of photometry --------------------------------- Figure 9.8 Logistic regression for RR Lyrae stars (see caption of figure 9.3 for details). With all four colors, logistic regression achieves a completeness of 0.993 and a contamination of 0.838. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from matplotlib import colors from sklearn.linear_model import LogisticRegression from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform Classification classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = LogisticRegression(class_weight='auto') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) print(clf.intercept_) print(clf.raw_coef_) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()])[:, 1] Z = Z.reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto', cmap=plt.cm.binary, zorder=1, extent=xlim + ylim) im.set_clim(0, 2) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_naivebayes.py0000644000076500000240000001126212420767763025006 0ustar jakevdpstaff00000000000000""" Gaussian Naive Bayes Classification of photometry ------------------------------------------------- Figure 9.3 Gaussian naive Bayes classification method used to separate variable RR Lyrae stars from nonvariable main sequence stars. In the left panel, the light gray points show non- variable sources, while the dark points show variable sources. The classification boundary is shown by the black line, and the classification probability is shown by the shaded background. In the right panel, we show the completeness and contamination as a function of the number of features used in the fit. For the single feature, u - g is used. For two features, u - g and g = r areused. For three features, u - g, g - r, and r- i are used. It is evident that the g - r color is the best discriminator. With all four colors, naive Bayes attains a completeness of 0.876 and a contamination of 0.790. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.naive_bayes import GaussianNB from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform Naive Bayes classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) order = np.array([1, 0, 2, 3]) for nc in Ncolors: clf = GaussianNB() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 81), np.linspace(ylim[0], ylim[1], 71)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto', cmap=plt.cm.binary, zorder=1, extent=xlim + ylim) im.set_clim(0, 1.5) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # Plot completeness vs Ncolors ax = plt.subplot(222) ax.plot(Ncolors, completeness, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # Plot contamination vs Ncolors ax = plt.subplot(224) ax.plot(Ncolors, contamination, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_qda.py0000644000076500000240000001021012420767763023415 0ustar jakevdpstaff00000000000000""" QDA Classification of photometry -------------------------------- Figure 9.5 The quadratic discriminant boundary for RR Lyrae stars (see caption of figure 9.3 for details). With all four colors, QDA achieves a completeness of 0.788 and a contamination of 0.757. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from matplotlib import colors from sklearn.qda import QDA from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform QDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = QDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) predictions = np.array(predictions) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[yy.ravel(), xx.ravel()]) Z = Z[:, 1].reshape(xx.shape) #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) im = ax.imshow(Z, origin='lower', aspect='auto', cmap=plt.cm.binary, zorder=1, extent=xlim + ylim) im.set_clim(0, 1.5) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness, 'o-k', c='k', ms=6, label='unweighted') ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination, 'o-k', c='k', ms=6, label='unweighted') ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_svm.py0000644000076500000240000001022012420767763023456 0ustar jakevdpstaff00000000000000""" SVM Classification of photometry -------------------------------- Figure 9.10 SVM applied to the RR Lyrae data (see caption of figure 9.3 for details). With all four colors, SVM achieves a completeness of 1.0 and a contamination of 0.854. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.svm import SVC from astroML.decorators import pickle_results from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples from astroML.utils import completeness_contamination #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[:, [1, 0, 2, 3]] # rearrange columns for better 1-color results # SVM takes several minutes to run, and is order[N^2] # truncating the dataset can be useful for experimentation. #X = X[::5] #y = y[::5] (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # Fit SVM Ncolors = np.arange(1, X.shape[1] + 1) @pickle_results('SVM_rrlyrae.pkl') def compute_SVM(Ncolors): classifiers = [] predictions = [] for nc in Ncolors: # perform support vector classification clf = SVC(kernel='linear', class_weight='auto') clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) return classifiers, predictions classifiers, predictions = compute_SVM(Ncolors) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # compute the decision boundary clf = classifiers[1] w = clf.coef_[0] a = -w[0] / w[1] yy = np.linspace(-0.1, 0.4) xx = a * yy - clf.intercept_[0] / w[1] #---------------------------------------------------------------------- # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) ax.plot(xx, yy, '-k') im = ax.scatter(X[-N_plot:, 1], X[-N_plot:, 0], c=y[-N_plot:], s=4, lw=0, cmap=plt.cm.binary, zorder=2) im.set_clim(-0.5, 1) ax.set_xlim(0.7, 1.35) ax.set_ylim(-0.15, 0.4) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') # plot completeness vs Ncolors ax = fig.add_subplot(222) ax.plot(Ncolors, completeness, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('completeness') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) # plot contamination vs Ncolors ax = fig.add_subplot(224) ax.plot(Ncolors, contamination, 'o-k', ms=6) ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(0.2)) ax.xaxis.set_major_formatter(plt.FormatStrFormatter('%i')) ax.set_xlabel('N colors') ax.set_ylabel('contamination') ax.set_xlim(0.5, 4.5) ax.set_ylim(-0.1, 1.1) ax.grid(True) plt.show() astroML-0.3/book_figures/chapter9/fig_rrlyrae_treevis.py0000644000076500000240000002110412420767763024335 0ustar jakevdpstaff00000000000000""" Decision Tree for RR Lyrae Classification ----------------------------------------- Figure 9.12 The decision tree for RR Lyrae classification. The numbers in each node are the statistics of the training sample of ~70,000 objects. The cross-validation statistics are shown in the bottom-left corner of the figure. See also figure 9.13. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from sklearn.tree import DecisionTreeClassifier from astroML.datasets import fetch_rrlyrae_combined from astroML.utils import split_samples #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) # % sign needs to be escaped if usetex is True import matplotlib if matplotlib.rcParams.get('text.usetex'): pct = r'\%' else: pct = '%' def get_x_position(level, n_levels, xmin=0.01, xmax=1): dx = (xmax - xmin) / (n_levels - 1.) return xmin + level * dx def get_y_position(level, j, ymin=0, ymax=1): n = 2 ** level dy = (ymax - ymin) * 1. / n return ymin + (j + 0.5) * dy def draw_connections(x_positions, y_positions, children, i, ax, linestyle='-k'): for (c, y) in zip(children, y_positions): if c == -1: continue ax.plot(x_positions[i - 1:i + 1], [y, y], linestyle, lw=1) for j in range(0, 2 ** i, 2): if children[j] == -1 or children[j + 1] == -1: continue ax.plot(2 * [x_positions[i - 1]], y_positions[j:j + 2], linestyle, lw=1) def visualize_tree(T, data, classes, labels=None, levels=5, ymin=0, ymax=1, xmin=0, xmax=1, ax=None): # to visualize the tree, we essentially need to re-build it: it doesn't # store the list of points at each node. if ax is None: ax = plt.gca() # get tree aspects try: # new versions of sklearn T_children = T.tree_.children except: # old versions of sklearn T_children = np.vstack([T.tree_.children_left, T.tree_.children_right]).T try: # version < 0.14 T_nsamples = T.tree_.n_samples except AttributeError: # version 0.14+ T_nsamples = T.tree_.n_node_samples T_feature = T.tree_.feature T_threshold = T.tree_.threshold x_positions = get_x_position(np.arange(levels + 1), levels) node_list = np.array([0]) new_data_masks = [np.ones(data.shape[0], dtype=bool)] for i in range(levels): y_positions = get_y_position(i, np.arange(2 ** i)) mask = (node_list != -1) # plot the positions of the nodes ax.plot(x_positions[i] * np.ones(2 ** i)[mask], y_positions[mask], 'ok') data_masks = new_data_masks new_data_masks = [] # list node info for j in range(2 ** i): if node_list[j] == -1: new_data_masks += [None, None] continue ind = node_list[j] # get masks of children split_mask = (data[:, T_feature[ind]] < T_threshold[ind]) new_data_masks.append(np.logical_and(data_masks[j], split_mask)) new_data_masks.append(np.logical_and(data_masks[j], ~split_mask)) n_neg = np.sum(classes[data_masks[j]] == 0) n_pos = np.sum(classes[data_masks[j]] == 1) text = "$%i\,/\,%i$" % (n_neg, n_pos) # assure that we're doing this correctly assert (n_neg + n_pos == T_nsamples[ind]) # check if node is a leaf if n_neg == 0: text += "\n" + r"${\rm(RR\,Lyrae)}$" elif n_pos == 0: text += "\n" + r"${\rm non}$-${\rm variable}$" else: text += "\n" + r"${\rm split\ on}$ %s" % labels[T_feature[ind]] if i < 4: fontsize = 8 else: fontsize = 7 ax.text(x_positions[i], y_positions[j], text, ha='center', va='center', fontsize=fontsize, bbox=dict(boxstyle='round', ec='k', fc='w')) # draw lines connecting nodes to parents if i > 0: draw_connections(x_positions, y_positions, node_list, i, ax, '-k') # get next set of nodes node_list = np.concatenate(list(T_children[node_list])) # draw dotted line for last level y_positions = get_y_position(levels, np.arange(2 ** levels)) draw_connections(x_positions, y_positions, node_list, levels, ax, ':k') # set suitable axes limits dx = 0.1 * (xmax - xmin) dy = 0.02 * (xmax - xmin) ax.set_xlim(xmin - dx, xmax + 2 * dx) ax.set_ylim(ymin - dy, ymax + dy) #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) N_tot = len(y) N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr clf = DecisionTreeClassifier(compute_importances=True, random_state=0, criterion='entropy') clf.fit(X_train, y_train) y_out = clf.predict(X_test) eq = (y_out == y_test) #-------------------------------------------------- # compute statistics of cross-validation tot_neg = np.sum(y_test == 0) tot_pos = np.sum(y_test == 1) fn = np.sum((y_test == 1) & (y_out == 0)) tn = np.sum((y_test == 0) & (y_out == 0)) fp = np.sum((y_test == 0) & (y_out == 1)) tp = np.sum((y_test == 1) & (y_out == 1)) print("----------------------------------------------------------------") print("partial training set: (%i non-variable, %i RR Lyrae)" % (np.sum(y_train == 0), np.sum(y_train == 1))) print("positive = RR Lyrae, negative = non-variable") print("false positives: %i (%.1f%%)" % (fp, fp * 100. / (fp + tp))) print("false negatives: %i (%.1f%%)" % (fn, fn * 100. / (fn + tn))) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 7), facecolor='w') ax = fig.add_axes([0, 0, 1, 1], xticks=[], yticks=[], frameon=False) visualize_tree(clf, X_train, y_train, labels=(['$u-g$', '$g-r$', '$r-i$', '$i-z$'])) ax.text(0.12, 0.95, ("Numbers are count of\n" "non-variable / RR Lyrae\n" "in each node"), ha='center', va='center', bbox=dict(boxstyle='round', ec='k', fc='w')) ax.text(-0.08, 0.14, ("Training Set Size:\n" " %i objects" % len(y_train)), ha='left', va='bottom') ax.text(-0.08, 0.01, ("Cross-Validation, with\n" " %i RR Lyraes (positive)\n" " %i non-variables (negative)\n" " false positives: %i (%.1f%s)\n" " false negatives: %i (%.1f%s)" % (tot_pos, tot_neg, fp, fp * 100. / (tp + fp), pct, fn, fn * 100. / (tn + fn), pct)), ha='left', va='bottom') #-------------------------------------------------- # compute statistics for a larger training set clf = DecisionTreeClassifier(compute_importances=True, random_state=0, criterion='entropy') clf.fit(X_train, y_train) y_out = clf.predict(X_test) tot_neg = np.sum(y_test == 0) tot_pos = np.sum(y_test == 1) fn = np.sum((y_test == 1) & (y_out == 0)) tn = np.sum((y_test == 0) & (y_out == 0)) fp = np.sum((y_test == 0) & (y_out == 1)) tp = np.sum((y_test == 1) & (y_out == 1)) print("----------------------------------------------------------------") print("full training set: (%i non-variables, %i RR Lyraes)" % (np.sum(y_train == 0), np.sum(y_train == 1))) print("positive = RR Lyrae, negative = non-variables") print("false positives: %i (%.1f%%)" % (fp, fp * 100. / (fp + tp))) print("false negatives: %i (%.1f%%)" % (fn, fn * 100. / (fn + tn))) plt.show() astroML-0.3/book_figures/chapter9/fig_simple_naivebayes.py0000644000076500000240000000471512252721253024607 0ustar jakevdpstaff00000000000000""" Simple Gaussian Naive Bayes Classification ------------------------------------------ Figure 9.2 A decision boundary computed for a simple data set using Gaussian naive Bayes classification. The line shows the decision boundary, which corresponds to the curve where a new point has equal posterior probability of being part of each class. In such a simple case, it is possible to find a classification with perfect completeness and contamination. This is rarely the case in the real world. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from matplotlib import colors from sklearn.naive_bayes import GaussianNB #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Simulate some data np.random.seed(0) mu1 = [1, 1] cov1 = 0.3 * np.eye(2) mu2 = [5, 3] cov2 = np.eye(2) * np.array([0.4, 0.1]) X = np.concatenate([np.random.multivariate_normal(mu1, cov1, 100), np.random.multivariate_normal(mu2, cov2, 100)]) y = np.zeros(200) y[100:] = 1 #------------------------------------------------------------ # Fit the Naive Bayes classifier clf = GaussianNB() clf.fit(X, y) # predict the classification probabilities on a grid xlim = (-1, 8) ylim = (-1, 5) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 71), np.linspace(ylim[0], ylim[1], 81)) Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111) ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.binary, zorder=2) ax.contour(xx, yy, Z, [0.5], colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter9/fig_star_quasar_ROC.py0000644000076500000240000001205212420767763024146 0ustar jakevdpstaff00000000000000""" Star/Quasar Classification ROC Curves ------------------------------------- Figure 9.18 The left panel shows data used in color-based photometric classification of stars and quasars. Stars are indicated by gray points, while quasars are indicated by black points. The right panel shows ROC curves for quasar identification based on u - g , g - r , r - i , and i - z colors. Labels are the same as those in Figure 9.17. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general from __future__ import print_function import numpy as np from matplotlib import pyplot as plt from astroML.utils import split_samples from sklearn.metrics import roc_curve from sklearn.naive_bayes import GaussianNB from sklearn.lda import LDA from sklearn.qda import QDA from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from astroML.classification import GMMBayes #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Fetch data and split into training and test samples from astroML.datasets import fetch_dr7_quasar from astroML.datasets import fetch_sdss_sspp quasars = fetch_dr7_quasar() stars = fetch_sdss_sspp() # Truncate data for speed quasars = quasars[::5] stars = stars[::5] # stack colors into matrix X Nqso = len(quasars) Nstars = len(stars) X = np.empty((Nqso + Nstars, 4), dtype=float) X[:Nqso, 0] = quasars['mag_u'] - quasars['mag_g'] X[:Nqso, 1] = quasars['mag_g'] - quasars['mag_r'] X[:Nqso, 2] = quasars['mag_r'] - quasars['mag_i'] X[:Nqso, 3] = quasars['mag_i'] - quasars['mag_z'] X[Nqso:, 0] = stars['upsf'] - stars['gpsf'] X[Nqso:, 1] = stars['gpsf'] - stars['rpsf'] X[Nqso:, 2] = stars['rpsf'] - stars['ipsf'] X[Nqso:, 3] = stars['ipsf'] - stars['zpsf'] y = np.zeros(Nqso + Nstars, dtype=int) y[:Nqso] = 1 # split into training and test sets (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.9, 0.1], random_state=0) #------------------------------------------------------------ # Compute fits for all the classifiers def compute_results(*args): names = [] probs = [] for classifier, kwargs in args: print(classifier.__name__) model = classifier(**kwargs) model.fit(X, y) y_prob = model.predict_proba(X_test) names.append(classifier.__name__) probs.append(y_prob[:, 1]) return names, probs LRclass_weight = dict([(i, np.sum(y_train == i)) for i in (0, 1)]) names, probs = compute_results((GaussianNB, {}), (LDA, {}), (QDA, {}), (LogisticRegression, dict(class_weight=LRclass_weight)), (KNeighborsClassifier, dict(n_neighbors=10)), (DecisionTreeClassifier, dict(random_state=0, max_depth=12, criterion='entropy')), (GMMBayes, dict(n_components=3, min_covar=1E-5, covariance_type='full'))) #------------------------------------------------------------ # Plot results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15, top=0.9, wspace=0.25) # First axis shows the data ax1 = fig.add_subplot(121) im = ax1.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=4, linewidths=0, edgecolors='none', cmap=plt.cm.binary) im.set_clim(-0.5, 1) ax1.set_xlim(-0.5, 3.0) ax1.set_ylim(-0.3, 1.4) ax1.set_xlabel('$u - g$') ax1.set_ylabel('$g - r$') labels = dict(GaussianNB='GNB', LDA='LDA', QDA='QDA', KNeighborsClassifier='KNN', DecisionTreeClassifier='DT', GMMBayes='GMMB', LogisticRegression='LR') # Second axis shows the ROC curves ax2 = fig.add_subplot(122) for name, y_prob in zip(names, probs): fpr, tpr, thresholds = roc_curve(y_test, y_prob) fpr = np.concatenate([[0], fpr]) tpr = np.concatenate([[0], tpr]) ax2.plot(fpr, tpr, label=labels[name]) ax2.legend(loc=4) ax2.set_xlabel('false positive rate') ax2.set_ylabel('true positive rate') ax2.set_xlim(0, 0.15) ax2.set_ylim(0.6, 1.01) ax2.xaxis.set_major_locator(plt.MaxNLocator(5)) plt.show() astroML-0.3/book_figures/chapter9/fig_svm_diagram.py0000644000076500000240000000442112252721253023373 0ustar jakevdpstaff00000000000000""" SVM Diagram ----------- Figure 9.9 Illustration of SVM. The region between the dashed lines is the margin, and the points which the dashed lines touch are called the support vectors. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from sklearn import svm #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Create the data np.random.seed(1) N1 = 10 N2 = 10 mu1 = np.array([0, 0]) mu2 = np.array([2.0, 2.0]) Cov1 = np.array([[1, -0.5], [-0.5, 1]]) Cov2 = Cov1 X = np.vstack([np.random.multivariate_normal(mu1, Cov1, N1), np.random.multivariate_normal(mu2, Cov2, N2)]) y = np.hstack([np.zeros(N1), np.ones(N2)]) #------------------------------------------------------------ # Perform an SVM classification clf = svm.SVC(kernel='linear') clf.fit(X, y) xx = np.linspace(-5, 5) w = clf.coef_[0] m = -w[0] / w[1] b = - clf.intercept_[0] / w[1] yy = m * xx + b #------------------------------------------------------------ # find support vectors i1 = np.argmax(np.dot(X[:N1], w)) i2 = N1 + np.argmin(np.dot(X[N1:], w)) db1 = X[i1, 1] - (m * X[i1, 0] + b) db2 = X[i2, 1] - (m * X[i2, 0] + b) #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(5, 3.75)) ax = fig.add_subplot(111, aspect='equal') ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=plt.cm.binary) ax.plot(xx, yy, '-k') ax.plot(xx, yy + db1, '--k') ax.plot(xx, yy + db2, '--k') ax.set_ylim(-1.5, 4) ax.set_xlim(-3, 4) ax.set_xlabel('$x$') ax.set_ylabel('$y$') plt.show() astroML-0.3/book_figures/chapter9/README.rst0000644000076500000240000000025012115147567021376 0ustar jakevdpstaff00000000000000Chapter 9: Classification ------------------------- This chapter covers supervised classification of data, using a number of generative and discriminative classifiers. astroML-0.3/book_figures/FOOTER.rst0000644000076500000240000000502212252721253017712 0ustar jakevdpstaff00000000000000 Getting Started/Frequently Asked Questions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There is so much here: where to begin? 1) Getting SDSS and other data, and quick analysis and plotting: - `How do I access SDSS imaging data and plot various color-color diagrams?` **Chapter 1**. - `How do I access an SDSS spectrum and plot it?` **Chapter 1**. - `How do plot data in pixelated sky projections?` **Chapter 1**. - `How can I visualize a four-dimensional data set and its intrinsic correlations?` **Chapter 1**. 2) Basic statistical tools: - `How do I use python to evaluate and plot various statistical distributions, such as Cauchy, Laplace, etc.` **Chapter 3**. - `How do I robustly estimate location and scale parameters of a one-dimensional data set?` **Chapter 3**. - `How do I robustly estimate parameters of a two-dimensional Gaussian?` **Chapter 3**. - `How do I account for selection effects (e.g. luminosity functions)?` **Chapter 4**. - `How do I generate a simulated sample drawn from an arbitrary distribution?` **Chapter 4**. - `How do I choose optimal bin width for a histogram? Do bins need to be same size?` **Chapters 4 and 5**. - `How do I fit y(x) when y has non-Gaussian uncertainties?` **Chapter 8**. - `How do I fit y(x) when both x and y have non-negligible uncertainties?` **Chapter 8**. 3) Non-trivial data mining and other tools: - `How do I run PCA on many SDSS spectra?` **Chapter 7**. - `How do I fit a multi-component Gaussian (or any other function) to my histogram?` **Chapter 5**. - `How do I decide if I have "detection"?` **Chapters 4, 5, 8**. - `How do I fit a multi-component Gaussian (while accounting for errors) to my multi-dimensional data?` **Chapter 6**. - `How do I justify the use of, for example, a parabola instead of a straight line to fit my data?` **Chapter 5**. - `How do I use Markov Chain Monte Carlo to fit a complex function to my multi-dimensional data?` **Chapter 5**. - `How do I estimate underlying density traced by a finite-size sample of points?` **Chapter 6**. - `How do I find clusters (over-densities, classes, features) in my data set?` **Chapters 6 and 9**. - `How do I estimate a light curve period (Lomb-Scargle)?` **Chapter 10**. - `How do I analyze a non-periodic light curve?` **Chapter 10**. - `How do I estimate power spectrum for unevenly sampled data with large heteroscedastic uncertainties?` **Chapter 10**. - `How do I use detection times for individual photons to estimate exponential decay time?` **Chapter 10**. astroML-0.3/book_figures/README.rst0000644000076500000240000000077012252721253017656 0ustar jakevdpstaff00000000000000Textbook Figures ---------------- This section makes available the source code used to generate every figure in the book `Statistics, Data Mining, and Machine Learning in Astronomy`. Many of the figures are fairly self-explanatory, though some will be less so without the book as a reference. The table of contents of the book can be seen :download:`here(pdf) <../documents/DMbookTOC.pdf>`. Figure Contents ~~~~~~~~~~~~~~~ Each chapter links to a page with thumbnails of the figures from the chapter. astroML-0.3/CHANGES.rst0000644000076500000240000000134112462243562015313 0ustar jakevdpstaff000000000000000.3 === - Add support for Python 3 - Add continuous integration via Travis - Bug: correctly account for errors in Ridge/Lasso regression - Add figure tests in ``compare_images.py`` 0.2 === - Documentation and example updates - Moved from using ``pyfits`` to using ``astropy.io.fits`` - Fix the prior for the Bayesian Blocks algorithm 0.1.1 ===== *Bug fixes, January 2013* - Fixed errors in dataset downloaders: they failed on some platforms - Added citation information to the website - Updated figures to reflect those submitted for publication - Performance improvement in ``freedman_bin_width`` - Fix setup issue when sklearn is not installed - Enhancements to ``devectorize_axes`` function 0.1 === *Initial release, October 2012* astroML-0.3/compare_images.py0000644000076500000240000000321412420767763017047 0ustar jakevdpstaff00000000000000""" Compare Image Tests ------------------- This script compares all the mis-matching images found when running $ nosetests astroML_fig_tests The result of running this script is an html page comparing each output file to the baseline result, showing only the ones with a mismatch above the threshold specified in astroML_fig_tests. """ import os TABLE = """ {rows}
""" ROW = """ {0} actual baseline """ baseline = "astroML_fig_tests/baseline/book_figures" results = "astroML_fig_tests/results/book_figures" figlist = [] for chapter in os.listdir(results): if not os.path.isdir(os.path.join(results,chapter)): continue for pyfile in os.listdir(os.path.join(results,chapter)): if pyfile.endswith('failed-diff.png'): root = pyfile.split('-failed-diff')[0] figlist.append((os.path.join("book_figures", chapter, root + ".py"), os.path.join(results, chapter, pyfile), os.path.join(results, chapter, root + '.png'), os.path.join(baseline, chapter, root + '.png'))) outfile = "_compare_images.html" with open(outfile, 'w') as f: f.write(TABLE.format(rows = '\n'.join([ROW.format(*figs, width="90%") for figs in figlist]))) import webbrowser webbrowser.open_new("file://localhost" + os.path.abspath(outfile)) astroML-0.3/examples/0000755000076500000240000000000012462244012015317 5ustar jakevdpstaff00000000000000astroML-0.3/examples/algorithms/0000755000076500000240000000000012462244012017470 5ustar jakevdpstaff00000000000000astroML-0.3/examples/algorithms/fig_volume_ratio.py0000644000076500000240000000347312420577220023407 0ustar jakevdpstaff00000000000000""" Curse of Dimensionality: Volume Ratio ------------------------------------- This figure shows the ratio of the volume of a unit hypercube to the volume of an inscribed hypersphere. The curse of dimensionality is illustrated in the fact that this ratio approaches zero as the number of dimensions approaches infinity. """ # Author: Jake VanderPlas # License: BSD # The figure produced by this code is published in the textbook # "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com # To report a bug or issue, use the following forum: # https://groups.google.com/forum/#!forum/astroml-general import numpy as np from matplotlib import pyplot as plt from scipy.special import gamma, gammaln #---------------------------------------------------------------------- # This function adjusts matplotlib settings for a uniform feel in the textbook. # Note that with usetex=True, fonts are rendered with LaTeX. This may # result in an error if LaTeX is not installed on your system. In that case, # you can set usetex to False. from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) dims = np.arange(0, 51) # log of volume of a sphere with r = 1 log_V_sphere = (np.log(2) + 0.5 * dims * np.log(np.pi) - np.log(dims) - gammaln(0.5 * dims)) log_V_cube = dims * np.log(2) # compute the log of f_k to avoid overflow errors log_f_k = log_V_sphere - log_V_cube fig, ax = plt.subplots(figsize=(5, 3.75)) ax.semilogy(dims, np.exp(log_V_cube), '-k', label='side-2 hypercube') ax.semilogy(dims, np.exp(log_V_sphere), '--k', label='inscribed unit hypersphere') ax.set_xlim(0, 50) ax.set_ylim(1E-13, 1E15) ax.set_xlabel('Number of Dimensions') ax.set_ylabel('Hyper-Volume') ax.legend(loc=3) plt.show() astroML-0.3/examples/algorithms/plot_bayesian_blocks.py0000644000076500000240000000522212115147567024245 0ustar jakevdpstaff00000000000000""" Bayesian Blocks for Histograms ------------------------------ .. currentmodule:: astroML Bayesian Blocks is a dynamic histogramming method which optimizes one of several possible fitness functions to determine an optimal binning for data, where the bins are not necessarily uniform width. The astroML implementation is based on [1]_. For more discussion of this technique, see the blog post at [2]_. The code below uses a fitness function suitable for event data with possible repeats. More fitness functions are available: see :mod:`density_estimation` References ~~~~~~~~~~ .. [1] Scargle, J `et al.` (2012) http://adsabs.harvard.edu/abs/2012arXiv1207.5578S .. [2] http://jakevdp.github.com/blog/2012/09/12/dynamic-programming-in-python/ """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from scipy import stats from matplotlib import pyplot as plt from astroML.plotting import hist # draw a set of variables np.random.seed(0) t = np.concatenate([stats.cauchy(-5, 1.8).rvs(500), stats.cauchy(-4, 0.8).rvs(2000), stats.cauchy(-1, 0.3).rvs(500), stats.cauchy(2, 0.8).rvs(1000), stats.cauchy(4, 1.5).rvs(500)]) # truncate values to a reasonable range t = t[(t > -15) & (t < 15)] #------------------------------------------------------------ # First figure: show normal histogram binning fig = plt.figure(figsize=(10, 4)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15) ax1 = fig.add_subplot(121) ax1.hist(t, bins=15, histtype='stepfilled', alpha=0.2, normed=True) ax1.set_xlabel('t') ax1.set_ylabel('P(t)') ax2 = fig.add_subplot(122) ax2.hist(t, bins=200, histtype='stepfilled', alpha=0.2, normed=True) ax2.set_xlabel('t') ax2.set_ylabel('P(t)') #------------------------------------------------------------ # Second & Third figure: Knuth bins & Bayesian Blocks fig = plt.figure(figsize=(10, 4)) fig.subplots_adjust(left=0.1, right=0.95, bottom=0.15) for bins, title, subplot in zip(['knuth', 'blocks'], ["Knuth's rule", 'Bayesian blocks'], [121, 122]): ax = fig.add_subplot(subplot) # plot a standard histogram in the background, with alpha transparency hist(t, bins=200, histtype='stepfilled', alpha=0.2, normed=True, label='standard histogram') # plot an adaptive-width histogram on top hist(t, bins=bins, ax=ax, color='black', histtype='step', normed=True, label=title) ax.legend(prop=dict(size=12)) ax.set_xlabel('t') ax.set_ylabel('P(t)') plt.show() astroML-0.3/examples/algorithms/plot_crossmatch.py0000644000076500000240000000277012252721253023260 0ustar jakevdpstaff00000000000000""" Catalog cross-matching ---------------------- This plots the cross-matched samples between the SDSS imaging data and the SDSS Stripe 82 standard stars. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import os import sys from time import time import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_imaging_sample, fetch_sdss_S82standards from astroML.crossmatch import crossmatch_angular from astroML.plotting import hist # get imaging data image_data = fetch_imaging_sample() imX = np.empty((len(image_data), 2), dtype=np.float64) imX[:, 0] = image_data['ra'] imX[:, 1] = image_data['dec'] # get standard stars standards_data = fetch_sdss_S82standards() stX = np.empty((len(standards_data), 2), dtype=np.float64) stX[:, 0] = standards_data['RA'] stX[:, 1] = standards_data['DEC'] # crossmatch catalogs max_radius = 1. / 3600 # 1 arcsec dist, ind = crossmatch_angular(imX, stX, max_radius) match = ~np.isinf(dist) dist_match = dist[match] dist_match *= 3600 ax = plt.axes() hist(dist_match, bins='knuth', ax=ax, histtype='stepfilled', ec='k', fc='#AAAAAA') ax.set_xlabel('radius of match (arcsec)') ax.set_ylabel('N(r, r+dr)') ax.text(0.95, 0.95, "Total objects: %i\nNumber with match: %i" % (imX.shape[0], np.sum(match)), ha='right', va='top', transform=ax.transAxes) ax.set_xlim(0, 0.2) plt.show() astroML-0.3/examples/algorithms/plot_spectrum_sum_of_norms.py0000644000076500000240000000250112115147567025542 0ustar jakevdpstaff00000000000000""" Linear Sum of Gaussians ----------------------- Fitting a spectrum with a linear sum of gaussians. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com from matplotlib import pyplot as plt from astroML.datasets import fetch_vega_spectrum from astroML.sum_of_norms import sum_of_norms, norm # Fetch the data x, y = fetch_vega_spectrum() # truncate the spectrum mask = (x >= 2000) & (x < 10000) x = x[mask] y = y[mask] for n_gaussians in (10, 50, 100): # compute the best-fit linear combination w_best, rms, locs, widths = sum_of_norms(x, y, n_gaussians, spacing='linear', full_output=True) norms = w_best * norm(x[:, None], locs, widths) # plot the results plt.figure() plt.plot(x, y, '-k', label='input spectrum') ylim = plt.ylim() plt.plot(x, norms, ls='-', c='#FFAAAA') plt.plot(x, norms.sum(1), '-r', label='sum of gaussians') plt.ylim(-0.1 * ylim[1], ylim[1]) plt.legend(loc=0) plt.text(0.97, 0.8, "rms error = %.2g" % rms, ha='right', va='top', transform=plt.gca().transAxes) plt.title("Fit to a Spectrum with a Sum of %i Gaussians" % n_gaussians) plt.show() astroML-0.3/examples/algorithms/README.rst0000644000076500000240000000040312115147567021170 0ustar jakevdpstaff00000000000000Data Processing Algorithms -------------------------- These figures and examples show some of the data processing and algorithmic tools enabled by astroML and other Python packages. For more examples, see the :ref:`figures ` from the textbook. astroML-0.3/examples/datasets/0000755000076500000240000000000012462244012017127 5ustar jakevdpstaff00000000000000astroML-0.3/examples/datasets/compute_sdss_pca.py0000644000076500000240000001100212420767763023047 0ustar jakevdpstaff00000000000000""" Example of downloading and processing SDSS spectra -------------------------------------------------- This is the code used to create the files fetched by the routine :func:`fetch_sdss_corrected_spectra`. Be aware that this routine downloads a large amount of data (~700MB for 4000 spectra) and takes a long time to run (~30 minutes for 4000 spectra). """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com from __future__ import print_function, division import sys from astroML.py3k_compat import HTTPError import numpy as np from astroML.datasets import fetch_sdss_spectrum from astroML.datasets.tools import query_plate_mjd_fiber, TARGET_GALAXY from astroML.dimensionality import iterative_pca def fetch_and_shift_spectra(n_spectra, outfile, primtarget=TARGET_GALAXY, zlim=(0, 0.7), loglam_start=3.5, loglam_end=3.9, Nlam=1000): """ This function queries CAS for matching spectra, and then downloads them and shifts them to a common redshift binning """ # First query for the list of spectra to download plate, mjd, fiber = query_plate_mjd_fiber(n_spectra, primtarget, zlim[0], zlim[1]) # Set up arrays to hold information gathered from the spectra spec_cln = np.zeros(n_spectra, dtype=np.int32) lineindex_cln = np.zeros(n_spectra, dtype=np.int32) log_NII_Ha = np.zeros(n_spectra, dtype=np.float32) log_OIII_Hb = np.zeros(n_spectra, dtype=np.float32) z = np.zeros(n_spectra, dtype=np.float32) zerr = np.zeros(n_spectra, dtype=np.float32) spectra = np.zeros((n_spectra, Nlam), dtype=np.float32) mask = np.zeros((n_spectra, Nlam), dtype=np.bool) # Calculate new wavelength coefficients new_coeff0 = loglam_start new_coeff1 = (loglam_end - loglam_start) / Nlam # Now download all the needed spectra, and resample to a common # wavelength bin. n_spectra = len(plate) num_skipped = 0 i = 0 while i < n_spectra: sys.stdout.write(' %i / %i spectra\r' % (i + 1, n_spectra)) sys.stdout.flush() try: spec = fetch_sdss_spectrum(plate[i], mjd[i], fiber[i]) except HTTPError: num_skipped += 1 print("%i, %i, %i not found" % (plate[i], mjd[i], fiber[i])) i += 1 continue spec_rebin = spec.restframe().rebin(new_coeff0, new_coeff1, Nlam) if np.all(spec_rebin.spectrum == 0): num_skipped += 1 print("%i, %i, %i is all zero" % (plate[i], mjd[i], fiber[i])) continue spec_cln[i] = spec.spec_cln lineindex_cln[i], (log_NII_Ha[i], log_OIII_Hb[i])\ = spec.lineratio_index() z[i] = spec.z zerr[i] = spec.zerr spectra[i] = spec_rebin.spectrum mask[i] = spec_rebin.compute_mask(0.5, 5) i += 1 sys.stdout.write('\n') N = i print(" %i spectra skipped" % num_skipped) print(" %i spectra processed" % N) print("saving to %s" % outfile) np.savez(outfile, spectra=spectra[:N], mask=mask[:N], coeff0=new_coeff0, coeff1=new_coeff1, spec_cln=spec_cln[:N], lineindex_cln=lineindex_cln[:N], log_NII_Ha=log_NII_Ha[:N], log_OIII_Hb=log_OIII_Hb[:N], z=z[:N], zerr=zerr[:N]) def spec_iterative_pca(outfile, n_ev=10, n_iter=20, norm='L2'): """ This function takes the file outputted above, performs an iterative PCA to fill in the gaps, and appends the results to the same file. """ data_in = np.load(outfile) spectra = data_in['spectra'] mask = data_in['mask'] res = iterative_pca(spectra, mask, n_ev=n_ev, n_iter=n_iter, norm=norm, full_output=True) input_dict = dict([(key, data_in[key]) for key in data_in.files]) # don't save the reconstructed spectrum: this can easily # be recomputed from the other parameters. input_dict['mu'] = res[1] input_dict['evecs'] = res[2] input_dict['evals'] = res[3] input_dict['norms'] = res[4] input_dict['coeffs'] = res[5] np.savez(outfile, **input_dict) if __name__ == '__main__': fetch_and_shift_spectra(4000, 'spec4000.npz') spec_iterative_pca('spec4000.npz') astroML-0.3/examples/datasets/plot_corrected_spectra.py0000644000076500000240000000224512115147567024251 0ustar jakevdpstaff00000000000000""" Corrected Spectra ----------------- The script examples/datasets/compute_sdss_pca.py uses an iterative PCA technique to reconstruct masked regions of SDSS spectra. Several of the resulting spectra are shown below. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np import matplotlib.pyplot as plt from astroML.datasets import sdss_corrected_spectra #------------------------------------------------------------ # Fetch the data data = sdss_corrected_spectra.fetch_sdss_corrected_spectra() spectra = sdss_corrected_spectra.reconstruct_spectra(data) lam = sdss_corrected_spectra.compute_wavelengths(data) #------------------------------------------------------------ # Plot several spectra fig = plt.figure(figsize=(8, 8)) fig.subplots_adjust(hspace=0) for i in range(5): ax = fig.add_subplot(511 + i) ax.plot(lam, spectra[i], '-k') if i < 4: ax.xaxis.set_major_formatter(plt.NullFormatter()) else: ax.set_xlabel('wavelength $(\AA)$') ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('flux') plt.show() astroML-0.3/examples/datasets/plot_dr7_quasar.py0000644000076500000240000000230012115147567022616 0ustar jakevdpstaff00000000000000""" SDSS Data Release 7 Quasar catalog ---------------------------------- This demonstrates how to fetch and visualize the colors from the SDSS DR7 quasar sample. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.plotting import MultiAxes from astroML.datasets import fetch_dr7_quasar data = fetch_dr7_quasar() colors = np.empty((len(data), 5)) colors[:, 0] = data['mag_u'] - data['mag_g'] colors[:, 1] = data['mag_g'] - data['mag_r'] colors[:, 2] = data['mag_r'] - data['mag_i'] colors[:, 3] = data['mag_i'] - data['mag_z'] colors[:, 4] = data['mag_z'] - data['mag_J'] labels = ['u-g', 'g-r', 'r-i', 'i-z', 'z-J'] bins = [np.linspace(-0.4, 1.0, 100), np.linspace(-0.4, 1.0, 100), np.linspace(-0.3, 0.6, 100), np.linspace(-0.4, 0.7, 100), np.linspace(0, 2.2, 100)] ax = MultiAxes(5, wspace=0.05, hspace=0.05, fig=plt.figure(figsize=(10, 10))) ax.density(colors, bins) ax.set_labels(labels) ax.set_locators(plt.MaxNLocator(5)) plt.suptitle('SDSS DR7 Quasar Colors', fontsize=18) plt.show() astroML-0.3/examples/datasets/plot_great_wall.py0000644000076500000240000000206412115147567022676 0ustar jakevdpstaff00000000000000""" SDSS "Great Wall" ----------------- Plotting the SDSS "great wall", a filament of galaxies visible by-eye in the projected locations of the SDSS spectroscopic galaxy sample. This follows a similar procedure to [1]_, References ---------- .. [1] http://adsabs.harvard.edu/abs/2008ApJ...674L..13C """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_great_wall from astroML.density_estimation import KDE, KNeighborsDensity #------------------------------------------------------------ # Fetch the great wall data X = fetch_great_wall() #------------------------------------------------------------ # Plot the results fig = plt.figure(figsize=(8, 4)) # First plot: scatter the points ax = plt.subplot(111, aspect='equal') ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax.set_xlim(-300, 200) ax.set_ylim(-375, -175) ax.set_xlabel('y (Mpc)') ax.set_ylabel('x (MPC)') plt.show() astroML-0.3/examples/datasets/plot_LIGO_spectrum.py0000644000076500000240000000472112115147567023233 0ustar jakevdpstaff00000000000000""" Plot the power spectrum of LIGO ------------------------------- This compares the power spectrum computed using the raw FFT, and using Welch's method (i.e. overlapping window functions that reduce noise). The top panel shows the raw signal, which is the measurements of the change in baseline length. The bottom panel shows the raw and smoothed power spectrum, used by the LIGO team to characterize the noise of the detector. The particular data used here is the injected `Big Dog `_ event. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from scipy import fftpack from matplotlib import mlab from astroML.datasets import fetch_LIGO_large #------------------------------------------------------------ # Fetch the LIGO hanford data data, dt = fetch_LIGO_large() # subset of the data to plot t0 = 646 T = 2 tplot = dt * np.arange(T * 4096) dplot = data[4096 * t0: 4096 * (t0 + T)] tplot = tplot[::10] dplot = dplot[::10] fmin = 40 fmax = 2060 #------------------------------------------------------------ # compute PSD using simple FFT N = len(data) df = 1. / (N * dt) PSD = abs(dt * fftpack.fft(data)[:N / 2]) ** 2 f = df * np.arange(N / 2) cutoff = ((f >= fmin) & (f <= fmax)) f = f[cutoff] PSD = PSD[cutoff] f = f[::100] PSD = PSD[::100] #------------------------------------------------------------ # compute PSD using Welch's method -- hanning window function PSDW2, fW2 = mlab.psd(data, NFFT=4096, Fs=1. / dt, window=mlab.window_hanning, noverlap=2048) dfW2 = fW2[1] - fW2[0] cutoff = (fW2 >= fmin) & (fW2 <= fmax) fW2 = fW2[cutoff] PSDW2 = PSDW2[cutoff] #------------------------------------------------------------ # Plot the data fig = plt.figure() fig.subplots_adjust(bottom=0.1, top=0.9, hspace=0.3) # top panel: time series ax = fig.add_subplot(211) ax.plot(tplot, dplot, '-k') ax.set_xlabel('time (s)') ax.set_ylabel('$h(t)$') ax.set_ylim(-1.2E-18, 1.2E-18) # bottom panel: hanning window ax = fig.add_subplot(212) ax.loglog(f, PSD, '-', c='#AAAAAA') ax.loglog(fW2, PSDW2, '-k') ax.text(0.98, 0.95, "Hanning (cosine) window", ha='right', va='top', transform=ax.transAxes) ax.set_xlabel('frequency (Hz)') ax.set_ylabel(r'$PSD(f)$') ax.set_xlim(40, 2060) ax.set_ylim(1E-46, 1E-36) ax.yaxis.set_major_locator(plt.LogLocator(base=100)) plt.show() astroML-0.3/examples/datasets/plot_moving_objects.py0000644000076500000240000000705712115147567023574 0ustar jakevdpstaff00000000000000""" SDSS Stripe 82 Moving Object Catalog ------------------------------------ This plot demonstrates how to fetch data from the SDSS Moving object catalog, and plot using a multicolor plot similar to that used in figures 3-4 of [1]_ References ~~~~~~~~~~ .. [1] Parker `et al.` 2008 http://adsabs.harvard.edu/abs/2008Icar..198..138P """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_moving_objects from astroML.plotting.tools import devectorize_axes def black_bg_subplot(*args, **kwargs): """Create a subplot with black background""" kwargs['axisbg'] = 'k' ax = plt.subplot(*args, **kwargs) # set ticks and labels to white for spine in ax.spines.values(): spine.set_color('w') for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): for child in tick.get_children(): child.set_color('w') return ax def compute_color(mag_a, mag_i, mag_z, a_crit=-0.1): """ Compute the scatter-plot color using code adapted from TCL source used in Parker 2008. """ # define the base color scalings R = np.ones_like(mag_i) G = 0.5 * 10 ** (-2 * (mag_i - mag_z - 0.01)) B = 1.5 * 10 ** (-8 * (mag_a + 0.0)) # enhance green beyond the a_crit cutoff i = np.where(mag_a < a_crit) G[i] += 10000 * (10 ** (-0.01 * (mag_a[i] - a_crit)) - 1) # normalize color of each point to its maximum component RGB = np.vstack([R, G, B]) RGB /= RGB.max(0) # return an array of RGB colors, which is shape (n_points, 3) return RGB.T #------------------------------------------------------------ # Fetch data and extract the desired quantities data = fetch_moving_objects(Parker2008_cuts=True) mag_a = data['mag_a'] mag_i = data['mag_i'] mag_z = data['mag_z'] a = data['aprime'] sini = data['sin_iprime'] # dither: magnitudes are recorded only to +/- 0.01 mag_a += -0.005 + 0.01 * np.random.random(size=mag_a.shape) mag_i += -0.005 + 0.01 * np.random.random(size=mag_i.shape) mag_z += -0.005 + 0.01 * np.random.random(size=mag_z.shape) # compute RGB color based on magnitudes color = compute_color(mag_a, mag_i, mag_z) #------------------------------------------------------------ # set up the plot # plot the color-magnitude plot fig = plt.figure(facecolor='k') ax = black_bg_subplot(111) ax.scatter(mag_a, mag_i - mag_z, c=color, s=1, lw=0) devectorize_axes(ax, dpi=400) ax.plot([0, 0], [-0.8, 0.6], '--w', lw=2) ax.plot([0, 0.4], [-0.15, -0.15], '--w', lw=2) ax.set_xlim(-0.3, 0.4) ax.set_ylim(-0.8, 0.6) ax.set_xlabel('a*', color='w') ax.set_ylabel('i-z', color='w') # plot the orbital parameters plot fig = plt.figure(facecolor='k') ax = black_bg_subplot(111) ax.scatter(a, sini, c=color, s=1, lw=0) devectorize_axes(ax, dpi=400) ax.plot([2.5, 2.5], [-0.02, 0.3], '--w') ax.plot([2.82, 2.82], [-0.02, 0.3], '--w') ax.set_xlim(2.0, 3.3) ax.set_ylim(-0.02, 0.3) ax.set_xlabel('a (AU)', color='w') ax.set_ylabel('sin(i)', color='w') # label the plot text_kwargs = dict(color='w', fontsize=14, transform=plt.gca().transAxes, ha='center', va='bottom') ax.text(0.25, 1.01, 'Inner', **text_kwargs) ax.text(0.53, 1.01, 'Mid', **text_kwargs) ax.text(0.83, 1.01, 'Outer', **text_kwargs) # Saving the black-background figure requires some extra arguments: #fig.savefig('moving_objects.png', # facecolor='black', # edgecolor='none') plt.show() astroML-0.3/examples/datasets/plot_nasa_atlas.py0000644000076500000240000000320412253706403022651 0ustar jakevdpstaff00000000000000""" NASA Sloan Atlas ---------------- This shows some visualizations of the data from the NASA SDSS Atlas """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_nasa_atlas data = fetch_nasa_atlas() #------------------------------------------------------------ # plot the RA/DEC in an area-preserving projection RA = data['RA'] DEC = data['DEC'] # convert coordinates to degrees RA -= 180 RA *= np.pi / 180 DEC *= np.pi / 180 ax = plt.axes(projection='mollweide') plt.scatter(RA, DEC, s=1, c=data['Z'], cmap=plt.cm.copper, edgecolors='none', linewidths=0) plt.grid(True) plt.title('NASA Atlas Galaxy Locations') cb = plt.colorbar(cax=plt.axes([0.05, 0.1, 0.9, 0.05]), orientation='horizontal', ticks=np.linspace(0, 0.05, 6)) cb.set_label('redshift') #------------------------------------------------------------ # plot the r vs u-r color-magnitude diagram absmag = data['ABSMAG'] u = absmag[:, 2] r = absmag[:, 4] plt.figure() ax = plt.axes() plt.scatter(u - r, r, s=1, lw=0, c=data['Z'], cmap=plt.cm.copper) plt.colorbar(ticks=np.linspace(0, 0.05, 6)).set_label('redshift') plt.xlim(0, 3.5) plt.ylim(-10, -24) plt.xlabel('u-r') plt.ylabel('r') #------------------------------------------------------------ # plot a histogram of the redshift from astroML.plotting import hist plt.figure() hist(data['Z'], bins='knuth', histtype='stepfilled', ec='k', fc='#F5CCB0') plt.xlabel('z') plt.ylabel('N(z)') plt.show() astroML-0.3/examples/datasets/plot_rrlyrae_mags.py0000644000076500000240000000206112115147567023241 0ustar jakevdpstaff00000000000000""" RR-Lyrae Magnitudes ------------------- This example downloads and plots the colors of RR Lyrae stars along with those of the non-variable stars. Several of the classification examples in the book figures use this dataset. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_rrlyrae_combined #---------------------------------------------------------------------- # get data and split into training & testing sets X, y = fetch_rrlyrae_combined() X = X[-5000:] y = y[-5000:] stars = (y == 0) rrlyrae = (y == 1) #------------------------------------------------------------ # plot the results ax = plt.axes() ax.plot(X[stars, 0], X[stars, 1], '.', ms=5, c='b', label='stars') ax.plot(X[rrlyrae, 0], X[rrlyrae, 1], '.', ms=5, c='r', label='RR-Lyrae') ax.legend(loc=3) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') ax.set_xlim(0.7, 1.4) ax.set_ylim(-0.2, 0.4) plt.show() astroML-0.3/examples/datasets/plot_sdss_filters.py0000644000076500000240000000252612115147567023264 0ustar jakevdpstaff00000000000000""" SDSS Filters ------------ Download and plot the five SDSS filter bands along with a Vega spectrum. This data is available on the SDSS website (filters) and on the STSci website (Vega). """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_filter, fetch_vega_spectrum #------------------------------------------------------------ # Set up figure and axes fig = plt.figure() ax = fig.add_subplot(111) #---------------------------------------------------------------------- # Fetch and plot the Vega spectrum spec = fetch_vega_spectrum() lam = spec[0] spectrum = spec[1] / 2.1 / spec[1].max() ax.plot(lam, spectrum, '-k', lw=2) #------------------------------------------------------------ # Fetch and plot the five filters text_kwargs = dict(fontsize=20, ha='center', va='center', alpha=0.5) for f, c, loc in zip('ugriz', 'bgrmk', [3500, 4600, 6100, 7500, 8800]): data = fetch_sdss_filter(f) ax.fill(data[0], data[1], ec=c, fc=c, alpha=0.4) ax.text(loc, 0.02, f, color=c, **text_kwargs) ax.set_xlim(3000, 11000) ax.set_title('SDSS Filters and Reference Spectrum') ax.set_xlabel('Wavelength (Angstroms)') ax.set_ylabel('normalized flux / filter transmission') plt.show() astroML-0.3/examples/datasets/plot_sdss_galaxy_colors.py0000644000076500000240000000242412272525540024452 0ustar jakevdpstaff00000000000000""" SDSS Galaxy Colors ------------------ The function :func:`fetch_sdss_galaxy_colors` used below actually queries the SDSS CASjobs server for the colors of the 50,000 galaxies. Below we extract the :math:`u - g` and :math:`g - r` colors for 5000 stars, and scatter-plot the results """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from sklearn.neighbors import KNeighborsRegressor from astroML.datasets import fetch_sdss_galaxy_colors #------------------------------------------------------------ # Download data data = fetch_sdss_galaxy_colors() data = data[::10] # truncate for plotting # Extract colors and spectral class ug = data['u'] - data['g'] gr = data['g'] - data['r'] spec_class = data['specClass'] stars = (spec_class == 2) qsos = (spec_class == 3) #------------------------------------------------------------ # Prepare plot fig = plt.figure() ax = fig.add_subplot(111) ax.set_xlim(-0.5, 2.5) ax.set_ylim(-0.5, 1.5) ax.plot(ug[stars], gr[stars], '.', ms=4, c='b', label='stars') ax.plot(ug[qsos], gr[qsos], '.', ms=4, c='r', label='qsos') ax.legend(loc=2) ax.set_xlabel('$u-g$') ax.set_ylabel('$g-r$') plt.show() astroML-0.3/examples/datasets/plot_sdss_imaging.py0000644000076500000240000000350312115147567023223 0ustar jakevdpstaff00000000000000""" SDSS Imaging ============ This example shows how to load the magnitude data from the SDSS imaging catalog, and plot colors and magnitudes of the stars and galaxies. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_imaging_sample #------------------------------------------------------------ # Get the star/galaxy data data = fetch_imaging_sample() objtype = data['type'] stars = data[objtype == 6][:5000] galaxies = data[objtype == 3][:5000] #------------------------------------------------------------ # Plot the stars and galaxies plot_kwargs = dict(color='k', linestyle='none', marker='.', markersize=1) fig = plt.figure() ax1 = fig.add_subplot(221) ax1.plot(galaxies['gRaw'] - galaxies['rRaw'], galaxies['rRaw'], **plot_kwargs) ax2 = fig.add_subplot(223, sharex=ax1) ax2.plot(galaxies['gRaw'] - galaxies['rRaw'], galaxies['rRaw'] - galaxies['iRaw'], **plot_kwargs) ax3 = fig.add_subplot(222, sharey=ax1) ax3.plot(stars['gRaw'] - stars['rRaw'], stars['rRaw'], **plot_kwargs) ax4 = fig.add_subplot(224, sharex=ax3, sharey=ax2) ax4.plot(stars['gRaw'] - stars['rRaw'], stars['rRaw'] - stars['iRaw'], **plot_kwargs) # set labels and titles ax1.set_ylabel('$r$') ax2.set_ylabel('$r-i$') ax2.set_xlabel('$g-r$') ax4.set_xlabel('$g-r$') ax1.set_title('Galaxies') ax3.set_title('Stars') # set axis limits ax2.set_xlim(-0.5, 3) ax3.set_ylim(22.5, 14) ax4.set_xlim(-0.5, 3) ax4.set_ylim(-1, 2) # adjust tick spacings on all axes for ax in (ax1, ax2, ax3, ax4): ax.xaxis.set_major_locator(plt.MultipleLocator(1)) ax.yaxis.set_major_locator(plt.MultipleLocator(1)) plt.show() astroML-0.3/examples/datasets/plot_sdss_line_ratios.py0000644000076500000240000000300012115147567024110 0ustar jakevdpstaff00000000000000""" SDSS Line-ratio Diagrams ------------------------ This shows how to plot line-ratio diagrams for the SDSS spectra. These diagrams are often called BPT plots [1]_, Osterbrock diagrams [2]_, or Kewley diagrams [3]_. The location of the dividing line is taken from from Kewley et al 2001. References ~~~~~~~~~~ .. [1] Baldwin, J. A.; Phillips, M. M.; Terlevich, R. (1981) http://adsabs.harvard.edu/abs/1981PASP...93....5B .. [2] Osterbrock, D. E.; De Robertis, M. M. (1985) http://adsabs.harvard.edu/abs/1985PASP...97.1129O .. [3] Kewley, L. J. `et al.` (2001) http://adsabs.harvard.edu/abs/2001ApJ...556..121K """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_corrected_spectra from astroML.datasets.tools.sdss_fits import log_OIII_Hb_NII data = fetch_sdss_corrected_spectra() i = np.where((data['lineindex_cln'] == 4) | (data['lineindex_cln'] == 5)) plt.scatter(data['log_NII_Ha'][i], data['log_OIII_Hb'][i], c=data['lineindex_cln'][i], s=9, lw=0) NII = np.linspace(-2.0, 0.35) plt.plot(NII, log_OIII_Hb_NII(NII), '-k') plt.plot(NII, log_OIII_Hb_NII(NII, 0.1), '--k') plt.plot(NII, log_OIII_Hb_NII(NII, -0.1), '--k') plt.xlim(-2.0, 1.0) plt.ylim(-1.2, 1.5) plt.xlabel(r'$\mathrm{log([NII]/H\alpha)}$', fontsize='large') plt.ylabel(r'$\mathrm{log([OIII]/H\beta)}$', fontsize='large') plt.show() astroML-0.3/examples/datasets/plot_sdss_S82standards.py0000644000076500000240000000431512115147567024072 0ustar jakevdpstaff00000000000000""" SDSS Standard Star catalog -------------------------- This demonstrates how to fetch and plot the colors of the SDSS Stripe 82 standard stars, both alone and with the cross-matched 2MASS colors. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_S82standards from astroML.plotting import MultiAxes #------------------------------------------------------------ # Plot SDSS data alone data = fetch_sdss_S82standards() colors = np.zeros((len(data), 4)) colors[:, 0] = data['mmu_u'] - data['mmu_g'] colors[:, 1] = data['mmu_g'] - data['mmu_r'] colors[:, 2] = data['mmu_r'] - data['mmu_i'] colors[:, 3] = data['mmu_i'] - data['mmu_z'] labels = ['u-g', 'g-r', 'r-i', 'i-z'] bins = [np.linspace(0.0, 3.5, 100), np.linspace(0, 2, 100), np.linspace(-0.2, 1.8, 100), np.linspace(-0.2, 1.0, 100)] fig = plt.figure(figsize=(10, 10)) ax = MultiAxes(4, hspace=0.05, wspace=0.05, fig=fig) ax.density(colors, bins=bins) ax.set_labels(labels) ax.set_locators(plt.MaxNLocator(5)) plt.suptitle('SDSS magnitudes') #------------------------------------------------------------ # Plot datacross-matched with 2MASS data = fetch_sdss_S82standards(crossmatch_2mass=True) colors = np.zeros((len(data), 7)) colors[:, 0] = data['mmu_u'] - data['mmu_g'] colors[:, 1] = data['mmu_g'] - data['mmu_r'] colors[:, 2] = data['mmu_r'] - data['mmu_i'] colors[:, 3] = data['mmu_i'] - data['mmu_z'] colors[:, 4] = data['mmu_z'] - data['J'] colors[:, 5] = data['J'] - data['H'] colors[:, 6] = data['H'] - data['K'] labels = ['u-g', 'g-r', 'r-i', 'i-z', 'z-J', 'J-H', 'H-K'] bins = [np.linspace(0.0, 3.5, 100), np.linspace(0, 2, 100), np.linspace(-0.2, 1.8, 100), np.linspace(-0.2, 1.0, 100), np.linspace(0.5, 2.0, 100), np.linspace(0.0, 1.0, 100), np.linspace(-0.4, 0.8, 100)] fig = plt.figure(figsize=(10, 10)) ax = MultiAxes(7, hspace=0.05, wspace=0.05, fig=fig) ax.density(colors, bins=bins) ax.set_labels(labels) ax.set_locators(plt.MaxNLocator(5)) fig.suptitle('SDSS+2MASS magnitudes') plt.show() astroML-0.3/examples/datasets/plot_sdss_specgals.py0000644000076500000240000000342412115147567023413 0ustar jakevdpstaff00000000000000""" SDSS Spectroscopic Galaxy Sample -------------------------------- This figure shows photometric colors of the SDSS spectroscopic galaxy sample. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_specgals data = fetch_sdss_specgals() #------------------------------------------------------------ # plot the RA/DEC in an area-preserving projection RA = data['ra'] DEC = data['dec'] # convert coordinates to degrees RA -= 180 RA *= np.pi / 180 DEC *= np.pi / 180 ax = plt.axes(projection='mollweide') ax = plt.axes() ax.grid() plt.scatter(RA, DEC, s=1, lw=0, c=data['z'], cmap=plt.cm.copper, vmin=0, vmax=0.4) plt.title('SDSS DR8 Spectroscopic Galaxies') cb = plt.colorbar(cax=plt.axes([0.05, 0.1, 0.9, 0.05]), orientation='horizontal', ticks=np.linspace(0, 0.4, 9)) cb.set_label('redshift') #------------------------------------------------------------ # plot the r vs u-r color-magnitude diagram u = data['modelMag_u'] r = data['modelMag_r'] rPetro = data['petroMag_r'] plt.figure() ax = plt.axes() plt.scatter(u - r, rPetro, s=1, lw=0, c=data['z'], cmap=plt.cm.copper, vmin=0, vmax=0.4) plt.colorbar(ticks=np.linspace(0, 0.4, 9)).set_label('redshift') plt.xlim(0.5, 5.5) plt.ylim(18, 12.5) plt.xlabel('u-r') plt.ylabel('rPetrosian') #------------------------------------------------------------ # plot a histogram of the redshift from astroML.plotting import hist plt.figure() hist(data['z'], bins='knuth', histtype='stepfilled', ec='k', fc='#F5CCB0') plt.xlim(0, 0.4) plt.xlabel('z (redshift)') plt.ylabel('dN/dz(z)') plt.show() astroML-0.3/examples/datasets/plot_sdss_spectrum.py0000644000076500000240000000233712115147567023456 0ustar jakevdpstaff00000000000000""" SDSS Spectrum Example --------------------- This example shows how to fetch and plot a spectrum from the SDSS database using the plate, MJD, and fiber numbers. The code below sends a query to the SDSS server for the given plate, fiber, and mjd, downloads the spectrum, and plots the result. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com from matplotlib import pyplot as plt from astroML.datasets import fetch_sdss_spectrum #------------------------------------------------------------ # Fetch single spectrum plate = 1615 mjd = 53166 fiber = 513 spec = fetch_sdss_spectrum(plate, mjd, fiber) #------------------------------------------------------------ # Plot the resulting spectrum ax = plt.axes() ax.plot(spec.wavelength(), spec.spectrum, '-k', label='spectrum') ax.plot(spec.wavelength(), spec.error, '-', color='gray', label='error') ax.legend(loc=4) ax.set_title('Plate = %(plate)i, MJD = %(mjd)i, Fiber = %(fiber)i' % locals()) ax.text(0.05, 0.95, 'z = %.2f' % spec.z, size=16, ha='left', va='top', transform=ax.transAxes) ax.set_xlabel(r'$\lambda (\AA)$') ax.set_ylabel('Flux') ax.set_ylim(-10, 300) plt.show() astroML-0.3/examples/datasets/plot_SDSS_SSPP.py0000644000076500000240000000575112115147567022204 0ustar jakevdpstaff00000000000000""" Stellar Parameters Hess Diagram ------------------------------- This example shows how to create Hess diagrams of the Segue Stellar Parameters Pipeline (SSPP) data to show multiple features on a single plot. The left panel shows the density of the points on the plot. The right panel shows the average metallicity in each pixel, with contours reflecting the density shown in the left plot. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt #------------------------------------------------------------ # Get SDSS SSPP data from astroML.datasets import fetch_sdss_sspp data = fetch_sdss_sspp() # do some reasonable magnitude cuts rpsf = data['rpsf'] data = data[(rpsf > 15) & (rpsf < 19)] # get the desired data logg = data['logg'] Teff = data['Teff'] FeH = data['FeH'] #------------------------------------------------------------ # Plot the results using the binned_statistic function from astroML.stats import binned_statistic_2d N, xedges, yedges = binned_statistic_2d(Teff, logg, FeH, 'count', bins=100) FeH_mean, xedges, yedges = binned_statistic_2d(Teff, logg, FeH, 'mean', bins=100) # Define custom colormaps: Set pixels with no sources to white cmap = plt.cm.jet cmap.set_bad('w', 1.) cmap_multicolor = plt.cm.jet cmap_multicolor.set_bad('w', 1.) # Create figure and subplots fig = plt.figure(figsize=(8, 4)) fig.subplots_adjust(wspace=0.25, left=0.1, right=0.95, bottom=0.07, top=0.95) #-------------------- # First axes: plt.subplot(121, xticks=[4000, 5000, 6000, 7000, 8000]) plt.imshow(np.log10(N.T), origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], aspect='auto', interpolation='nearest', cmap=cmap) plt.xlim(xedges[-1], xedges[0]) plt.ylim(yedges[-1], yedges[0]) plt.xlabel(r'$\mathrm{T_{eff}}$') plt.ylabel(r'$\mathrm{log(g)}$') cb = plt.colorbar(ticks=[0, 1, 2, 3], format=r'$10^{%i}$', orientation='horizontal') cb.set_label(r'$\mathrm{number\ in\ pixel}$') plt.clim(0, 3) #-------------------- # Third axes: plt.subplot(122, xticks=[4000, 5000, 6000, 7000, 8000]) plt.imshow(FeH_mean.T, origin='lower', extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], aspect='auto', interpolation='nearest', cmap=cmap_multicolor) plt.xlim(xedges[-1], xedges[0]) plt.ylim(yedges[-1], yedges[0]) plt.xlabel(r'$\mathrm{T_{eff}}$') plt.ylabel(r'$\mathrm{log(g)}$') cb = plt.colorbar(ticks=np.arange(-2.5, 1, 0.5), format=r'$%.1f$', orientation='horizontal') cb.set_label(r'$\mathrm{mean\ [Fe/H]\ in\ pixel}$') plt.clim(-2.5, 0.5) # Draw density contours over the colors levels = np.linspace(0, np.log10(N.max()), 7)[2:] plt.contour(np.log10(N.T), levels, colors='k', linewidths=1, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]]) plt.show() astroML-0.3/examples/datasets/plot_wmap_power_spectra.py0000644000076500000240000000405412115147567024457 0ustar jakevdpstaff00000000000000""" WMAP power spectrum analysis with HealPy ---------------------------------------- This demonstrates how to plot and take a power spectrum of the WMAP data using healpy, the python wrapper for healpix. Healpy is available for download at the `github site `_ """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt # warning: due to a bug in healpy, importing it before pylab can cause # a segmentation fault in some circumstances. import healpy as hp from astroML.datasets import fetch_wmap_temperatures #------------------------------------------------------------ # Fetch the data wmap_unmasked = fetch_wmap_temperatures(masked=False) wmap_masked = fetch_wmap_temperatures(masked=True) white_noise = np.ma.asarray(np.random.normal(0, 0.062, wmap_masked.shape)) #------------------------------------------------------------ # plot the unmasked map fig = plt.figure(1) hp.mollview(wmap_unmasked, min=-1, max=1, title='Unmasked map', fig=1, unit=r'$\Delta$T (mK)') #------------------------------------------------------------ # plot the masked map # filled() fills the masked regions with a null value. fig = plt.figure(2) hp.mollview(wmap_masked.filled(), title='Masked map', fig=2, unit=r'$\Delta$T (mK)') #------------------------------------------------------------ # compute and plot the power spectrum cl = hp.anafast(wmap_masked.filled(), lmax=1024) ell = np.arange(len(cl)) cl_white = hp.anafast(white_noise, lmax=1024) fig = plt.figure(3) ax = fig.add_subplot(111) ax.scatter(ell, ell * (ell + 1) * cl, s=4, c='black', lw=0, label='data') ax.scatter(ell, ell * (ell + 1) * cl_white, s=4, c='gray', lw=0, label='white noise') ax.set_xlabel(r'$\ell$') ax.set_ylabel(r'$\ell(\ell+1)C_\ell$') ax.set_title('Angular Power (not mask corrected)') ax.legend(loc='upper right') ax.grid() ax.set_xlim(0, 1100) plt.show() astroML-0.3/examples/datasets/plot_wmap_raw.py0000644000076500000240000000206312115147567022371 0ustar jakevdpstaff00000000000000""" WMAP plotting with HEALPix -------------------------- This example uses the :func:`astromL.datasets.fetch_wmap_temperatures` functionality to download and plot the raw WMAP 7-year data. The visualization requires the `healpy `_ package to be installed. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt # warning: due to a bug in healpy, importing it before pylab can cause # a segmentation fault in some circumstances. import healpy as hp from astroML.datasets import fetch_wmap_temperatures #------------------------------------------------------------ # Fetch the wmap data wmap_unmasked = fetch_wmap_temperatures(masked=False) #------------------------------------------------------------ # plot the unmasked map fig = plt.figure(1) hp.mollview(wmap_unmasked, min=-1, max=1, title='Raw WMAP data', fig=1, cmap=plt.cm.jet, unit=r'$\Delta$T (mK)') plt.show() astroML-0.3/examples/datasets/README.rst0000644000076500000240000000135112115147567020632 0ustar jakevdpstaff00000000000000Data set Examples ----------------- These plots show some of the data set loaders available in astroML, and some of the ways that astronomical data can be visualized and processed using open source python tools. The dataset loaders are in the submodule :mod:`astroML.datasets`, and start with the word ``fetch_``. The first time a dataset loader is called, it will attempt to download the dataset from the web and store it locally on disk. The default location is ``~/astroML_data``, but this location can be changed by specifying an alternative directory in the ``ASTROML_DATA`` environment variable. On subsequent calls, the cached version of the data is used. For more examples, see the :ref:`figures ` from the textbook. astroML-0.3/examples/learning/0000755000076500000240000000000012462244012017116 5ustar jakevdpstaff00000000000000astroML-0.3/examples/learning/plot_neighbors_photoz.py0000644000076500000240000000410312420767763024130 0ustar jakevdpstaff00000000000000""" K-Neighbors for Photometric Redshifts ------------------------------------- Estimate redshifts from the colors of sdss galaxies and quasars. This uses colors from a sample of 50,000 objects with SDSS photometry and ugriz magnitudes. The example shows how far one can get with an extremely simple machine learning approach to the photometric redshift problem. The function :func:`fetch_sdss_galaxy_colors` used below actually queries the SDSS CASjobs server for the colors of the 50,000 galaxies. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com from __future__ import print_function, division import numpy as np from matplotlib import pyplot as plt from sklearn.neighbors import KNeighborsRegressor from astroML.datasets import fetch_sdss_galaxy_colors from astroML.plotting import scatter_contour n_neighbors = 1 data = fetch_sdss_galaxy_colors() N = len(data) # shuffle data np.random.seed(0) np.random.shuffle(data) # put colors in a matrix X = np.zeros((N, 4)) X[:, 0] = data['u'] - data['g'] X[:, 1] = data['g'] - data['r'] X[:, 2] = data['r'] - data['i'] X[:, 3] = data['i'] - data['z'] z = data['redshift'] # divide into training and testing data Ntrain = N // 2 Xtrain = X[:Ntrain] ztrain = z[:Ntrain] Xtest = X[Ntrain:] ztest = z[Ntrain:] knn = KNeighborsRegressor(n_neighbors, weights='uniform') zpred = knn.fit(Xtrain, ztrain).predict(Xtest) axis_lim = np.array([-0.1, 2.5]) rms = np.sqrt(np.mean((ztest - zpred) ** 2)) print("RMS error = %.2g" % rms) ax = plt.axes() plt.scatter(ztest, zpred, c='k', lw=0, s=4) plt.plot(axis_lim, axis_lim, '--k') plt.plot(axis_lim, axis_lim + rms, ':k') plt.plot(axis_lim, axis_lim - rms, ':k') plt.xlim(axis_lim) plt.ylim(axis_lim) plt.text(0.99, 0.02, "RMS error = %.2g" % rms, ha='right', va='bottom', transform=ax.transAxes, bbox=dict(ec='w', fc='w'), fontsize=16) plt.title('Photo-z: Nearest Neigbor Regression') plt.xlabel(r'$\mathrm{z_{spec}}$', fontsize=14) plt.ylabel(r'$\mathrm{z_{phot}}$', fontsize=14) plt.show() astroML-0.3/examples/learning/README.rst0000644000076500000240000000036212115147567020622 0ustar jakevdpstaff00000000000000Machine Learning and Data Modeling ---------------------------------- These scripts show some of the machine learning and data modeling tools available in astroML. For more examples, see the :ref:`figures ` from the textbook. astroML-0.3/examples/README.rst0000644000076500000240000000036612252721253017017 0ustar jakevdpstaff00000000000000General astroML Examples ------------------------ This section contains several example plots which do not appear in the textbook. Currently there are only a few examples here: for more, please see the :ref:`text book figures `. astroML-0.3/LICENSE.rst0000644000076500000240000000242512252721253015324 0ustar jakevdpstaff00000000000000Copyright (c) 2012-2013, Jacob Vanderplas All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. astroML-0.3/paper_figures/0000755000076500000240000000000012462244012016334 5ustar jakevdpstaff00000000000000astroML-0.3/paper_figures/CIDU2012/0000755000076500000240000000000012462244012017365 5ustar jakevdpstaff00000000000000astroML-0.3/paper_figures/CIDU2012/fig_great_wall_MST.py0000644000076500000240000000372012115147567023446 0ustar jakevdpstaff00000000000000""" Euclidean Minimum Spanning Tree ------------------------------- """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from scipy.interpolate import interp1d from sklearn.neighbors import kneighbors_graph try: from scipy.sparse.csgraph import minimum_spanning_tree except: raise ValueError("scipy v0.11 or greater required " "for minimum spanning tree") from astroML.datasets import fetch_great_wall from astroML.cosmology import Cosmology #------------------------------------------------------------ # get data X = fetch_great_wall() xmin, xmax = (-375, -175) ymin, ymax = (-300, 200) #------------------------------------------------------------ # generate a sparse graph using the k nearest neighbors of each point G = kneighbors_graph(X, n_neighbors=10, mode='distance') #------------------------------------------------------------ # Compute the minimum spanning tree of this graph T = minimum_spanning_tree(G, overwrite=True) #------------------------------------------------------------ # Get the x, y coordinates of the beginning and end of each line segment T = T.tocoo() dist = T.data p1 = T.row p2 = T.col A = X[p1].T B = X[p2].T x_coords = np.vstack([A[0], B[0]]) y_coords = np.vstack([A[1], B[1]]) #---------------------------------------------------------------------- # Plot the results fig = plt.figure() fig.subplots_adjust(hspace=0, left=0.1, right=0.95, bottom=0.1, top=0.9) ax = fig.add_subplot(211, aspect='equal') ax.scatter(X[:, 1], X[:, 0], s=1, lw=0, c='k') ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.set_ylabel('x (Mpc)') ax = fig.add_subplot(212, aspect='equal') ax.plot(y_coords, x_coords, c='k', lw=1) ax.set_xlim(ymin, ymax) ax.set_ylim(xmin, xmax) ax.set_xlabel('y (Mpc)') ax.set_ylabel('x (Mpc)') plt.show() astroML-0.3/paper_figures/CIDU2012/fig_LS_sg_comparison.py0000644000076500000240000000336012115147567024043 0ustar jakevdpstaff00000000000000""" Generalized vs Standard Lomb-Scargle ------------------------------------ """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from astroML.time_series import lomb_scargle #------------------------------------------------------------ # Generate data where y is positive np.random.seed(0) N = 30 P = 0.3 t = P / 2 * np.random.random(N) + P * np.random.randint(100, size=N) y = 10 + np.sin(2 * np.pi * t / P) dy = 0.5 + 0.5 * np.random.random(N) y_obs = y + np.random.normal(dy) omega_0 = 2 * np.pi / P #------------------------------------------------------------ # Compute the Lomb-Scargle Periodogram sig = np.array([0.1, 0.01, 0.001]) omega = np.linspace(17, 22, 1000) P_S = lomb_scargle(t, y, dy, omega, generalized=False) P_G, z = lomb_scargle(t, y, dy, omega, generalized=True, significance=sig) #------------------------------------------------------------ # Plot the results fig = plt.figure() # First panel: input data ax = fig.add_subplot(211) ax.errorbar(t, y_obs, dy, fmt='.k', lw=1, ecolor='gray') ax.plot([-2, 32], [10, 10], ':k', lw=1) ax.set_xlim(-2, 32) ax.set_xlabel('$t$') ax.set_ylabel('$y(t)$') # Second panel: periodogram ax = fig.add_subplot(212) ax.plot(omega, P_S, '--k', lw=1, label='standard') ax.plot(omega, P_G, '-k', lw=1, label='generalized') ax.legend(loc=2, prop=dict(size=14)) # plot the significance lines. xlim = (omega[0], omega[-1]) for zi, pi in zip(z, sig): ax.plot(xlim, (zi, zi), ':k', lw=1) ax.text(xlim[-1] - 0.001, zi - 0.02, "$%.1g$" % pi, ha='right', va='top') ax.set_xlabel('$\omega$') ax.set_ylabel('$P_{LS}(\omega)$') ax.set_ylim(0, 1.1) plt.show() astroML-0.3/paper_figures/CIDU2012/fig_rbf_ridge_mu_z.py0000644000076500000240000000612612115147567023562 0ustar jakevdpstaff00000000000000""" Regularized Regression Example ------------------------------ This performs regularized regression on a gaussian basis function model. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import numpy as np from matplotlib import pyplot as plt from scipy.stats import lognorm from sklearn.linear_model import LinearRegression, Ridge, Lasso from astroML.cosmology import Cosmology from astroML.datasets import generate_mu_z from astroML.density_estimation import FunctionDistribution #---------------------------------------------------------------------- # generate data np.random.seed(0) z_sample, mu_sample, dmu = generate_mu_z(100, random_state=0) cosmo = Cosmology() z = np.linspace(0.01, 2, 1000) mu = np.asarray(map(cosmo.mu, z)) #------------------------------------------------------------ # Manually convert data to a gaussian basis # note that we're ignoring errors here, for the sake of example. def gaussian_basis(x, mu, sigma): return np.exp(-0.5 * ((x - mu) / sigma) ** 2) centers = np.linspace(0, 1.8, 100) widths = 0.2 X = gaussian_basis(z_sample[:, np.newaxis], centers, widths) #------------------------------------------------------------ # Set up the figure to plot the results fig = plt.figure(figsize=(12, 7)) fig.subplots_adjust(left=0.07, right=0.95, bottom=0.08, top=0.95, hspace=0.1, wspace=0.15) classifier = [LinearRegression, Ridge, Lasso] kwargs = [dict(), dict(alpha=0.005), dict(alpha=0.001)] labels = ['Linear Regression', 'Ridge Regression', 'Lasso Regression'] for i in range(3): clf = classifier[i](fit_intercept=True, **kwargs[i]) clf.fit(X, mu_sample) w = clf.coef_ fit = clf.predict(gaussian_basis(z[:, None], centers, widths)) # plot fit ax = fig.add_subplot(231 + i) ax.xaxis.set_major_formatter(plt.NullFormatter()) # plot curves for regularized fits if i == 0: ax.set_ylabel('$\mu$') else: ax.yaxis.set_major_formatter(plt.NullFormatter()) curves = 37 + w * gaussian_basis(z[:, np.newaxis], centers, widths) curves = curves[:, abs(w) > 0.01] ax.plot(z, curves, c='gray', lw=1, alpha=0.5) ax.plot(z, fit, '-k') ax.plot(z, mu, '--', c='gray') ax.errorbar(z_sample, mu_sample, dmu, fmt='.k', ecolor='gray', lw=1) ax.set_xlim(0.001, 1.8) ax.set_ylim(36, 48) ax.text(0.05, 0.95, labels[i], ha='left', va='top', transform=ax.transAxes) # plot weights ax = plt.subplot(234 + i) ax.xaxis.set_major_locator(plt.MultipleLocator(0.5)) ax.set_xlabel('z') if i == 0: ax.set_ylabel(r'$\theta$') w *= 1E-12 ax.text(0, 1, r'$\rm \times 10^{12}$', transform=ax.transAxes, fontsize=16) ax.scatter(centers, w, s=9, lw=0, c='k') ax.set_xlim(-0.05, 1.8) if i == 1: ax.set_ylim(-2, 4) elif i == 2: ax.set_ylim(-0.5, 2) ax.text(0.05, 0.95, labels[i], ha='left', va='top', transform=ax.transAxes) plt.show() astroML-0.3/paper_figures/CIDU2012/fig_spec_decompositions.py0000644000076500000240000000651512115147567024660 0ustar jakevdpstaff00000000000000""" SDSS spectra Decompositions --------------------------- Comparison of PCA, ICA, and NMF decompositions of SDSS spectra """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import os import numpy as np from matplotlib import pyplot as plt from sklearn.decomposition import NMF from sklearn.decomposition import FastICA from sklearn.decomposition import RandomizedPCA from astroML.datasets import sdss_corrected_spectra from astroML.decorators import pickle_results #------------------------------------------------------------ # Download data data = sdss_corrected_spectra.fetch_sdss_corrected_spectra() spectra = sdss_corrected_spectra.reconstruct_spectra(data) wavelengths = sdss_corrected_spectra.compute_wavelengths(data) #---------------------------------------------------------------------- # Compute PCA, ICA, and NMF components # we'll save the results so that they can be re-used @pickle_results('spec_decompositions.pkl') def compute_PCA_ICA_NMF(n_components=5): spec_mean = spectra.mean(0) # PCA: use randomized PCA for speed pca = RandomizedPCA(n_components - 1) pca.fit(spectra) pca_comp = np.vstack([spec_mean, pca.components_]) # ICA treats sequential observations as related. Because of this, we need # to fit with the transpose of the spectra ica = FastICA(n_components - 1) ica.fit(spectra.T) ica_comp = np.vstack([spec_mean, ica.transform(spectra.T).T]) # NMF requires all elements of the input to be greater than zero spectra[spectra < 0] = 0 nmf = NMF(n_components) nmf.fit(spectra) nmf_comp = nmf.components_ return pca_comp, ica_comp, nmf_comp n_components = 5 decompositions = compute_PCA_ICA_NMF(n_components) #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(10, 8)) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05, bottom=0.1, top=0.95, hspace=0.05) titles = ['PCA components', 'ICA components', 'NMF components'] for i, comp in enumerate(decompositions): for j in range(n_components): ax = fig.add_subplot(n_components, 3, 3 * j + 1 + i) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_major_locator(plt.MultipleLocator(1000)) if j < n_components - 1: ax.xaxis.set_major_formatter(plt.NullFormatter()) else: ax.set_xlabel(r'wavelength $(\AA)$') ax.plot(wavelengths, comp[j], '-k', lw=1) # plot zero line xlim = [3000, 7999] ax.plot(xlim, [0, 0], '-', c='gray', lw=1) ax.set_xlim(xlim) if j == 0: ax.set_title(titles[i], fontsize='medium') if titles[i].startswith('PCA') or titles[i].startswith('ICA'): if j == 0: label = 'mean' else: label = 'component %i' % j else: label = 'component %i' % (j + 1) ax.text(0.02, 0.95, label, transform=ax.transAxes, ha='left', va='top', bbox=dict(ec='w', fc='w'), fontsize='small') # adjust y limits ylim = plt.ylim() dy = 0.05 * (ylim[1] - ylim[0]) ax.set_ylim(ylim[0] - dy, ylim[1] + 4 * dy) plt.show() astroML-0.3/paper_figures/CIDU2012/fig_spec_examples.py0000644000076500000240000000334212115147567023432 0ustar jakevdpstaff00000000000000""" SDSS spectra Examples --------------------- Plot 15 random SDSS spectra from the sample """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import os import numpy as np from matplotlib import pyplot as plt from sklearn.decomposition import RandomizedPCA from astroML.datasets import sdss_corrected_spectra #---------------------------------------------------------------------- # Use pre-computed PCA to reconstruct spectra data = sdss_corrected_spectra.fetch_sdss_corrected_spectra() spectra = sdss_corrected_spectra.reconstruct_spectra(data) lam = sdss_corrected_spectra.compute_wavelengths(data) #------------------------------------------------------------ # select random spectra np.random.seed(5) nrows = 5 ncols = 3 ind = np.random.randint(spectra.shape[0], size=nrows * ncols) spec_sample = spectra[ind] #---------------------------------------------------------------------- # Plot the results fig = plt.figure(figsize=(10, 8)) fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05, bottom=0.1, top=0.95, hspace=0.05) for i in range(ncols): for j in range(nrows): ax = fig.add_subplot(nrows, ncols, ncols * j + 1 + i) ax.plot(lam, spec_sample[ncols * j + i], '-k', lw=1) ax.yaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_major_locator(plt.MultipleLocator(1000)) if j < nrows - 1: ax.xaxis.set_major_formatter(plt.NullFormatter()) else: plt.xlabel(r'wavelength $(\AA)$') ax.set_xlim(3000, 7999) ylim = ax.get_ylim() dy = 0.05 * (ylim[1] - ylim[0]) ax.set_ylim(ylim[0] - dy, ylim[1] + dy) plt.show() astroML-0.3/paper_figures/CIDU2012/fig_XD_example.py0000644000076500000240000000561012420767763022635 0ustar jakevdpstaff00000000000000""" Extreme Deconvolution example ----------------------------- This demonstrates extreme deconvolution on a toy dataset. """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import os import numpy as np from matplotlib import pyplot as plt from matplotlib.patches import Ellipse from astroML.decorators import pickle_results from astroML.density_estimation import XDGMM from astroML.plotting.tools import draw_ellipse #------------------------------------------------------------ # Sample the dataset N = 2000 np.random.seed(0) # generate the true data x_true = (1.4 + 2 * np.random.random(N)) ** 2 y_true = 0.1 * x_true ** 2 # add scatter to "true" distribution dx = 0.1 + 4. / x_true ** 2 dy = 0.1 + 10. / x_true ** 2 x_true += np.random.normal(0, dx, N) y_true += np.random.normal(0, dy, N) # add noise to get the "observed" distribution dx = 0.2 + 0.5 * np.random.random(N) dy = 0.2 + 0.5 * np.random.random(N) x = x_true + np.random.normal(0, dx) y = y_true + np.random.normal(0, dy) # stack the results for computation X = np.vstack([x, y]).T Xerr = np.zeros(X.shape + X.shape[-1:]) diag = np.arange(X.shape[-1]) Xerr[:, diag, diag] = np.vstack([dx ** 2, dy ** 2]).T #------------------------------------------------------------ # compute and save results @pickle_results("XD_toy.pkl") def compute_XD_results(n_components=10, n_iter=500): clf = XDGMM(n_components, n_iter=n_iter) clf.fit(X, Xerr) return clf clf = compute_XD_results(10, 500) sample = clf.sample(N) #------------------------------------------------------------ # Plot the results fig = plt.figure() fig.subplots_adjust(left=0.1, right=0.95, bottom=0.1, top=0.95, wspace=0.02, hspace=0.02) ax1 = fig.add_subplot(221) ax1.scatter(x_true, y_true, s=4, lw=0, c='k') ax2 = fig.add_subplot(222) ax2.scatter(x, y, s=4, lw=0, c='k') ax3 = fig.add_subplot(223) ax3.scatter(sample[:, 0], sample[:, 1], s=4, lw=0, c='k') ax4 = fig.add_subplot(224) for i in range(clf.n_components): draw_ellipse(clf.mu[i], clf.V[i], scales=[2], ax=ax4, ec='k', fc='gray', alpha=0.2) titles = ["True Distribution", "Noisy Distribution", "Extreme Deconvolution\n resampling", "Extreme Deconvolution\n cluster locations"] ax = [ax1, ax2, ax3, ax4] for i in range(4): ax[i].set_xlim(-1, 13) ax[i].set_ylim(-6, 16) ax[i].xaxis.set_major_locator(plt.MultipleLocator(4)) ax[i].yaxis.set_major_locator(plt.MultipleLocator(5)) ax[i].text(0.05, 0.95, titles[i], ha='left', va='top', transform=ax[i].transAxes) if i in (0, 1): ax[i].xaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_xlabel('x') if i in (1, 3): ax[i].yaxis.set_major_formatter(plt.NullFormatter()) else: ax[i].set_ylabel('y') plt.show() astroML-0.3/paper_figures/CIDU2012/README.rst0000644000076500000240000000032112115147567021064 0ustar jakevdpstaff00000000000000CIDU 2012: Introduction to astroML ---------------------------------- These figures are from the introduction to the astroML package presented at the Conference for Intelligent Data Understanding (CIDU) 2012. astroML-0.3/paper_figures/README.rst0000644000076500000240000000034212115147567020036 0ustar jakevdpstaff00000000000000Paper Figures ------------- In the spirit of `reprodicible research `_, this page lists the source code for figures from published papers which use datasets and routines available in astroML. astroML-0.3/PKG-INFO0000644000076500000240000002654612462244012014613 0ustar jakevdpstaff00000000000000Metadata-Version: 1.1 Name: astroML Version: 0.3 Summary: tools for machine learning and data mining in Astronomy Home-page: http://astroML.github.com Author: Jake VanderPlas Author-email: vanderplas@astro.washington.edu License: BSD Download-URL: http://github.com/astroML/astroML Description: .. -*- mode: rst -*- .. image:: https://travis-ci.org/astroML/astroML.png?branch=master :target: https://travis-ci.org/astroML/astroML/ .. image:: https://pypip.in/v/astroML/badge.png :target: https://pypi.python.org/pypi/astroML .. image:: https://pypip.in/d/astroML/badge.png :target: https://pypi.python.org/pypi/astroML ============================================ AstroML: Machine Learning code for Astronomy ============================================ AstroML is a Python module for machine learning and data mining built on numpy, scipy, scikit-learn, and matplotlib, and distributed under the 3-Clause BSD license. It contains a growing library of statistical and machine learning routines for analyzing astronomical data in python, loaders for several open astronomical datasets, and a large suite of examples of analyzing and visualizing astronomical datasets. This project was started in 2012 by Jake VanderPlas to accompany the book *Statistics, Data Mining, and Machine Learning in Astronomy* by Zeljko Ivezic, Andrew Connolly, Jacob VanderPlas, and Alex Gray. Core and Addons =============== The project is split into two components. The core ``astroML`` library is written in python only, and is designed to be very easy to install for any users, even those who don't have a working C or fortran compiler. A companion library, ``astroML_addons``, can be optionally installed for increased performance on certain algorithms. Every algorithm in ``astroML_addons`` has a pure python counterpart in the core ``astroML`` implementation, but the ``astroML_addons`` library contains faster and more efficient implementations in compiled code. Furthermore, if ``astroML_addons`` is installed on your system, the core ``astroML`` library will import and use the faster routines by default. The reason for this split is the ease of use for newcomers to Python. If the prerequisites are already installed on your system, the core ``astroML`` library can be installed and used on any system with little trouble. The ``astroML_addons`` library requires a C compiler, but is also designed to be easy to install for more advanced users. See further discussion in "Development", below. Important Links =============== - HTML documentation: http://www.astroML.org - Core source-code repository: http://github.com/astroML/astroML - Addons source-code repository: http://github.com/astroML/astroML_addons - Issue Tracker: http://github.com/astroML/astroML/issues - Mailing List: https://groups.google.com/forum/#!forum/astroml-general Installation ============ This package uses distutils, which is the default way of installing python modules. **Before installation, make sure your system meets the prerequisites listed in Dependencies, listed below.** Core ---- To install the core ``astroML`` package in your home directory, use:: pip install astroML The core package is pure python, so installation should be straightforward on most systems. To install from source, use:: python setup.py install You can specify an arbitrary directory for installation using:: python setup.py install --prefix='/some/path' To install system-wide on Linux/Unix systems:: python setup.py build sudo python setup.py install Addons ------ The ``astroML_addons`` package requires a working C/C++ compiler for installation. It can be installed using:: pip install astroML_addons To install from source, refer to http://github.com/astroML/astroML_addons Dependencies ============ There are three levels of dependencies in astroML. *Core* dependencies are required for the core ``astroML`` package. *Add-on* dependencies are required for the performance ``astroML_addons``. *Optional* dependencies are required to run some (but not all) of the example scripts. Individual example scripts will list their optional dependencies at the top of the file. Core Dependencies ----------------- The core ``astroML`` package requires the following: - Python_ version 2.6.x - 2.7.x (astroML does not yet support python 3.x) - Numpy_ >= 1.4 - Scipy_ >= 0.7 - Scikit-learn_ >= 0.10 - Matplotlib_ >= 0.99 - AstroPy_ > 0.2.5 AstroPy is required to read Flexible Image Transport System (FITS) files, which are used by several datasets. This configuration matches the Ubuntu 10.04 LTS release from April 2010, with the addition of scikit-learn. To run unit tests, you will also need nose >= 0.10 Add-on Dependencies ------------------- The fast code in ``astroML_addons`` requires a working C/C++ compiler. Optional Dependencies --------------------- Several of the example scripts require specialized or upgraded packages. These requirements are listed at the top of the particular scripts - Scipy_ version 0.11 added a sparse graph submodule. The minimum spanning tree example requires scipy >= 0.11 - PyMC_ provides a nice interface for Markov-Chain Monte Carlo. Several astroML examples use pyMC for exploration of high-dimensional spaces. The examples were written with pymc version 2.2 - HEALPy_ provides an interface to the HEALPix pixelization scheme, as well as fast spherical harmonic transforms. Development =========== This package is designed to be a repository for well-written astronomy code, and submissions of new routines are encouraged. After installing the version-control system Git_, you can check out the latest sources from GitHub_ using:: git clone git://github.com/astroML/astroML.git or if you have write privileges:: git clone git@github.com:astroML/astroML.git Contribution ------------ We strongly encourage contributions of useful astronomy-related code: for `astroML` to be a relevant tool for the python/astronomy community, it will need to grow with the field of research. There are a few guidelines for contribution: General ~~~~~~~ Any contribution should be done through the github pull request system (for more information, see the `help page `_ Code submitted to ``astroML`` should conform to a BSD-style license, and follow the `PEP8 style guide `_. Documentation and Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~ All submitted code should be documented following the `Numpy Documentation Guide`_. This is a unified documentation style used by many packages in the scipy universe. In addition, it is highly recommended to create example scripts that show the usefulness of the method on an astronomical dataset (preferably making use of the loaders in ``astroML.datasets``). These example scripts are in the ``examples`` subdirectory of the main source repository. Add-on code ~~~~~~~~~~~ We made the decision early-on to separate the core routines from high-performance compiled routines. This is to make sure that installation of the core package is as straightforward as possible (i.e. not requiring a C compiler). Contributions of efficient compiled code to ``astroML_addons`` is encouraged: the availability of efficient implementations of common algorithms in python is one of the strongest features of the python universe. The preferred method of wrapping compiled libraries is to use `cython `_; other options (weave, SWIG, etc.) are harder to build and maintain. Currently, the policy is that any efficient algorithm included in ``astroML_addons`` should have a duplicate python-only implementation in ``astroML``, with code that selects the faster routine if it's available. (For an example of how this works, see the definition of the ``lomb_scargle`` function in ``astroML/periodogram.py``). This policy exists for two reasons: 1. it allows novice users to have all the functionality of ``astroML`` without requiring the headache of complicated installation steps. 2. it serves a didactic purpose: python-only implementations are often easier to read and understand than equivalent implementations in C or cython. 3. it enforces the good coding practice of avoiding premature optimization. First make sure the code works (i.e. write it in simple python). Then create an optimized version in the addons. If this policy proves especially burdensome in the future, it may be revisited. .. _Numpy Documentation Guide: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt Authors ======= Package Author -------------- * Jake Vanderplas http://jakevdp.github.com Code Contribution ----------------- * Morgan Fouesneau https://github.com/mfouesneau * Julian Taylor http://github.com/juliantaylor .. _Python: http://www.python.org .. _Numpy: http://www.numpy.org .. _Scipy: http://www.scipy.org .. _Scikit-learn: http://scikit-learn.org .. _Matplotlib: http://matplotlib.org .. _AstroPy: http://www.astropy.org/ .. _PyMC: http://pymc-devs.github.com/pymc/ .. _HEALPy: https://github.com/healpy/healpy> .. _Git: http://git-scm.com/ .. _GitHub: http://www.github.com Platform: UNKNOWN Classifier: Development Status :: 4 - Beta Classifier: Environment :: Console Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: BSD License Classifier: Natural Language :: English Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Topic :: Scientific/Engineering :: Astronomy astroML-0.3/README.rst0000644000076500000240000002140012462242075015174 0ustar jakevdpstaff00000000000000.. -*- mode: rst -*- .. image:: https://travis-ci.org/astroML/astroML.png?branch=master :target: https://travis-ci.org/astroML/astroML/ .. image:: https://pypip.in/v/astroML/badge.png :target: https://pypi.python.org/pypi/astroML .. image:: https://pypip.in/d/astroML/badge.png :target: https://pypi.python.org/pypi/astroML ============================================ AstroML: Machine Learning code for Astronomy ============================================ AstroML is a Python module for machine learning and data mining built on numpy, scipy, scikit-learn, and matplotlib, and distributed under the 3-Clause BSD license. It contains a growing library of statistical and machine learning routines for analyzing astronomical data in python, loaders for several open astronomical datasets, and a large suite of examples of analyzing and visualizing astronomical datasets. This project was started in 2012 by Jake VanderPlas to accompany the book *Statistics, Data Mining, and Machine Learning in Astronomy* by Zeljko Ivezic, Andrew Connolly, Jacob VanderPlas, and Alex Gray. Core and Addons =============== The project is split into two components. The core ``astroML`` library is written in python only, and is designed to be very easy to install for any users, even those who don't have a working C or fortran compiler. A companion library, ``astroML_addons``, can be optionally installed for increased performance on certain algorithms. Every algorithm in ``astroML_addons`` has a pure python counterpart in the core ``astroML`` implementation, but the ``astroML_addons`` library contains faster and more efficient implementations in compiled code. Furthermore, if ``astroML_addons`` is installed on your system, the core ``astroML`` library will import and use the faster routines by default. The reason for this split is the ease of use for newcomers to Python. If the prerequisites are already installed on your system, the core ``astroML`` library can be installed and used on any system with little trouble. The ``astroML_addons`` library requires a C compiler, but is also designed to be easy to install for more advanced users. See further discussion in "Development", below. Important Links =============== - HTML documentation: http://www.astroML.org - Core source-code repository: http://github.com/astroML/astroML - Addons source-code repository: http://github.com/astroML/astroML_addons - Issue Tracker: http://github.com/astroML/astroML/issues - Mailing List: https://groups.google.com/forum/#!forum/astroml-general Installation ============ This package uses distutils, which is the default way of installing python modules. **Before installation, make sure your system meets the prerequisites listed in Dependencies, listed below.** Core ---- To install the core ``astroML`` package in your home directory, use:: pip install astroML The core package is pure python, so installation should be straightforward on most systems. To install from source, use:: python setup.py install You can specify an arbitrary directory for installation using:: python setup.py install --prefix='/some/path' To install system-wide on Linux/Unix systems:: python setup.py build sudo python setup.py install Addons ------ The ``astroML_addons`` package requires a working C/C++ compiler for installation. It can be installed using:: pip install astroML_addons To install from source, refer to http://github.com/astroML/astroML_addons Dependencies ============ There are three levels of dependencies in astroML. *Core* dependencies are required for the core ``astroML`` package. *Add-on* dependencies are required for the performance ``astroML_addons``. *Optional* dependencies are required to run some (but not all) of the example scripts. Individual example scripts will list their optional dependencies at the top of the file. Core Dependencies ----------------- The core ``astroML`` package requires the following: - Python_ version 2.6.x - 2.7.x (astroML does not yet support python 3.x) - Numpy_ >= 1.4 - Scipy_ >= 0.7 - Scikit-learn_ >= 0.10 - Matplotlib_ >= 0.99 - AstroPy_ > 0.2.5 AstroPy is required to read Flexible Image Transport System (FITS) files, which are used by several datasets. This configuration matches the Ubuntu 10.04 LTS release from April 2010, with the addition of scikit-learn. To run unit tests, you will also need nose >= 0.10 Add-on Dependencies ------------------- The fast code in ``astroML_addons`` requires a working C/C++ compiler. Optional Dependencies --------------------- Several of the example scripts require specialized or upgraded packages. These requirements are listed at the top of the particular scripts - Scipy_ version 0.11 added a sparse graph submodule. The minimum spanning tree example requires scipy >= 0.11 - PyMC_ provides a nice interface for Markov-Chain Monte Carlo. Several astroML examples use pyMC for exploration of high-dimensional spaces. The examples were written with pymc version 2.2 - HEALPy_ provides an interface to the HEALPix pixelization scheme, as well as fast spherical harmonic transforms. Development =========== This package is designed to be a repository for well-written astronomy code, and submissions of new routines are encouraged. After installing the version-control system Git_, you can check out the latest sources from GitHub_ using:: git clone git://github.com/astroML/astroML.git or if you have write privileges:: git clone git@github.com:astroML/astroML.git Contribution ------------ We strongly encourage contributions of useful astronomy-related code: for `astroML` to be a relevant tool for the python/astronomy community, it will need to grow with the field of research. There are a few guidelines for contribution: General ~~~~~~~ Any contribution should be done through the github pull request system (for more information, see the `help page `_ Code submitted to ``astroML`` should conform to a BSD-style license, and follow the `PEP8 style guide `_. Documentation and Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~ All submitted code should be documented following the `Numpy Documentation Guide`_. This is a unified documentation style used by many packages in the scipy universe. In addition, it is highly recommended to create example scripts that show the usefulness of the method on an astronomical dataset (preferably making use of the loaders in ``astroML.datasets``). These example scripts are in the ``examples`` subdirectory of the main source repository. Add-on code ~~~~~~~~~~~ We made the decision early-on to separate the core routines from high-performance compiled routines. This is to make sure that installation of the core package is as straightforward as possible (i.e. not requiring a C compiler). Contributions of efficient compiled code to ``astroML_addons`` is encouraged: the availability of efficient implementations of common algorithms in python is one of the strongest features of the python universe. The preferred method of wrapping compiled libraries is to use `cython `_; other options (weave, SWIG, etc.) are harder to build and maintain. Currently, the policy is that any efficient algorithm included in ``astroML_addons`` should have a duplicate python-only implementation in ``astroML``, with code that selects the faster routine if it's available. (For an example of how this works, see the definition of the ``lomb_scargle`` function in ``astroML/periodogram.py``). This policy exists for two reasons: 1. it allows novice users to have all the functionality of ``astroML`` without requiring the headache of complicated installation steps. 2. it serves a didactic purpose: python-only implementations are often easier to read and understand than equivalent implementations in C or cython. 3. it enforces the good coding practice of avoiding premature optimization. First make sure the code works (i.e. write it in simple python). Then create an optimized version in the addons. If this policy proves especially burdensome in the future, it may be revisited. .. _Numpy Documentation Guide: https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt Authors ======= Package Author -------------- * Jake Vanderplas http://jakevdp.github.com Code Contribution ----------------- * Morgan Fouesneau https://github.com/mfouesneau * Julian Taylor http://github.com/juliantaylor .. _Python: http://www.python.org .. _Numpy: http://www.numpy.org .. _Scipy: http://www.scipy.org .. _Scikit-learn: http://scikit-learn.org .. _Matplotlib: http://matplotlib.org .. _AstroPy: http://www.astropy.org/ .. _PyMC: http://pymc-devs.github.com/pymc/ .. _HEALPy: https://github.com/healpy/healpy> .. _Git: http://git-scm.com/ .. _GitHub: http://www.github.com astroML-0.3/setup.py0000644000076500000240000000374012421537727015234 0ustar jakevdpstaff00000000000000from distutils.core import setup DESCRIPTION = "tools for machine learning and data mining in Astronomy" LONG_DESCRIPTION = open('README.rst').read() NAME = "astroML" AUTHOR = "Jake VanderPlas" AUTHOR_EMAIL = "vanderplas@astro.washington.edu" MAINTAINER = "Jake VanderPlas" MAINTAINER_EMAIL = "vanderplas@astro.washington.edu" URL = 'http://astroML.github.com' DOWNLOAD_URL = 'http://github.com/astroML/astroML' LICENSE = 'BSD' import astroML VERSION = astroML.__version__ setup(name=NAME, version=VERSION, description=DESCRIPTION, long_description=LONG_DESCRIPTION, author=AUTHOR, author_email=AUTHOR_EMAIL, maintainer=MAINTAINER, maintainer_email=MAINTAINER_EMAIL, url=URL, download_url=DOWNLOAD_URL, license=LICENSE, packages=['astroML', 'astroML.tests', 'astroML.clustering', 'astroML.clustering.tests', 'astroML.classification', 'astroML.linear_model', 'astroML.datasets', 'astroML.datasets.tools', 'astroML.density_estimation', 'astroML.density_estimation.tests', 'astroML.time_series', 'astroML.time_series.tests', 'astroML.dimensionality', 'astroML.dimensionality.tests', 'astroML.plotting', 'astroML.plotting.tests', 'astroML.stats', 'astroML.stats.tests', ], classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Console', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: BSD License', 'Natural Language :: English', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Topic :: Scientific/Engineering :: Astronomy'], ) astroML-0.3/tmp.py0000644000076500000240000000467112420747203014667 0ustar jakevdpstaff00000000000000import numpy as np import pymc print("pymc version", pymc.__version__) np.random.seed(0) xi = np.random.random(10) dyi = np.random.random(10) yi = np.random.normal(2 * xi, dyi) # We're doing a linear fit with a background. Pb is the nuisance parameter # which specifies the probability that points are drawn from a normal # distribution N(Yb, sigmab^2). # uniform prior on Pb, the fraction of bad points Pb = pymc.Uniform('Pb', 0, 1.0, value=0.1) # uniform prior on Yb, the centroid of the outlier distribution Yb = pymc.Uniform('Yb', -10000, 10000, value=0) # uniform prior on log(sigmab), the spread of the outlier distribution log_sigmab = pymc.Uniform('log_sigmab', -10, 10, value=5) # define priors on beta = (slope, intercept) @pymc.stochastic def beta_M1(value=np.array([2., 100.])): """Slope and intercept parameters for a straight line. The likelihood corresponds to the prior probability of the parameters.""" slope, intercept = value prob_intercept = 1 + 0 * intercept # uniform prior on theta = arctan(slope) # d[arctan(x)]/dx = 1 / (1 + x^2) prob_slope = np.log(1. / (1. + slope ** 2)) return prob_intercept + prob_slope @pymc.deterministic def model_M1(xi=xi, beta=beta_M1): slope, intercept = beta return slope * xi + intercept @pymc.deterministic def sigmab(log_sigmab=log_sigmab): return np.exp(log_sigmab) # set up the expression for likelihood def mixture_likelihood(yi, model, dyi, Pb, Yb, sigmab): if (Pb < 0) or (Pb > 1): print("error: Pb =", Pb) raise ValueError("Pb out of range") Vi = dyi ** 2 Vb = sigmab ** 2 root2pi = np.sqrt(2 * np.pi) L_in = (1. / root2pi / dyi * np.exp(-0.5 * (yi - model) ** 2 / Vi)) L_out = (1. / root2pi / np.sqrt(Vi + Vb) * np.exp(-0.5 * (yi - Yb) ** 2 / (Vi + Vb))) return np.sum(np.log((1 - Pb) * L_in + Pb * L_out)) MixtureNormal = pymc.stochastic_from_dist('mixturenormal', logp=mixture_likelihood, dtype=np.float, mv=True) y_mixture = MixtureNormal('y_mixture', model=model_M1, dyi=dyi, Pb=Pb, Yb=Yb, sigmab=sigmab, observed=True, value=yi) M = dict(y_mixture=y_mixture, beta_M1=beta_M1, model_M1=model_M1, Pb=Pb, Yb=Yb, log_sigmab=log_sigmab, sigmab=sigmab) S = pymc.MCMC(M) S.sample(iter=25000, burn=5000)