Source code for astroML.resample

import numpy as np
import warnings
from sklearn.utils import check_random_state


[docs]def bootstrap(data, n_bootstraps, user_statistic, kwargs=None, pass_indices=False, random_state=None): """Compute bootstraped statistics of a dataset. Parameters ---------- data : array_like An n-dimensional data array of size n_samples by n_attributes n_bootstraps : integer the number of bootstrap samples to compute. Note that internally, two arrays of size (n_bootstraps, n_samples) will be allocated. For very large numbers of bootstraps, this can cause memory issues. user_statistic : function The statistic to be computed. This should take an array of data of size (n_bootstraps, n_samples) and return the row-wise statistics of the data. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. pass_indices : boolean (optional) if True, then the indices of the points rather than the points themselves are passed to `user_statistic` random_state: RandomState or an int seed (0 by default) A random number generator instance Returns ------- distribution : ndarray the bootstrapped distribution of statistics (length = n_bootstraps) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} rng = check_random_state(random_state) data = np.asarray(data) if data.ndim != 1: n_samples = data.shape[0] warnings.warn("bootstrap data are n-dimensional: " "assuming ordered n_samples by n_attributes") else: n_samples = data.size # Generate random indices with repetition ind = rng.randint(n_samples, size=(n_bootstraps, n_samples)) data = data[ind].reshape(-1, data[ind].shape[-1]) # Call the function if pass_indices: stat_bootstrap = user_statistic(ind, **kwargs) else: stat_bootstrap = user_statistic(data, **kwargs) # compute the statistic on the data return stat_bootstrap
[docs]def jackknife(data, user_statistic, kwargs=None, return_raw_distribution=False, pass_indices=False): """Compute first-order jackknife statistics of the data. Parameters ---------- data : array_like A 1-dimensional data array of size n_samples user_statistic : function The statistic to be computed. This should take an array of data of size (n_samples, n_samples - 1) and return an array of size n_samples or tuple of arrays of size n_samples, representing the row-wise statistics of the input. kwargs : dictionary (optional) A dictionary of keyword arguments to be passed to the user_statistic function. return_raw_distribution : boolean (optional) if True, return the raw jackknife distribution. Be aware that this distribution is not reflective of the true distribution: it is simply an intermediate step in the jackknife calculation pass_indices : boolean (optional) if True, then the indices of the points rather than the points themselves are passed to `user_statistic` Returns ------- mean, stdev : floats The mean and standard deviation of the jackknifed distribution raw_distribution : ndarray Returned only if `return_raw_distribution` is True The array containing the raw distribution (length n_samples) Be aware that this distribution is not reflective of the true distribution: it is simply an intermediate step in the jackknife calculation Notes ----- This implementation is a leave-one-out jackknife. Jackknife resampling is known to fail on rank-based statistics (e.g. median, quartiles, etc.) It works well on smooth statistics (e.g. mean, standard deviation, etc.) """ # we don't set kwargs={} by default in the argument list, because using # a mutable type as a default argument can lead to strange results if kwargs is None: kwargs = {} data = np.asarray(data) n_samples = data.size if data.ndim != 1: raise ValueError("bootstrap expects 1-dimensional data") # generate indices for the entire dataset, converting to row vector ind0 = np.arange(n_samples)[np.newaxis, :] # generate sets of indices where a single datapoint is left-out ind = np.arange(n_samples, dtype=int) ind = np.vstack([np.hstack((ind[:i], ind[i + 1:])) for i in ind]) # compute the statistic for the whole dataset if pass_indices: stat_data = user_statistic(ind0, **kwargs) stat_jackknife = user_statistic(ind, **kwargs) else: stat_data = user_statistic(data[ind0], **kwargs) stat_jackknife = user_statistic(data[ind], **kwargs) # handle multiple statistics: # if ndim=0, then the statistic is not operating on rows (error). # if ndim=1, then it's a single statistic returned # if ndim=2, then a tuple has been returned stat_data = np.asarray(stat_data) ndim = stat_data.ndim if ndim == 0: raise ValueError("user_statistic should return row-wise statistics") stat_data = np.atleast_2d(stat_data).T stat_jackknife = np.atleast_2d(stat_jackknife) # compute the jackknife correction formula delta_stat = (n_samples - 1) * (stat_data - stat_jackknife.mean(1)) stat_corrected = (stat_data + delta_stat)[0] sigma_stat = np.sqrt(1. / n_samples / (n_samples + 1) * np.sum((n_samples * stat_data - stat_corrected - (n_samples - 1) * stat_jackknife.T) ** 2, 0)) if return_raw_distribution: results = tuple(zip(stat_corrected, sigma_stat, stat_jackknife)) else: results = tuple(zip(stat_corrected, sigma_stat)) if ndim == 1: return results[0] else: return results