Source code for astroML.stats._point_statistics

import numpy as np
from scipy import stats

# from scipy.special import erfinv
# sigmaG_factor = 1. / (2 * np.sqrt(2) * erfinv(0.5))
sigmaG_factor = 0.74130110925280102


[docs]def mean_sigma(a, axis=None, dtype=None, ddof=0, keepdims=False):
    """Compute mean and standard deviation for an array

    Parameters
    ----------
    a : array_like
        Array containing numbers whose mean is desired. If `a` is not an
        array, a conversion is attempted.
    axis : int, optional
        Axis along which the means are computed. The default is to compute
        the mean of the flattened array.
    dtype : dtype, optional
        Type to use in computing the standard deviation. For arrays of
        integer type the default is float64, for arrays of float types it is
        the same as the array type.
    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left
        in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the original `arr`.

    Returns
    -------
    mu : ndarray, see dtype parameter above
        array containing the mean values

    sigma : ndarray, see dtype parameter above.
        array containing the standard deviation

    See Also
    --------
    median_sigmaG : robust rank-based version of this calculation.

    Notes
    -----
    This routine simply calls ``np.mean`` and ``np.std``, passing the
    keyword arguments to them.  It is provided for ease of comparison
    with the function median_sigmaG()
    """
    mu = np.mean(a, axis=axis, dtype=dtype)
    sigma = np.std(a, axis=axis, dtype=dtype, ddof=ddof)

    if keepdims:
        if axis is None:
            newshape = a.ndim * (1,)
        else:
            newshape = np.asarray(a.shape)
            newshape[axis] = 1

        mu = mu.reshape(newshape)
        sigma = sigma.reshape(newshape)

    return mu, sigma


[docs]def median_sigmaG(a, axis=None, overwrite_input=False, keepdims=False):
    """Compute median and rank-based estimate of the standard deviation

    Parameters
    ----------
    a : array_like
        Array containing numbers whose mean is desired. If `a` is not an
        array, a conversion is attempted.
    axis : int, optional
        Axis along which the means are computed. The default is to compute
        the mean of the flattened array.
    overwrite_input : bool, optional
       If True, then allow use of memory of input array `a` for
       calculations. The input array will be modified by the call to
       median. This will save memory when you do not need to preserve
       the contents of the input array. Treat the input as undefined,
       but it will probably be fully or partially sorted.
       Default is False. Note that, if `overwrite_input` is True and the
       input is not already an array, an error will be raised.
    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left
        in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the original `arr`.

    Returns
    -------
    median : ndarray, see dtype parameter above
        array containing the median values

    sigmaG : ndarray, see dtype parameter above.
        array containing the robust estimator of the standard deviation

    See Also
    --------
    mean_sigma : non-robust version of this calculation
    sigmaG : robust rank-based estimate of standard deviation

    Notes
    -----
    This routine uses a single call to ``np.percentile`` to find the
    quartiles along the given axis, and uses these to compute the
    median and sigmaG:

    median = q50
    sigmaG = (q75 - q25) * 0.7413

    where 0.7413 ~ 1 / (2 sqrt(2) erf^-1(0.5))
    """
    q25, median, q75 = np.percentile(a, [25, 50, 75],
                                     axis=axis,
                                     overwrite_input=overwrite_input)
    sigmaG = sigmaG_factor * (q75 - q25)

    if keepdims:
        if axis is None:
            newshape = a.ndim * (1,)
        else:
            newshape = np.asarray(a.shape)
            newshape[axis] = 1

        median = median.reshape(newshape)
        sigmaG = sigmaG.reshape(newshape)

    return median, sigmaG


[docs]def sigmaG(a, axis=None, overwrite_input=False, keepdims=False):
    """Compute the rank-based estimate of the standard deviation

    Parameters
    ----------
    a : array_like
        Array containing numbers whose mean is desired. If `a` is not an
        array, a conversion is attempted.
    axis : int, optional
        Axis along which the means are computed. The default is to compute
        the mean of the flattened array.
    overwrite_input : bool, optional
       If True, then allow use of memory of input array `a` for
       calculations. The input array will be modified by the call to
       median. This will save memory when you do not need to preserve
       the contents of the input array. Treat the input as undefined,
       but it will probably be fully or partially sorted.
       Default is False. Note that, if `overwrite_input` is True and the
       input is not already an array, an error will be raised.
    keepdims : bool, optional
        If this is set to True, the axes which are reduced are left
        in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the original `arr`.

    Returns
    -------
    median : ndarray, see dtype parameter above
        array containing the median values

    sigmaG : ndarray, see dtype parameter above.
        array containing the robust estimator of the standard deviation

    See Also
    --------
    median_sigmaG : robust rank-based estimate of mean and standard deviation

    Notes
    -----
    This routine uses a single call to ``np.percentile`` to find the
    quartiles along the given axis, and uses these to compute the
    sigmaG, a robust estimate of the standard deviation sigma:

    sigmaG = 0.7413 * (q75 - q25)

    where 0.7413 ~ 1 / (2 sqrt(2) erf^-1(0.5))
    """
    q25, q75 = np.percentile(a, [25, 75],
                             axis=axis,
                             overwrite_input=overwrite_input)
    sigmaG = sigmaG_factor * (q75 - q25)

    if keepdims:
        if axis is None:
            newshape = a.ndim * (1,)
        else:
            newshape = np.asarray(a.shape)
            newshape[axis] = 1

        sigmaG = sigmaG.reshape(newshape)

    return sigmaG


[docs]def fit_bivariate_normal(x, y, robust=False):
    """Fit bivariate normal parameters to a 2D distribution of points

    Parameters
    ----------
    x, y : array_like
        The x, y coordinates of the points

    robust : boolean (optional, default=False)
        If True, then use rank-based statistics which are robust to outliers
        Otherwise, use mean/std statistics which are not robust

    Returns
    -------
    mu : tuple
        (x, y) location of the best-fit bivariate normal
    sigma_1, sigma_2 : float
        The best-fit gaussian widths in the uncorrelated frame
    alpha : float
        The rotation angle in radians of the uncorrelated frame
    """
    x = np.asarray(x)
    y = np.asarray(y)

    assert x.shape == y.shape

    if robust:
        # use quartiles to compute center and spread
        med_x, sigmaG_x = median_sigmaG(x)
        med_y, sigmaG_y = median_sigmaG(y)

        # define the principal variables from Shevlyakov & Smirnov (2011)
        sx = 2 * sigmaG_x
        sy = 2 * sigmaG_y

        u = (x / sx + y / sy) / np.sqrt(2)
        v = (x / sx - y / sy) / np.sqrt(2)

        med_u, sigmaG_u = median_sigmaG(u)
        med_v, sigmaG_v = median_sigmaG(v)

        r_xy = ((sigmaG_u ** 2 - sigmaG_v ** 2) /
                (sigmaG_u ** 2 + sigmaG_v ** 2))

        # rename estimators
        mu_x, mu_y = med_x, med_y
        sigma_x, sigma_y = sigmaG_x, sigmaG_y
    else:
        mu_x = np.mean(x)
        sigma_x = np.std(x)

        mu_y = np.mean(y)
        sigma_y = np.std(y)

        r_xy = stats.pearsonr(x, y)[0]

    # We need to use the full (-180, 180) version of arctan: this is
    # np.arctan2(x, y) = np.arctan(x / y), modulo 180 degrees
    sigma_xy = r_xy * sigma_x * sigma_y
    alpha = 0.5 * np.arctan2(2 * sigma_xy, sigma_x ** 2 - sigma_y ** 2)

    sigma1 = np.sqrt((0.5 * (sigma_x ** 2 + sigma_y ** 2)
                      + np.sqrt(0.25 * (sigma_x ** 2 - sigma_y ** 2) ** 2
                                + sigma_xy ** 2)))
    sigma2 = np.sqrt((0.5 * (sigma_x ** 2 + sigma_y ** 2)
                      - np.sqrt(0.25 * (sigma_x ** 2 - sigma_y ** 2) ** 2
                                + sigma_xy ** 2)))

    return [mu_x, mu_y], sigma1, sigma2, alpha