Searching for Structure in Point Data#

First we import data#

The data used in this chapter is a subset of SDSS spectroscope galaxy sample centered at SDSS “Great Wall”.

%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
from scipy import stats

from sklearn.neighbors import KernelDensity

from astroML.density_estimation import KNeighborsDensity
from astropy.visualization import hist

The code below ensures the fonts in plots are rendered LaTex.
This function adjusts matplotlib settings for a uniform feel in the textbook.
Note that with usetex=True, fonts are rendered with LaTeX. This may result in an error if LaTeX is not installed on your system. In that case, you can set usetex to False.

if "setup_text_plots" not in globals():
from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)

Generate our data#

Generate our data: a mix of several Cauchy distributions this is the same data used in the Bayesian Blocks figure

np.random.seed(0)
N = 10000
mu_gamma_f = [(5, 1.0, 0.1),
(7, 0.5, 0.5),
(9, 0.1, 0.1),
(12, 0.5, 0.2),
(14, 1.0, 0.1)]
true_pdf = lambda x: sum([f * stats.cauchy(mu, gamma).pdf(x)
for (mu, gamma, f) in mu_gamma_f])
x = np.concatenate([stats.cauchy(mu, gamma).rvs(int(f * N))
for (mu, gamma, f) in mu_gamma_f])
np.random.shuffle(x)
x = x[x > -10]
x = x[x < 30]

Generating Distribution#

plot the result of the generating distribution of the given dataset.

fig = plt.figure(figsize=(5, 2.5))
t = np.linspace(-10, 30, 1000)

# plot_generating_data(x_values) takes a row vector with x values as parameter
# and plots the generating distribution of the given data using true_pdf() function.
def plot_generating_data(x_values):
ax.plot(x_values, true_pdf(x_values), ':', color='black', zorder=3,
label="Generating Distribution")
# label the plot
ax.set_ylabel('$p(x)$')

# set axis limit
ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)

plot_generating_data(t)
ax.legend(loc='upper right')
ax.set_xlabel('$x$')

plt.show()

Plot the results#

Kernel Density Estimation (KDE)#

We often use Gaussian Kernel in KDE. Function $$K(u)$$ represents the weight at a given point, which is normalized such that $$\int K(u)du = 1$$.
For a Gaussian Kernel: $$$K(u) = \frac{1}{ {2\pi}^{\frac{D}{2}} } e^{\frac{-{u}^2}{2}}$$$

# with functions

fig = plt.figure(figsize=(5, 5))
subplots = (211, 212)

# set N values to be 500 and 5000
N_values = (500, 5000)

# plot_kde(x_values) takes a row vector with x values as a parameter, computes the and plots KDE at x.
def plot_kde(x_values):
kde = KernelDensity(bandwidth=0.1, kernel='gaussian')
kde.fit(xN[:, None])
dens_kde = np.exp(kde.score_samples(t[:, None]))
ax.plot(x_values, dens_kde, '-', color='black', zorder=3,
label="Kernel Density (h=0.1)")

for N, subplot in zip(N_values, subplots):
xN = x[:N]

# plot generating data in comparison with KDE
plot_generating_data(t)
plot_kde(t)
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')

# make label and legend to the plot
ax.legend(loc='upper right')
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)

if subplot == 212:
ax.set_xlabel('$x$')

plt.show()
# without functions
fig = plt.figure(figsize=(5, 5))
N_values = (500, 5000)
subplots = (211, 212)
k_values = (10, 100)

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]
t = np.linspace(-10, 30, 1000)

# Compute density with KDE
kde = KernelDensity(bandwidth=0.1, kernel='gaussian')
kde.fit(xN[:, None])
dens_kde = np.exp(kde.score_samples(t[:, None]))

# plot the results
ax.plot(t, true_pdf(t), ':', color='black', zorder=3,
label="Generating Distribution")
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')

ax.plot(t, dens_kde, '-', color='black', zorder=3,
label="Kernel Density (h=0.1)")

# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.set_ylabel('$p(x)$')
ax.legend(loc='upper right')

# set axis limit
ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)

if subplot == 212:
ax.set_xlabel('$x$')

plt.show()

Nearest-Neighbor Density Estimation#

The code below plots generating distribution and a result from nearest-neighbor estimation.

# with functions
fig = plt.figure(figsize=(5, 5))

k_values = (10, 100)

# plot_nearest_neighor(x_values) takes a row vector with x values as a parameter
# computes the and plots density with Bayesian nearest neighbors at x.
def plot_nearest_neighbor(x_values):
nbrs = KNeighborsDensity('bayesian', n_neighbors=k).fit(xN[:, None])
dens_nbrs = nbrs.eval(t[:, None]) / N
ax.plot(x_values, dens_nbrs, '-', lw=1.5, color='gray', zorder=2,
label="Nearest Neighbors (k=%i)" % k)

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]

# plot generating data in comparison with nearest neighbor
plot_generating_data(t)
plot_nearest_neighbor(t)
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')

# make label and legend to the plot
ax.legend(loc='upper right')
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)

if subplot == 212:
ax.set_xlabel('$x$')

plt.show()
# without function
fig = plt.figure(figsize=(5, 5))
subplots = (211, 212)
N_values = (500, 5000)
k_values = (10, 100)

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]
t = np.linspace(-10, 30, 1000)

# Compute density with Bayesian nearest neighbors
nbrs = KNeighborsDensity('bayesian', n_neighbors=k).fit(xN[:, None])
dens_nbrs = nbrs.eval(t[:, None]) / N

# plot the results
ax.plot(t, true_pdf(t), ':', color='black', zorder=3,
label="Generating Distribution")
ax.plot(t, dens_nbrs, '-', lw=1.5, color='gray', zorder=2,
label="Nearest Neighbors (k=%i)" % k)
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')

# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.set_ylabel('$p(x)$')
ax.legend(loc='upper right')

if subplot == 212:
ax.set_xlabel('$x$')

ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)

plt.show()

Bayesian Blocks#

The code below plots generating distribution and Baysian block analysis.

# with functions
fig = plt.figure(figsize=(5, 5))

# plot_bayesian_block(x_values) takes a row vector with x values as a parameter
# computes the and plots the estimated Bayesian blocks using histogram.
def plot_bayesian_block(x_values):
hist(x_values, bins='blocks', ax=ax, density=True, zorder=1,
histtype='stepfilled', color='k', alpha=0.2,
label="Bayesian Blocks")

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]

# plot generating data in comparison with bayesian blocks
plot_generating_data(t)
plot_bayesian_block(xN)
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')

# make label and legend to the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.legend(loc='upper right')

if subplot == 212:
ax.set_xlabel('$x$')

plt.show()
WARNING: p0 does not seem to accurately represent the false positive rate for event data. It is highly recommended that you run random trials on signal-free noise to calibrate ncp_prior to achieve a desired false positive rate. [astropy.stats.bayesian_blocks]
# without functions
fig = plt.figure(figsize=(5, 5))
N_values = (500, 5000)
subplots = (211, 212)
k_values = (10, 100)

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]
t = np.linspace(-10, 30, 1000)

ax.plot(t, true_pdf(t), ':', color='black', zorder=3,
label="Generating Distribution")
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')
hist(xN, bins='blocks', ax=ax, density=True, zorder=1,
histtype='stepfilled', color='k', alpha=0.2,
label="Bayesian Blocks")

# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.set_ylabel('$p(x)$')
ax.legend(loc='upper right')

if subplot == 212:
ax.set_xlabel('$x$')

ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)

plt.show()

A comparison of the Three Estimations#

The code below plots results from all three estimations in two subplots for reference.

# with functions
fig = plt.figure(figsize=(5, 5))

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]

# plot the results from three methods and generating data
plot_generating_data(t)
plot_bayesian_block(xN)
plot_nearest_neighbor(t)
plot_kde(t)
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')

# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.legend(loc='upper right')

if subplot == 212:
ax.set_xlabel('$x$')

plt.show()
WARNING: p0 does not seem to accurately represent the false positive rate for event data. It is highly recommended that you run random trials on signal-free noise to calibrate ncp_prior to achieve a desired false positive rate. [astropy.stats.bayesian_blocks]
# without functions
fig = plt.figure(figsize=(5, 5))
N_values = (500, 5000)
subplots = (211, 212)
k_values = (10, 100)

for N, k, subplot in zip(N_values, k_values, subplots):
xN = x[:N]
t = np.linspace(-10, 30, 1000)

# Compute density with KDE
kde = KernelDensity(bandwidth=0.1, kernel='gaussian')
kde.fit(xN[:, None])
dens_kde = np.exp(kde.score_samples(t[:, None]))

# Compute density with Bayesian nearest neighbors
nbrs = KNeighborsDensity('bayesian', n_neighbors=k).fit(xN[:, None])
dens_nbrs = nbrs.eval(t[:, None]) / N

# plot the results
ax.plot(t, true_pdf(t), ':', color='black', zorder=3,
label="Generating Distribution")
ax.plot(xN, -0.005 * np.ones(len(xN)), '|k')
hist(xN, bins='blocks', ax=ax, density=True, zorder=1,
histtype='stepfilled', color='k', alpha=0.2,
label="Bayesian Blocks")
ax.plot(t, dens_nbrs, '-', lw=1.5, color='gray', zorder=2,
label="Nearest Neighbors (k=%i)" % k)
ax.plot(t, dens_kde, '-', color='black', zorder=3,
label="Kernel Density (h=0.1)")

# label the plot
ax.text(0.02, 0.95, "%i points" % N, ha='left', va='top',
transform=ax.transAxes)
ax.set_ylabel('$p(x)$')
ax.legend(loc='upper right')

if subplot == 212:
ax.set_xlabel('$x$')

ax.set_xlim(0, 20)
ax.set_ylim(-0.01, 0.4001)

plt.show()
WARNING: p0 does not seem to accurately represent the false positive rate for event data. It is highly recommended that you run random trials on signal-free noise to calibrate ncp_prior to achieve a desired false positive rate. [astropy.stats.bayesian_blocks]
WARNING: p0 does not seem to accurately represent the false positive rate for event data. It is highly recommended that you run random trials on signal-free noise to calibrate ncp_prior to achieve a desired false positive rate. [astropy.stats.bayesian_blocks]