Pull-clipping

[1]:
%matplotlib notebook
import numpy as N
import scipy.stats as SS
import matplotlib.pyplot as P
[2]:
import inspec.combine as C
[3]:
N.random.seed(1234)

Abstract: We present the outlier detection procedure using pull-clipping, and its typical usage to compute inverse-variance weighted mean on small samples. We further test 2 different implementations, one based on a loop and the other fully vectorized; the later is ~20× faster.

Typical usage for inverse-variance weighted mean

Define a population of m = 531 samples of n = 4 measurements (e.g. n spectra of m pixels to be combined). Each point have a distribution \(\mathcal{N}(\mu=0, \sigma^2 \sim 1)\), the actual errors being distributed around 1 with a log-normal distribution \(\log-\mathcal{N}(\mu=0, \sigma^2=0.25^2)\).

[4]:
n = 4       # Nb of points per sample
m = 531     # Nb of samples
size = (m, n)
i = N.arange(m)
[5]:
shape = 0.25
dx_dist = SS.lognorm(shape)  # Log-normal distribution
[6]:
eps = 0.01
print("Probability of outliers: {:.1%}".format(eps))

outliers = N.random.rand(m, n) <= eps  # Outlier mask
Probability of outliers: 1.0%
[7]:
noutliers = outliers.sum(axis=-1)      # Nb of outliers in each sample
for k in range(n + 1):
    print(f"{k}/{n} outliers: {len(noutliers[noutliers == k])} px = {len(noutliers[noutliers == k])/m:.2%}, expected {SS.binom.pmf(k, n, eps):.2%}")
0/4 outliers: 506 px = 95.29%, expected 96.06%
1/4 outliers: 25 px = 4.71%, expected 3.88%
2/4 outliers: 0 px = 0.00%, expected 0.06%
3/4 outliers: 0 px = 0.00%, expected 0.00%
4/4 outliers: 0 px = 0.00%, expected 0.00%

Let assume outliers follow a log-normal distribution \(\ln\mathcal{N}(\mu=\ln(\mathtt{nsig}=10), \sigma^2=0.25^2)\):

[8]:
nsig = 10
outliers_dist = SS.lognorm(shape, scale=nsig)
[9]:
dx = dx_dist.rvs(size=size)
x = N.random.normal(loc=0, scale=dx, size=size)  # Pure gaussian errors
x += N.where(outliers, outliers_dist.rvs(size=size), 0)  # Add outliers
[10]:
fig, ax = P.subplots(1, 1)
for j, (xx, dxx) in enumerate(zip(x.T, dx.T), start=1):
    l, = ax.plot(i, xx, label=f"Sample #{j}")
    ax.fill_between(i, xx - dxx, xx + dxx, fc=l.get_color(), alpha=0.2)
ax.legend();
[11]:
fig, ax = P.subplots(1, 1)

Standard weighted mean:

[12]:
ave, dave = C.sample_weighted_mean(x, dx)

l, = ax.plot(i, ave, label="Weighted mean")
ax.fill_between(i, ave - dave, ave + dave, fc=l.get_color(), alpha=0.2);

Median:

[13]:
med, dmed = C.sample_median(x, dx)       # Median (unweighted)

l, = ax.plot(i, med, label="Median")
ax.fill_between(i, med - dmed, med + dmed, fc=l.get_color(), alpha=0.2);

\(\sigma\)-clipping (standard Grubbs’ test):

[14]:
side = +1  # Upper one-tailed outlier test
identified = C.outlier_clipping(x, dx, method='sigma', clip=3, side=side, verbose=True)

mx = N.ma.masked_array(x, mask=identified)
wm, dwm = C.sample_weighted_mean(mx, dx)

l, = ax.plot(i, wm, label="+3σ clipping")
ax.fill_between(i, wm - dwm, wm + dwm, fc=l.get_color(), alpha=0.2);
Outlier sigma-clipping: upper one-tailed, clip=3

Pull-clipping:

[15]:
side = +1  # Upper one-tailed outlier test
identified = C.outlier_clipping(x, dx, method='pull', clip=3, side=side, verbose=True)

mx = N.ma.masked_array(x, mask=identified)
pm, dpm = C.sample_weighted_mean(mx, dx)

l, = ax.plot(i, pm, label="+3 pull clipping")
ax.fill_between(i, pm - dpm, pm + dpm, fc=l.get_color(), alpha=0.2);
Outlier pull-clipping: upper one-tailed, clip=3
  Iter #1/3: 27 outliers (total: 27)
[16]:
ax.legend();

Implementation tests

Mimic loop-based implementation in SIR pipeline and compare against fully-vectorized implementation.

[17]:
flux_set = x.T       # (ndither, nlbda)
stdev_set = dx.T
quality_set = SS.uniform.rvs(size=flux_set.shape)              # Random quality factor in [0, 1]
quality_set[SS.uniform.rvs(size=quality_set.shape) < 0.1] = 0  # Randomly flagged-out px
[18]:
fig, ax = P.subplots(1, 1)
mflux_set = N.ma.masked_where(quality_set == 0, flux_set)
mstdev_set = N.ma.masked_where(quality_set == 0, stdev_set)
for f, df in zip(mflux_set, mstdev_set):
    ax.plot(i, f, c='0.8')
    ax.fill_between(i, f-df, f+df, fc='0.8', alpha=0.2)
[19]:
d_w, dd_w = C.sample_weighted_mean(flux_set.T, stdev_set.T)

l, = ax.plot(i, d_w, label='wmean')
ax.fill_between(i, d_w - dd_w, d_w + dd_w, fc=l.get_color(), alpha=0.2)
ax.legend();
[20]:
t_w = %timeit -o C.sample_weighted_mean(flux_set.T, stdev_set.T)
133 µs ± 13.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
[21]:
def SIR_combine(flux_set, stdev_set, quality_set):

    combined_data = N.zeros(flux_set.shape[-1])  # (nlbda,)
    combined_variance = N.zeros_like(combined_data)
    combined_quality = N.zeros_like(combined_data)

    for wave, _ in enumerate(combined_data):  # Loop over wavelength
        quality = quality_set[:, wave]
        good_quality = quality > 0
        quality = quality[good_quality]
        # pulling cannot be perfomed
        if len(quality) <= 2:
            # Use previously computed weighted average without clipping
            combined_data[wave], wsum = N.average(flux_set[:, wave][good_quality],
                                              weights=stdev_set[:, wave][good_quality]**(-2), returned=True)
            combined_variance[wave] = 1.0 / wsum
            # Improper use of quality! Just for check and keep track of number of spectra combined
            combined_quality[wave] += 10 * len(quality)
            continue

        stdev = stdev_set[:, wave][good_quality]
        data = flux_set[:, wave][good_quality]

        outliers = C.outlier_clipping(data, stdev, method='pull', clip=3, side=0)

        # Recompute combine at the current wave
        combined_data[wave], wsum = N.average(data[~outliers],
                                              weights=stdev[~outliers]**(-2), returned=True)
        combined_variance[wave] = 1.0 / wsum
        combined_quality[wave] = quality[~outliers].sum() / quality.size

       # Improper use of quality! Just for check and keep track of number of spectra combined
        combined_quality[wave] += 10 * quality[~outliers].size

    return combined_data, combined_variance, combined_quality
[22]:
d_sir, v_sir, q_sir = SIR_combine(flux_set, stdev_set, quality_set)
dd_sir = v_sir ** 0.5

l, = ax.plot(i, d_sir, label='SIR')
ax.fill_between(i, d_sir - dd_sir, d_sir + dd_sir, fc=l.get_color(), alpha=0.2)
ax.legend();
[23]:
t_sir = %timeit -o SIR_combine(flux_set, stdev_set, quality_set)
563 ms ± 66.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
[24]:
def opt_combine(flux_set, stdev_set, quality_set):

    mflux_set = N.ma.masked_where(quality_set == 0, flux_set)
    mstdev_set = N.ma.masked_where(quality_set == 0, stdev_set)

    # BEWARE: C.outlier_clipping modifies mask of input masked array!!!
    identified = C.outlier_clipping(mflux_set.T.copy(), mstdev_set.T, method='pull', clip=3, side=0)
    cflux_set = N.ma.masked_where(identified.T, mflux_set)
    combined_data, combined_stdev = C.sample_weighted_mean(cflux_set.T, mstdev_set.T)

    good_before = mflux_set.count(axis=0)  # Nb of points per sample before clipping
    good_after = cflux_set.count(axis=0)   # Nb of points per sample after clipping
    mquality_set = N.ma.masked_where(identified.T, quality_set)  # identified already includes quality mask
    combined_quality = N.where(good_before <= 2,
                               10 * good_before,
                               mquality_set.sum(axis=0) / good_before + 10 * good_after)

    return combined_data, combined_stdev**2, combined_quality
[25]:
d_opt, v_opt, q_opt = opt_combine(flux_set, stdev_set, quality_set)
dd_opt = v_opt ** 0.5

l, = ax.plot(i, d_opt, label='opt')
ax.fill_between(i, d_opt - dd_opt, d_opt + dd_opt, fc=l.get_color(), alpha=0.2)
ax.legend();
/home/ycopin/Softwares/VirtualEnvs/Python3.6/lib/python3.6/site-packages/inspec/combine.py:249: RuntimeWarning: divide by zero encountered in power
  dxi = wsum**(-0.5)                          # x.shape
[26]:
N.allclose(d_sir, d_opt) and N.allclose(dd_sir, dd_opt) and (q_sir == q_opt).all()
[26]:
True
[27]:
t_opt = %timeit -o opt_combine(flux_set, stdev_set, quality_set)
/home/ycopin/Softwares/VirtualEnvs/Python3.6/lib/python3.6/site-packages/inspec/combine.py:249: RuntimeWarning: divide by zero encountered in power
  dxi = wsum**(-0.5)                          # x.shape
6.23 ms ± 509 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
[28]:
print("Improvement factor opt/SIR:", t_sir.average / t_opt.average)
Improvement factor opt/SIR: 90.35582324368045

Memory profiling

[29]:
%load_ext memory_profiler
[30]:
%%memit

d_w, dd_w = C.sample_weighted_mean(flux_set.T, stdev_set.T)
peak memory: 128.92 MiB, increment: 0.24 MiB
[31]:
%%memit

d_sir, v_sir, q_sir = SIR_combine(flux_set, stdev_set, quality_set)
peak memory: 129.28 MiB, increment: 0.12 MiB
[32]:
%%memit

d_opt, v_opt, q_opt = opt_combine(flux_set, stdev_set, quality_set)
peak memory: 129.64 MiB, increment: 0.36 MiB