Correlation efficiency

What’s the fastest way to calculate r^2?

import random
import numpy as np
import pandas as pd
from scipy import stats
import perfplot
import matplotlib.pyplot as plt
from _benchmarking import plot_results, print_environment


def make_dataset(N):
    random.seed(0)
    x1 = pd.Series(range(N))
    x1[1] = np.nan
    x2 = pd.Series(range(N))
    random.shuffle(x2)
    x2[2] = np.nan
    return x1, x2


def rsq1(args):
    x1, x2 = args
    filt = ~np.isnan(x1) & ~np.isnan(x2)
    return np.corrcoef(x1[filt], x2[filt])[0, 1]**2


def rsq2(args):
    x1, x2 = args
    return pd.DataFrame({'x1': x1, 'x2': x2}).corr().loc['x1', 'x2']**2


def rsq3(args):
    x1, x2 = args
    return x1.corr(x2)**2


def rsq4(args):
    x1, x2 = args
    filt = ~np.isnan(x1) & ~np.isnan(x2)
    _, _, r_value, _, _ = stats.linregress(x1[filt], x2[filt])
    return r_value**2


def rsq5(args):
    x1, x2 = args
    filt = ~np.isnan(x1) & ~np.isnan(x2)
    r_value, _ = stats.pearsonr(x1[filt], x2[filt])
    return r_value**2


def rsq6(args):
    x1, x2 = args
    filt = ~np.isnan(x1) & ~np.isnan(x2)
    x1 = x1[filt]
    x2 = x2[filt]
    m1 = x1.mean()
    m2 = x2.mean()
    x1_m_m1 = x1 - m1
    x2_m_m2 = x2 - m2
    numerator = ((x1_m_m1) * (x2_m_m2)).sum()
    denominator = (((x1_m_m1)**2).sum())**0.5 * (((x2_m_m2)**2).sum())**0.5
    return (numerator / denominator)**2


x1, x2 = make_dataset(1000)
expected_rsq = 4.113435053656e-4

for func in [rsq1, rsq2, rsq3, rsq4, rsq5, rsq6]:
    pct_diff = abs(func((x1, x2)) - expected_rsq) / expected_rsq
    assert pct_diff < 1e-10, func.__name__
results = perfplot.bench(
    setup=make_dataset,
    kernels=[rsq1, rsq2, rsq3, rsq4, rsq5, rsq6],
    labels=["np.corrcoef", "df.corr", "x1.corr(x2)",
            "stats.linregress", "stats.pearsonr", "formula"],
    n_range=np.logspace(2, 8, num=7).astype(int),
    show_progress=False,
)

plt.style.use('default')  # reset style changes made by perfplot
plot_results(results, keep=0).show()
correlation efficiency
print_environment()

Out:

3.7.9 (default, Aug 31 2020, 12:42:55)
[GCC 7.3.0]
pandas 1.2.2
numpy 1.19.2
matplotlib 3.3.4
scipy 1.6.0
pvlib 0.8.1

Total running time of the script: ( 91 minutes 55.615 seconds)

Gallery generated by Sphinx-Gallery