Note
Click here to download the full example code
Correlation efficiency¶
What’s the fastest way to calculate r^2?
import random
import numpy as np
import pandas as pd
from scipy import stats
import perfplot
import matplotlib.pyplot as plt
from _benchmarking import plot_results, print_environment
def make_dataset(N):
random.seed(0)
x1 = pd.Series(range(N))
x1[1] = np.nan
x2 = pd.Series(range(N))
random.shuffle(x2)
x2[2] = np.nan
return x1, x2
def rsq1(args):
x1, x2 = args
filt = ~np.isnan(x1) & ~np.isnan(x2)
return np.corrcoef(x1[filt], x2[filt])[0, 1]**2
def rsq2(args):
x1, x2 = args
return pd.DataFrame({'x1': x1, 'x2': x2}).corr().loc['x1', 'x2']**2
def rsq3(args):
x1, x2 = args
return x1.corr(x2)**2
def rsq4(args):
x1, x2 = args
filt = ~np.isnan(x1) & ~np.isnan(x2)
_, _, r_value, _, _ = stats.linregress(x1[filt], x2[filt])
return r_value**2
def rsq5(args):
x1, x2 = args
filt = ~np.isnan(x1) & ~np.isnan(x2)
r_value, _ = stats.pearsonr(x1[filt], x2[filt])
return r_value**2
def rsq6(args):
x1, x2 = args
filt = ~np.isnan(x1) & ~np.isnan(x2)
x1 = x1[filt]
x2 = x2[filt]
m1 = x1.mean()
m2 = x2.mean()
x1_m_m1 = x1 - m1
x2_m_m2 = x2 - m2
numerator = ((x1_m_m1) * (x2_m_m2)).sum()
denominator = (((x1_m_m1)**2).sum())**0.5 * (((x2_m_m2)**2).sum())**0.5
return (numerator / denominator)**2
x1, x2 = make_dataset(1000)
expected_rsq = 4.113435053656e-4
for func in [rsq1, rsq2, rsq3, rsq4, rsq5, rsq6]:
pct_diff = abs(func((x1, x2)) - expected_rsq) / expected_rsq
assert pct_diff < 1e-10, func.__name__
results = perfplot.bench(
setup=make_dataset,
kernels=[rsq1, rsq2, rsq3, rsq4, rsq5, rsq6],
labels=["np.corrcoef", "df.corr", "x1.corr(x2)",
"stats.linregress", "stats.pearsonr", "formula"],
n_range=np.logspace(2, 8, num=7).astype(int),
show_progress=False,
)
plt.style.use('default') # reset style changes made by perfplot
plot_results(results, keep=0).show()

print_environment()
Out:
3.7.9 (default, Aug 31 2020, 12:42:55)
[GCC 7.3.0]
pandas 1.2.2
numpy 1.19.2
matplotlib 3.3.4
scipy 1.6.0
pvlib 0.8.1
Total running time of the script: ( 91 minutes 55.615 seconds)