luispedro/santa-clara.py

## santa-clara.py
import pymc3 as pm
from scipy import stats
import numpy as np


NR_TESTS = 3330
POSITIVES = 50
PRE_NEG = 401
PRE_NEG_POS = 2
PRE_PLUS = 37+75+85
PRE_PLUS_POS = 25 + 75 + 78

print("# Pre-Bayesian")

print("{}\t{}".format("Spec","p-value"))
for sp in np.linspace(.983, .995, 10):
    print("{}\t{}".format(sp, 1-stats.binom(NR_TESTS, 1 - sp).cdf(POSITIVES)))

print("\n# Semi-Bayesian")

sp_dist = stats.beta(PRE_NEG - PRE_NEG_POS + 1, PRE_NEG_POS + 1)

cs = []
for _ in range(10_000):
    sp = sp_dist.rvs()
    cs.append(
        stats.binom(NR_TESTS, 1 - sp).rvs())

cs = np.array(cs)
print("Empirical p-value for null: {}".format((cs >= 50).mean()))

print("\n# The full Bayesian")

with pm.Model() as model:

    # Beta(1,1) priors for all the fractions
    sp = pm.Beta('sp', 1,1)
    sn = pm.Beta('sn', 1,1)
    prev = pm.Beta('prev', 1, 1)

    # These were the test evaluations
    pm.Binomial('pre', PRE_NEG, 1 - sp, observed=PRE_NEG_POS)
    pm.Binomial('pre_sn', PRE_PLUS, sn, observed=PRE_PLUS_POS)

    # Total positives in the test set:
    pos = pm.Binomial('pos', NR_TESTS, prev)

    # Some are going to be true positives (at most POSITIVES):
    tp = pm.Bound(pm.Binomial, upper=POSITIVES)('tp', pos, sn)

    # We observe the sum of fp + tp, but it's not possible to directly express
    # it that way in pymc, but it is possible to express it in this equivalent
    # way:
    p = pm.Binomial('fp', NR_TESTS - pos, 1 - sp, observed=(POSITIVES - tp))

    t = pm.sample(10_000)

print("P(pos == 0): {}".format(np.mean(t.pos==0)))

t_prev = t.prev.copy()

for name, arr in [
            ('prevalence', t.prev),
            ('positives', t.pos)]:
    arr = arr.copy()
    arr.sort()
    print("Credible interval for {}: {} -- {}".format(
        name,
        arr[len(arr)//20],
        arr[-len(arr)//20]))


# OUTPUT

# # Pre-Bayesian
# Spec	p-value
# 0.983	0.7916561584607545
# 0.9843333333333333	0.5837802613783531
# 0.9856666666666667	0.3359586144295388
# 0.987	0.13589251160874583
# 0.9883333333333333	0.03419905470535767
# 0.9896666666666667	0.004607969061608097
# 0.991	0.000272379260707889
# 0.9923333333333333	5.3119956933134205e-06
# 0.9936666666666667	2.1987185627736494e-08
# 0.995	9.120371124993198e-12
#
# # Semi-Bayesian
# Empirical p-value for null: 0.0732
#
# # The full Bayesian
# P(pos == 0): 0.0
# Credible interval for prevalence: 0.004510854444494588 -- 0.016812474319518215
# Credible interval for positives: 15 -- 51
	import pymc3 as pm
	from scipy import stats
	import numpy as np


	NR_TESTS = 3330
	POSITIVES = 50
	PRE_NEG = 401
	PRE_NEG_POS = 2
	PRE_PLUS = 37+75+85
	PRE_PLUS_POS = 25 + 75 + 78

	print("# Pre-Bayesian")

	print("{}\t{}".format("Spec","p-value"))
	for sp in np.linspace(.983, .995, 10):
	print("{}\t{}".format(sp, 1-stats.binom(NR_TESTS, 1 - sp).cdf(POSITIVES)))

	print("\n# Semi-Bayesian")

	sp_dist = stats.beta(PRE_NEG - PRE_NEG_POS + 1, PRE_NEG_POS + 1)

	cs = []
	for _ in range(10_000):
	sp = sp_dist.rvs()
	cs.append(
	stats.binom(NR_TESTS, 1 - sp).rvs())

	cs = np.array(cs)
	print("Empirical p-value for null: {}".format((cs >= 50).mean()))

	print("\n# The full Bayesian")

	with pm.Model() as model:

	# Beta(1,1) priors for all the fractions
	sp = pm.Beta('sp', 1,1)
	sn = pm.Beta('sn', 1,1)
	prev = pm.Beta('prev', 1, 1)

	# These were the test evaluations
	pm.Binomial('pre', PRE_NEG, 1 - sp, observed=PRE_NEG_POS)
	pm.Binomial('pre_sn', PRE_PLUS, sn, observed=PRE_PLUS_POS)

	# Total positives in the test set:
	pos = pm.Binomial('pos', NR_TESTS, prev)

	# Some are going to be true positives (at most POSITIVES):
	tp = pm.Bound(pm.Binomial, upper=POSITIVES)('tp', pos, sn)

	# We observe the sum of fp + tp, but it's not possible to directly express
	# it that way in pymc, but it is possible to express it in this equivalent
	# way:
	p = pm.Binomial('fp', NR_TESTS - pos, 1 - sp, observed=(POSITIVES - tp))

	t = pm.sample(10_000)

	print("P(pos == 0): {}".format(np.mean(t.pos==0)))

	t_prev = t.prev.copy()

	for name, arr in [
	('prevalence', t.prev),
	('positives', t.pos)]:
	arr = arr.copy()
	arr.sort()
	print("Credible interval for {}: {} -- {}".format(
	name,
	arr[len(arr)//20],
	arr[-len(arr)//20]))


	# OUTPUT

	# # Pre-Bayesian
	# Spec p-value
	# 0.983 0.7916561584607545
	# 0.9843333333333333 0.5837802613783531
	# 0.9856666666666667 0.3359586144295388
	# 0.987 0.13589251160874583
	# 0.9883333333333333 0.03419905470535767
	# 0.9896666666666667 0.004607969061608097
	# 0.991 0.000272379260707889
	# 0.9923333333333333 5.3119956933134205e-06
	# 0.9936666666666667 2.1987185627736494e-08
	# 0.995 9.120371124993198e-12
	#
	# # Semi-Bayesian
	# Empirical p-value for null: 0.0732
	#
	# # The full Bayesian
	# P(pos == 0): 0.0
	# Credible interval for prevalence: 0.004510854444494588 -- 0.016812474319518215
	# Credible interval for positives: 15 -- 51