-
-
Save lhenkelm/6d787e3395742a263ea66d2e868a2739 to your computer and use it in GitHub Desktop.
random sampling for cabinetry's yield_stdev, but using batches of parameters instead of a list expression looping over individual examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pyhf | |
import shapepolynomial | |
def random_sampling( | |
model: pyhf.pdf.Model, | |
parameters: np.ndarray, | |
uncertainty: np.ndarray, | |
corr_mat: np.ndarray, | |
*, | |
# allow setting batch size and number of toys independently | |
# since performance drops if arrays are too large for available memory | |
num_batches: int = 2, | |
batch_size: int = 25_000, | |
**model_kwargs, | |
): | |
# other details to forward here? | |
model_kwargs.setdefault('modifier_settings', model.config.modifier_settings) | |
model_kwargs.setdefault('poi_name', model.config.poi_name) | |
# required for my specifc example | |
model_kwargs.setdefault('validate', False) | |
model_kwargs.setdefault('modifier_set', shapepolynomial.modifier_set()) | |
batched_model = pyhf.Model(model.spec, batch_size = batch_size, **model_kwargs) | |
rng = np.random.default_rng(1) | |
# compute variance iteratively to avoid large lists/arrays | |
running_mean = 0 | |
running_variance = 0 | |
for i_batch in range(num_batches): | |
par_b = rng.multivariate_normal( | |
parameters, uncertainty @ corr_mat @ uncertainty.T, size=batch_size | |
) | |
y_b = batched_model.expected_data(par_b, include_auxdata=False) | |
mean = np.mean(y_b, axis =0) | |
variance = np.var(y_b, axis=0) | |
cumulated_size = i_batch*batch_size | |
running_variance = running_variance*(cumulated_size -1)/(cumulated_size + batch_size -1) | |
running_variance += variance *(batch_size -1) /(cumulated_size + batch_size -1) | |
running_variance += (running_mean - mean)**2 * cumulated_size*batch_size/(cumulated_size+batch_size) | |
# update mean last, otherwise the variance is updated with a wrong mean correction | |
running_mean = cumulated_size*running_mean + batch_size*mean | |
running_mean /= cumulated_size + batch_size | |
yerr_boot = np.sqrt(running_variance) | |
# don't bother with unused total uncertainty per channel for now | |
return yerr_boot, None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment