Bayesian Recurrent Neural Network Implementation
An implementation of the `Local reparameterization trick`
from Kingma & Wellings and
Bayesian RNN
from Fortunato, Blundell & Vinyals
import os
import time
import copy
from os.path import join as pjoin
from six.moves import xrange
import numpy as np
import tensorflow as tf
from tensorflow.python.util import nest
from tensorflow.python.ops.rnn_cell import LSTMStateTuple
from loader import TextLoader, noise_batch
import logging
import sys
flags = tf.flags
# Settings
flags.DEFINE_integer("hidden_dim", 512, "hidden dimension")
flags.DEFINE_integer("layers", 2, "number of hidden layers")
flags.DEFINE_integer("unroll", 35, "number of time steps to unroll for BPTT")
flags.DEFINE_integer("batch_size", 20, "batch size")
flags.DEFINE_float("init_scale", 0.1, "scale for random initialization")
flags.DEFINE_float("bayes_init_scale", 0.05, "scale for random initialization")
flags.DEFINE_float("pi", 0.25, "mixture parameter on the Gaussians")
flags.DEFINE_float("log_sigma1", -1.0, "log sigma for the first mixture comp")
flags.DEFINE_float("log_sigma2", -7.0, "log sigma for the second mixture comp")
flags.DEFINE_float("learning_rate", 1.0, "initial learning rate")
flags.DEFINE_float("learning_rate_decay", 0.9, "amount to decrease learning rate")
flags.DEFINE_float("decay_threshold", 0.0, "decrease learning rate if validation cost difference less than this value")
flags.DEFINE_integer("max_decays", 8, "stop decreasing learning rate after this many times")
flags.DEFINE_float("drop_prob", 0.0, "probability of dropping units")
flags.DEFINE_float("gamma", 0.0, "probability of noising input data")
flags.DEFINE_float("norm_scale", 0.1, "hyperparameter on ixh")
flags.DEFINE_boolean("tied", False, "train with weight tying or not")
flags.DEFINE_integer("epoch", 0, "which epoch to load model from")
flags.DEFINE_boolean("absolute_discounting", False, "scale gamma by absolute discounting factor")
flags.DEFINE_integer("max_epochs", 400, "maximum number of epochs to train")
flags.DEFINE_float("clip_norm", 5.0, "value at which to clip gradients")
flags.DEFINE_string("optimizer", "sgd", "optimizer")
flags.DEFINE_string("run_dir", "sandbox", "directory to store experiment outputs")
flags.DEFINE_string("token_type", "word", "use word or character tokens")
flags.DEFINE_string("scheme", "blank", "use blank or ngram noising scheme")
flags.DEFINE_string("ngram_scheme", "unigram", "use {unigram, uniform, bgkn, mbgkn}")
flags.DEFINE_string("restore_checkpoint", None, "checkpoint file to restore model parameters from")
flags.DEFINE_integer("seed", 123, "random seed to use")
flags.DEFINE_integer("steps_per_summary", 10, "how many steps between writing summaries")
flags.DEFINE_boolean("final", False, "final evaluation (run on test after picked best model)")
flags.DEFINE_string("dataset", "ptb", "ptb or text8 or wikitext2")
def get_optimizer(name):
if name == "sgd":
return tf.train.GradientDescentOptimizer
elif name == "adam":
return tf.train.AdamOptimizer
assert False
# Getting stale file handle errors
def log_info(s):
except IOError:
class MixturePrior(object):
def __init__(self, pi, log_sigma1, log_sigma2):
self.mean = 0
self.sigma_mix = pi * tf.exp(log_sigma1) + (1 - pi) * tf.exp(log_sigma2)
def get_kl_divergence(self, gaussian1):
# because the other compute_kl does log(sigma) and this is already set
mean1, sigma1 = gaussian1
mean2, sigma2 = self.mean, self.sigma_mix
kl_divergence = tf.log(sigma2) - tf.log(sigma1) + \
((tf.square(sigma1) + tf.square(mean1 - mean2)) / (2 * tf.square(sigma2))) \
- 0.5
return tf.reduce_mean(kl_divergence)
# should only use inside RNN
def get_random_normal_variable(name, mean, prior, shape, dtype):
A wrapper around tf.get_variable which lets you get a "variable" which is
explicitly a sample from a normal distribution.
# Inverse of a softplus function, so that the value of the standard deviation
# will be equal to what the user specifies, but we can still enforce positivity
# by wrapping the standard deviation in the softplus function.
# standard_dev = tf.log(tf.exp(standard_dev) - 1.0) * tf.ones(shape)
# it's important to initialize variances with care, otherwise the model takes too long to converge
rho_max_init = tf.log(tf.exp(prior.sigma_mix / 2.0) - 1.0)
rho_min_init = tf.log(tf.exp(prior.sigma_mix / 4.0) - 1.0)
std_init = tf.random_uniform_initializer(rho_min_init, rho_max_init)
# this is constant, original paper/email is not constant
mean = tf.get_variable(name + "_mean", shape,
standard_deviation = tf.get_variable(name + "_standard_deviation", shape,
standard_deviation = tf.nn.softplus(standard_deviation) + 1e-5
weights = mean + (standard_deviation * tf.random_normal(shape, 0.0, 1.0, dtype))
return weights, mean, standard_deviation
class BayesianLSTMCell(tf.nn.rnn_cell.BasicLSTMCell):
def __init__(self, num_units, prior, is_training=True, forget_bias=1.0, input_size=None, state_is_tuple=True,
# once generated they stay the same across time-steps
# must construct different cell for each layer
self.is_training = is_training
self.prior = prior
self.W, self.b = None, None
self.W_mu, self.W_std = None, None
self.b_mu, self.b_std = None, None
super(BayesianLSTMCell, self).__init__(num_units, forget_bias, input_size, state_is_tuple, activation)
# we'll see if this implementation is correct
def get_W(self, total_arg_size, output_size, dtype):
with tf.variable_scope("CellWeight"):
if self.W is None:
# can use its own init_scale
self.W, self.W_mu, self.W_std = get_random_normal_variable("Matrix", 0.0, self.prior,
[total_arg_size, output_size], dtype=dtype)
if self.is_training:
return self.W
return self.W_mu
def get_b(self, output_size, dtype):
with tf.variable_scope("CellBias"):
if self.b is None:
self.b, self.b_mu, self.b_std = get_random_normal_variable("Bias", 0.0, self.prior,
[output_size], dtype=dtype)
if self.is_training:
return self.b
return self.b_mu # at evaluation time we only do MAP (on mean value)
def get_kl(self):
# compute KL divergence internally (more modular code)
theta_kl = self.prior.get_kl_divergence((self.W_mu, self.W_std))
theta_kl += self.prior.get_kl_divergence((self.b_mu, self.b_std))
return theta_kl
def stochastic_linear(self, args, output_size, bias, bias_start=0.0, scope=None):
# Local reparameterization trick
if args is None or (nest.is_sequence(args) and not args):
raise ValueError("`args` must be specified")
if not nest.is_sequence(args):
args = [args]
# Calculate the total size of arguments on dimension 1.
total_arg_size = 0
shapes = [a.get_shape().as_list() for a in args]
for shape in shapes:
if len(shape) != 2:
raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
if not shape[1]:
raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
total_arg_size += shape[1]
dtype = [a.dtype for a in args][0]
# Now the computation.
with tf.variable_scope(scope or "Linear"):
matrix = self.get_W(total_arg_size, output_size, dtype=dtype)
if len(args) == 1:
res = tf.matmul(args[0], matrix)
res = tf.matmul(tf.concat(1, args), matrix)
if not bias:
return res
bias_term = self.get_b(output_size, dtype=dtype)
return res + bias_term
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM)."""
with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell"
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
c, h = tf.split(1, 2, state)
concat = self.stochastic_linear([inputs, h], 4 * self._num_units, True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = tf.split(1, 4, concat)
new_c = (c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) *
new_h = self._activation(new_c) * tf.sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
new_state = tf.concat(1, [new_c, new_h])
return new_h, new_state
def compute_info_loss(outputs, cell, unroll):
# outputs: (batch_size, time, hidden_size)
for time_step in range(unroll):
class LanguageModel(object):
def __init__(self, flags, vocab_size, is_training=True):
batch_size = flags.batch_size
unroll = flags.unroll
self._x = tf.placeholder(tf.int32, [batch_size, unroll])
self._y = tf.placeholder(tf.int32, [batch_size, unroll])
self._len = tf.placeholder(tf.int32, [None, ])
in_size = flags.hidden_dim
prior = MixturePrior(flags.pi, flags.log_sigma1, flags.log_sigma2)
# use Bayesian LSTM Cell instead
# under this, epsilon is sampled once in all time steps, and each layer is different
multi_layer_cell = []
for _ in range(flags.layers):
lstm_cell = BayesianLSTMCell(flags.hidden_dim, prior, is_training=is_training,
forget_bias=1.0, state_is_tuple=True)
cell = tf.nn.rnn_cell.MultiRNNCell(multi_layer_cell, state_is_tuple=True)
self._initial_state = cell.zero_state(batch_size, tf.float32)
with tf.device("/cpu:0"):
self.embeddings = tf.get_variable("embeddings", [vocab_size, flags.hidden_dim])
inputs = tf.nn.embedding_lookup(self.embeddings, self._x)
# These options (fixed unroll or dynamic_rnn) should give same results but
# using fixed here since faster
if True:
outputs = []
state = self._initial_state
with tf.variable_scope("RNN"):
for time_step in range(unroll):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs = tf.reshape(tf.concat(1, outputs), [-1, flags.hidden_dim])
with tf.variable_scope("RNN"):
outputs, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=self._len,
initial_state=self._initial_state, dtype=tf.float32,
outputs = tf.reshape(outputs, [-1, flags.hidden_dim])
softmax_w, softmax_w_mu, softmax_w_std = get_random_normal_variable("softmax_w", 0., prior,
[flags.hidden_dim, vocab_size], dtype=tf.float32)
softmax_b, softmax_b_mu, softmax_b_std = get_random_normal_variable("softmax_b", 0., prior,
[vocab_size], dtype=tf.float32)
if is_training:
logits = tf.matmul(outputs, softmax_w) + softmax_b
logits = tf.matmul(outputs, softmax_w_mu) + softmax_b_mu
seq_loss = tf.nn.seq2seq.sequence_loss_by_example(
[tf.reshape(logits, [-1, vocab_size])],
[tf.reshape(self._y, [-1])],
[tf.ones([batch_size * unroll])])
# NLL loss
self.loss = tf.reduce_sum(seq_loss) / batch_size
# KL loss
self.kl_loss = 0.0
for i in range(flags.layers):
self.kl_loss += multi_layer_cell[i].get_kl()
self.kl_loss += prior.get_kl_divergence((softmax_w_mu, softmax_w_std))
self.kl_loss += prior.get_kl_divergence((softmax_b_mu, softmax_b_std))
# if these don't really work, we can remove input dropout
# and make softmax projection bayesian as well
self._final_state = state
self.ixh = tf.global_norm([outputs])
if not is_training:
return = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
shapes = [tvar.get_shape() for tvar in tvars]
log_info("# params: %d" % np.sum([ for s in shapes]))
# careful at this part
grads = tf.gradients(self.loss + self.kl_loss, tvars) # + FLAGS.norm_scale * self.ixh
if flags.clip_norm is not None:
grads, grads_norm = tf.clip_by_global_norm(grads, flags.clip_norm)
grads_norm = tf.global_norm(grads)
optimizer = get_optimizer(flags.optimizer)(
self._train_op = optimizer.apply_gradients(zip(grads, tvars))
# Summaries for TensorBoard, note this is only within training portion
with tf.name_scope("summaries"):
tf.scalar_summary("loss", self.loss / unroll)
tf.scalar_summary("grads_norm", grads_norm)
def set_lr(self, session, lr_value):, lr_value))
def run_epoch(epoch_ind, session, model, loader, split, update_op, flags,
writer=None, summary_op=None, verbose=True):
"""Run an epoch of training/testing"""
epoch_size = loader.get_num_batches(split)
start_time = time.time()
total_cost = 0.0
total_ixh = 0.0
state =
iters = 0
for k in xrange(epoch_size):
x, y = loader.get_batch(split, k)
if split == "train":
gamma = flags.gamma
x, y = noise_batch(x, y, flags, loader, gamma=gamma)
seq_len = [y.shape[1]] * flags.batch_size
fetches = [model.loss, update_op, model.ixh, model._final_state]
feed_dict = {model._x: x,
model._y: y,
model._len: seq_len,
model._initial_state: state}
if summary_op is not None and writer is not None:
fetches = [summary_op] + fetches
summary, cost, _, ixh, state =, feed_dict)
if k % flags.steps_per_summary == 0:
writer.add_summary(summary, epoch_size * epoch_ind + k)
cost, _, ixh, state =, feed_dict)
total_cost += cost
total_ixh += ixh
iters += flags.unroll
if k % (epoch_size // 10) == 10 and verbose:
log_info("%.3f perplexity: %.3f ixh: %.3f speed: %.0f tps" %
(k * 1.0 / epoch_size, np.exp(total_cost / iters),
total_ixh / iters,
iters * flags.batch_size / (time.time() - start_time)))
return np.exp(total_cost / iters)
def main(_):
if not os.path.exists(FLAGS.run_dir):
file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.run_dir))
if FLAGS.dataset == "ptb":
elif FLAGS.dataset == "text8":
data_loader = TextLoader(DATA_PATHS, FLAGS.batch_size, FLAGS.unroll,
vocab_size = len(data_loader.token_to_id)
log_info("Vocabulary size: %d" % vocab_size)
eval_flags = copy.deepcopy(FLAGS)
eval_flags.batch_size = 1
eval_flags.unroll = 1
with tf.Graph().as_default(), tf.Session() as session:
initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale, seed=FLAGS.seed)
# Create training, validation, and evaluation models
with tf.variable_scope("model", reuse=None, initializer=initializer):
mtrain = LanguageModel(FLAGS, vocab_size, is_training=True)
with tf.variable_scope("model", reuse=True, initializer=initializer):
mvalid = LanguageModel(FLAGS, vocab_size, is_training=False)
mtest = LanguageModel(eval_flags, vocab_size, is_training=False)
summary_op = tf.merge_all_summaries()
train_writer = tf.train.SummaryWriter(FLAGS.run_dir)
model_saver = tf.train.Saver(max_to_keep=FLAGS.max_epochs)
if FLAGS.restore_checkpoint is not None:
model_saver.restore(session, FLAGS.restore_checkpoint)
lr = FLAGS.learning_rate
decay_count = 0
prev_valid_perplexity = None
valid_perplexities = list()
k = best_epoch = -1
for k in xrange(FLAGS.max_epochs):
mtrain.set_lr(session, lr)
log_info("Epoch %d, learning rate %f" % (k, lr))
train_perplexity = run_epoch(k, session, mtrain, data_loader, "train",
mtrain._train_op, FLAGS, writer=train_writer, summary_op=summary_op)
log_info("Epoch: %d Train Perplexity: %.3f" % (k, train_perplexity))
valid_perplexity = run_epoch(k, session, mvalid, data_loader, "valid",
tf.no_op(), FLAGS, verbose=False)
log_info("Epoch: %d Valid Perplexity: %.3f" % (k, valid_perplexity))
if prev_valid_perplexity != None and \
np.log(best_valid_perplexity) - np.log(valid_perplexity) < FLAGS.decay_threshold:
lr = lr * FLAGS.learning_rate_decay
decay_count += 1
log_info("Loading epoch %d parameters, perplexity %f" % \
(best_epoch, best_valid_perplexity))
model_saver.restore(session, pjoin(FLAGS.run_dir, "model_epoch%d.ckpt" % best_epoch))
prev_valid_perplexity = valid_perplexity
if valid_perplexity <= np.min(valid_perplexities):
best_epoch = k
best_valid_perplexity = valid_perplexities[best_epoch]
save_path =, pjoin(FLAGS.run_dir, "model_epoch%d.ckpt" % k))
log_info("Saved model to file: %s" % save_path)
if decay_count > FLAGS.max_decays:
log_info("Reached maximum number of decays, quiting after epoch %d" % k)
if best_epoch == -1:
assert FLAGS.epoch != 0
best_epoch = k = FLAGS.epoch
best_valid_perplexity = 0
log_info("Loading epoch %d parameters, perplexity %f" % \
(best_epoch, best_valid_perplexity))
model_saver.restore(session, pjoin(FLAGS.run_dir, "model_epoch%d.ckpt" % best_epoch))
data_loader = TextLoader(DATA_PATHS, eval_flags.batch_size, eval_flags.unroll, FLAGS.token_type)
test_perplexity = run_epoch(k, session, mtest, data_loader, "test",
tf.no_op(), eval_flags, verbose=False)
log_info("Test Perplexity: %.3f" % test_perplexity)
if __name__ == "__main__":
