Created
October 2, 2014 07:28
-
-
Save syhw/fc66aafd2f0bd3b64cfd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A deep neural network with or w/o dropout in one file. | |
""" | |
import numpy | |
import theano | |
import sys | |
import math | |
from theano import tensor as T | |
from theano import shared | |
from theano.tensor.shared_randomstreams import RandomStreams | |
from collections import OrderedDict | |
BATCH_SIZE = 100 | |
SAG = False | |
def relu_f(vec): | |
""" Wrapper to quickly change the rectified linear unit function """ | |
return (vec + abs(vec)) / 2. | |
def softplus_f(v): | |
return T.log(1 + T.exp(v)) | |
def dropout(rng, x, p=0.5): | |
""" Zero-out random values in x with probability p using rng """ | |
if p > 0. and p < 1.: | |
seed = rng.randint(2 ** 30) | |
srng = theano.tensor.shared_randomstreams.RandomStreams(seed) | |
mask = srng.binomial(n=1, p=1.-p, size=x.shape, | |
dtype=theano.config.floatX) | |
return x * mask | |
return x | |
def fast_dropout(rng, x): | |
""" Multiply activations by N(1,1) """ | |
seed = rng.randint(2 ** 30) | |
srng = RandomStreams(seed) | |
mask = srng.normal(size=x.shape, avg=1., dtype=theano.config.floatX) | |
return x * mask | |
def build_shared_zeros(shape, name): | |
""" Builds a theano shared variable filled with a zeros numpy array """ | |
return shared(value=numpy.zeros(shape, dtype=theano.config.floatX), | |
name=name, borrow=True) | |
class Linear(object): | |
""" Basic linear transformation layer (W.X + b) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
if W is None: | |
W_values = numpy.asarray(rng.uniform( | |
low=-numpy.sqrt(6. / (n_in + n_out)), | |
high=numpy.sqrt(6. / (n_in + n_out)), | |
size=(n_in, n_out)), dtype=theano.config.floatX) | |
W_values *= 4 # This works for sigmoid activated networks! | |
W = theano.shared(value=W_values, name='W', borrow=True) | |
if b is None: | |
b = build_shared_zeros((n_out,), 'b') | |
self.input = input | |
self.W = W | |
self.b = b | |
self.params = [self.W, self.b] | |
self.output = T.dot(self.input, self.W) + self.b | |
if fdrop: | |
self.output = fast_dropout(rng, self.output) | |
def __repr__(self): | |
return "Linear" | |
class SigmoidLayer(Linear): | |
""" Sigmoid activation layer (sigmoid(W.X + b)) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation) | |
self.output = T.nnet.sigmoid(self.pre_activation) | |
class ReLU(Linear): | |
""" Rectified Linear Unit activation layer (max(0, W.X + b)) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
if b is None: | |
b = build_shared_zeros((n_out,), 'b') | |
super(ReLU, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation) | |
self.output = relu_f(self.pre_activation) | |
class SoftPlus(Linear): | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=0.): | |
if b is None: | |
b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) | |
b = theano.shared(value=b_values, name='b', borrow=True) | |
super(SoftPlus, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation, fdrop) | |
self.output = softplus_f(self.pre_activation) | |
class DatasetMiniBatchIterator(object): | |
""" Basic mini-batch iterator """ | |
def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False): | |
self.x = x | |
self.y = y | |
self.batch_size = batch_size | |
self.randomize = randomize | |
from sklearn.utils import check_random_state | |
self.rng = check_random_state(42) | |
def __iter__(self): | |
n_samples = self.x.shape[0] | |
if self.randomize: | |
for _ in xrange(n_samples / BATCH_SIZE): | |
if BATCH_SIZE > 1: | |
i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE)) | |
else: | |
i = int(math.floor(self.rng.rand(1) * n_samples)) | |
yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size], | |
self.y[i*self.batch_size:(i+1)*self.batch_size]) | |
else: | |
for i in xrange((n_samples + self.batch_size - 1) | |
/ self.batch_size): | |
yield (self.x[i*self.batch_size:(i+1)*self.batch_size], | |
self.y[i*self.batch_size:(i+1)*self.batch_size]) | |
class LogisticRegression: | |
"""Multi-class Logistic Regression | |
""" | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None): | |
if W != None: | |
self.W = W | |
else: | |
self.W = build_shared_zeros((n_in, n_out), 'W') | |
if b != None: | |
self.b = b | |
else: | |
self.b = build_shared_zeros((n_out,), 'b') | |
# P(Y|X) = softmax(W.X + b) | |
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) | |
self.y_pred = T.argmax(self.p_y_given_x, axis=1) | |
self.output = self.y_pred | |
self.params = [self.W, self.b] | |
def negative_log_likelihood(self, y): | |
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
def negative_log_likelihood_sum(self, y): | |
return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
def training_cost(self, y): | |
""" Wrapper for standard name """ | |
return self.negative_log_likelihood_sum(y) | |
def errors(self, y): | |
if y.ndim != self.y_pred.ndim: | |
raise TypeError("y should have the same shape as self.y_pred", | |
("y", y.type, "y_pred", self.y_pred.type)) | |
if y.dtype.startswith('int'): | |
return T.mean(T.neq(self.y_pred, y)) | |
else: | |
print("!!! y should be of int type") | |
return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int'))) | |
class NeuralNet(object): | |
""" Neural network (not regularized, without dropout) """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=40*3, | |
layers_types=[Linear, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[1024, 1024, 1024, 1024], | |
n_outs=62 * 3, | |
rho=0.95, eps=1.E-6, | |
max_norm=0., | |
debugprint=False): | |
""" | |
TODO | |
""" | |
self.layers = [] | |
self.params = [] | |
self.pre_activations = [] # SAG specific | |
self.n_layers = len(layers_types) | |
self.layers_types = layers_types | |
assert self.n_layers > 0 | |
self.max_norm = max_norm | |
self._rho = rho # ``momentum'' for adadelta | |
self._eps = eps # epsilon for adadelta | |
self._accugrads = [] # for adadelta | |
self._accudeltas = [] # for adadelta | |
self._old_dxs = [] # for adadelta with Nesterov | |
if SAG: | |
self._sag_gradient_memory = [] # for SAG | |
if theano_rng == None: | |
theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) | |
self.x = T.fmatrix('x') | |
self.y = T.ivector('y') | |
self.layers_ins = [n_ins] + layers_sizes | |
self.layers_outs = layers_sizes + [n_outs] | |
layer_input = self.x | |
for layer_type, n_in, n_out in zip(layers_types, | |
self.layers_ins, self.layers_outs): | |
this_layer = layer_type(rng=numpy_rng, | |
input=layer_input, n_in=n_in, n_out=n_out) | |
assert hasattr(this_layer, 'output') | |
self.params.extend(this_layer.params) | |
#self.pre_activations.extend(this_layer.pre_activation)# SAG specific TODO | |
self._accugrads.extend([build_shared_zeros(t.shape.eval(), | |
'accugrad') for t in this_layer.params]) | |
self._accudeltas.extend([build_shared_zeros(t.shape.eval(), | |
'accudelta') for t in this_layer.params]) | |
self._old_dxs.extend([build_shared_zeros(t.shape.eval(), | |
'old_dxs') for t in this_layer.params]) | |
if SAG: | |
self._sag_gradient_memory.extend([build_shared_zeros(tuple([(x_train.shape[0]+BATCH_SIZE-1) / BATCH_SIZE] + list(t.shape.eval())), 'sag_gradient_memory') for t in this_layer.params]) | |
#self._sag_gradient_memory.extend([[build_shared_zeros(t.shape.eval(), 'sag_gradient_memory') for _ in xrange(x_train.shape[0] / BATCH_SIZE + 1)] for t in this_layer.params]) | |
self.layers.append(this_layer) | |
layer_input = this_layer.output | |
assert hasattr(self.layers[-1], 'training_cost') | |
assert hasattr(self.layers[-1], 'errors') | |
# TODO standardize cost | |
self.mean_cost = self.layers[-1].negative_log_likelihood(self.y) | |
self.cost = self.layers[-1].training_cost(self.y) | |
if debugprint: | |
theano.printing.debugprint(self.cost) | |
self.errors = self.layers[-1].errors(self.y) | |
def __repr__(self): | |
dimensions_layers_str = map(lambda x: "x".join(map(str, x)), | |
zip(self.layers_ins, self.layers_outs)) | |
return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])), | |
zip(self.layers_types, dimensions_layers_str))) | |
def get_SGD_trainer(self): | |
""" Returns a plain SGD minibatch trainer with learning rate as param. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
learning_rate = T.fscalar('lr') # learning rate to use | |
# compute the gradients with respect to the model parameters | |
# using mean_cost so that the learning rate is not too dependent | |
# on the batch size | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for param, gparam in zip(self.params, gparams): | |
if self.max_norm: | |
W = param - gparam * learning_rate | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param - gparam * learning_rate | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), | |
theano.Param(learning_rate)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_SAG_trainer(self, R=1., alpha=0., debug=False): # alpha for reg. TODO | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
ind_minibatch = T.iscalar('ind_minibatch') | |
n_seen = T.fscalar('n_seen') | |
# compute the gradients with respect to the model parameters | |
cost = self.mean_cost | |
gparams = T.grad(cost, self.params) | |
#sparams = T.grad(cost, self.pre_activations) # SAG specific | |
scaling = numpy.float32(1. / (R / 4. + alpha)) | |
updates = OrderedDict() | |
for accugrad, gradient_memory, param, gparam in zip( | |
self._accugrads, self._sag_gradient_memory, | |
#self._accugrads, self._sag_gradient_memory[ind_minibatch.eval()], | |
self.params, gparams): | |
new = gparam + alpha * param | |
agrad = accugrad + new - gradient_memory[ind_minibatch] | |
# updates[gradient_memory[ind_minibatch]] = new | |
updates[gradient_memory] = T.set_subtensor(gradient_memory[ind_minibatch], new) | |
updates[param] = param - (scaling / n_seen) * agrad | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), theano.Param(ind_minibatch), | |
theano.Param(n_seen)], | |
outputs=cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adagrad_trainer(self): | |
""" Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
learning_rate = T.fscalar('lr') # learning rate to use | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = accugrad + gparam * gparam | |
dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), | |
theano.Param(learning_rate)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adadelta_trainer(self): | |
""" Returns an Adadelta (Zeiler 2012) trainer using self._rho and | |
self._eps params. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, accudelta, param, gparam in zip(self._accugrads, | |
self._accudeltas, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam | |
dx = - T.sqrt((accudelta + self._eps) | |
/ (agrad + self._eps)) * gparam | |
updates[accudelta] = (self._rho * accudelta | |
+ (1 - self._rho) * dx * dx) | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def nesterov_step(self): | |
beta = 0.5 | |
updates = OrderedDict() | |
for param, dx in zip(self.params, self._old_dxs): | |
updates[param] = param + beta * dx | |
nesterov_fn = theano.function(inputs=[], | |
outputs=[], | |
updates=updates, | |
givens={}) | |
nesterov_fn() | |
def get_adadelta_nesterov_trainer(self): | |
""" Returns an Adadelta (Zeiler 2012) trainer using self._rho and | |
self._eps params. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
beta = 0.5 | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, accudelta, old_dx, param, gparam in zip(self._accugrads, | |
self._accudeltas, self._old_dxs, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam | |
dx = - T.sqrt((accudelta + self._eps) | |
/ (agrad + self._eps)) * gparam | |
updates[accudelta] = (self._rho * accudelta | |
+ (1 - self._rho) * dx * dx) | |
if self.max_norm: | |
W = param + dx - beta*old_dx | |
#W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx - beta*old_dx | |
#updates[param] = param + dx | |
updates[old_dx] = dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adadelta_rprop_trainer(self): | |
""" TODO | |
""" | |
# TODO working on that | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, accudelta, param, gparam in zip(self._accugrads, | |
self._accudeltas, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam | |
dx = - T.sqrt((accudelta + self._eps) | |
/ (agrad + self._eps)) * T.switch(gparam < 0, -1., 1.) | |
updates[accudelta] = (self._rho * accudelta | |
+ (1 - self._rho) * dx * dx) | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def score_classif(self, given_set): | |
""" Returns functions to get current classification errors. """ | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
score = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.errors, | |
givens={self.x: batch_x, self.y: batch_y}) | |
def scoref(): | |
""" returned function that scans the entire set given as input """ | |
return [score(batch_x, batch_y) for batch_x, batch_y in given_set] | |
return scoref | |
class RegularizedNet(NeuralNet): | |
""" Neural net with L1 and L2 regularization """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=100, | |
layers_types=[ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[1024, 1024, 1024], | |
n_outs=2, | |
rho=0.9, eps=1.E-6, | |
L1_reg=0., | |
L2_reg=0., | |
max_norm=0., | |
debugprint=False): | |
""" | |
TODO | |
""" | |
super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins, | |
layers_types, layers_sizes, n_outs, rho, eps, max_norm, | |
debugprint) | |
L1 = shared(0.) | |
for param in self.params: | |
L1 += T.sum(abs(param)) | |
if L1_reg > 0.: | |
self.cost = self.cost + L1_reg * L1 | |
L2 = shared(0.) | |
for param in self.params: | |
L2 += T.sum(param ** 2) | |
if L2_reg > 0.: | |
self.cost = self.cost + L2_reg * L2 | |
class DropoutNet(NeuralNet): | |
""" Neural net with dropout (see Hinton's et al. paper) """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=40*3, | |
layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[4000, 4000, 4000, 4000], | |
dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], | |
n_outs=62 * 3, | |
rho=0.98, eps=1.E-6, | |
max_norm=0., | |
fast_drop=True, | |
debugprint=False): | |
""" | |
TODO | |
""" | |
super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins, | |
layers_types, layers_sizes, n_outs, rho, eps, max_norm, | |
debugprint) | |
self.dropout_rates = dropout_rates | |
if fast_drop: | |
if dropout_rates[0]: | |
dropout_layer_input = fast_dropout(numpy_rng, self.x) | |
else: | |
dropout_layer_input = self.x | |
else: | |
dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0]) | |
self.dropout_layers = [] | |
for layer, layer_type, n_in, n_out, dr in zip(self.layers, | |
layers_types, self.layers_ins, self.layers_outs, | |
dropout_rates[1:] + [0]): # !!! we do not dropout anything | |
# from the last layer !!! | |
if dr: | |
if fast_drop: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W, b=layer.b, fdrop=True) | |
else: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W * 1. / (1. - dr), | |
b=layer.b * 1. / (1. - dr)) | |
# N.B. dropout with dr==1 does not dropanything!! | |
this_layer.output = dropout(numpy_rng, this_layer.output, dr) | |
else: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W, b=layer.b) | |
assert hasattr(this_layer, 'output') | |
self.dropout_layers.append(this_layer) | |
dropout_layer_input = this_layer.output | |
assert hasattr(self.layers[-1], 'training_cost') | |
assert hasattr(self.layers[-1], 'errors') | |
# TODO standardize cost | |
# these are the dropout costs | |
self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y) | |
self.cost = self.dropout_layers[-1].training_cost(self.y) | |
# these is the non-dropout errors | |
self.errors = self.layers[-1].errors(self.y) | |
def __repr__(self): | |
return super(DropoutNet, self).__repr__() + "\n"\ | |
+ "dropout rates: " + str(self.dropout_rates) | |
def add_fit_and_score(class_to_chg): | |
""" Mutates a class to add the fit() and score() functions to a NeuralNet. | |
""" | |
from types import MethodType | |
def fit(self, x_train, y_train, x_dev=None, y_dev=None, | |
max_epochs=20, early_stopping=True, split_ratio=0.1, # TODO 100+ epochs | |
method='adadelta', verbose=False, plot=False): | |
""" | |
TODO | |
""" | |
import time, copy | |
if x_dev == None or y_dev == None: | |
from sklearn.cross_validation import train_test_split | |
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, | |
test_size=split_ratio, random_state=42) | |
if method == 'sgd': | |
train_fn = self.get_SGD_trainer() | |
elif method == 'adagrad': | |
train_fn = self.get_adagrad_trainer() | |
elif method == 'adadelta': | |
train_fn = self.get_adadelta_trainer() | |
elif method == 'adadelta_nesterov': | |
train_fn = self.get_adadelta_nesterov_trainer() | |
elif method == 'adadelta_rprop': | |
train_fn = self.get_adadelta_rprop_trainer() | |
elif method == 'sag': | |
#train_fn = self.get_SAG_trainer(R=1+numpy.max(numpy.sum(x_train**2, axis=1))) | |
if BATCH_SIZE > 1: | |
line_sums = numpy.sum(x_train**2, axis=1) | |
train_fn = self.get_SAG_trainer(R=numpy.max(numpy.mean( | |
line_sums[:(line_sums.shape[0]/BATCH_SIZE)*BATCH_SIZE].reshape((line_sums.shape[0]/BATCH_SIZE, | |
BATCH_SIZE)), axis=1)), | |
alpha=1./x_train.shape[0]) | |
else: | |
train_fn = self.get_SAG_trainer(R=numpy.max(numpy.sum(x_train**2, | |
axis=1)), alpha=1./x_train.shape[0]) | |
train_set_iterator = DatasetMiniBatchIterator(x_train, y_train) | |
if method == 'sag': | |
sag_train_set_iterator = DatasetMiniBatchIterator(x_train, y_train, randomize=True) | |
dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev) | |
train_scoref = self.score_classif(train_set_iterator) | |
dev_scoref = self.score_classif(dev_set_iterator) | |
best_dev_loss = numpy.inf | |
epoch = 0 | |
# TODO early stopping (not just cross val, also stop training) | |
if plot: | |
verbose = True | |
self._costs = [] | |
self._train_errors = [] | |
self._dev_errors = [] | |
self._updates = [] | |
seen = numpy.zeros(((x_train.shape[0]+BATCH_SIZE-1) / BATCH_SIZE,), dtype=numpy.bool) | |
n_seen = 0 | |
while epoch < max_epochs: | |
if not verbose: | |
sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs)) | |
sys.stdout.flush() | |
avg_costs = [] | |
timer = time.time() | |
if method == 'sag': | |
for ind_minibatch, x, y in sag_train_set_iterator: | |
if not seen[ind_minibatch]: | |
seen[ind_minibatch] = 1 | |
n_seen += 1 | |
if 'nesterov' in method: | |
self.nesterov_step() | |
avg_cost = train_fn(x, y, ind_minibatch, n_seen) | |
if type(avg_cost) == list: | |
avg_costs.append(avg_cost[0]) | |
else: | |
avg_costs.append(avg_cost) | |
else: | |
for x, y in train_set_iterator: | |
if method == 'sgd' or method == 'adagrad': | |
avg_cost = train_fn(x, y, lr=1.E-2) | |
elif 'adadelta' in method: | |
avg_cost = train_fn(x, y) | |
if type(avg_cost) == list: | |
avg_costs.append(avg_cost[0]) | |
else: | |
avg_costs.append(avg_cost) | |
if verbose: | |
mean_costs = numpy.mean(avg_costs) | |
mean_train_errors = numpy.mean(train_scoref()) | |
print(' epoch %i took %f seconds' % | |
(epoch, time.time() - timer)) | |
print(' epoch %i, avg costs %f' % | |
(epoch, mean_costs)) | |
print(' method %s, epoch %i, training error %f' % | |
(method, epoch, mean_train_errors)) | |
if plot: | |
self._costs.append(mean_costs) | |
self._train_errors.append(mean_train_errors) | |
dev_errors = numpy.mean(dev_scoref()) | |
if plot: | |
self._dev_errors.append(dev_errors) | |
if dev_errors < best_dev_loss: | |
best_dev_loss = dev_errors | |
best_params = copy.deepcopy(self.params) | |
if verbose: | |
print('!!! epoch %i, validation error of best model %f' % | |
(epoch, dev_errors)) | |
epoch += 1 | |
if not verbose: | |
print("") | |
for i, param in enumerate(best_params): | |
self.params[i] = param | |
def score(self, x, y): | |
""" error rates """ | |
iterator = DatasetMiniBatchIterator(x, y) | |
scoref = self.score_classif(iterator) | |
return numpy.mean(scoref()) | |
class_to_chg.fit = MethodType(fit, None, class_to_chg) | |
class_to_chg.score = MethodType(score, None, class_to_chg) | |
if __name__ == "__main__": | |
add_fit_and_score(DropoutNet) | |
add_fit_and_score(RegularizedNet) | |
def nudge_dataset(X, Y): | |
""" | |
This produces a dataset 5 times bigger than the original one, | |
by moving the 8x8 images in X around by 1px to left, right, down, up | |
""" | |
from scipy.ndimage import convolve | |
direction_vectors = [ | |
[[0, 1, 0], | |
[0, 0, 0], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[1, 0, 0], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[0, 0, 1], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[0, 0, 0], | |
[0, 1, 0]]] | |
shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', | |
weights=w).ravel() | |
X = numpy.concatenate([X] + | |
[numpy.apply_along_axis(shift, 1, X, vector) | |
for vector in direction_vectors]) | |
Y = numpy.concatenate([Y for _ in range(5)], axis=0) | |
return X, Y | |
from sklearn import datasets, svm, naive_bayes | |
from sklearn import cross_validation, preprocessing | |
MNIST = True | |
DIGITS = False | |
FACES = False | |
TWENTYNEWSGROUPS = False | |
VERBOSE = True | |
SCALE = True | |
PLOT = True | |
def train_models(x_train, y_train, x_test, y_test, n_features, n_outs, | |
use_dropout=True, n_epochs=100, numpy_rng=None, # TODO 200+ epochs | |
svms=False, nb=False, deepnn=True, name=''): | |
if svms: | |
print("Linear SVM") | |
classifier = svm.SVC(gamma=0.001) | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
print("RBF-kernel SVM") | |
classifier = svm.SVC(kernel='rbf', class_weight='auto') | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
if nb: | |
print("Multinomial Naive Bayes") | |
classifier = naive_bayes.MultinomialNB() | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
if deepnn: | |
import warnings | |
warnings.filterwarnings("ignore") # TODO remove | |
if use_dropout: | |
n_epochs *= 4 | |
pass | |
def new_dnn(dropout=False): | |
if dropout: | |
print("Dropout DNN") | |
return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features, | |
#layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_types=[SoftPlus, SoftPlus, SoftPlus, SoftPlus, LogisticRegression], | |
layers_sizes=[2000, 2000, 2000, 2000], | |
dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], | |
n_outs=n_outs, | |
max_norm=4., | |
fast_drop=False, | |
debugprint=0) | |
else: | |
print("Simple (regularized) DNN") | |
return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features, | |
#layers_types=[LogisticRegression], | |
#layers_sizes=[], | |
#layers_types=[ReLU, ReLU, ReLU, LogisticRegression], | |
layers_types=[SoftPlus, SoftPlus, SoftPlus, LogisticRegression], | |
layers_sizes=[1000, 1000, 1000], | |
#layers_types=[ReLU, LogisticRegression], | |
#layers_sizes=[200], | |
n_outs=n_outs, | |
#L1_reg=0.001/x_train.shape[0], | |
#L2_reg=0.001/x_train.shape[0], | |
L1_reg=0., | |
L2_reg=1./x_train.shape[0], | |
max_norm=0., | |
debugprint=0) | |
import matplotlib.pyplot as plt | |
plt.figure() | |
ax1 = plt.subplot(221) | |
ax2 = plt.subplot(222) | |
ax3 = plt.subplot(223) | |
ax4 = plt.subplot(224) # TODO updates of the weights | |
#methods = ['adadelta', 'adadelta_nesterov', 'sgd', 'adagrad'] # 'sag' | |
#methods = ['adadelta_rprop', 'adadelta', 'adadelta_nesterov'] | |
methods = ['adadelta_nesterov', 'adadelta'] | |
#methods = ['adadelta'] | |
#methods = ['sgd'] | |
#methods = ['adagrad'] | |
if SAG: | |
methods = ['sag', 'sgd', 'adagrad', 'adadelta'] | |
for method in methods: | |
dnn = new_dnn(use_dropout) | |
print dnn | |
dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT) | |
test_error = dnn.score(x_test, y_test) | |
print("score: %f" % (1. - test_error)) | |
ax1.plot(numpy.log10(dnn._costs), label=method) | |
#ax2.plot(numpy.log10(dnn._train_errors), label=method) | |
#ax3.plot(numpy.log10(dnn._dev_errors), label=method) | |
ax2.plot(dnn._train_errors, label=method) | |
ax3.plot(dnn._dev_errors, label=method) | |
#ax4.plot(dnn._updates, label=method) TODO | |
ax4.plot([test_error for _ in range(10)], label=method) | |
ax1.set_xlabel('epoch') | |
ax1.set_ylabel('cost (log10)') | |
ax2.set_xlabel('epoch') | |
ax2.set_ylabel('train error') | |
ax3.set_xlabel('epoch') | |
ax3.set_ylabel('dev error') | |
ax4.set_ylabel('test error') | |
plt.legend() | |
plt.savefig('training' + name + '.png') | |
if MNIST: | |
from sklearn.datasets import fetch_mldata | |
mnist = fetch_mldata('MNIST original') | |
X = numpy.asarray(mnist.data, dtype='float32') | |
if SCALE: | |
#X = preprocessing.scale(X) | |
X /= 255. | |
y = numpy.asarray(mnist.target, dtype='int32') | |
#target_names = mnist.target_names | |
print("Total dataset size:") | |
print("n samples: %d" % X.shape[0]) | |
print("n features: %d" % X.shape[1]) | |
print("n classes: %d" % len(set(y))) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, X.shape[1], | |
len(set(y)), numpy_rng=numpy.random.RandomState(123), | |
name='MNIST') | |
if DIGITS: | |
digits = datasets.load_digits() | |
data = numpy.asarray(digits.data, dtype='float32') | |
target = numpy.asarray(digits.target, dtype='int32') | |
nudged_x, nudged_y = nudge_dataset(data, target) | |
if SCALE: | |
nudged_x = preprocessing.scale(nudged_x) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
nudged_x, nudged_y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, nudged_x.shape[1], | |
len(set(target)), numpy_rng=numpy.random.RandomState(123), | |
name='digits') | |
if FACES: | |
import logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(message)s') | |
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, | |
resize=0.4) | |
X = numpy.asarray(lfw_people.data, dtype='float32') | |
if SCALE: | |
X = preprocessing.scale(X) | |
y = numpy.asarray(lfw_people.target, dtype='int32') | |
target_names = lfw_people.target_names | |
print("Total dataset size:") | |
print("n samples: %d" % X.shape[0]) | |
print("n features: %d" % X.shape[1]) | |
print("n classes: %d" % target_names.shape[0]) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, X.shape[1], | |
len(set(y)), numpy_rng=numpy.random.RandomState(123), | |
name='faces') | |
if TWENTYNEWSGROUPS: | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
newsgroups_train = datasets.fetch_20newsgroups(subset='train') | |
vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000) | |
#vectorizer = HashingVectorizer(encoding='latin-1') | |
x_train = vectorizer.fit_transform(newsgroups_train.data) | |
x_train = numpy.asarray(x_train.todense(), dtype='float32') | |
y_train = numpy.asarray(newsgroups_train.target, dtype='int32') | |
newsgroups_test = datasets.fetch_20newsgroups(subset='test') | |
x_test = vectorizer.transform(newsgroups_test.data) | |
x_test = numpy.asarray(x_test.todense(), dtype='float32') | |
y_test = numpy.asarray(newsgroups_test.target, dtype='int32') | |
train_models(x_train, y_train, x_test, y_test, x_train.shape[1], | |
len(set(y_train)), | |
numpy_rng=numpy.random.RandomState(123), | |
svms=False, nb=True, deepnn=True, | |
name='20newsgroups') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment