""" A deep neural network with or w/o dropout in one file. """ import numpy import theano import sys import math from theano import tensor as T from theano import shared from theano.tensor.shared_randomstreams import RandomStreams from collections import OrderedDict BATCH_SIZE = 100 SAG = False def relu_f(vec): """ Wrapper to quickly change the rectified linear unit function """ return (vec + abs(vec)) / 2. def softplus_f(v): return T.log(1 + T.exp(v)) def dropout(rng, x, p=0.5): """ Zero-out random values in x with probability p using rng """ if p > 0. and p < 1.: seed = rng.randint(2 ** 30) srng = theano.tensor.shared_randomstreams.RandomStreams(seed) mask = srng.binomial(n=1, p=1.-p, size=x.shape, dtype=theano.config.floatX) return x * mask return x def fast_dropout(rng, x): """ Multiply activations by N(1,1) """ seed = rng.randint(2 ** 30) srng = RandomStreams(seed) mask = srng.normal(size=x.shape, avg=1., dtype=theano.config.floatX) return x * mask def build_shared_zeros(shape, name): """ Builds a theano shared variable filled with a zeros numpy array """ return shared(value=numpy.zeros(shape, dtype=theano.config.floatX), name=name, borrow=True) class Linear(object): """ Basic linear transformation layer (W.X + b) """ def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): if W is None: W_values = numpy.asarray(rng.uniform( low=-numpy.sqrt(6. / (n_in + n_out)), high=numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out)), dtype=theano.config.floatX) W_values *= 4 # This works for sigmoid activated networks! W = theano.shared(value=W_values, name='W', borrow=True) if b is None: b = build_shared_zeros((n_out,), 'b') self.input = input self.W = W self.b = b self.params = [self.W, self.b] self.output = T.dot(self.input, self.W) + self.b if fdrop: self.output = fast_dropout(rng, self.output) def __repr__(self): return "Linear" class SigmoidLayer(Linear): """ Sigmoid activation layer (sigmoid(W.X + b)) """ def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b) self.pre_activation = self.output if fdrop: self.pre_activation = fast_dropout(rng, self.pre_activation) self.output = T.nnet.sigmoid(self.pre_activation) class ReLU(Linear): """ Rectified Linear Unit activation layer (max(0, W.X + b)) """ def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): if b is None: b = build_shared_zeros((n_out,), 'b') super(ReLU, self).__init__(rng, input, n_in, n_out, W, b) self.pre_activation = self.output if fdrop: self.pre_activation = fast_dropout(rng, self.pre_activation) self.output = relu_f(self.pre_activation) class SoftPlus(Linear): def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=0.): if b is None: b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) b = theano.shared(value=b_values, name='b', borrow=True) super(SoftPlus, self).__init__(rng, input, n_in, n_out, W, b) self.pre_activation = self.output if fdrop: self.pre_activation = fast_dropout(rng, self.pre_activation, fdrop) self.output = softplus_f(self.pre_activation) class DatasetMiniBatchIterator(object): """ Basic mini-batch iterator """ def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False): self.x = x self.y = y self.batch_size = batch_size self.randomize = randomize from sklearn.utils import check_random_state self.rng = check_random_state(42) def __iter__(self): n_samples = self.x.shape[0] if self.randomize: for _ in xrange(n_samples / BATCH_SIZE): if BATCH_SIZE > 1: i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE)) else: i = int(math.floor(self.rng.rand(1) * n_samples)) yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size], self.y[i*self.batch_size:(i+1)*self.batch_size]) else: for i in xrange((n_samples + self.batch_size - 1) / self.batch_size): yield (self.x[i*self.batch_size:(i+1)*self.batch_size], self.y[i*self.batch_size:(i+1)*self.batch_size]) class LogisticRegression: """Multi-class Logistic Regression """ def __init__(self, rng, input, n_in, n_out, W=None, b=None): if W != None: self.W = W else: self.W = build_shared_zeros((n_in, n_out), 'W') if b != None: self.b = b else: self.b = build_shared_zeros((n_out,), 'b') # P(Y|X) = softmax(W.X + b) self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) self.y_pred = T.argmax(self.p_y_given_x, axis=1) self.output = self.y_pred self.params = [self.W, self.b] def negative_log_likelihood(self, y): return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) def negative_log_likelihood_sum(self, y): return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) def training_cost(self, y): """ Wrapper for standard name """ return self.negative_log_likelihood_sum(y) def errors(self, y): if y.ndim != self.y_pred.ndim: raise TypeError("y should have the same shape as self.y_pred", ("y", y.type, "y_pred", self.y_pred.type)) if y.dtype.startswith('int'): return T.mean(T.neq(self.y_pred, y)) else: print("!!! y should be of int type") return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int'))) class NeuralNet(object): """ Neural network (not regularized, without dropout) """ def __init__(self, numpy_rng, theano_rng=None, n_ins=40*3, layers_types=[Linear, ReLU, ReLU, ReLU, LogisticRegression], layers_sizes=[1024, 1024, 1024, 1024], n_outs=62 * 3, rho=0.95, eps=1.E-6, max_norm=0., debugprint=False): """ TODO """ self.layers = [] self.params = [] self.pre_activations = [] # SAG specific self.n_layers = len(layers_types) self.layers_types = layers_types assert self.n_layers > 0 self.max_norm = max_norm self._rho = rho # ``momentum'' for adadelta self._eps = eps # epsilon for adadelta self._accugrads = [] # for adadelta self._accudeltas = [] # for adadelta self._old_dxs = [] # for adadelta with Nesterov if SAG: self._sag_gradient_memory = [] # for SAG if theano_rng == None: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.x = T.fmatrix('x') self.y = T.ivector('y') self.layers_ins = [n_ins] + layers_sizes self.layers_outs = layers_sizes + [n_outs] layer_input = self.x for layer_type, n_in, n_out in zip(layers_types, self.layers_ins, self.layers_outs): this_layer = layer_type(rng=numpy_rng, input=layer_input, n_in=n_in, n_out=n_out) assert hasattr(this_layer, 'output') self.params.extend(this_layer.params) #self.pre_activations.extend(this_layer.pre_activation)# SAG specific TODO self._accugrads.extend([build_shared_zeros(t.shape.eval(), 'accugrad') for t in this_layer.params]) self._accudeltas.extend([build_shared_zeros(t.shape.eval(), 'accudelta') for t in this_layer.params]) self._old_dxs.extend([build_shared_zeros(t.shape.eval(), 'old_dxs') for t in this_layer.params]) if SAG: self._sag_gradient_memory.extend([build_shared_zeros(tuple([(x_train.shape[0]+BATCH_SIZE-1) / BATCH_SIZE] + list(t.shape.eval())), 'sag_gradient_memory') for t in this_layer.params]) #self._sag_gradient_memory.extend([[build_shared_zeros(t.shape.eval(), 'sag_gradient_memory') for _ in xrange(x_train.shape[0] / BATCH_SIZE + 1)] for t in this_layer.params]) self.layers.append(this_layer) layer_input = this_layer.output assert hasattr(self.layers[-1], 'training_cost') assert hasattr(self.layers[-1], 'errors') # TODO standardize cost self.mean_cost = self.layers[-1].negative_log_likelihood(self.y) self.cost = self.layers[-1].training_cost(self.y) if debugprint: theano.printing.debugprint(self.cost) self.errors = self.layers[-1].errors(self.y) def __repr__(self): dimensions_layers_str = map(lambda x: "x".join(map(str, x)), zip(self.layers_ins, self.layers_outs)) return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])), zip(self.layers_types, dimensions_layers_str))) def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use # compute the gradients with respect to the model parameters # using mean_cost so that the learning rate is not too dependent # on the batch size gparams = T.grad(self.mean_cost, self.params) # compute list of weights updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): if self.max_norm: W = param - gparam * learning_rate col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, self.max_norm) updates[param] = W * (desired_norms / (1e-6 + col_norms)) else: updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_SAG_trainer(self, R=1., alpha=0., debug=False): # alpha for reg. TODO batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') ind_minibatch = T.iscalar('ind_minibatch') n_seen = T.fscalar('n_seen') # compute the gradients with respect to the model parameters cost = self.mean_cost gparams = T.grad(cost, self.params) #sparams = T.grad(cost, self.pre_activations) # SAG specific scaling = numpy.float32(1. / (R / 4. + alpha)) updates = OrderedDict() for accugrad, gradient_memory, param, gparam in zip( self._accugrads, self._sag_gradient_memory, #self._accugrads, self._sag_gradient_memory[ind_minibatch.eval()], self.params, gparams): new = gparam + alpha * param agrad = accugrad + new - gradient_memory[ind_minibatch] # updates[gradient_memory[ind_minibatch]] = new updates[gradient_memory] = T.set_subtensor(gradient_memory[ind_minibatch], new) updates[param] = param - (scaling / n_seen) * agrad updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(ind_minibatch), theano.Param(n_seen)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use # compute the gradients with respect to the model parameters gparams = T.grad(self.mean_cost, self.params) # compute list of weights updates updates = OrderedDict() for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam if self.max_norm: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, self.max_norm) updates[param] = W * (desired_norms / (1e-6 + col_norms)) else: updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adadelta_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') # compute the gradients with respect to the model parameters gparams = T.grad(self.mean_cost, self.params) # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = (self._rho * accudelta + (1 - self._rho) * dx * dx) if self.max_norm: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, self.max_norm) updates[param] = W * (desired_norms / (1e-6 + col_norms)) else: updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def nesterov_step(self): beta = 0.5 updates = OrderedDict() for param, dx in zip(self.params, self._old_dxs): updates[param] = param + beta * dx nesterov_fn = theano.function(inputs=[], outputs=[], updates=updates, givens={}) nesterov_fn() def get_adadelta_nesterov_trainer(self): """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') # compute the gradients with respect to the model parameters gparams = T.grad(self.mean_cost, self.params) beta = 0.5 # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, old_dx, param, gparam in zip(self._accugrads, self._accudeltas, self._old_dxs, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = (self._rho * accudelta + (1 - self._rho) * dx * dx) if self.max_norm: W = param + dx - beta*old_dx #W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, self.max_norm) updates[param] = W * (desired_norms / (1e-6 + col_norms)) else: updates[param] = param + dx - beta*old_dx #updates[param] = param + dx updates[old_dx] = dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def get_adadelta_rprop_trainer(self): """ TODO """ # TODO working on that batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') # compute the gradients with respect to the model parameters gparams = T.grad(self.mean_cost, self.params) # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * T.switch(gparam < 0, -1., 1.) updates[accudelta] = (self._rho * accudelta + (1 - self._rho) * dx * dx) if self.max_norm: W = param + dx col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, self.max_norm) updates[param] = W * (desired_norms / (1e-6 + col_norms)) else: updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn def score_classif(self, given_set): """ Returns functions to get current classification errors. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)], outputs=self.errors, givens={self.x: batch_x, self.y: batch_y}) def scoref(): """ returned function that scans the entire set given as input """ return [score(batch_x, batch_y) for batch_x, batch_y in given_set] return scoref class RegularizedNet(NeuralNet): """ Neural net with L1 and L2 regularization """ def __init__(self, numpy_rng, theano_rng=None, n_ins=100, layers_types=[ReLU, ReLU, ReLU, LogisticRegression], layers_sizes=[1024, 1024, 1024], n_outs=2, rho=0.9, eps=1.E-6, L1_reg=0., L2_reg=0., max_norm=0., debugprint=False): """ TODO """ super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins, layers_types, layers_sizes, n_outs, rho, eps, max_norm, debugprint) L1 = shared(0.) for param in self.params: L1 += T.sum(abs(param)) if L1_reg > 0.: self.cost = self.cost + L1_reg * L1 L2 = shared(0.) for param in self.params: L2 += T.sum(param ** 2) if L2_reg > 0.: self.cost = self.cost + L2_reg * L2 class DropoutNet(NeuralNet): """ Neural net with dropout (see Hinton's et al. paper) """ def __init__(self, numpy_rng, theano_rng=None, n_ins=40*3, layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], layers_sizes=[4000, 4000, 4000, 4000], dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], n_outs=62 * 3, rho=0.98, eps=1.E-6, max_norm=0., fast_drop=True, debugprint=False): """ TODO """ super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins, layers_types, layers_sizes, n_outs, rho, eps, max_norm, debugprint) self.dropout_rates = dropout_rates if fast_drop: if dropout_rates[0]: dropout_layer_input = fast_dropout(numpy_rng, self.x) else: dropout_layer_input = self.x else: dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0]) self.dropout_layers = [] for layer, layer_type, n_in, n_out, dr in zip(self.layers, layers_types, self.layers_ins, self.layers_outs, dropout_rates[1:] + [0]): # !!! we do not dropout anything # from the last layer !!! if dr: if fast_drop: this_layer = layer_type(rng=numpy_rng, input=dropout_layer_input, n_in=n_in, n_out=n_out, W=layer.W, b=layer.b, fdrop=True) else: this_layer = layer_type(rng=numpy_rng, input=dropout_layer_input, n_in=n_in, n_out=n_out, W=layer.W * 1. / (1. - dr), b=layer.b * 1. / (1. - dr)) # N.B. dropout with dr==1 does not dropanything!! this_layer.output = dropout(numpy_rng, this_layer.output, dr) else: this_layer = layer_type(rng=numpy_rng, input=dropout_layer_input, n_in=n_in, n_out=n_out, W=layer.W, b=layer.b) assert hasattr(this_layer, 'output') self.dropout_layers.append(this_layer) dropout_layer_input = this_layer.output assert hasattr(self.layers[-1], 'training_cost') assert hasattr(self.layers[-1], 'errors') # TODO standardize cost # these are the dropout costs self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y) self.cost = self.dropout_layers[-1].training_cost(self.y) # these is the non-dropout errors self.errors = self.layers[-1].errors(self.y) def __repr__(self): return super(DropoutNet, self).__repr__() + "\n"\ + "dropout rates: " + str(self.dropout_rates) def add_fit_and_score(class_to_chg): """ Mutates a class to add the fit() and score() functions to a NeuralNet. """ from types import MethodType def fit(self, x_train, y_train, x_dev=None, y_dev=None, max_epochs=20, early_stopping=True, split_ratio=0.1, # TODO 100+ epochs method='adadelta', verbose=False, plot=False): """ TODO """ import time, copy if x_dev == None or y_dev == None: from sklearn.cross_validation import train_test_split x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=split_ratio, random_state=42) if method == 'sgd': train_fn = self.get_SGD_trainer() elif method == 'adagrad': train_fn = self.get_adagrad_trainer() elif method == 'adadelta': train_fn = self.get_adadelta_trainer() elif method == 'adadelta_nesterov': train_fn = self.get_adadelta_nesterov_trainer() elif method == 'adadelta_rprop': train_fn = self.get_adadelta_rprop_trainer() elif method == 'sag': #train_fn = self.get_SAG_trainer(R=1+numpy.max(numpy.sum(x_train**2, axis=1))) if BATCH_SIZE > 1: line_sums = numpy.sum(x_train**2, axis=1) train_fn = self.get_SAG_trainer(R=numpy.max(numpy.mean( line_sums[:(line_sums.shape[0]/BATCH_SIZE)*BATCH_SIZE].reshape((line_sums.shape[0]/BATCH_SIZE, BATCH_SIZE)), axis=1)), alpha=1./x_train.shape[0]) else: train_fn = self.get_SAG_trainer(R=numpy.max(numpy.sum(x_train**2, axis=1)), alpha=1./x_train.shape[0]) train_set_iterator = DatasetMiniBatchIterator(x_train, y_train) if method == 'sag': sag_train_set_iterator = DatasetMiniBatchIterator(x_train, y_train, randomize=True) dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev) train_scoref = self.score_classif(train_set_iterator) dev_scoref = self.score_classif(dev_set_iterator) best_dev_loss = numpy.inf epoch = 0 # TODO early stopping (not just cross val, also stop training) if plot: verbose = True self._costs = [] self._train_errors = [] self._dev_errors = [] self._updates = [] seen = numpy.zeros(((x_train.shape[0]+BATCH_SIZE-1) / BATCH_SIZE,), dtype=numpy.bool) n_seen = 0 while epoch < max_epochs: if not verbose: sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs)) sys.stdout.flush() avg_costs = [] timer = time.time() if method == 'sag': for ind_minibatch, x, y in sag_train_set_iterator: if not seen[ind_minibatch]: seen[ind_minibatch] = 1 n_seen += 1 if 'nesterov' in method: self.nesterov_step() avg_cost = train_fn(x, y, ind_minibatch, n_seen) if type(avg_cost) == list: avg_costs.append(avg_cost[0]) else: avg_costs.append(avg_cost) else: for x, y in train_set_iterator: if method == 'sgd' or method == 'adagrad': avg_cost = train_fn(x, y, lr=1.E-2) elif 'adadelta' in method: avg_cost = train_fn(x, y) if type(avg_cost) == list: avg_costs.append(avg_cost[0]) else: avg_costs.append(avg_cost) if verbose: mean_costs = numpy.mean(avg_costs) mean_train_errors = numpy.mean(train_scoref()) print(' epoch %i took %f seconds' % (epoch, time.time() - timer)) print(' epoch %i, avg costs %f' % (epoch, mean_costs)) print(' method %s, epoch %i, training error %f' % (method, epoch, mean_train_errors)) if plot: self._costs.append(mean_costs) self._train_errors.append(mean_train_errors) dev_errors = numpy.mean(dev_scoref()) if plot: self._dev_errors.append(dev_errors) if dev_errors < best_dev_loss: best_dev_loss = dev_errors best_params = copy.deepcopy(self.params) if verbose: print('!!! epoch %i, validation error of best model %f' % (epoch, dev_errors)) epoch += 1 if not verbose: print("") for i, param in enumerate(best_params): self.params[i] = param def score(self, x, y): """ error rates """ iterator = DatasetMiniBatchIterator(x, y) scoref = self.score_classif(iterator) return numpy.mean(scoref()) class_to_chg.fit = MethodType(fit, None, class_to_chg) class_to_chg.score = MethodType(score, None, class_to_chg) if __name__ == "__main__": add_fit_and_score(DropoutNet) add_fit_and_score(RegularizedNet) def nudge_dataset(X, Y): """ This produces a dataset 5 times bigger than the original one, by moving the 8x8 images in X around by 1px to left, right, down, up """ from scipy.ndimage import convolve direction_vectors = [ [[0, 1, 0], [0, 0, 0], [0, 0, 0]], [[0, 0, 0], [1, 0, 0], [0, 0, 0]], [[0, 0, 0], [0, 0, 1], [0, 0, 0]], [[0, 0, 0], [0, 0, 0], [0, 1, 0]]] shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', weights=w).ravel() X = numpy.concatenate([X] + [numpy.apply_along_axis(shift, 1, X, vector) for vector in direction_vectors]) Y = numpy.concatenate([Y for _ in range(5)], axis=0) return X, Y from sklearn import datasets, svm, naive_bayes from sklearn import cross_validation, preprocessing MNIST = True DIGITS = False FACES = False TWENTYNEWSGROUPS = False VERBOSE = True SCALE = True PLOT = True def train_models(x_train, y_train, x_test, y_test, n_features, n_outs, use_dropout=True, n_epochs=100, numpy_rng=None, # TODO 200+ epochs svms=False, nb=False, deepnn=True, name=''): if svms: print("Linear SVM") classifier = svm.SVC(gamma=0.001) print(classifier) classifier.fit(x_train, y_train) print("score: %f" % classifier.score(x_test, y_test)) print("RBF-kernel SVM") classifier = svm.SVC(kernel='rbf', class_weight='auto') print(classifier) classifier.fit(x_train, y_train) print("score: %f" % classifier.score(x_test, y_test)) if nb: print("Multinomial Naive Bayes") classifier = naive_bayes.MultinomialNB() print(classifier) classifier.fit(x_train, y_train) print("score: %f" % classifier.score(x_test, y_test)) if deepnn: import warnings warnings.filterwarnings("ignore") # TODO remove if use_dropout: n_epochs *= 4 pass def new_dnn(dropout=False): if dropout: print("Dropout DNN") return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features, #layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], layers_types=[SoftPlus, SoftPlus, SoftPlus, SoftPlus, LogisticRegression], layers_sizes=[2000, 2000, 2000, 2000], dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], n_outs=n_outs, max_norm=4., fast_drop=False, debugprint=0) else: print("Simple (regularized) DNN") return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features, #layers_types=[LogisticRegression], #layers_sizes=[], #layers_types=[ReLU, ReLU, ReLU, LogisticRegression], layers_types=[SoftPlus, SoftPlus, SoftPlus, LogisticRegression], layers_sizes=[1000, 1000, 1000], #layers_types=[ReLU, LogisticRegression], #layers_sizes=[200], n_outs=n_outs, #L1_reg=0.001/x_train.shape[0], #L2_reg=0.001/x_train.shape[0], L1_reg=0., L2_reg=1./x_train.shape[0], max_norm=0., debugprint=0) import matplotlib.pyplot as plt plt.figure() ax1 = plt.subplot(221) ax2 = plt.subplot(222) ax3 = plt.subplot(223) ax4 = plt.subplot(224) # TODO updates of the weights #methods = ['adadelta', 'adadelta_nesterov', 'sgd', 'adagrad'] # 'sag' #methods = ['adadelta_rprop', 'adadelta', 'adadelta_nesterov'] methods = ['adadelta_nesterov', 'adadelta'] #methods = ['adadelta'] #methods = ['sgd'] #methods = ['adagrad'] if SAG: methods = ['sag', 'sgd', 'adagrad', 'adadelta'] for method in methods: dnn = new_dnn(use_dropout) print dnn dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT) test_error = dnn.score(x_test, y_test) print("score: %f" % (1. - test_error)) ax1.plot(numpy.log10(dnn._costs), label=method) #ax2.plot(numpy.log10(dnn._train_errors), label=method) #ax3.plot(numpy.log10(dnn._dev_errors), label=method) ax2.plot(dnn._train_errors, label=method) ax3.plot(dnn._dev_errors, label=method) #ax4.plot(dnn._updates, label=method) TODO ax4.plot([test_error for _ in range(10)], label=method) ax1.set_xlabel('epoch') ax1.set_ylabel('cost (log10)') ax2.set_xlabel('epoch') ax2.set_ylabel('train error') ax3.set_xlabel('epoch') ax3.set_ylabel('dev error') ax4.set_ylabel('test error') plt.legend() plt.savefig('training' + name + '.png') if MNIST: from sklearn.datasets import fetch_mldata mnist = fetch_mldata('MNIST original') X = numpy.asarray(mnist.data, dtype='float32') if SCALE: #X = preprocessing.scale(X) X /= 255. y = numpy.asarray(mnist.target, dtype='int32') #target_names = mnist.target_names print("Total dataset size:") print("n samples: %d" % X.shape[0]) print("n features: %d" % X.shape[1]) print("n classes: %d" % len(set(y))) x_train, x_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2, random_state=42) train_models(x_train, y_train, x_test, y_test, X.shape[1], len(set(y)), numpy_rng=numpy.random.RandomState(123), name='MNIST') if DIGITS: digits = datasets.load_digits() data = numpy.asarray(digits.data, dtype='float32') target = numpy.asarray(digits.target, dtype='int32') nudged_x, nudged_y = nudge_dataset(data, target) if SCALE: nudged_x = preprocessing.scale(nudged_x) x_train, x_test, y_train, y_test = cross_validation.train_test_split( nudged_x, nudged_y, test_size=0.2, random_state=42) train_models(x_train, y_train, x_test, y_test, nudged_x.shape[1], len(set(target)), numpy_rng=numpy.random.RandomState(123), name='digits') if FACES: import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, resize=0.4) X = numpy.asarray(lfw_people.data, dtype='float32') if SCALE: X = preprocessing.scale(X) y = numpy.asarray(lfw_people.target, dtype='int32') target_names = lfw_people.target_names print("Total dataset size:") print("n samples: %d" % X.shape[0]) print("n features: %d" % X.shape[1]) print("n classes: %d" % target_names.shape[0]) x_train, x_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2, random_state=42) train_models(x_train, y_train, x_test, y_test, X.shape[1], len(set(y)), numpy_rng=numpy.random.RandomState(123), name='faces') if TWENTYNEWSGROUPS: from sklearn.feature_extraction.text import TfidfVectorizer newsgroups_train = datasets.fetch_20newsgroups(subset='train') vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000) #vectorizer = HashingVectorizer(encoding='latin-1') x_train = vectorizer.fit_transform(newsgroups_train.data) x_train = numpy.asarray(x_train.todense(), dtype='float32') y_train = numpy.asarray(newsgroups_train.target, dtype='int32') newsgroups_test = datasets.fetch_20newsgroups(subset='test') x_test = vectorizer.transform(newsgroups_test.data) x_test = numpy.asarray(x_test.todense(), dtype='float32') y_test = numpy.asarray(newsgroups_test.target, dtype='int32') train_models(x_train, y_train, x_test, y_test, x_train.shape[1], len(set(y_train)), numpy_rng=numpy.random.RandomState(123), svms=False, nb=True, deepnn=True, name='20newsgroups')