-
-
Save syhw/8a0f820261926e2f41cc to your computer and use it in GitHub Desktop.
""" | |
A deep neural network with or w/o dropout in one file. | |
License: Do What The Fuck You Want to Public License http://www.wtfpl.net/ | |
""" | |
import numpy, theano, sys, math | |
from theano import tensor as T | |
from theano import shared | |
from theano.tensor.shared_randomstreams import RandomStreams | |
from collections import OrderedDict | |
BATCH_SIZE = 100 | |
def relu_f(vec): | |
""" Wrapper to quickly change the rectified linear unit function """ | |
return (vec + abs(vec)) / 2. | |
def dropout(rng, x, p=0.5): | |
""" Zero-out random values in x with probability p using rng """ | |
if p > 0. and p < 1.: | |
seed = rng.randint(2 ** 30) | |
srng = theano.tensor.shared_randomstreams.RandomStreams(seed) | |
mask = srng.binomial(n=1, p=1.-p, size=x.shape, | |
dtype=theano.config.floatX) | |
return x * mask | |
return x | |
def fast_dropout(rng, x): | |
""" Multiply activations by N(1,1) """ | |
seed = rng.randint(2 ** 30) | |
srng = RandomStreams(seed) | |
mask = srng.normal(size=x.shape, avg=1., dtype=theano.config.floatX) | |
return x * mask | |
def build_shared_zeros(shape, name): | |
""" Builds a theano shared variable filled with a zeros numpy array """ | |
return shared(value=numpy.zeros(shape, dtype=theano.config.floatX), | |
name=name, borrow=True) | |
class Linear(object): | |
""" Basic linear transformation layer (W.X + b) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
if W is None: | |
W_values = numpy.asarray(rng.uniform( | |
low=-numpy.sqrt(6. / (n_in + n_out)), | |
high=numpy.sqrt(6. / (n_in + n_out)), | |
size=(n_in, n_out)), dtype=theano.config.floatX) | |
W_values *= 4 # This works for sigmoid activated networks! | |
W = theano.shared(value=W_values, name='W', borrow=True) | |
if b is None: | |
b = build_shared_zeros((n_out,), 'b') | |
self.input = input | |
self.W = W | |
self.b = b | |
self.params = [self.W, self.b] | |
self.output = T.dot(self.input, self.W) + self.b | |
if fdrop: | |
self.output = fast_dropout(rng, self.output) | |
def __repr__(self): | |
return "Linear" | |
class SigmoidLayer(Linear): | |
""" Sigmoid activation layer (sigmoid(W.X + b)) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
super(SigmoidLayer, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation) | |
self.output = T.nnet.sigmoid(self.pre_activation) | |
class ReLU(Linear): | |
""" Rectified Linear Unit activation layer (max(0, W.X + b)) """ | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None, fdrop=False): | |
if b is None: | |
b = build_shared_zeros((n_out,), 'b') | |
super(ReLU, self).__init__(rng, input, n_in, n_out, W, b) | |
self.pre_activation = self.output | |
if fdrop: | |
self.pre_activation = fast_dropout(rng, self.pre_activation) | |
self.output = relu_f(self.pre_activation) | |
class DatasetMiniBatchIterator(object): | |
""" Basic mini-batch iterator """ | |
def __init__(self, x, y, batch_size=BATCH_SIZE, randomize=False): | |
self.x = x | |
self.y = y | |
self.batch_size = batch_size | |
self.randomize = randomize | |
from sklearn.utils import check_random_state | |
self.rng = check_random_state(42) | |
def __iter__(self): | |
n_samples = self.x.shape[0] | |
if self.randomize: | |
for _ in xrange(n_samples / BATCH_SIZE): | |
if BATCH_SIZE > 1: | |
i = int(self.rng.rand(1) * ((n_samples+BATCH_SIZE-1) / BATCH_SIZE)) | |
else: | |
i = int(math.floor(self.rng.rand(1) * n_samples)) | |
yield (i, self.x[i*self.batch_size:(i+1)*self.batch_size], | |
self.y[i*self.batch_size:(i+1)*self.batch_size]) | |
else: | |
for i in xrange((n_samples + self.batch_size - 1) | |
/ self.batch_size): | |
yield (self.x[i*self.batch_size:(i+1)*self.batch_size], | |
self.y[i*self.batch_size:(i+1)*self.batch_size]) | |
class LogisticRegression: | |
"""Multi-class Logistic Regression | |
""" | |
def __init__(self, rng, input, n_in, n_out, W=None, b=None): | |
if W != None: | |
self.W = W | |
else: | |
self.W = build_shared_zeros((n_in, n_out), 'W') | |
if b != None: | |
self.b = b | |
else: | |
self.b = build_shared_zeros((n_out,), 'b') | |
# P(Y|X) = softmax(W.X + b) | |
self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) | |
self.y_pred = T.argmax(self.p_y_given_x, axis=1) | |
self.output = self.y_pred | |
self.params = [self.W, self.b] | |
def negative_log_likelihood(self, y): | |
return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
def negative_log_likelihood_sum(self, y): | |
return -T.sum(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) | |
def training_cost(self, y): | |
""" Wrapper for standard name """ | |
return self.negative_log_likelihood_sum(y) | |
def errors(self, y): | |
if y.ndim != self.y_pred.ndim: | |
raise TypeError("y should have the same shape as self.y_pred", | |
("y", y.type, "y_pred", self.y_pred.type)) | |
if y.dtype.startswith('int'): | |
return T.mean(T.neq(self.y_pred, y)) | |
else: | |
print("!!! y should be of int type") | |
return T.mean(T.neq(self.y_pred, numpy.asarray(y, dtype='int'))) | |
class NeuralNet(object): | |
""" Neural network (not regularized, without dropout) """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=40*3, | |
layers_types=[Linear, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[1024, 1024, 1024, 1024], | |
n_outs=62 * 3, | |
rho=0.9, | |
eps=1.E-6, | |
max_norm=0., | |
debugprint=False): | |
""" | |
Basic feedforward neural network. | |
""" | |
self.layers = [] | |
self.params = [] | |
self.n_layers = len(layers_types) | |
self.layers_types = layers_types | |
assert self.n_layers > 0 | |
self.max_norm = max_norm | |
self._rho = rho # "momentum" for adadelta | |
self._eps = eps # epsilon for adadelta | |
self._accugrads = [] # for adadelta | |
self._accudeltas = [] # for adadelta | |
if theano_rng == None: | |
theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) | |
self.x = T.fmatrix('x') | |
self.y = T.ivector('y') | |
self.layers_ins = [n_ins] + layers_sizes | |
self.layers_outs = layers_sizes + [n_outs] | |
layer_input = self.x | |
for layer_type, n_in, n_out in zip(layers_types, | |
self.layers_ins, self.layers_outs): | |
this_layer = layer_type(rng=numpy_rng, | |
input=layer_input, n_in=n_in, n_out=n_out) | |
assert hasattr(this_layer, 'output') | |
self.params.extend(this_layer.params) | |
self._accugrads.extend([build_shared_zeros(t.shape.eval(), | |
'accugrad') for t in this_layer.params]) | |
self._accudeltas.extend([build_shared_zeros(t.shape.eval(), | |
'accudelta') for t in this_layer.params]) | |
self.layers.append(this_layer) | |
layer_input = this_layer.output | |
assert hasattr(self.layers[-1], 'training_cost') | |
assert hasattr(self.layers[-1], 'errors') | |
# TODO standardize cost | |
self.mean_cost = self.layers[-1].negative_log_likelihood(self.y) | |
self.cost = self.layers[-1].training_cost(self.y) | |
if debugprint: | |
theano.printing.debugprint(self.cost) | |
self.errors = self.layers[-1].errors(self.y) | |
def __repr__(self): | |
dimensions_layers_str = map(lambda x: "x".join(map(str, x)), | |
zip(self.layers_ins, self.layers_outs)) | |
return "_".join(map(lambda x: "_".join((x[0].__name__, x[1])), | |
zip(self.layers_types, dimensions_layers_str))) | |
def get_SGD_trainer(self): | |
""" Returns a plain SGD minibatch trainer with learning rate as param. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
learning_rate = T.fscalar('lr') # learning rate to use | |
# compute the gradients with respect to the model parameters | |
# using mean_cost so that the learning rate is not too dependent | |
# on the batch size | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for param, gparam in zip(self.params, gparams): | |
if self.max_norm: | |
W = param - gparam * learning_rate | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param - gparam * learning_rate | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), | |
theano.Param(learning_rate)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adagrad_trainer(self): | |
""" Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
learning_rate = T.fscalar('lr') # learning rate to use | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = accugrad + gparam * gparam | |
dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y), | |
theano.Param(learning_rate)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def get_adadelta_trainer(self): | |
""" Returns an Adadelta (Zeiler 2012) trainer using self._rho and | |
self._eps params. | |
""" | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
# compute the gradients with respect to the model parameters | |
gparams = T.grad(self.mean_cost, self.params) | |
# compute list of weights updates | |
updates = OrderedDict() | |
for accugrad, accudelta, param, gparam in zip(self._accugrads, | |
self._accudeltas, self.params, gparams): | |
# c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) | |
agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam | |
dx = - T.sqrt((accudelta + self._eps) | |
/ (agrad + self._eps)) * gparam | |
updates[accudelta] = (self._rho * accudelta | |
+ (1 - self._rho) * dx * dx) | |
if self.max_norm: | |
W = param + dx | |
col_norms = W.norm(2, axis=0) | |
desired_norms = T.clip(col_norms, 0, self.max_norm) | |
updates[param] = W * (desired_norms / (1e-6 + col_norms)) | |
else: | |
updates[param] = param + dx | |
updates[accugrad] = agrad | |
train_fn = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.mean_cost, | |
updates=updates, | |
givens={self.x: batch_x, self.y: batch_y}) | |
return train_fn | |
def score_classif(self, given_set): | |
""" Returns functions to get current classification errors. """ | |
batch_x = T.fmatrix('batch_x') | |
batch_y = T.ivector('batch_y') | |
score = theano.function(inputs=[theano.Param(batch_x), | |
theano.Param(batch_y)], | |
outputs=self.errors, | |
givens={self.x: batch_x, self.y: batch_y}) | |
def scoref(): | |
""" returned function that scans the entire set given as input """ | |
return [score(batch_x, batch_y) for batch_x, batch_y in given_set] | |
return scoref | |
class RegularizedNet(NeuralNet): | |
""" Neural net with L1 and L2 regularization """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=100, | |
layers_types=[ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[1024, 1024, 1024], | |
n_outs=2, | |
rho=0.9, | |
eps=1.E-6, | |
L1_reg=0., | |
L2_reg=0., | |
max_norm=0., | |
debugprint=False): | |
""" | |
Feedforward neural network with added L1 and/or L2 regularization. | |
""" | |
super(RegularizedNet, self).__init__(numpy_rng, theano_rng, n_ins, | |
layers_types, layers_sizes, n_outs, rho, eps, max_norm, | |
debugprint) | |
L1 = shared(0.) | |
for param in self.params: | |
L1 += T.sum(abs(param)) | |
if L1_reg > 0.: | |
self.cost = self.cost + L1_reg * L1 | |
L2 = shared(0.) | |
for param in self.params: | |
L2 += T.sum(param ** 2) | |
if L2_reg > 0.: | |
self.cost = self.cost + L2_reg * L2 | |
class DropoutNet(NeuralNet): | |
""" Neural net with dropout (see Hinton's et al. paper) """ | |
def __init__(self, numpy_rng, theano_rng=None, | |
n_ins=40*3, | |
layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], | |
layers_sizes=[4000, 4000, 4000, 4000], | |
dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], | |
n_outs=62 * 3, | |
rho=0.9, | |
eps=1.E-6, | |
max_norm=0., | |
fast_drop=False, | |
debugprint=False): | |
""" | |
Feedforward neural network with dropout regularization. | |
""" | |
super(DropoutNet, self).__init__(numpy_rng, theano_rng, n_ins, | |
layers_types, layers_sizes, n_outs, rho, eps, max_norm, | |
debugprint) | |
self.dropout_rates = dropout_rates | |
if fast_drop: | |
if dropout_rates[0]: | |
dropout_layer_input = fast_dropout(numpy_rng, self.x) | |
else: | |
dropout_layer_input = self.x | |
else: | |
dropout_layer_input = dropout(numpy_rng, self.x, p=dropout_rates[0]) | |
self.dropout_layers = [] | |
for layer, layer_type, n_in, n_out, dr in zip(self.layers, | |
layers_types, self.layers_ins, self.layers_outs, | |
dropout_rates[1:] + [0]): # !!! we do not dropout anything | |
# from the last layer !!! | |
if dr: | |
if fast_drop: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W, b=layer.b, fdrop=True) | |
else: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W * 1. / (1. - dr), | |
b=layer.b * 1. / (1. - dr)) | |
# N.B. dropout with dr==1 does not dropanything!! | |
this_layer.output = dropout(numpy_rng, this_layer.output, dr) | |
else: | |
this_layer = layer_type(rng=numpy_rng, | |
input=dropout_layer_input, n_in=n_in, n_out=n_out, | |
W=layer.W, b=layer.b) | |
assert hasattr(this_layer, 'output') | |
self.dropout_layers.append(this_layer) | |
dropout_layer_input = this_layer.output | |
assert hasattr(self.layers[-1], 'training_cost') | |
assert hasattr(self.layers[-1], 'errors') | |
# these are the dropout costs | |
self.mean_cost = self.dropout_layers[-1].negative_log_likelihood(self.y) | |
self.cost = self.dropout_layers[-1].training_cost(self.y) | |
# these is the non-dropout errors | |
self.errors = self.layers[-1].errors(self.y) | |
def __repr__(self): | |
return super(DropoutNet, self).__repr__() + "\n"\ | |
+ "dropout rates: " + str(self.dropout_rates) | |
def add_fit_and_score(class_to_chg): | |
""" Mutates a class to add the fit() and score() functions to a NeuralNet. | |
""" | |
from types import MethodType | |
def fit(self, x_train, y_train, x_dev=None, y_dev=None, | |
max_epochs=100, early_stopping=True, split_ratio=0.1, | |
method='adadelta', verbose=False, plot=False): | |
""" | |
Fits the neural network to `x_train` and `y_train`. | |
If x_dev nor y_dev are not given, it will do a `split_ratio` cross- | |
validation split on `x_train` and `y_train` (for early stopping). | |
""" | |
import time, copy | |
if x_dev == None or y_dev == None: | |
from sklearn.cross_validation import train_test_split | |
x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, | |
test_size=split_ratio, random_state=42) | |
if method == 'sgd': | |
train_fn = self.get_SGD_trainer() | |
elif method == 'adagrad': | |
train_fn = self.get_adagrad_trainer() | |
elif method == 'adadelta': | |
train_fn = self.get_adadelta_trainer() | |
train_set_iterator = DatasetMiniBatchIterator(x_train, y_train) | |
dev_set_iterator = DatasetMiniBatchIterator(x_dev, y_dev) | |
train_scoref = self.score_classif(train_set_iterator) | |
dev_scoref = self.score_classif(dev_set_iterator) | |
best_dev_loss = numpy.inf | |
epoch = 0 | |
# TODO early stopping (not just cross val, also stop training) | |
if plot: | |
verbose = True | |
self._costs = [] | |
self._train_errors = [] | |
self._dev_errors = [] | |
self._updates = [] | |
while epoch < max_epochs: | |
if not verbose: | |
sys.stdout.write("\r%0.2f%%" % (epoch * 100./ max_epochs)) | |
sys.stdout.flush() | |
avg_costs = [] | |
timer = time.time() | |
for x, y in train_set_iterator: | |
if method == 'sgd' or method == 'adagrad': | |
avg_cost = train_fn(x, y, lr=1.E-2) # TODO: you have to | |
# play with this | |
# learning rate | |
# (dataset dependent) | |
elif method == 'adadelta': | |
avg_cost = train_fn(x, y) | |
if type(avg_cost) == list: | |
avg_costs.append(avg_cost[0]) | |
else: | |
avg_costs.append(avg_cost) | |
if verbose: | |
mean_costs = numpy.mean(avg_costs) | |
mean_train_errors = numpy.mean(train_scoref()) | |
print(' epoch %i took %f seconds' % | |
(epoch, time.time() - timer)) | |
print(' epoch %i, avg costs %f' % | |
(epoch, mean_costs)) | |
print(' epoch %i, training error %f' % | |
(epoch, mean_train_errors)) | |
if plot: | |
self._costs.append(mean_costs) | |
self._train_errors.append(mean_train_errors) | |
dev_errors = numpy.mean(dev_scoref()) | |
if plot: | |
self._dev_errors.append(dev_errors) | |
if dev_errors < best_dev_loss: | |
best_dev_loss = dev_errors | |
best_params = copy.deepcopy(self.params) | |
if verbose: | |
print('!!! epoch %i, validation error of best model %f' % | |
(epoch, dev_errors)) | |
epoch += 1 | |
if not verbose: | |
print("") | |
for i, param in enumerate(best_params): | |
self.params[i] = param | |
def score(self, x, y): | |
""" error rates """ | |
iterator = DatasetMiniBatchIterator(x, y) | |
scoref = self.score_classif(iterator) | |
return numpy.mean(scoref()) | |
class_to_chg.fit = MethodType(fit, None, class_to_chg) | |
class_to_chg.score = MethodType(score, None, class_to_chg) | |
if __name__ == "__main__": | |
add_fit_and_score(DropoutNet) | |
add_fit_and_score(RegularizedNet) | |
def nudge_dataset(X, Y): | |
""" | |
This produces a dataset 5 times bigger than the original one, | |
by moving the 8x8 images in X around by 1px to left, right, down, up | |
""" | |
from scipy.ndimage import convolve | |
direction_vectors = [ | |
[[0, 1, 0], | |
[0, 0, 0], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[1, 0, 0], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[0, 0, 1], | |
[0, 0, 0]], | |
[[0, 0, 0], | |
[0, 0, 0], | |
[0, 1, 0]]] | |
shift = lambda x, w: convolve(x.reshape((8, 8)), mode='constant', | |
weights=w).ravel() | |
X = numpy.concatenate([X] + | |
[numpy.apply_along_axis(shift, 1, X, vector) | |
for vector in direction_vectors]) | |
Y = numpy.concatenate([Y for _ in range(5)], axis=0) | |
return X, Y | |
from sklearn import datasets, svm, naive_bayes | |
from sklearn import cross_validation, preprocessing | |
MNIST = True # MNIST dataset | |
DIGITS = False # digits dataset | |
FACES = True # faces dataset | |
TWENTYNEWSGROUPS = False # 20 newgroups dataset | |
VERBOSE = True # prints evolution of the loss/accuracy during the fitting | |
SCALE = True # scale the dataset | |
PLOT = True # plot losses and accuracies | |
def train_models(x_train, y_train, x_test, y_test, n_features, n_outs, | |
use_dropout=True, n_epochs=100, numpy_rng=None, | |
svms=False, nb=False, deepnn=True, name=''): | |
if svms: | |
print("Linear SVM") | |
classifier = svm.SVC(gamma=0.001) | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
print("RBF-kernel SVM") | |
classifier = svm.SVC(kernel='rbf', class_weight='auto') | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
if nb: | |
print("Multinomial Naive Bayes") | |
classifier = naive_bayes.MultinomialNB() | |
print(classifier) | |
classifier.fit(x_train, y_train) | |
print("score: %f" % classifier.score(x_test, y_test)) | |
if deepnn: | |
import warnings | |
warnings.filterwarnings("ignore") # TODO remove | |
if use_dropout: | |
#n_epochs *= 4 TODO | |
pass | |
def new_dnn(dropout=False): | |
if dropout: | |
print("Dropout DNN") | |
return DropoutNet(numpy_rng=numpy_rng, n_ins=n_features, | |
layers_types=[ReLU, ReLU, LogisticRegression], | |
layers_sizes=[200, 200], | |
dropout_rates=[0.2, 0.5, 0.5], | |
# TODO if you have a big enough GPU, use these: | |
#layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], | |
#layers_sizes=[2000, 2000, 2000, 2000], | |
#dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], | |
n_outs=n_outs, | |
max_norm=4., | |
fast_drop=True, | |
debugprint=0) | |
else: | |
print("Simple (regularized) DNN") | |
return RegularizedNet(numpy_rng=numpy_rng, n_ins=n_features, | |
layers_types=[ReLU, ReLU, LogisticRegression], | |
layers_sizes=[200, 200], | |
n_outs=n_outs, | |
#L1_reg=0.001/x_train.shape[0], | |
#L2_reg=0.001/x_train.shape[0], | |
L1_reg=0., | |
L2_reg=1./x_train.shape[0], | |
debugprint=0) | |
import matplotlib.pyplot as plt | |
plt.figure() | |
ax1 = plt.subplot(221) | |
ax2 = plt.subplot(222) | |
ax3 = plt.subplot(223) | |
ax4 = plt.subplot(224) # TODO plot the updates of the weights | |
methods = ['sgd', 'adagrad', 'adadelta'] | |
#methods = ['adadelta'] TODO if you want "good" results asap | |
for method in methods: | |
dnn = new_dnn(use_dropout) | |
print dnn, "using", method | |
dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT) | |
test_error = dnn.score(x_test, y_test) | |
print("score: %f" % (1. - test_error)) | |
ax1.plot(numpy.log10(dnn._costs), label=method) | |
ax2.plot(numpy.log10(dnn._train_errors), label=method) | |
ax3.plot(numpy.log10(dnn._dev_errors), label=method) | |
#ax2.plot(dnn._train_errors, label=method) | |
#ax3.plot(dnn._dev_errors, label=method) | |
ax4.plot([test_error for _ in range(10)], label=method) | |
ax1.set_xlabel('epoch') | |
ax1.set_ylabel('cost (log10)') | |
ax2.set_xlabel('epoch') | |
ax2.set_ylabel('train error') | |
ax3.set_xlabel('epoch') | |
ax3.set_ylabel('dev error') | |
ax4.set_ylabel('test error') | |
plt.legend() | |
plt.savefig('training_' + name + '.png') | |
if MNIST: | |
from sklearn.datasets import fetch_mldata | |
mnist = fetch_mldata('MNIST original') | |
X = numpy.asarray(mnist.data, dtype='float32') | |
if SCALE: | |
#X = preprocessing.scale(X) | |
X /= 255. | |
y = numpy.asarray(mnist.target, dtype='int32') | |
print("Total dataset size:") | |
print("n samples: %d" % X.shape[0]) | |
print("n features: %d" % X.shape[1]) | |
print("n classes: %d" % len(set(y))) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, X.shape[1], | |
len(set(y)), numpy_rng=numpy.random.RandomState(123), | |
name='MNIST') | |
if DIGITS: | |
digits = datasets.load_digits() | |
data = numpy.asarray(digits.data, dtype='float32') | |
target = numpy.asarray(digits.target, dtype='int32') | |
nudged_x, nudged_y = nudge_dataset(data, target) | |
if SCALE: | |
nudged_x = preprocessing.scale(nudged_x) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
nudged_x, nudged_y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, nudged_x.shape[1], | |
len(set(target)), numpy_rng=numpy.random.RandomState(123), | |
name='digits') | |
if FACES: | |
import logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s %(message)s') | |
lfw_people = datasets.fetch_lfw_people(min_faces_per_person=70, | |
resize=0.4) | |
X = numpy.asarray(lfw_people.data, dtype='float32') | |
if SCALE: | |
X = preprocessing.scale(X) | |
y = numpy.asarray(lfw_people.target, dtype='int32') | |
target_names = lfw_people.target_names | |
print("Total dataset size:") | |
print("n samples: %d" % X.shape[0]) | |
print("n features: %d" % X.shape[1]) | |
print("n classes: %d" % target_names.shape[0]) | |
x_train, x_test, y_train, y_test = cross_validation.train_test_split( | |
X, y, test_size=0.2, random_state=42) | |
train_models(x_train, y_train, x_test, y_test, X.shape[1], | |
len(set(y)), numpy_rng=numpy.random.RandomState(123), | |
name='faces') | |
if TWENTYNEWSGROUPS: | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
newsgroups_train = datasets.fetch_20newsgroups(subset='train') | |
vectorizer = TfidfVectorizer(encoding='latin-1', max_features=10000) | |
#vectorizer = HashingVectorizer(encoding='latin-1') | |
x_train = vectorizer.fit_transform(newsgroups_train.data) | |
x_train = numpy.asarray(x_train.todense(), dtype='float32') | |
y_train = numpy.asarray(newsgroups_train.target, dtype='int32') | |
newsgroups_test = datasets.fetch_20newsgroups(subset='test') | |
x_test = vectorizer.transform(newsgroups_test.data) | |
x_test = numpy.asarray(x_test.todense(), dtype='float32') | |
y_test = numpy.asarray(newsgroups_test.target, dtype='int32') | |
train_models(x_train, y_train, x_test, y_test, x_train.shape[1], | |
len(set(y_train)), | |
numpy_rng=numpy.random.RandomState(123), | |
svms=False, nb=True, deepnn=True, | |
name='20newsgroups') |
On Windows 7, 64 bit, Python 3.4:
Traceback (most recent call last):
File "E:\Anaconda3\lib\site-packages\theano\gof\lazylinker_c.py", line 59, in
raise ImportError()
ImportError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:\Anaconda3\lib\site-packages\theano\gof\lazylinker_c.py", line 76, in
raise ImportError()
ImportError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "E:\Dropbox\Dropbox\BioInformatics Lab\AdaHERF_ML-master\ModdedScripts\dnn.py", line 7, in
import numpy, theano, sys, math
File "E:\Anaconda3\lib\site-packages\theano__init__.py", line 55, in
from theano.compile import
File "E:\Anaconda3\lib\site-packages\theano\compile__init__.py", line 6, in
from theano.compile.function_module import *
File "E:\Anaconda3\lib\site-packages\theano\compile\function_module.py", line 18, in
import theano.compile.mode
File "E:\Anaconda3\lib\site-packages\theano\compile\mode.py", line 11, in
import theano.gof.vm
File "E:\Anaconda3\lib\site-packages\theano\gof\vm.py", line 516, in
from . import lazylinker_c
File "E:\Anaconda3\lib\site-packages\theano\gof\lazylinker_c.py", line 85, in
args = cmodule.GCC_compiler.compile_args()
File "E:\Anaconda3\lib\site-packages\theano\gof\cmodule.py", line 1603, in compile_args
native_lines = get_lines("g++ -march=native -E -v -")
File "E:\Anaconda3\lib\site-packages\theano\gof\cmodule.py", line 1577, in get_lines
(stdout, stderr) = p.communicate(input='')
File "E:\Anaconda3\lib\subprocess.py", line 959, in communicate
stdout, stderr = self._communicate(input, endtime, timeout)
File "E:\Anaconda3\lib\subprocess.py", line 1195, in _communicate
self.stdin.write(input)
TypeError: 'str' does not support the buffer interface
[Finished in 6.6s with exit code 1]
[shell_cmd: python -u "E:\Dropbox\Dropbox\BioInformatics Lab\AdaHERF_ML-master\ModdedScripts\dnn.py"]
in order to make the code works on OSX i substituted
lr=1.E-2
with
lr=numpy.asarray(1.E-2, dtype='float32')
On OSX,
./dnn.py
Total dataset size:
n samples: 70000
n features: 784
n classes: 10
Dropout DNN
ReLU_784x200_ReLU_200x200_LogisticRegression_200x10
dropout rates: [0.0, 0.5, 0.5] using sgd
Traceback (most recent call last):
File "./dnn.py", line 684, in
name='MNIST')
File "./dnn.py", line 647, in train_models
dnn.fit(x_train, y_train, max_epochs=n_epochs, method=method, verbose=VERBOSE, plot=PLOT)
File "./dnn.py", line 490, in fit
avg_cost = train_fn(x, y, lr=1.E-2) # TODO: you have to
File "/Library/Python/2.7/site-packages/Theano-0.6.0-py2.7.egg/theano/compile/function_module.py", line 516, in call
self[k] = arg
File "/Library/Python/2.7/site-packages/Theano-0.6.0-py2.7.egg/theano/compile/function_module.py", line 452, in setitem
self.value[item] = value
File "/Library/Python/2.7/site-packages/Theano-0.6.0-py2.7.egg/theano/compile/function_module.py", line 415, in setitem
s.value = value
File "/Library/Python/2.7/site-packages/Theano-0.6.0-py2.7.egg/theano/gof/link.py", line 278, in set
self.storage[0] = self.type.filter(value, **kwargs)
File "/Library/Python/2.7/site-packages/Theano-0.6.0-py2.7.egg/theano/tensor/type.py", line 152, in filter
raise TypeError(err_msg, data)
TypeError: ('TensorType(float32, scalar) cannot store accurately value 0.01, it would be represented as 0.00999999977648. If you do not mind this precision loss, you can: 1) explicitly convert your data to a numpy array of dtype float32, or 2) set "allow_input_downcast=True" when calling "function".', 0.01, 'Container name "lr"')