defshared_dataset(data_xy):
""" Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance."""
data_x, data_y = data_xy
shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))
# When storing data on the GPU it has to be stored as floats# therefore we will store the labels as ``floatX`` as well# (``shared_y`` does exactly that). But during our computations# we need them as ints (we use labels as index, and if they are# floats it doesn't make sense) therefore instead of returning# ``shared_y`` we will have to cast it to int. This little hack# lets us get around this issuereturn shared_x, T.cast(shared_y, 'int32')
import theano
import theano.tensor as T
import numpy
深度学习的监督优化入门
学习一个分类器
0-1损失函数
5bZ快充网络
5bZ快充网络
5bZ快充网络
# zero_one_loss is a Theano variable representing a symbolic# expression of the zero one loss ; to get the actual value this# symbolic expression has to be compiled into a Theano function (see# the Theano tutorial for more details)
zero_one_loss = T.sum(T.neq(T.argmax(p_y_given_x), y))
# NLL is a symbolic variable ; to get the actual value of NLL, this symbolic# expression has to be compiled into a Theano function (see the Theano# tutorial for more details)NLL=-T.sum(T.log(p_y_given_x)[T.arange(y.shape[0]), y])
# note on syntax: T.arange(y.shape[0]) is a vector of integers [0,1,2,...,len(y)].# Indexing a matrix M by the two vectors [0,1,...,K], [a,b,...,k] returns the# elements M[0,a], M[1,b], ..., M[K,k] as a vector. Here, we use this# syntax to retrieve the log-probability of the correct labels, y.
# STOCHASTIC GRADIENT DESCENTfor (x_i,y_i) in training_set:
# imagine an infinite generator# that may repeat examples (if there is only a finite training set)
loss = f(params, x_i, y_i)
d_loss_wrt_params =...# compute gradient
params -= learning_rate * d_loss_wrt_params
if<stopping condition is met>:
return params
for (x_batch,y_batch) in train_batches:
# imagine an infinite generator# that may repeat examples
loss = f(params, x_batch, y_batch)
d_loss_wrt_params =...# compute gradient using theano
params -= learning_rate * d_loss_wrt_params
if<stopping condition is met>:
return params
# Minibatch Stochastic Gradient Descent# assume loss is a symbolic description of the loss function given# the symbolic variables params (shared variable), x_batch, y_batch;# compute gradient of loss with respect to params
d_loss_wrt_params = T.grad(loss, params)
# compile the MSGD step into a theano function
updates = [(params, params - learning_rate * d_loss_wrt_params)]
MSGD= theano.function([x_batch,y_batch], loss, updates=updates)
for (x_batch, y_batch) in train_batches:
# here x_batch and y_batch are elements of train_batches and# therefore numpy arrays; function MSGD also updates the paramsprint('Current loss is ', MSGD(x_batch, y_batch))
if stopping_condition_is_met:
return params
# symbolic Theano variable that represents the L1 regularization term
L1 = T.sum(abs(param))
# symbolic Theano variable that represents the squared L2 term
L2_sqr = T.sum(param **2)
# the loss
loss =NLL+ lambda_1 * L1 + lambda_2 * L2
# early-stopping parameters
patience =5000# look as this many examples regardless
patience_increase =2# wait this much longer when a new best is# found
improvement_threshold =0.995# a relative improvement of this much is# considered significant
validation_frequency =min(n_train_batches, patience/2)
# go through this many# minibatches before checking the network# on the validation set; in this case we# check every epoch
best_params =None
best_validation_loss = numpy.inf
test_score =0.
start_time = time.clock()
done_looping =False
epoch =0while (epoch < n_epochs) and (not done_looping):
# Report "1" for first epoch, "n_epochs" for last epoch
epoch = epoch +1for minibatch_index inxrange(n_train_batches):
d_loss_wrt_params =...# compute gradient
params -= learning_rate * d_loss_wrt_params # gradient descent# iteration number. We want it to start at 0.iter= (epoch -1) * n_train_batches + minibatch_index
# note that if we do `iter % validation_frequency` it will be# true for iter = 0 which we do not want. We want it true for# iter = validation_frequency - 1.if (iter+1) % validation_frequency ==0:
this_validation_loss =...# compute zero-one loss on validation setif this_validation_loss < best_validation_loss:
# improve patience if loss improvement is good enoughif this_validation_loss < best_validation_loss * improvement_threshold:
patience =max(patience, iter* patience_increase)
best_params = copy.deepcopy(params)
best_validation_loss = this_validation_loss
if patience <=iter:
done_looping =Truebreak# POSTCONDITION:# best_params refers to the best out-of-sample parameters observed during the optimization
>>>import cPickle
>>> save_file =open('path', 'wb') # this will overwrite current contents>>> cPickle.dump(w.get_value(borrow=True), save_file, -1) # the -1 is for HIGHEST_PROTOCOL>>> cPickle.dump(v.get_value(borrow=True), save_file, -1) # .. and it triggers much more efficient>>> cPickle.dump(u.get_value(borrow=True), save_file, -1) # .. storage than numpy's default>>> save_file.close()