Upload
others
View
1
Download
0
Embed Size (px)
Citation preview
CHAPTER 1
(2018/8/3) ()
(http://nnadl-ja.github.io/nnadl_site_ja/chap1.html) (10/47)
504192 ---V1 V1V1V2V3V4V5
---9 ---
What this book is about
On the exercises and problems
Appendix:
?
Acknowledgements Frequently Asked Questions
Sponsors
Resources
Code repository
Mailing list for book announcements
Michael Nielsen's project announcement mailing list
100
7496% 299%
Michael Nielsen / 20149-12
**
19501960 Warren
McCulloch Walter Pitts Frank
Rosenblatt
(
01) x1, x2, (01)
x1, x2, x3 , w1, w2, 01 j wjxj
({)output =0
1
if j wjxj threshold if j wjxj > threshold
(1)
1.
2.
3.
x1, x2 x3 x1 = 1x1 = 0 x2 = 1
x2 = 0 x3
w1 = 6w2 = 2 w3 = 2 w1 --- 5 10
53
- -
1j wjxj > threshold j wjxj w x j wjxjw
x b threshold
({)output =0
1
if w x + b 0 if w x + b > 0
(2)
1 11
ANDORNAND
23
00 1 (2) 0 + (2) 0 + 3 = 3 01 10 1 11 0 (2) 1 + (2) 1 + 3 = 1 NAND
NAND NAND NAND
NAND 1 x1 x2 x1 x2 x1 x2 1 1 x1 AND x2
NAND 23 NAND
-2 -4 -2 3-4
x1 x2
j wjxj 0 b > 01 b 00 (x1 ) x1, x2,
NAND NAND
NAND
NAND
9 8
9
01 999
x1, x2, 0
1010.638 (w1, w2, )(b)
01 (w x + b)*
1
*
(z)
1 + ez .
(3)
x1, x2, w1, w2, b
1.
1 + exp( j wjxj b)
(4)
z w x + b ez 0(z) 1 z = w x + b1
z = w x + bez
(z) 0 z = w x + b w x + b
1.0
0.8
0.6
0.4
0.2
0.0
-4-3-2-101234
z
:
1.0
0.8
0.6
0.4
0.2
0.0
-4-3-2-101234
z
w x + b10
wjb
output output
w x + b = 0
10
()output output w
wj
+ output b,
b
(5)
jj
sumwj output/wj output/b
outputwjb output
wjb
output output
(3)? f()f(w x + b) (5)
01 01 0.173 0.689 01 0.5"9"0.5"9"
Exercises
I
c > 0
II
(=) x
w x + b 0 c > 0 c 1w x + b = 0
(input layer)(input neurons)(output layer)(output
neurons)1(hidden
layer)""
""1 42
(multilayer perceptrons)MLPs
964 644, 096 = 64 64011
0.5"9"0.5" 9"
(feedforward neural
networks)
(recurrent neural networks)
2 1
6
(segmentation problem)
5
21
3
28 2828 28 = 784 0.01.0
nnn = 15
10( 1)01 09 66
10 (0, 1, 2, , 9)4 014 24 = 161010 10410 4-10-
10 0
4 0
4 0 0()00
10 4 4
4
3 3 3()0.99 0.01
MNISTMNIST NIST
ModifyMNIST
beginning of this chapter
MNIST60,000250250Census Bureau282810,000
2828 250(Census
Bureau )
x 28 28 = 784- x y = y(x) y 10 x6
y(x) = (0, 0, 0, 0, 0, 0, 1, 0, 0, 0)T
T
x y(x) *:
*
()C(w, b) 1y(x) a2 .
2n x
(6)
w b n a x x a w b x v v C 2
MSE(mean squared
error)2 C(w, b) C(w, b) C(w, b) 0 y(x) C(w, b) 0 C(w, b) - y(x) C(w, b)
2 2 2
22
(6) 2
2(6)
2 C(w, b) MNIST
C(v)
C(v) v = v1 , v2 , w b v C(v) C v1 v2
C C
C C -10
( C "" C 2 )
() C ()
C
v1 v1 v2 v2 C :
C
C v +
v11
C v .
v22
(7)
C; v1
v2 v
v v (v1 , v2 )T
T
()
(() C C
v1
, C
(T) ())v2
C :
(()C C
v1
C
(T) ()),.
v2
(8)
C v C C
C -- " C "
()
(7) C
C C v.(9)
C : C
C V
C v
v = C,(10)
()
(9) C C C = C2
C2 0 C 0 (10) v
C
((9) )(10) ""(10) v v :
v v = v C.
(11)
C - -
C ""
() v ""
(9) C > 0 v (9)
C C C m v1 , , vm v = (v1 , , vm)T C C
C C v,(12)
C
(()C C
v1
C
(T) ()), ,.
vm
(13)
v = C,(14)
(12)() C C
v v = v C.
(15)
v C -
C v C C v
v = > 0 C C v v v = C = /C v = C
:
C C
2 C/vjvk 100 1(1002)*
*1
2C/vjvk = 2C/vkvj
()
(6) vj "" wk bl C C/wk C/bl
(k)wk w = wk
C
w
(16)
(l)bl b = bl
k
C .
b
(17)
l
""
(6)
(n) (x) C = 1 Cx
Cx
y(x)a2 2
C
() (n) x Cx C = 1Cx
x
Cx C C
m X1 , X2 , , Xm
m CXj Cx
()m j=1
CXj
x
Cx
= C,
(18)
mn
1m
C
CXj ,
(m)j=1
(19)
wk bl
wk w
(k)= wk
m
CXj
()w
(20)
jk
(l)bl b = bl
m
CXj ,
()b
(21)
jl
Xj 1(:1)
(6)
1
n
1
n
() (20) (21) 1
m
MNIST n = 60, 000 () m = 10
6, 000 - - C
1 x
(k) wk w = wk Cx/wk
(l)bl b = bl Cx/bl ( ) 20
C - - "" " ()"""
C C ()
MNIST MNISTgit
git clone https://github.com/mnielsen/neural-networks-and-deep-learning.git
git
MNISTMNIST
60,000 10,000 60,00050,000 10,000 MNIST
MNIST MNIST60,000 50,000 *
MNISTNumpyPython Numpy
Network
Network
class Network():
def init (self, sizes):
*MNIST(NIST) MNISTNISTYann LeCun, Corinna Cortes, Christopher J. C.
Burges
Python LISA machine learning laboratory (link)
self.num_layers = len(sizes) self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]] self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
sizes 12231Network
net = Network([2, 3, 1])
NetworkNumpynp.random.randn 01 Network1
Numpy net.weights[1]23NumpyPython 012 net.weights[1]w
wjk 2k3
jjk jk3
a = (wa + b).
(22)
a2a awb wa + bvectorizing (22) (4)
Exercise
Equation (22) Equation (4)
Network Numpy
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
sigmoid_vec = np.vectorize(sigmoid)
a feedforward
Network (22)
def feedforward(self, a):
"""Return the output of the network if "a" is input.""" for b, w in zip(self.biases, self.weights):
a = sigmoid_vec(np.dot(w, a)+b)
return a
Network (SGD)
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
"""Train the neural network using mini-batch stochastic gradient descent. The "training_data" is a list of tuples "(x, y)" representing the training inputs and the desired outputs. The other non-optional parameters are
self-explanatory. If "test_data" is provided then the network will be evaluated against the test data after each epoch, and partial progress printed out. This is useful for tracking progress, but slows things down substantially.""" if test_data: n_test = len(test_data)
n = len(training_data)
for j in xrange(epochs): random.shuffle(training_data) mini_batches = [
training_data[k:k+mini_batch_size]
for k in xrange(0, n, mini_batch_size)]
for mini_batch in mini_batches: self.update_mini_batch(mini_batch, eta)
if test_data:
print "Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} complete".format(j)
training_data(x, y)epochsmini_batch_size eta test_data
1
self.update_mini_batch(mini_batch, eta) update_mini_batch
def update_mini_batch(self, mini_batch, eta):
"""Update the network's weights and biases by applying gradient descent using backpropagation to a single mini batch. The "mini_batch" is a list of tuples "(x, y)", and "eta"
is the learning rate."""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)] self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
backpropagation
update_mini_batch self.weightsself.biases
self.backpropself.backprop x
self.backpropself.SGD
self.update_mini_batch self.backprop sigmoid_primesigmoid_prime_vecself.cost_derivative
74 GitHub
"""
network.py
~~~~~~~~~~
A module to implement the stochastic gradient descent learning algorithm for a feedforward neural network. Gradients are calculated using backpropagation. Note that I have focused on making the code simple, easily readable, and easily modifiable. It is not optimized, and omits many desirable features.
"""
#### Libraries
# Standard library
import random
# Third-party libraries import numpy as np
class Network():
def init (self, sizes):
"""The list ``sizes`` contains the number of neurons in the respective layers of the network. For example, if the list was [2, 3, 1] then it would be a three-layer network, with the first layer containing 2 neurons, the second layer 3 neurons, and the third layer 1 neuron. The biases and weights for the network are initialized randomly, using a Gaussian distribution with mean 0, and variance 1. Note that the first layer is assumed to be an input layer, and by convention we won't set any biases for those neurons, since biases are only ever used in computing the outputs from later layers.""" self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]] self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
"""Return the output of the network if ``a`` is input.""" for b, w in zip(self.biases, self.weights):
a = sigmoid_vec(np.dot(w, a)+b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
"""Train the neural network using mini-batch stochastic gradient descent. The ``training_data`` is a list of tuples
``(x, y)`` representing the training inputs and the desired outputs. The other non-optional parameters are
self-explanatory. If ``test_data`` is provided then the network will be evaluated against the test data after each epoch, and partial progress printed out. This is useful for tracking progress, but slows things down substantially."""
if test_data: n_test = len(test_data) n = len(training_data)
for j in xrange(epochs): random.shuffle(training_data) mini_batches = [
training_data[k:k+mini_batch_size]
for k in xrange(0, n, mini_batch_size)]
for mini_batch in mini_batches: self.update_mini_batch(mini_batch, eta)
if test_data:
print "Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test)
else:
print "Epoch {0} complete".format(j)
def update_mini_batch(self, mini_batch, eta):
"""Update the network's weights and biases by applying gradient descent using backpropagation to a single mini batch. The ``mini_batch`` is a list of tuples ``(x, y)``, and ``eta`` is the learning rate."""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)] self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w)`` representing the gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] # feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b zs.append(z)
activation = sigmoid_vec(z) activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * \ sigmoid_prime_vec(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# Note that the variable l in the loop below is used a little # differently to the notation in Chapter 2 of the book. Here, # l = 1 means the last layer of neurons, l = 2 is the
# second-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact # that Python can use negative indices in lists.
for l in xrange(2, self.num_layers): z = zs[-l]
spv = sigmoid_prime_vec(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * spv
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
"""Return the number of test inputs for which the neural network outputs the correct result. Note that the neural network's output is assumed to be the index of whichever neuron in the final layer has the highest activation.""" test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations.""" return (output_activations-y)
#### Miscellaneous functions def sigmoid(z):
"""The sigmoid function.""" return 1.0/(1.0+np.exp(-z))
sigmoid_vec = np.vectorize(sigmoid)
def sigmoid_prime(z):
"""Derivative of the sigmoid function.""" return sigmoid(z)*(1-sigmoid(z))
sigmoid_prime_vec = np.vectorize(sigmoid_prime)
MNIST mnist_loader.pypython
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
python python
MNIST30Network networkpython
>>> import network
>>> net = network.Network([784, 30, 10])
3010 = 3.0MNIST
training_data
>>> net.SGD(training_data, 30, 10, 3.0, test_data=test_data)
20141 python web Javascript 100009129
Epoch 0: 9129 / 10000
Epoch 1: 9295 / 10000
Epoch 2: 9348 / 10000
...
Epoch 27: 9528 / 10000
Epoch 28: 9542 / 10000
Epoch 29: 9534 / 10000
95%2895.42% 3
100
>>> net = network.Network([784, 100, 10])
>>> net.SGD(training_data, 30, 10, 3.0, test_data=test_data)
96.59%
**
= 0.001
>>> net = network.Network([784, 100, 10])
>>> net.SGD(training_data, 30, 10, 0.001, test_data=test_data)
Epoch 0: 1139 / 10000
Epoch 1: 1136 / 10000
Epoch 2: 1135 / 10000
...
Epoch 27: 2101 / 10000
Epoch 28: 2123 / 10000
Epoch 29: 2142 / 10000
= 0.01
= 1.0
30
= 100.0
>>> net = network.Network([784, 30, 10])
>>> net.SGD(training_data, 30, 10, 100.0, test_data=test_data)
Epoch 0: 1009 / 10000
Epoch 1: 1009 / 10000
Epoch 2: 1009 / 10000
Epoch 3: 1009 / 10000
...
Epoch 27: 982 / 10000
3
Epoch 28: 982 / 10000
Epoch 29: 982 / 10000
Exercise
2 78410
MNIST
MNIST Numpyndarrayndarray
"""
mnist_loader
~~~~~~~~~~~~
A library to load the MNIST image data. For details of the data structures that are returned, see the doc strings for ``load_data``
and ``load_data_wrapper``. In practice, ``load_data_wrapper`` is the function usually called by our neural network code.
"""
#### Libraries
# Standard library
import cPickle import gzip
# Third-party libraries import numpy as np
def load_data():
"""Return the MNIST data as a tuple containing the training data, the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries. The first entry contains the actual training images. This is a numpy ndarray with 50,000 entries. Each entry is, in turn, a numpy ndarray with 784 values, representing the 28 * 28 = 784 pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray containing 50,000 entries. Those entries are just the digit values (0...9) for the corresponding images contained in the first entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's helpful to modify the format of the ``training_data`` a little. That's done in the wrapper function ``load_data_wrapper()``, see below.
"""
f = gzip.open('../data/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = cPickle.load(f) f.close()
return (training_data, validation_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data, test_data)``. Based on ``load_data``, but the format is more convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray containing the input image. ``y`` is a 10-dimensional numpy.ndarray representing the unit vector corresponding to the correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000 2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional numpy.ndarry containing the input image, and ``y`` is the corresponding classification, i.e., the digit values (integers) corresponding to ``x``.
Obviously, this means we're using slightly different formats for the training data and the validation / test data. These formats turn out to be the most convenient for use in our neural network code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]] training_results = [vectorized_result(y) for y in tr_d[1]] training_data = zip(training_inputs, training_results) validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]] validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]] test_data = zip(test_inputs, te_d[1])
return (training_data, validation_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth position and zeroes elsewhere. This is used to convert a digit (0...9) into a corresponding desired output from the neural network."""
e = np.zeros((10, 1)) e[j] = 1.0
return e
10%
2
0, 1, 2, , 9 GitHubGitHub
repository10000222522.25%
2050% 50%
(SVM) SVMSVM LIBSVMSVMCpythonscikit-learn
scikit-learnSVM
100009435
SVM SVM
100009435scikit-learnSVM SVM Andreas
Mueller MuellerSVM98.5% SVM70
SVMMNIST 2014100009979 Li Wan Matthew Zeiler,Sixin
ZhangYann LeCun Rob Fergus MNIST
MNIST 1000021 MNIST Wan
+
Deep Learning
2030 (AI)
AIAI
Credits: 1. Ester Inbar. 2. Unknown. 3. NASA, ESA, G. Illingworth, D. Magee, and P. Oesch (University of California, Santa Cruz), R. Bouwens (Leiden University), and the HUDF09 Team. Click on the images for more details.
1
YESYESNO
19801990
2006 510
In academic work, please cite this book as: Michael A. Nielsen, "Neural Networks and Deep Learning", Determination Press, 2014
This work is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. This means you're free to copy, share, and build on this book, but not to sell it. If you're interested in commercial use, please contact me.
Last update: Tue Sep 2 09:19:44 2014
CHAPTER 2
(http://nnadl-ja.github.io/nnadl_site_ja/chap2.html) (10/23)
1970 David Rumelhart Geoffrey Hinton Ronald
Williams 1986
C wbC/w C/b
What this book is about
On the exercises and problems
Appendix:
?
Acknowledgements Frequently Asked Questions
Sponsors
Resources
Code repository
Mailing list for book announcements
Michael Nielsen's project announcement mailing list
Michael Nielsen / 20149-12
(w)l
jk
(l 1)kl
j 2432
jk jk
(j) (j) bl lj al lj
(j)ljal
(l 1) (4)
al = (wl al1 + bl ).
(23)
jjk kj
k
(l 1)k lwl wll
(jk) jkwl l
bl
(j) (j)bl l1 al al
(23) v (v) (v)(v)j = (vj) f(x) = x2
f ([2 ]) = [f(2) ] = [4 ].
(24)
3f(3)9
f2
(23)
al = (wlal1 + bl).
(25)
(jk)*wl jk
(25)
al (25) zl wlal1 + bl zll
zl
(25) al = (zl) zl
zl = wl al1 + bl zl
jkjk kjj
lj
2
wbC C/wC/b 2 2(6) 2
C = 1 y(x) aL(x)2 .
2n x
(26)
nx y = y(x)LaL = aL(x)x
C 1x
(n) (x)Cx C = 1 Cx 2
1
(2)Cx = 1 y aL2
Cx/wCx/b C/wC/bx1CxxC xx
2
21
x
C = 1 y aL2 = 2
1 (y
(j)2j
aL)2 .
(27)
(j)y y xy y
C aLy
s t
1
sts t
2s t(s t)j = sjtj
[1 ] [3 ] = [1 3 ] = [3 ]
(28)
242 48
4
(jk) (j) C/wl C/bl
(j) l lj
l l C/wl
jjjk
(j)C/bl
lj zl (zl)
jj
(zl + zl)
jj
C zl
(j)z lj
zl C
(j)jz l
C zl
(j)z lj
C 0
(j)z l
(j) (j)zl * C
z l
(j)ljl
*zl
(j)
(j)l
C .
(j)zl
(29)
. l l
(jk)l C/wl
(j)C/bl
(j) (j)zl al C
jal
(j)l =
C *
(j)z l
*MNISTerror 96.0%
4 l 41
Python 4
L L
"error"4.0%
L = C (zL).
(BP1)
(j)jaLj
(j) () 1C/aL j C j L
j
2(zL)zL
jj
(j) (j) (j) (BP1) zL(zL)2 1C/aL
(j)C/aL 2
C =
1 (yj aj)2 C/aL = (aj yj)
2jj
(BP1) L
L = aC (zL).
(BP1a)
(j)aCC/aL aCC (BP1a) (BP1) (BP1) 2
aC = (aL y) (BP1)
L = (aL y) (zL).
(30)
Numpy
l l+1
l = ((wl+1)T l+1 ) (zl).
(BP2)
(wl+1)T (l + 1)wl+1 (l + 1)l+1 (wl+1)T l (zl) l ll
(BP2) (BP1) ll L (BP1) (BP2) L1 (BP2) L2
C
(j)bl
= l .
(BP3)
(j)l C/bl
jj
(BP1) (BP2) (BP3)
C =
b
(31)
b
C= al1l .
(BP4)
(w)lkj
jk
(jk)C/wl l al1
C
w = ainout,
(32)
ainwout w 2
(32) ain(ain 0)C/w (BP4) 1
(j) (j) (j) (j)(BP1) - (BP4) (BP1) (zL) (zL)01 (zL) 0 ( 0)( 1)
(j) (BP2) (zl) l *
4
0 (BP1) -
(BP4) 4
*(wl+1)T l+1 (zl )
(BP1) (BP2) (1) (BP1)
L = (zL)aC
(33)
(j)(zL)(zL)0 aC
(2) (BP2)
l = (zl)(wl+1)T l+1 .
(34)
(3) (1)(2)
l = (zl)(wl+1)T (zL1)(wL)T (zL)aC
(35)
(BP1) (BP2) (BP1)
(BP2)
4
(BP1)-(BP4)
L (BP1)
(j)L =
C .
(j)zL
(36)
C aL
L = k .
(37)
jaL zL
kkj
(k) (j)k k = jkaL jzL k jaL/zL0
kj
L =
C aL
(j).
(38)
jaL zL
jj
aL = (zL)2(zL)
j jj
L = C (zL)
(39)
(j)jaLj
(BP1)
l 1l+1 (BP2) l = C/zl
jj
l+1 = C/zl+1
k k
C
()l =
(j)jzl
C
zl+1
(40)
= k
(41)
(k) (j)k zl+1
zl+1
zl
(k)= kl+1 .
(42)
(j)kzl
(k)22l+1 1
zl+1 = wl+1al + bl+1 = wl+1(zl ) + bl+1 .
(43)
kkjjk
j
kjjk
j
( )zl+1
k= wl+1(zl).
(44)
(j)zl
kjj
(42)
l = wl+1l+1 (zl ).
(45)
jkjkj
k
(BP2)
2 (BP3) (BP4) 2
(BP3) (BP4)
4
1. x a1
2. l = 2, 3, , Lzl = wlal1 + bl
and al = (zl)
3. L L = aC (zL)
4. l = L 1, L 2, , 2l = ((wl+1)T l+1 ) (zl)
5. (j) C = al1l C
= l
(w)lkj
jk
blj
l
1 1
f(j wjxj + b)f
(z) = z
C = Cx
m
1.
2. x ax,1
l = 2, 3, , L
zx,l = wlax,l1 + bl ax,l = (zx,l)
x,L x,L = aCx (zx,L)
l = L 1, L 2, , 2x,l = ((wl+1)T x,l+1 ) (zx,l)
3. () l = L, L 1, , 2
wl wl
x,l(ax,l1)T
(mx)bl bl
() ()x,l mx
Networkupdate_minibatchbackprop
update_mini_batchmini_batchNetwork
class Network():
...
def update_mini_batch(self, mini_batch, eta):
"""1
"mini_batch""(x, y)""
eta""""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)] self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
backprop
Cx/bl Cx/wl
backprop
jjk
Pythonl[-3]l3 backprop
class Network():
...
def backprop(self, x, y):
""""(nabla_b, nabla_w)""self.biases" and "self.weights"
"nabla_b""nabla_w"numpy"""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights]
#
activation = x
activations = [x] #
zs = [] # zfor b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b zs.append(z)
...
activation = sigmoid_vec(z) activations.append(activation)
#
delta = self.cost_derivative(activations[-1], y) * \ sigmoid_prime_vec(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# l2
# l = 1l = 22
#
# Pythonfor l in xrange(2, self.num_layers):
z = zs[-l]
spv = sigmoid_prime_vec(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * spv nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def cost_derivative(self, output_activations, y):
"""\partial C_x / \partial a """
return (output_activations-y)
def sigmoid(z):
""""""
return 1.0/(1.0+np.exp(-z))
sigmoid_vec = np.vectorize(sigmoid)
def sigmoid_prime(z): """""" return sigmoid(z)*(1-sigmoid(z))
sigmoid_prime_vec = np.vectorize(sigmoid_prime)
xX = [x1x2 xm] network.py MNIST
2
195060 C = C(w)
w1, w2, wjC/wj
C
wj
C(w + ej) C(w)
(46)
> 0ejj C/wj 2wjC (46) C/b
100 wjC/wj
C(w + ej) 100100 C(w)1001
11C/wj * 2 (46)1001 (46)
1986
1980
2 1 2
(w)
(w) l
jk
l jk
(jk)Cwl
(w) (.) ()C
Cl
(w)ljk
jk
(47)
(w) (l) C
jk
(w) l
jk
C
(jk)C/wl
(w) l
jk
(j)ljal
(j)al
al
(j) (w)l jk
l
(w)jk
(48)
(j) (q) (j)al l + 1 1al+1
al
(q)al+1
al+1
(q) (j)al
al .
(49)
(j) (48)
(q)al+1
al+1
(q) (j)al
al
(j) (w)l
(w)ljk
jk
(50)
(q) al+1
(w) l
jk
C 1
al , al+1, , aL1 , aL
jqn
CaL
m
aL1
al+1
al
(j)
C
mnq
wl
(51)
aL
aL1 aL2
al
wljk
mnp
jjk
(m)a/a C/aL C
(w)l
jk
1 C
CaL
aL1
al+1
al
C
mnq
j wl ,
(52)
aL
aL1 aL2
al
wljk
mnpqmnp
jjk
(47)
CC
aL
aL1
al+1
al
= mnq
j
(53)
wl
aL
aL1 aL2
al
wl
jkmnpqmnp
jjk
(53) C 2 1
al /wl
jjk
(jk) C/wl
1
(53)
1 *
(q) (q) (q)*1 (53) al+1zl+1 al+1
In academic work, please cite this book as: Michael A. Nielsen, "Neural Networks and Deep Learning", Determination Press, 2014
Last update: Tue Sep 2 09:19:44 2014
This work is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. This means you're free to copy, share, and build on this book, but not to sell it. If you're interested in commercial use, please contact me.
3
(http://nnadl-ja.github.io/nnadl_site_ja/chap3.html) (10/82)
4L1L2
1
1
What this book is about
On the exercises and problems
Appendix:
?
Acknowledgements Frequently Asked Questions
Sponsors
Resources
Code repository
Mailing list for book announcements
Michael Nielsen's project announcement mailing list
10
0.60.9 0.82 0.0 0.0 Run
= 0.15 1Run
Michael Nielsen / 20149-12
0.090.0 2.00 Run
( = 0.15)150 0.0
C/wC/b (6)
(y a)2
C =,
2
(54)
ax = 1y = 0 z = wx + ba = (z)
C = (a y)(z)x = a(z)
w
C = (a y)(z) = a(z),
b
(55)
(56)
x = 1y = 0(z)
sigmoid function
1.0
0.8
0.6
0.4
0.2
0.0
-4-3-2-101234
z
1(z)(55)(56)C/w
C/b
x1, x2, w1, w2, b
a = (z)z = j wjxj + b
(n )1
C = [y ln a + (1 y) ln(1 a)] ,
x
(57)
n
xy
(57)
C > 0(57)(a) log01(b)
x y = y(x)
x
y(x) = 0a 0y(x) = 0(57)2 ln(1 a) 0y(x) = 1a 1
xy0
*y(x)0
1
(57)a = (z)2
C =
wj
1y
(()n x(z)
(1 y) 1 (z)wj
(58)
(() ())= 1
n x
y (z)
(1 y)
1 (z)
)(z)xj.
(59)
(x)C1
(z)xj
wj
= n (z)(1 (z)) ((z) y).
(60)
(z) = 1/(1 + ez)(z) = (z)(1 (z)) (z) = (z)(1 (z))
(j) C = 1 x ((z) y).
(61)
(n)wjx
(z) y (55) (z)(z)
C = 1 ((z) y).
(62)
bn x
(56)
(z)
(z) = (z)(1 (z))
0.60.9
(link, for
comparison)2.0
= 0.15
y = y1 , y2 , aL, aL,
12
(1)C = [y ln aL + (1 y ) ln(1 aL)].
(j) (j)
(63)
(n)jj
xj
(57)
j (63)
01
ya[y ln a + (1 y) ln(1 a)]
[a ln y + (1 a) ln(1 y)]y = 0y = 1
(z) yy01y01(z) = y
(n )1
C = [y ln y + (1 y) ln(1 y)].
x
(64)
[y ln y + (1 y) ln(1 y)]
C= 1 aL1 (aL y )(zL).
(65)
(w) (n)Lkjjj jkx
(j)(zL) xL
L = aL y.(66)
C= 1 aL1 (aL y ).
(67)
(w) (n)Lkjj jkx
(j)(zL)
aL = zL
jj
xL
L = aL y.(68)
C= 1 aL1 (aL y )
(69)
(w) (n)Lkjj jkx
(n) (bLjj) C = 1 (aL y ).
(j) (x)
(70)
(j) (x)
MNIST
MNISTnetwork.py network2.py* MNIST
*GitHub
13010 = 0.5*30 network2.pynetwork.pynetwork2.py Python
help(network2.Network.SGD)
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
>>> import network2
>>> net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost)
>>> net.large_weight_initializer()
>>> net.SGD(training_data, 30, 10, 0.5, evaluation_data=test_data,
... monitor_evaluation_accuracy=True)
net.large_weight_initializer()1 95.49195.49
*1 = 3.0
= (1 )
(0) 1 d(1 ) = 1/6
66
100
96.8296.591 .413.1814 1
MNIST ? ?
? ?
? ? ?
?(55) and (56)(z) (z)xC = Cx
C
wj
C
b
= xj(a y)
= (a y).
(71)
(72)
C = C (z).
ba
(73)
(z) = (z)(1 (z)) = a(1 a)2
C
b
(72)
= C a(1 a).
a
(74)
C =
a
a y . a(1 a)
(75)
a
C = [y ln a + (1 y) ln(1 a)] + constant,(76)
x
(n )1
C = [y ln a + (1 y) ln(1 a)] + constant,
x
(77)
(71)(72)
? ?
x y = y(x)
x a = a(x)a
y11 ay0y
Cover and Thomas5Kraft
(61)xj xjwj xj
: 45
*
4
4 30MNIST24,000 1008
3023,860
MNIST50,0001,000 = 0.510400 network2
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
>>> import network2
>>> net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost)
>>> net.large_weight_initializer()
>>> net.SGD(training_data[:1000], 400, 10, 0.5, evaluation_data=test_data,
... monitor_evaluation_accuracy=True, monitor_training_cost=True)
* :
*4 overfitting.py
200 - 399
200 82280 280 280 280
15 1 15280 280
100 1,000 82.27
MNIST3
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
training_data test_data
validation_data validation_data
10, 000 MNIST
50, 000 10, 000 test_data
validation_data validation_data validation_data *
test_data validation_data
*
280
validation_data ()
test_data validation_data test_data validation_data test_data test_data test_data validation_data test_data test_data
(validation_data) (training_data) validation_data training_data ("hold out")
test_data
test_data
400
training_datavalidation_data test_data
1,000 50,000 30 0.5 10 50,00030
1,000 97.86 95.331.53
1,000 17.73
L2L2 L2
( 1)C = [y ln aL + (1 y ) ln(1 aL)] +
(j) (j)
w2.
(78)
n xj
jj2n
(w)1 2 2 /2n > 0 n
2L2
C = 1 y aL2 + w2.
(79)
2n x2n w
C = C0 +
w2.
()2n w
(80)
C0
1 2
2 C/w C/b (80)
C =
w
C =
b
C0 + w
wn
C0 .
b
(81)
(82)
(w)C0/w C0/b
n
b b C0 .
b
(83)
w w C0 w
(84)
wn
= (1 )w C0 .
(85)
nw
(n) w 1
C0/w m
(c.f. (20))
w (1 )w
Cx .
(86)
nm xw
(n) x Cx 1 (c.f. (21))
b b
Cx .
(87)
m xb
x
30 10 0.5 = 0.1 lmbda Python lambda validation_data
test_data
validation_data test_data validation_data
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
>>> import network2
>>> net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost)
>>> net.large_weight_initializer()
>>> net.SGD(training_data[:1000], 400, 10, 0.5,
... evaluation_data=test_data, lmbda = 0.1,
... monitor_evaluation_cost=True, monitor_evaluation_accuracy=True,
... monitor_training_cost=True, monitor_training_accuracy=True)
*
*2
overfitting.py
test_data 400
82.27 87.1400
1,00050,000 50,000 30 0.5
(n)10 n = 1, 000 n = 50, 000 1 = 0.1 = 5.0
>>> net.large_weight_initializer()
>>> net.SGD(training_data, 30, 10, 0.5,
... evaluation_data=test_data, lmbda = 5.0,
... monitor_evaluation_accuracy=True, monitor_training_accuracy=True)
95.49 96.49 1
100 = 5.0 L22
>>> net = network2.Network([784, 100, 10], cost=network2.CrossEntropyCost)
>>> net.large_weight_initializer()
>>> net.SGD(training_data, 30, 10, 0.5, lmbda=5.0,
... evaluation_data=validation_data,
... monitor_evaluation_accuracy=True)
97.92 30 = 0.1 = 5.0 60 98 98.04 152
MNIST
10
9
8
7
6
y 5
4
3
2
1
0
012345
x
x y x y y x 10 9
(012345x)y = a0 x9 + a1 x8 + + a9 *
10
9
*
Numpy polyfit 14 p(x)
8
7
6
y 5
4
3
2
1
0
y = 2x
10
9
8
7
6
y 5
4
3
2
1
0
012345
x
2
2(1) 9 (2) y = 2x
3 2 x y 9 x9
y = 2x + () y = a0 x9 + a1 x8 + y = 2x + () 9 9
2
1940
51 5 *
2 1859
1916
3 12 2 3
*
*
10080,000 50,00080,000 50,000
* "An Enquiry Concerning Human Understanding" (1748) ()
*
L2
L2 3L1 L2
L1
* Gradient-Based Learning Applied to Document Recognition, by Yann LeCun, Lon Bottou, Yoshua Bengio, and Patrick Haffner
(1998)
(n )
C = C0 +|w|.
w
(88)
L2
L1L2 L1L2
(88)
C = C0 + sgn(w).
(89)
wwn
sgn(w) w w +1 1 L1 L1
w w = w sgn(w) C0
(90)
nw
C0/w L2 (c.f. (86)),
w w = w (1 ) C0 .
(91)
nw
L10 L2 w |w| L1L2 w |w| L2 w L1 0
C/w w = 0 |w| w = 0 w = 0 w = 0 (89) (90) sgn(0) = 0 L1
L1L2
x y x
x
2
5 3 "3" "3" 2
*ImageNet Classification with Deep Convolutional Neural Networks, by Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton (2012).
L1L2
* MNIST 98.4 L2
98.7
: 1,000 MNIST80 30 10
= 0.5 = 5.0
30
= 5.0 *
*Improving neural networks by preventing co- adaptation of feature detectors by Geoffrey Hinton, Nitish Srivastava, Alex Krizhevsky, Ilya Sutskever, and Ruslan Salakhutdinov (2012).
* more_data.py
10010
MNIST5
15
MNIST 1 MNIST
* MNIST 800 MNIST 98.4% 98.9%
99.3%
*Best Practices for Convolutional Neural Networks Applied to Visual Document Analysis, by Patrice Simard, Dave Steinkraus, and John Platt (2003).
MNIST
Chapter 1 SVMSVM SVM scikit-learn SVMSVM
*
* more_data.py
SVM SVMscikit-learn 50,000SVM94.48 5,00093.24
A B 2 AB 2* AB
* Scaling to very very large corpora for natural language disambiguation, by Michele Banko and Eric Brill (2001).
X Y
()
Chapter 1 0 1
1, 000 1 1
x x 1 0 z = j wjxj + b xj 500 z 500 1 501 z 501 22.4
z
0.02
-30-20-100102030
|z| z 1 z 1 (z) 1 0 *
1 0 1
nin 0 1/nin 0 1 z = j wjxj + b 0 500 500 1 z 0 3/2 = 1.22
*Chapter 2
0.4
-30-20-100102030
Exercise
z = j wjxj + b 3/2 (a)
(b) 2
0 1 0
MNIST 30 10 = 5.0 = 0.5 0.1
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
>>> import network2
>>> net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost)
>>> net.large_weight_initializer()
>>> net.SGD(training_data, 30, 10, 0.1, lmbda = 5.0,
... evaluation_data=validation_data,
... monitor_evaluation_accuracy=True)
network2 net.large_weight_initializer()
>>> net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost)
>>> net.SGD(training_data, 30, 10, 0.1, lmbda = 5.0,
... evaluation_data=validation_data,
... monitor_evaluation_accuracy=True)
*
* weight_initialization.py
96 30 8793
100
2 2, 3 Chapter 4 1/nin
1/nin Yoshua Bengio2012*14, 15
L2 (1) (2) n exp(/m) (3) 1/n
*Practical Recommendations for Gradient-Based Training of Deep Architectures, by Yoshua Bengio (2012).
n
network2.py Chapter 1 network.py network.py 74
network.py network2.py Network Network sizes
cost
class Network():
def init (self, sizes, cost=CrossEntropyCost): self.num_layers = len(sizes)
self.sizes = sizes self.default_weight_initializer() self.cost=cost
init 2 network.py 2
default_weight_initializer nin 0 1/nin 0 1
def default_weight_initializer(self):
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]] self.weights = [np.random.randn(y, x)/np.sqrt(x)
for x, y in zip(self.sizes[:-1], self.sizes[1:])]
np Numpy Numpy
import 1 network.py
large_weight_initializer Chapter 1 0 1
default_weight_initializer
def large_weight_initializer(self):
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]] self.weights = [np.random.randn(y, x)
for x, y in zip(self.sizes[:-1], self.sizes[1:])]
large_weight_initializer Chapter 1
Network init 2 cost *
class CrossEntropyCost:
@staticmethod
def fn(a, y):
return np.nan_to_num(np.sum(-y*np.log(a)-(1-y)*np.log(1-a)))
@staticmethod
def delta(z, a, y):
return (a-y)
Python 2 1 a y CrossEntropyCost.fn
CrossEntropyCost.fn np.nan_to_num Numpy Chapter 2 L (66)
*Python
@staticmethod fn
delta
@staticmethod fn delta1 self
L = aL y.(92)
CrossEntropyCost.delta 2 2
network2.py 2 Chapter 1
QuadraticCost.fn a y 2 QuadraticCost.delta Chapter 22 (30)
class QuadraticCost:
@staticmethod
def fn(a, y):
return 0.5*np.linalg.norm(a-y)**2
@staticmethod
def delta(z, a, y):
return (a-y) * sigmoid_prime_vec(z)
network2.py network.py L2 network2.py
"""network2.py
~~~~~~~~~~~~~~
An improved version of network.py, implementing the stochastic gradient descent learning algorithm for a feedforward neural network. Improvements include the addition of the cross-entropy cost function, regularization, and better initialization of network weights. Note that I have focused on making the code simple, easily readable, and easily modifiable. It is not optimized, and omits many desirable features.
"""
#### Libraries
# Standard library
import json import random import sys
# Third-party libraries import numpy as np
#### Define the quadratic and cross-entropy cost functions class QuadraticCost:
@staticmethod
def fn(a, y):
"""Return the cost associated with an output ``a`` and desired output
``y``.
"""
return 0.5*np.linalg.norm(a-y)**2
@staticmethod
def delta(z, a, y):
"""Return the error delta from the output layer.""" return (a-y) * sigmoid_prime_vec(z)
class CrossEntropyCost: @staticmethod
def fn(a, y):
"""Return the cost associated with an output ``a`` and desired output
``y``. Note that np.nan_to_num is used to ensure numerical stability. In particular, if both ``a`` and ``y`` have a 1.0 in the same slot, then the expression (1-y)*np.log(1-a) returns nan. The np.nan_to_num ensures that that is converted to the correct value (0.0).
"""
return np.nan_to_num(np.sum(-y*np.log(a)-(1-y)*np.log(1-a)))
@staticmethod
def delta(z, a, y):
"""Return the error delta from the output layer. Note that the parameter ``z`` is not used by the method. It is included in the method's parameters in order to make the interface consistent with the delta method for other cost classes.
"""
return (a-y)
#### Main Network class class Network():
def init (self, sizes, cost=CrossEntropyCost):
"""The list ``sizes`` contains the number of neurons in the respective layers of the network. For example, if the list was [2, 3, 1]
then it would be a three-layer network, with the first layer containing 2 neurons, the second layer 3 neurons, and the third layer 1 neuron. The biases and weights for the network are initialized randomly, using
``self.default_weight_initializer`` (see docstring for that method).
"""
self.num_layers = len(sizes) self.sizes = sizes self.default_weight_initializer() self.cost=cost
def default_weight_initializer(self):
"""Initialize each weight using a Gaussian distribution with mean 0 and standard deviation 1 over the square root of the number of weights connecting to the same neuron. Initialize the biases
using a Gaussian distribution with mean 0 and standard deviation 1.
Note that the first layer is assumed to be an input layer, and by convention we won't set any biases for those neurons, since biases are only ever used in computing the outputs from later layers.
"""
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]] self.weights = [np.random.randn(y, x)/np.sqrt(x)
for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def large_weight_initializer(self):
"""Initialize the weights using a Gaussian distribution with mean 0 and standard deviation 1. Initialize the biases using a
Gaussian distribution with mean 0 and standard deviation 1.
Note that the first layer is assumed to be an input layer, and by convention we won't set any biases for those neurons, since biases are only ever used in computing the outputs from later layers.
This weight and bias initializer uses the same approach as in Chapter 1, and is included for purposes of comparison. It will usually be better to use the default weight initializer instead.
"""
self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]] self.weights = [np.random.randn(y, x)
for x, y in zip(self.sizes[:-1], self.sizes[1:])]
def feedforward(self, a):
"""Return the output of the network if ``a`` is input.""" for b, w in zip(self.biases, self.weights):
a = sigmoid_vec(np.dot(w, a)+b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta, lmbda = 0.0,
evaluation_data=None, monitor_evaluation_cost=False, monitor_evaluation_accuracy=False, monitor_training_cost=False, monitor_training_accuracy=False):
"""Train the neural network using mini-batch stochastic gradient descent. The ``training_data`` is a list of tuples ``(x, y)`` representing the training inputs and the desired outputs. The other non-optional parameters are self-explanatory, as is the regularization parameter ``lmbda``. The method also accepts
``evaluation_data``, usually either the validation or test data. We can monitor the cost and accuracy on either the evaluation data or the training data, by setting the appropriate flags. The method returns a tuple containing four lists: the (per-epoch) costs on the evaluation data, the accuracies on the evaluation data, the costs on the training data, and the accuracies on the training data. All values are evaluated at the end of each training epoch. So, for example, if we train for 30 epochs, then the first element of the tuple will be a 30-element list containing the cost on the evaluation data at the end of each epoch. Note that the lists
are empty if the corresponding flag is not set.
"""
if evaluation_data: n_data = len(evaluation_data) n = len(training_data)
evaluation_cost, evaluation_accuracy = [], [] training_cost, training_accuracy = [], [] for j in xrange(epochs):
random.shuffle(training_data) mini_batches = [
training_data[k:k+mini_batch_size]
for k in xrange(0, n, mini_batch_size)]
for mini_batch in mini_batches: self.update_mini_batch(
mini_batch, eta, lmbda, len(training_data))
print "Epoch %s training complete" j
if monitor_training_cost:
cost = self.total_cost(training_data, lmbda) training_cost.append(cost)
print "Cost on training data: {}".format(cost)
if monitor_training_accuracy:
accuracy = self.accuracy(training_data, convert=True) training_accuracy.append(accuracy)
print "Accuracy on training data: {} / {}".format( accuracy, n)
if monitor_evaluation_cost:
cost = self.total_cost(evaluation_data, lmbda, convert=True) evaluation_cost.append(cost)
print "Cost on evaluation data: {}".format(cost)
if monitor_evaluation_accuracy:
accuracy = self.accuracy(evaluation_data) evaluation_accuracy.append(accuracy)
print "Accuracy on evaluation data: {} / {}".format( self.accuracy(evaluation_data), n_data)
return evaluation_cost, evaluation_accuracy, \ training_cost, training_accuracy
def update_mini_batch(self, mini_batch, eta, lmbda, n):
"""Update the network's weights and biases by applying gradient descent using backpropagation to a single mini batch. The
``mini_batch`` is a list of tuples ``(x, y)``, ``eta`` is the learning rate, ``lmbda`` is the regularization parameter, and
``n`` is the total size of the training data set.
"""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)] nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)] self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w)`` representing the gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases] nabla_w = [np.zeros(w.shape) for w in self.weights] # feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b zs.append(z)
activation = sigmoid_vec(z) activations.append(activation)
# backward pass
delta = (self.cost).delta(zs[-1], activations[-1], y) nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# Note that the variable l in the loop below is used a little # differently to the notation in Chapter 2 of the book. Here, # l = 1 means the last layer of neurons, l = 2 is the
# second-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact # that Python can use negative indices in lists.
for l in xrange(2, self.num_layers): z = zs[-l]
spv = sigmoid_prime_vec(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * spv nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def accuracy(self, data, convert=False):
"""Return the number of inputs in ``data`` for which the neural network outputs the correct result. The neural network's output is assumed to be the index of whichever neuron in the final layer has the highest activation.
The flag ``convert`` should be set to False if the data set is validation or test data (the usual case), and to True if the data set is the training data. The need for this flag arises due to differences in the way the results ``y`` are represented in the different data sets. In particular, it flags whether we need to convert between the different representations. It may seem strange to use different representations for the different data sets. Why not use the same representation for all three data sets? It's done for efficiency reasons -- the program usually evaluates the cost on the training data and the accuracy on other data sets.
These are different types of computations, and using different representations speeds things up. More details on the representations can be found in mnist_loader.load_data_wrapper.
"""
if convert:
results = [(np.argmax(self.feedforward(x)), np.argmax(y))
for (x, y) in data]
else:
results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in data]
return sum(int(x == y) for (x, y) in results)
def total_cost(self, data, lmbda, convert=False):
"""Return the total cost for the data set ``data``. The flag
``convert`` should be set to False if the data set is the training data (the usual case), and to True if the data set is the validation or test data. See comments on the similar (but reversed) convention for the ``accuracy`` method, above.
"""
cost = 0.0
for x, y in data:
a = self.feedforward(x)
if convert: y = vectorized_result(y) cost += self.cost.fn(a, y)/len(data)
cost += 0.5*(lmbda/len(data))*sum( np.linalg.norm(w)**2 for w in self.weights)
return cost
def save(self, filename):
"""Save the neural network to the file ``filename``."""
data = {"sizes": self.sizes,
"weights": [w.tolist() for w in self.weights], "biases": [b.tolist() for b in self.biases], "cost": str(self.cost. name )}
f = open(filename, "w") json.dump(data, f) f.close()
#### Loading a Network def load(filename):
"""Load a neural network from the file ``filename``. Returns an instance of Network.
"""
f = open(filename, "r") data = json.load(f) f.close()
cost = getattr(sys.modules[ name ], data["cost"]) net = Network(data["sizes"], cost=cost)
net.weights = [np.array(w) for w in data["weights"]] net.biases = [np.array(b) for b in data["biases"]] return net
#### Miscellaneous functions def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the j'th position and zeroes elsewhere. This is used to convert a digit (0...9)
into a corresponding desired output from the neural network.
"""
e = np.zeros((10, 1)) e[j] = 1.0
return e
def sigmoid(z):
"""The sigmoid function.""" return 1.0/(1.0+np.exp(-z))
sigmoid_vec = np.vectorize(sigmoid)
def sigmoid_prime(z):
"""Derivative of the sigmoid function.""" return sigmoid(z)*(1-sigmoid(z))
sigmoid_prime_vec = np.vectorize(sigmoid_prime)
L2 Network.SGD lmbda
1Network.update_mini_batch
4
Network.SGD training_data evaluation_data Network.SGD
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
>>> import network2
>>> net = network2.Network([784, 30, 10], cost=network2.CrossEntropyCost())
>>> net.SGD(training_data, 30, 10, 0.5,
... lmbda = 5.0,
... evaluation_data=validation_data,
... monitor_evaluation_accuracy=True,
... monitor_evaluation_cost=True,
... monitor_training_accuracy=True,
... monitor_training_cost=True)
evaluation_data validation_data test_data 4 evaluation_data training_data False True Network network2.py
Network.SGD 4
>>> evaluation_cost, evaluation_accuracy,
... training_cost, training_accuracy = net.SGD(training_data, 30, 10, 0.5,
... lmbda = 5.0,
... evaluation_data=validation_data,
... monitor_evaluation_accuracy=True,
... monitor_evaluation_cost=True,
... monitor_training_accuracy=True,
... monitor_training_cost=True)
evaluation_cost 30 evaluation_data
Network Network.save Network load Python pickle cPickle JSONJSON pickle cPickle JSON Network Network Network. init pickle load JSON Network
network2.py
network.py 74152
L1 30 L1
MNIST
network.py Network.cost_derivative 2
Network.cost_derivative
network2.py Network.cost_derivative
CrossEntropyCost.delta
?
MNIST 301030 = 10.0 = 1000.0
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
>>> import network2
>>> net = network2.Network([784, 30, 10])
>>> net.SGD(training_data, 30, 10, 10.0, lmbda = 1000.0,
... evaluation_data=validation_data, monitor_evaluation_accuracy=True) Epoch 0 training complete
Accuracy on evaluation data: 1030 / 10000
Epoch 1 training complete
Accuracy on evaluation data: 990 / 10000
Epoch 2 training complete
Accuracy on evaluation data: 1009 / 10000
...
Epoch 27 training complete
Accuracy on evaluation data: 1009 / 10000
Epoch 28 training complete
Accuracy on evaluation data: 983 / 10000
Epoch 29 training complete
Accuracy on evaluation data: 967 / 10000
30 100 300 2
1
1
MNIST
01 01 0110 80%5
[784, 10] MNIST [784, 30, 10]
network2.py 50,000 PC[784, 30, 10] 1 1 1,000 10,000100 network2.py MNIST1,000 (01
)
>>> net = network2.Network([784, 10])
>>> net.SGD(training_data[:1000], 30, 10, 10.0, lmbda = 1000.0, \
... evaluation_data=validation_data[:100], \
... monitor_evaluation_accuracy=True) Epoch 0 training complete
Accuracy on evaluation data: 10 / 100
Epoch 1 training complete
Accuracy on evaluation data: 10 / 100
Epoch 2 training complete
Accuracy on evaluation data: 10 / 100
...
1 1
= 1000.0 = 20.0
>>> net = network2.Network([784, 10])
>>> net.SGD(training_data[:1000], 30, 10, 10.0, lmbda = 20.0, \
... evaluation_data=validation_data[:100], \
... monitor_evaluation_accuracy=True) Epoch 0 training complete
Accuracy on evaluation data: 12 / 100
Epoch 1 training complete
Accuracy on evaluation data: 14 / 100
Epoch 2 training complete
Accuracy on evaluation data: 25 / 100
Epoch 3 training complete
Accuracy on evaluation data: 18 / 100
...
100.0
>>> net = network2.Network([784, 10])
>>> net.SGD(training_data[:1000], 30, 10, 100.0, lmbda = 20.0, \
... evaluation_data=validation_data[:100], \
... monitor_evaluation_accuracy=True) Epoch 0 training complete
Accuracy on evaluation data: 10 / 100
Epoch 1 training complete
Accuracy on evaluation data: 10 / 100
Epoch 2 training complete
Accuracy on evaluation data: 10 / 100
Epoch 3 training complete
Accuracy on evaluation data: 10 / 100
...
1.0
>>> net = network2.Network([784, 10])
>>> net.SGD(training_data[:1000], 30, 10, 1.0, lmbda = 20.0, \
... evaluation_data=validation_data[:100], \
... monitor_evaluation_accuracy=True) Epoch 0 training complete
Accuracy on evaluation data: 62 / 100
Epoch 1 training complete
Accuracy on evaluation data: 42 / 100
Epoch 2 training complete
Accuracy on evaluation data: 43 / 100
Epoch 3 training complete
Accuracy on evaluation data: 61 / 100
...
10 20
L2
3 = 0.025 = 0.25 = 2.5 MNIST 3010 = 5.0 50, 000 *
* multiple_eta.py
= 0.025 = 0.25 20 = 2.5
= 2.5 * = 0.25 = 0.025 30 = 0.25 20 = 0.025
= 0.01 = 0.1, 1.0,
= 0.01
= 0.001, 0.0001, = 0.5 = 0.2
*
MNIST 0.1 = 0.5 = 0.25 = 0.5 30
("held-out")
MNIST10
10MNIST 10 10 2050 MNIST10
MNIST network2.py
network2.py nn
n network2.py 310
101 1,02411,0001
1 *
Exercise
network2.py 10 1/128
*MNIST Deep, Big, Simple Neural Nets Excel on Handwritten Digit Recognition, by Dan Claudiu Cirean, Ueli Meier, Luca Maria Gambardella, and Jrgen Schmidhuber (2010)
= 0.0 = 1.0* 10101
2 2
1
* = 1.0
1020
100 100 1 10050
100
() (x)w w = w 1C ,
100 x
(93)
w w = w Cx(94)
50 50 100
w w = w Cx.
x
(95)
100 150
100 100
10
James BergstraYoshua Bengio2012* 2012 *
Yoshua Bengio2012* Benigo Yann LeCun, Lon Bottou, Genevieve Orr, Klaus-Robert Mller 1998* 2012
*Random search for hyper-parameter optimization, by James Bergstra and Yoshua Bengio (2012).
*Practical Bayesian optimization of machine learning algorithms, by Jasper Snoek, Hugo Larochelle, and Ryan Adams.
*Practical recommendations for gradient-based training of deep architectures, by Yoshua Bengio (2012).
*Efficient BackProp, by Yann LeCun, Lon Bottou, Genevieve Orr and Klaus-Robert Mller (1998)
*
SVM
*Neural Networks: Tricks of the Trade, edited by Grgoire Montavon, Genevive Orr, and Klaus- Robert Mller.
MNIST 2
w = w1, w2, C(w) w
()C
C(w + w) = C(w) +w
wj
jj
1 2 C
+ 2 wj w w
wk +
(96)
jkjk
C(w + w) = C(w) + C w + 1 wT Hw + ,
2
(97)
C H
j k
2 C/wjwk
C(w + w) C(w) + C w + 1 wT Hw.
2
*
w = H 1 C.
(98)
(99)
* C
(98) w w + w = w H 1 C
w
w w = w H 1 C
C H w
w w = w H 1 C H C w
(98) w w = H 1 C
2
107 107 107 = 1014
2
2 2
wj v = v1 , v2 , * w w = w C
*wj
v v = v C w w = w + v
(100)
(101)
= 1 C v w
(100) 1 = 1 C = 0 (100) (101)
w w = w C 01
> 1
< 0
network2.py
* BFGS L-
BFGS BFGS *Nesterov
tanh
x w b tanh
*Efficient BackProp, by Yann LeCun, Lon Bottou, Genevieve Orr and Klaus-Robert Mller (1998).
* On the importance of initialization and momentum in deep learning, by Ilya Sutskever, James Martens, George Dahl, and Geoffrey Hinton (2012).
tanh(w x + b)(102)
tanh tanh
ez ez
tanh(z) . ez + ez
(103)
(z) =
1 + tanh(z/2)
2
(104)
tanh tanh
tanh function
(-4-3-2-101234)1.0
0.5
0.0z
-0.5
-1.0
tanhtanh -11tanh
tanh -11* tanh
(104)
(w)tanh tanh * l + 1j l+1
jk
al l+1
*
* Efficient BackProp, by Yann LeCun, Lon Bottou, Genevieve Orr and Klaus-Robert Mller (1998), Understanding the difficulty of training deep feedforward networks, by Xavier Glorot and Yoshua Bengio (2010)
k j
(j) l+1
l+1 wl+1
jjk
l+1 wl+1
jjk
tanh tanh
tanh tanh
Rectied Linear
Unit ReLU x, w,
b ReLU
max(0, w x + b).(105)
max(0, z)
(-4-3-2-1012345)5
4
3
2
1
0z
-1
-2
-3
-4
tanh tanhReLU
tanhReLU * ReLU tanhReLU 0 1 tanh ReLU ReLU 2ReLU
* Kevin Jarrett, Koray Kavukcuoglu, Marc'Aurelio Ranzato, Yann
LeCun What is the Best Multi-Stage Architecture for Object Recognition? (2009),
Xavier Glorot, Antoine Bordes, Yoshua Bengio Deep Sparse Rectier Neural Networks
(2011), Alex Krizhevsky, Ilya Sutskever, Geoffrey Hinton ImageNet Classification
with Deep Convolutional Neural Networks
(2012) ReLU ReLU Vinod Nair Geoffrey Hinton Rectified Linear Units Improve Restricted Boltzmann Machines (2010)
- Question and answer, Yann LeCun
...
*
* Alex Krizhevsky, Ilya Sutskever, Geoffrey Hinton ImageNet Classification with Deep Convolutional Neural Networks
(2012)
In academic work, please cite this book as: Michael A. Nielsen, "Neural Networks and Deep Learning", Determination Press, 2014
This work is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. This means you're free to copy, share, and build on this book, but not to sell it. If you're interested in commercial use, please contact me.
Last update: Tue Sep 2 09:19:44 2014
CHAPTER 4
(http://nnadl-ja.github.io/nnadl_site_ja/chap4.html) (10/24)
1 f(x)
x f(x)
f = f(x1, , xm) m = 3n = 2
What this book is about
On the exercises and problems
Appendix:
?
Acknowledgements Frequently Asked Questions
Sponsors
Resources
Code repository
Mailing list for book announcements
Michael Nielsen's project announcement mailing list
Michael Nielsen / 20149-12
1
*
1
* mp4 *
*Approximation by superpositions of a sigmoidal function, George Cybenko (1989). Cybenko Multilayer feedforward networks are universal approximators, Kurt Hornik, Maxwell Stinchcombe, Halbert White (1989)
1 1
2
2
f(x)3 3 5
f(x) > 0 g(x)|g(x) f(x) < |x
2
1 12 1
1
11
f 21
1
ww
(wx + b) (z) 1/(1 + ez) *
b
23
w = 100 x = 0.3
w = 999
*
x
wb bw
s = b/w
s = b/w s
w b = ws1s
2s1 s2 w1 , w2
w1a1 + w2a2 a a *
* b
12
(activation)a
s1 s1
s2
s2
10
(w1a1 + w2a2 + b)
w1 0.8w2 0.8 s1 s2 0.8
h "s1 = ""w1 = "
h
if-then-
else
if input >= step point:
1else:
0
if-then-else
212
h h
N [0, 1]N N N = 5
5 (0, 1/5), (1/5, 2/5), , (4/5, 5/5) 5
h hh h
h
+hh
h
f(x) = 0.2 + 0.4x2 + 0.3 sin(15x) + 0.05 cos(50x),
(106)
x01y01
j wjaj
(j wjaj + b) b
1 f(x) 1
f(x)*
2 1 0.40
1 2 h0.40
*0
f(x)
w = 1000
b = ws2s = 0.2b = 1000 0.2 = 200
hh
h = -1.2 12 -1.2 1.2
0
f(x) = 0.2 + 0.4x2 + 0.3 sin(15x) + 0.05 cos(50x) [0, 1][0, 1] 1
22
2
x, yw1 , w2 b w2 01w1 b
(Outputy=1x=1)
w2 = 0y x
w2 0w1 w1 = 100
3 sx b/w1
1
(Outputy=1x=1)
xw1 = 1000w2 = 0 xx
yw2 = 1000x0w1 = 0 y
(Outputy=1x=1)
yy xy yyx0
3x2 hh
h
(Weighted output from hidden layery=1x=1)
h h
0.30 0.70
x yy yx0
(Weighted output from hidden layery=1x=1)
y xy yx0
hxy2
(Weighted output from hidden layery=1x=1)
0 xy
h xy
(Tower functiony=1x=1)
(Many towersy=1x=1)
2hh
if-then-else
if input >= threshold:
1else:
0
1
if >= threshold:
1else:
0
threshold
3h/2
0
hb 2 (1) if-then-
elsehh (2) bif-then-else
(Outputy=1x=1)
h if-then-else b 3h/2
h = 10
h hb = 3h/2
22
2
2
(Weighted outputy=1x=1)
22
(Many towersy=1x=1)
21 ff
2
3x1, x2, x3 4
x1, x2, x3 s1, t1 s1, t1 , s2, 2+hhh5h/2
x1 s1 t1 x2 s2 t2 x3 s3 t3 3 10 10
3 m (m + 1/2)h
f(x1, , xm) Rn nf 1(x1, , xm), f 2(x1, , xm)
f 1 f 2
2 1 2 (a) xy (b)
(a) (c)
(c)
x1, x2, (j wjxj + b) wjb
s(z)
x1, x2, w1, w2, bs(j wjxj + b)
w = 100
s(z) s(z)z z well-defined 2 2 s(z)
Rectied Linear Unit Rectier Linear Unit
s(z) = z
1
f 1 f(x)
1 f(x)
1 f(x)/2
1
1 f(x)/2
1 f(x)/21 1 f(x) 2
M M 1 f(x)/M 1 M
NAND
2 1
1
1
. .
Jen DoddChris Olah Chris Mike
Bostock Amit Patel Bret Victor Steven
Wittens
In academic work, please cite this book as: Michael A. Nielsen, "Neural Networks and Deep Learning", Determination Press, 2014
This work is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. This means you're free to copy, share, and build on this book, but not to sell it. If you're interested in commercial use, please contact me.
Last update: Tue Sep 2 09:19:44 2014
CHAPTER 5
(http://nnadl-ja.github.io/nnadl_site_ja/chap5.html) (10/17)
ANDOR 2
AND NAND
NANDAND
2
2 2 22
What this book is about
On the exercises and problems
Appendix:
?
Acknowledgements Frequently Asked Questions
Sponsors
Resources
Code repository
Mailing list for book announcements
Michael Nielsen's project announcement mailing list
-
3 3
1984Furst
SaxeSipser*
Michael Nielsen / 20149-12
* Parity, Circuits, and the Polynomial-Time Hierarchy, by Merrick Furst, James B. Saxe, and Michael Sipser (1984)
98
*
* On the number of response regions of deep feed forward networks with piece-wise linear activations, by Razvan Pascanu, Guido Montfar, and Yoshua Bengio
(2014). 2 Learning deep architectures for AI, by Yoshua Bengio (2009).
1 MNIST*
Python 2.7Numpy
git clone https://github.com/mnielsen/neural-networks-and-deep-learning.git
git
src
PythonMNIST
>>> import mnist_loader
>>> training_data, validation_data, test_data = \
... mnist_loader.load_data_wrapper()
*MNIST
>>> import network2
>>> net = network2.Network([784, 30, 10])
784 28 28 = 784 3010
MNIST ('0',
'1', '2',, '9') 10
10 = 0.1 = 5.0 30 validation data**
>>> net.SGD(training_data, 30, 10, 0.1, lmbda=5.0,
... evaluation_data=validation_data, monitor_evaluation_accuracy=True)
96.48%
30
>>> net = network2.Network([784, 30, 30, 10])
>>> net.SGD(training_data, 30, 10, 0.1, lmbda=5.0,
... evaluation_data=validation_data, monitor_evaluation_accuracy=True)
96.90% 30
>>> net = network2.Network([784, 30, 30, 30, 10])
>>> net.SGD(training_data, 30, 10, 0.1, lmbda=5.0,
... evaluation_data=validation_data, monitor_evaluation_accuracy=True)
96.57% 1
>>> net = network2.Network([784, 30, 30, 30, 30, 10])
>>> net.SGD(training_data, 30, 10, 0.1, lmbda=5.0,
... evaluation_data=validation_data, monitor_evaluation_accuracy=True)
96.53%
* 2,3
*
[784, 30, 30, 10] 230 C/b 2
6 *
*
* generate_gradient.py
21 21 21
12 l = C/bl
jj
l j * 1 1 2 2 1 1 2 2
1 = 0.07 2 = 0.31 21
*2 C/w
3 0.0120.060
0.283 30 0.0030.0170.0700.285
2
1000 500 500001000
2
12
3 [784, 30, 30, 30, 10]
4 [784, 30, 30, 30, 30, 10]
100
1 *
*Gradient flow in recurrent nets: the difficulty of learning long-term dependencies, by Sepp Hochreiter, Yoshua Bengio, Paolo Frasconi, and Jrgen Schmidhuber (2001)recurrent neural network
1 1 f(x)
f (x)
MNIST
[784, 30, 30, 30, 10]
1 3
Sepp Hochreiter Untersuchungen zu dynamischen neuronalen Netzen (1991, in German)
w1, w2, b1, b2, C j aj (zj )
zj = wjaj1 + bj C a4
1 C/b1 C/b1
C/b1
(zj ) wj C/a4
b1 b1 1 a1 2 z2 2 a2 C
C
b1
C .
b1
(114)
C/b1
b1 1 a1 a1 = (z1 ) = (w1a0 + b1)
a1
(w1a0 + b1) b
b11
= (z1 )b1.
(115)
(116)
(z1 ) C/b1 b1 a1 a1 z2 = w2a1 + b2 2
z2
z2 a
a11
(117)
= w2a1 .(118)
z2 a1 b1 z2
z2 (z1 )w2b1.
(119)
C/b1 2
(zj ) wj b1 C
C (z )w (z ) (z ) C b .
(120)
122
4 a41
b1
C = (z )w (z ) (z ) C .
(121)
b1
122
4 a4
:
C = (z ) w (z ) w (z ) w (z ) C .
(122)
b1
1223
344
a4
wj(zj )
Derivative of sigmoid function
0.25
0.20
0.15
0.10
0.05
0.00
-4-3-2-101234
z
(0) = 1/4 0 1
|wj| < 1 wj(zj ) |wj(zj )| < 1/4
C/b1 C/b3 C/b3 C/b1 2
2 C/b1 wj(zj ) 2 (zj ) 1/4
C/b1 C/b3 16
wj
wj(zj ) |wj(zj )| < 1/4 1
2 1
w1 = w2 = w3 = w4 = 100 2(zj ) zj = 0 ( (zj ) = 1/4 ) z1 = w1a0 + b1 = 0 b1 = 100 a0 wj(zj )
(4) 100 1 = 25
|(z)| < 1/4
: |w(z)| |w(z)| 1 w (z) w a
(z) = (wa + b) w (wa + b) w wa + b
|w(wa + b)| |w(wa + b)| 1 (1) |w| 4 (2)
|w| 4 |w(wa + b)| 1 a a
(|w|) (2) 2 ln(|w|(1 + 1 4/|w|)
1).
(123)
(3) |w| 6.9 0.45
Identity neuron: 1 x w1 b w2 x [0, 1] w2(w1x + b) x identity neuron x = 1/2 + w1 w1
L l
l = (zl)(wl+1)T (zl+1)(wl+2)T (zL)aC
(124)
(zl) l (z) wl aC C
1 (wj)T (zj)
(4) (zj) 1 .
wj (wj)T (zl)
12010GlorotBengio* 0
22013SutskeverMartensDahlHinton* momentummomentum
2
*Understanding the difficulty of training deep feedforward neural networks, by Xavier Glorot and Yoshua Bengio (2010)Efficient BackProp, by Yann LeCun, Lon Bottou, Genevieve Orr and Klaus- Robert Mller (1998)
*On the importance of initialization and momentum in deep learning, by Ilya Sutskever, James Martens, George Dahl and Geoffrey Hinton (2013)
In academic work, please cite this book as: Michael A. Nielsen, "Neural Networks and Deep Learning", Determination Press, 2015
This work is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. This means you're free to copy, share, and build on this book, but not to sell it. If you're interested in commercial use, please contact me.
Last update: Sun Dec 21 14:49:05 2014
CHAPTER 6
(2018/8/3) (Neural networks and deep learning)
(http://nnadl-ja.github.io/nnadl_site_ja/chap6.html) (10/53)
MNIST
GPU 10,000
MNIST9,967 33
What this book is about
On the exercises and problems
Appendix:
?
Acknowledgements Frequently Asked Questions
Sponsors
Resources
Code repository
Mailing list for book announcements
Michael Nielsen's project announcement mailing list
3 "8""8""9" "9" ""
RNNLSTM
1 25
Michael Nielsen / 20149-12
MNIST
28 28
784 (= 28 28) '0', '1','2', ..., '8', '9'
MNIST
98%
3
28 28 28 28
1 25 5 5
*1970 1998by Yann LeCun, Lon Bottou, Yoshua Bengio, Patrick Haffner "Gradient-based learning applied to document recognition" LeCun """
""""" 1 """" """"
1
112
28 28 5 5 1
24 24
23
1 1 2 2 1 *
5 5 24 24 j, k
* 5 5 28 28 MNIST
(() ())44
b +
wl,maj+l,k+m.
l=0 m=0
(125)
b wl,m
5 5 ax,y x, y
*1 *
*
*MNIST MNIST
5 5
3 LeNet-5
MNIST 6 5 5 LeNet-5 20 40 *
20 20
5 5
5 5
*
2013Matthew ZeilerRob Fergus Visualizing and Understanding Convolutional Networks
25 = 5 5 1 26 20
20 26 = 520 784 = 28 28 30 784 30 30
23, 550
40
2
(125) a1 = (b + w a0 ) a1 a0
* 2 2 Max Max 2 2
24 24
12 12
1 Max 3Max
Max
*
Max L2 L2 2 2
2 2 L2 Max
1 10 MNIST 10 ('0', '1',
'2', etc)
MNIST 28 28 5 5 3 3 24 24 Max
2 2 3
3 12 12
Max 10
1
Max
(BP1)-(BP4) (link) Max
MNIST network3.py
network.pynetwork2.py*
GitHub network3.py network3.py
network.pynetwork2.pyPythonNumpy
*network3.pyTheano (LeNet-5 ) Misha Denil Chris Olah
network3.py
Theano* Theano Theano Theano1
CPUGPU GPU
Theano Theano Theano 0.6*
GPUMac OS X Yosemite NVIDIAGPUUbuntu 14.04 network3.py
network3.pyGPUTrueFalse TheanoGPU Google GPUAmazon Web Services EC2 G2 GPU CPU
CPU
1001 60
= 0.1 10*
>>> import network3
>>> from network3 import Network
>>> from network3 import ConvPoolLayer, FullyConnectedLayer, SoftmaxLayer
>>> training_data, validation_data, test_data = network3.load_data_shared()
>>> mini_batch_size = 10
>>> net = Network([
FullyConnectedLayer(n_in=784, n_out=100), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
>>> net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)
*2010 James Bergstra, Olivier Breuleux, Frederic Bastien, Pascal Lamblin, Ravzan Pascanu, Guillaume Desjardins, Joseph Turian, David Warde-Farley, and Yoshua BengioTheano: A CPU and GPU Math Expression Compiler in Python Theano Pylearn2 Keras Caffe Torch
*Theano
0.7 Theano 0.7
*
$97.80% ( *
97.80 %3 98.04
3 100 1 60 10 = 0.1
2 13
2 3
5 5 1 20 Max
2 2
*3 3
*
>>> net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2)), FullyConnectedLayer(n_in=20*12*12, n_out=100), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
>>> net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)
98.78 31
1 1 network3.py1 network3.py
network3.py
-
*10
98.78 %
2- - -5 5 2 2
>>> net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2)),
ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12),
filter_shape=(40, 20, 5, 5),
poolsize=(2, 2)), FullyConnectedLayer(n_in=40*4*4, n_out=100), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
>>> net.SGD(training_data, 60, mini_batch_size, 0.1, validation_data, test_data)
99.06 %
2 12- 2- 12 12 "" "" ""
2-
2 1 20 2- 20 12 12 1-1 20 - 2- 2 20 5 5 2 2-
*
tanh tanh tanh
1 3
*activation_fn=tanh
tanh* 60 20 60
tanh 60 tanh tanh * tanh
tanh ReLUtanh
ReLU 1998* MNISTLeNet-5
ReLU f(z) max(0, z) 60 = 0.03 = 0.1 L2
>>> from network3 import ReLU
>>> net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12),
filter_shape=(40, 20, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
>>> net.SGD(training_data, 60, mini_batch_size, 0.03, validation_data, test_data, lmbda=0.1)
ConvPoolLayerFullyConnectedLayer
*(z) = (1 + tanh(z/2))/2
*1998Yann LeCun, Lon Bottou, Yoshua Bengio, Patrick Haffner
"Gradient-based learning applied to document
recognition"
99.23 % 99.06 % ReLU
ReLU
ReLUtanh ReLU *ReLU
ReLU
1 expand_mnist.py*
$ python expand_mnist.py
50, 000 MNIST
250, 000 ReLU 60
>>> expanded_training_data, _, _ = network3.load_data_shared( "../data/mnist_expanded.pkl.gz")
>>> net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12),
* max(0, z) z ReLU 2
*expand_mnist.py
filter_shape=(40, 20, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
>>> net.SGD(expanded_training_data, 60, mini_batch_size, 0.03, validation_data, test_data, lmbda=0.1)
99.37 % 2003Simard, Steinkraus, Platt*
MNIST 99.6 2- 100 ReLU MNIST "elastic distortion" 99.6 %
*2003 Patrice Simard, Dave Steinkraus, John Platt Best Practices for Convolutional Neural Networks Applied to Visual Document
Analysis
: 1 300 1, 000
99.46 % 99.43 % 99.37 %
1100 2
>>> net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12),
filter_shape=(40, 20, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
FullyConnectedLayer(n_in=40*4*4, n_out=100, activation_fn=ReLU), FullyConnectedLayer(n_in=100, n_out=100, activation_fn=ReLU), SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
>>> net.SGD(expanded_training_data, 60, mini_batch_size, 0.03, validation_data, test_data, lmbda=0.1)
99.43
300 1, 000
99.48 % 99.47 %
MNIST
3
0
>>> net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
ConvPoolLayer(image_shape=(mini_batch_size, 20, 12, 12),
filter_shape=(40, 20, 5, 5),
poolsize=(2, 2), activation_fn=ReLU),
FullyConnectedLayer(
n_in=40*4*4, n_out=1000, activation_fn=ReLU, p_dropout=0.5), FullyConnectedLayer(
n_in=1000, n_out=1000, activation_fn=ReLU, p_dropout=0.5), SoftmaxLayer(n_in=1000, n_out=10, p_dropout=0.5)], mini_batch_size)
>>> net.SGD(expanded_training_data, 40, mini_batch_size, 0.03, validation_data, test_data)
99.60 % 100
99.37
2
1 40
2 100
1, 000
300 1, 000
1, 000
5 99.6 % 5
99.67 % 10, 000
33
MNIST
2
"6""5"
"6""0""5""3" 3"8""9" 4"6"
9,967
MNIST Rodrigo Benenson
1 2010Cirean, Meier, Gambardella, Schmidhuber*
2500, 2000, 1500, 1000, 500 Simard et al 1 1980MNIST 99.65 % GPU
103 106
(1) (2) (3)ReLU 3-5 (4) GPUMNIST 5 40
30
(3) (4) 30
*2010Dan Claudiu Cirean, Ueli Meier, Luca Maria Gambardella, Jrgen SchmidhuberDeep, Big, Simple Neural Nets Excel on Handwritten Digit Recognition
-1 4 4 1 2
2015 10 2000 1 2
3
network3.py 3
network2.py Theano FullyConnectedLayer *
class FullyConnectedLayer(object):
def init (self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0): self.n_in = n_in
self.n_out = n_out self.activation_fn = activation_fn self.p_dropout = p_dropout
# Initialize we