Upload
seiya-tokui
View
37.497
Download
0
Embed Size (px)
Citation preview
ChainerNLP 2015/09/03 @
, Preferred Networks
l (Seiya Tokui) @beam2d (Twitter, GitHub)
l PFI (2012-2014) -> PFN (2014-)
l
Deep Learning2012
l 4 Chainer
2
Chainer
: http://chainer.org
: http://docs.chainer.org
GitHub: https://github.com/pfnet/chainer
l Deep Learning
l Python
l v1.3.0 (2015/9/3)
3
l Chainer
Deep Learning
CUDA CUDA NVIDIA
l
Recurrent Net Chainer
4
l
l
l
6
l
u /
u
l
7
DAG =
l
l
l
8
l z = x ** 2 + 2 * x * y + y
9
x
y
_ ** 2
2 * _ _ * _ _ + _ z
_ + _
l (chain rule)
l
l
l
l
10
z
z = h(y),y = g(x),x = f(w)
z
w=
z
y
y
x
x
w= Dh(y)Dg(x)Df (w)
w,x,y
z w
l
11
Recurrent Net
l
l t=T t=T-1 t=T
12
T
T-1
T
Recurrent Net
l DAG
l DAG Backprop Through Time
13
t=1
t=2
t=3
t=4
Truncated BPTT
l
l Truncated BPTT
14
t=1
t=2
t=3
t=4
Truncated
Chainer
Chainer
l
l Chainer Python
l Chainer
Python
16
Chainer
l Linux Ubuntu
l
Python CPython 2.7+ 3.4+
pip
pip install chainer
chainer import
l Python Anaconda
l Python pyenv OK
pyenv Anaconda
17
l Chainer Variable
l Variable
Function
Variable
Function chainer.functionsF
x = Varaible(...)
y = Variable(...)
z = x ** 2 + 2 * x * y + y
18
x
y
_**2
2*_ _*_ _+_ z
_+_
Variable
l
l NumPy CuPy
data
x = Variable(np.zeros((10, 20), dtype=np.float32))
x.data #=>
l Function
x 20 10
l Chainer float32 NumPy/CuPy float64
19
Function
l
l chainer.functions (=: F)
l Function
F.Linear, F.Convolution2D, F.EmbedID, ...
l
F.relu, F.max_pooling_2d, F.lstm, ...
Variable Python
20
FunctionSet
l Function
l FunctionSet
model = FunctionSet(embed=F.EmbedID(10000, 100),
layer1=F.Linear(100, 100),
layer2=F.Linear(100, 10000))
def forward(x):
h = F.relu(model.layer1(model.embed(x)))
return model.layer2(h)
21
l
F.softmax_cross_entropy, F.mean_squared_error
F.NegativeSampling, F.BinaryHierarchicalSoftmax
l Variable.backward()
def forward(x, t):
h = F.relu(model.layer1(model.embed(x)))
return F.softmax_cross_entropy(model.layer2(h), t)
loss = forward(x)
loss.backward()
22
Optimizer
l
l Optimizer
chainer.optimizers
SGD, MomentumSGD, AdaGrad, RMSprop, RMSpropGraves, AdaDelta, Adam
l setup
l FunctionSet
optimizer = optimizers.SGD()
optimizer.setup(model)
23
Optimizer
l Optimizer
zero_grads()
weight_decay(), clip_grads()
update()
optimizer.zero_grads()
loss = forward(x, t)
loss.backward()
optimizer.weight_decay(0.005)
optimizer.update()
24
Chainer
1. FunctionSet
2. Optimizer FunctionSet
3. forward
4.
5.
a.
b. forward backward
c. update
6.
a. forward
25
MNIST
# Model definition
model = FunctionSet(
l1=F.Linear(784, 100),
l2=F.Linear(100, 100),
l3=F.Linear(100, 10))
opt = optimizers.SGD()
opt.setup(model)
# Forward computation
def forward(x, t):
h1 = F.relu(model.l1(x))
h2 = F.relu(model.l2(h1))
y = model.l3(h2)
return F.softmax_cross_entropy(
y, t)
# Training loop
for epoch in xrange(n_epoch):
for i in xrange(0, N, batchsize):
x = Variable(...)
t = Variable(...)
opt.zero_grads()
loss = forward(x, t)
loss.backward()
opt.update()
Function
l Function Python
l forward(_cpu/_gpu) backward(_cpu/_gpu)
l
class SquaredDiff(Function):
def forward_cpu(self, inputs):
x, y = inputs
z = x y
return z * z,
def backward_cpu(self, inputs, grad_outputs):
x, y = inputs
gz = grad_outputs
gx = 2 * (x y) * gz
return gx, -gx
27
Function
l Function
l (gradient check)
forward backward
chainer.gradient_check.numerical_grad
l tests/chainer_tests/function_tests
28
CUDA
l Chainer v1.3.0
l CuPy: CUDA
NumPy
u
u reshape
elementwise, reduction
29
CuPy
l CUDA GPU
l CUDA 6.5
Ubuntu deb
l
PATH LD_LIBRARY_PATH
/usr/local/cuda
u PATH=/usr/local/cuda/bin:$PATH
u LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
l Chainer cupy import
30
CuPy
l numpy cupy NumPy
l CPU/GPU
chainer.cuda.get_array_module() cupy.ndarray numpy / cupy
NumPy CuPy logsumexp
def logsumexp(x, axis=None):
xp = cuda.get_array_module(x)
x_max = x.max(axis=axis)
return x_max + xp.log(xp.exp(x x_max).sum(axis=axis))
31
32
l ID
l F.EmbedID
Function
ID
int32 Variable float32 Variable
l 1 200
embedder = F.Embed(10000, 200)
word_arr = np.ndarray((10,), dtype=np.int32)
word = Variable(word_arr)
x = embedder(word) #=> 10 x 200
33
Recurrent Net
l 1 for
l
l DAG
def forward_one_step(x, h, t):
h = F.tanh(model.input(x) + model.lateral(h))
y = model.output(h)
return F.softmax_cross_entropy(y, t), h
h = Variable(np.zeros((1, 200), dtype=np.float32))
accum_loss = 0
for x, t in input_seq:
loss, h = forward_one_step(x, h, t)
accum_loss += loss
34
RNN
# Model definition
model = FunctionSet(
emb=F.EmbedID(1000, 50),
h2h=F.Linear( 50, 50),
h2y=F.Linear( 50, 1000))
opt = optimizers.SGD()
opt.setup(model)
# Forward computation of one step
def fwd1step(h, w, t):
x = model.emb(w)
h = F.tanh(x + model.h2h(h))
y = model.h2y(h)
return F.softmax_cross_entropy(
y, t), h
# Full RNN forward computation
def forward(seq):
h = Variable(...) # init state
accum_loss = 0
for curw, nextw in \
zip(seq, seq[1:]):
x = Variable(curw)
t = Variable(nextw)
loss, h = fwd1step(h, x, t)
accum_loss += loss
return accum_loss
Long Short-Term Memory
l LSTM
l F.lstm
l 2 2
4
4 2
u LSTM 4 OK
36
LSTM RNN
# Model definition
model = FunctionSet(
emb=F.EmbedID(1000, 50*4),
h2h=F.Linear( 50, 50*4),
h2y=F.Linear( 50, 1000))
opt = optimizers.SGD()
opt.setup(model)
# Forward computation of one step
def fwd1step(c, h, w, t):
x = model.emb(w)
c, h = F.lstm(
c, x + model.h2h(h))
y = model.h2y(h)
return F.softmax_cross_entropy(
y, t), c, h
# Full RNN forward computation
def forward(seq):
c = Variable(...) # init cell
h = Variable(...) # init state
accum_loss = 0
for curw, nextw in \
zip(seq, seq[1:]):
x = Variable(curw)
t = Variable(nextw)
loss, c, h = fwd1step(
c, h, x, t)
accum_loss += loss
return accum_loss
unchain_backward Truncated BPTT
l Truncated BPTT unchain_backward()
Variable
Python Variable
accum_loss = 0
for i, x in enumerate(batches):
loss, h = forward_on_step(*x) # forward
accum_loss += loss
if i % 30 == 0:
optimizer.zero_grads()
accum_loss.backward() # backward
accum_loss.unchain_backward() # truncate graph
optimizer.update() accum_loss = 0
38
Examples
examples
l mnist: MNIST
l imagenet: ImageNet ConvNet
l modelzoo: Caffe
l ptb: Penn-Tree Bank LSTM
Truncated BPTT
l word2vec: word2vec PTB
l sentiment: Recursive Net
39
l
l Chainer
l Chainer
l Chainer
40