Chainerの使い方と自然言語処理への応用

ChainerNLP 2015/09/03 @

, Preferred Networks

l (Seiya Tokui) @beam2d (Twitter, GitHub)

l PFI (2012-2014) -> PFN (2014-)

l

Deep Learning2012

l 4 Chainer

2

Chainer

: http://chainer.org

: http://docs.chainer.org

GitHub: https://github.com/pfnet/chainer

l Deep Learning

l Python

l v1.3.0 (2015/9/3)

3

l Chainer

Deep Learning

CUDA CUDA NVIDIA

l

Recurrent Net Chainer

4

l

l

l

6

l

u /

u

l

7

DAG =

l

l

l

8

l z = x ** 2 + 2 * x * y + y

9

x

y

_ ** 2

2 * _ _ * _ _ + _ z

_ + _

l (chain rule)

l

l

l

l

10

z

z = h(y),y = g(x),x = f(w)

z

w=

z

y

y

x

x

w= Dh(y)Dg(x)Df (w)

w,x,y

z w

Recurrent Net

l

l t=T t=T-1 t=T

12

T

T-1

T

Recurrent Net

l DAG

l DAG Backprop Through Time

13

t=1

t=2

t=3

t=4

Truncated BPTT

l

l Truncated BPTT

14

t=1

t=2

t=3

t=4

Truncated

Chainer

Chainer

l

l Chainer Python

l Chainer

Python

16

Chainer

l Linux Ubuntu

l

Python CPython 2.7+ 3.4+

pip

pip install chainer

chainer import

l Python Anaconda

l Python pyenv OK

pyenv Anaconda

17

l Chainer Variable

l Variable

Function

Variable

Function chainer.functionsF

x = Varaible(...)

y = Variable(...)

z = x ** 2 + 2 * x * y + y

18

x

y

_**2

2*_ _*_ _+_ z

_+_

Variable

l

l NumPy CuPy

data

x = Variable(np.zeros((10, 20), dtype=np.float32))

x.data #=>

l Function

x 20 10

l Chainer float32 NumPy/CuPy float64

19

Function

l

l chainer.functions (=: F)

l Function

F.Linear, F.Convolution2D, F.EmbedID, ...

l

F.relu, F.max_pooling_2d, F.lstm, ...

Variable Python

20

FunctionSet

l Function

l FunctionSet

model = FunctionSet(embed=F.EmbedID(10000, 100),

layer1=F.Linear(100, 100),

layer2=F.Linear(100, 10000))

def forward(x):

h = F.relu(model.layer1(model.embed(x)))

return model.layer2(h)

21

l

F.softmax_cross_entropy, F.mean_squared_error

F.NegativeSampling, F.BinaryHierarchicalSoftmax

l Variable.backward()

def forward(x, t):

h = F.relu(model.layer1(model.embed(x)))

return F.softmax_cross_entropy(model.layer2(h), t)

loss = forward(x)

loss.backward()

22

Optimizer

l

l Optimizer

chainer.optimizers

SGD, MomentumSGD, AdaGrad, RMSprop, RMSpropGraves, AdaDelta, Adam

l setup

l FunctionSet

optimizer = optimizers.SGD()

optimizer.setup(model)

23

Optimizer

l Optimizer

zero_grads()

weight_decay(), clip_grads()

update()

optimizer.zero_grads()

loss = forward(x, t)

loss.backward()

optimizer.weight_decay(0.005)

optimizer.update()

24

Chainer

1. FunctionSet

2. Optimizer FunctionSet

3. forward

4.

5.

a.

b. forward backward

c. update

6.

a. forward

25

MNIST

# Model definition

model = FunctionSet(

l1=F.Linear(784, 100),

l2=F.Linear(100, 100),

l3=F.Linear(100, 10))

opt = optimizers.SGD()

opt.setup(model)

# Forward computation

def forward(x, t):

h1 = F.relu(model.l1(x))

h2 = F.relu(model.l2(h1))

y = model.l3(h2)

return F.softmax_cross_entropy(

y, t)

# Training loop

for epoch in xrange(n_epoch):

for i in xrange(0, N, batchsize):

x = Variable(...)

t = Variable(...)

opt.zero_grads()

loss = forward(x, t)

loss.backward()

opt.update()

Function

l Function Python

l forward(_cpu/_gpu) backward(_cpu/_gpu)

l

class SquaredDiff(Function):

def forward_cpu(self, inputs):

x, y = inputs

z = x y

return z * z,

def backward_cpu(self, inputs, grad_outputs):

x, y = inputs

gz = grad_outputs

gx = 2 * (x y) * gz

return gx, -gx

27

Function

l Function

l (gradient check)

forward backward

chainer.gradient_check.numerical_grad

l tests/chainer_tests/function_tests

28

CUDA

l Chainer v1.3.0

l CuPy: CUDA

NumPy

u

u reshape

elementwise, reduction

29

CuPy

l CUDA GPU

l CUDA 6.5

Ubuntu deb

l

PATH LD_LIBRARY_PATH

/usr/local/cuda

u PATH=/usr/local/cuda/bin:$PATH

u LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

l Chainer cupy import

30

CuPy

l numpy cupy NumPy

l CPU/GPU

chainer.cuda.get_array_module() cupy.ndarray numpy / cupy

NumPy CuPy logsumexp

def logsumexp(x, axis=None):

xp = cuda.get_array_module(x)

x_max = x.max(axis=axis)

return x_max + xp.log(xp.exp(x x_max).sum(axis=axis))

31

l ID

l F.EmbedID

Function

ID

int32 Variable float32 Variable

l 1 200

embedder = F.Embed(10000, 200)

word_arr = np.ndarray((10,), dtype=np.int32)

word = Variable(word_arr)

x = embedder(word) #=> 10 x 200

33

Recurrent Net

l 1 for

l

l DAG

def forward_one_step(x, h, t):

h = F.tanh(model.input(x) + model.lateral(h))

y = model.output(h)

return F.softmax_cross_entropy(y, t), h

h = Variable(np.zeros((1, 200), dtype=np.float32))

accum_loss = 0

for x, t in input_seq:

loss, h = forward_one_step(x, h, t)

accum_loss += loss

34

RNN

# Model definition


emb=F.EmbedID(1000, 50),

h2h=F.Linear( 50, 50),

h2y=F.Linear( 50, 1000))


opt.setup(model)

# Forward computation of one step

def fwd1step(h, w, t):

x = model.emb(w)

h = F.tanh(x + model.h2h(h))

y = model.h2y(h)


y, t), h

# Full RNN forward computation

def forward(seq):

h = Variable(...) # init state

accum_loss = 0

for curw, nextw in \

zip(seq, seq[1:]):

x = Variable(curw)

t = Variable(nextw)

loss, h = fwd1step(h, x, t)

accum_loss += loss

return accum_loss

Long Short-Term Memory

l LSTM

l F.lstm

l 2 2

4

4 2

u LSTM 4 OK

36

LSTM RNN

# Model definition


emb=F.EmbedID(1000, 50*4),

h2h=F.Linear( 50, 50*4),

h2y=F.Linear( 50, 1000))


opt.setup(model)

# Forward computation of one step

def fwd1step(c, h, w, t):

x = model.emb(w)

c, h = F.lstm(

c, x + model.h2h(h))

y = model.h2y(h)


y, t), c, h

# Full RNN forward computation

def forward(seq):

c = Variable(...) # init cell

h = Variable(...) # init state

accum_loss = 0

for curw, nextw in \

zip(seq, seq[1:]):

x = Variable(curw)

t = Variable(nextw)

loss, c, h = fwd1step(

c, h, x, t)

accum_loss += loss

return accum_loss

unchain_backward Truncated BPTT

l Truncated BPTT unchain_backward()

Variable

Python Variable

accum_loss = 0

for i, x in enumerate(batches):

loss, h = forward_on_step(*x) # forward

accum_loss += loss

if i % 30 == 0:

optimizer.zero_grads()

accum_loss.backward() # backward

accum_loss.unchain_backward() # truncate graph

optimizer.update() accum_loss = 0

38

Examples

examples

l mnist: MNIST

l imagenet: ImageNet ConvNet

l modelzoo: Caffe

l ptb: Penn-Tree Bank LSTM

Truncated BPTT

l word2vec: word2vec PTB

l sentiment: Recursive Net

39

l

l Chainer

l Chainer

l Chainer

40

Technology

Chainerの使い方と自然言語処理への応用