*? Ti2`ky L2m` HL2irQ`Fb - SJTU · Ti2`ky L2m` HL2irQ`Fb J K Hr vbb B/ óbQM-B7vQm? / #` BM-vQm ö/#2/ M;2`Qmb- ô;m2bbBiT vbiQ#2#` BMH2bbX 1KBM2K-aQM; ó"` BMH2bb ô-h?2J `b? HHJ

Hidden layerInput layer Output layerHidden layer Hidden layer

Vector

Vector

Vector

Vector

Vector

k − 1 x(k−1) kx(k)

k

x(k) := σ(W (k)x(k−1) + b(k)),

W (k) b(k)

σ : R → R

σ

σ1(x) :=1

1 + e−x.

σ2(x) :=

{x, x ≥ 0,

αx, x < 0,

α α = 1/10 α = 1/100

σ3(x) := max(x, 0).

σ4(x) := ln(1 + exp(x)).

σ5(x) := tanh(x).

σ2x < 0 σ3

α

σ1([0, 1/2], (1/2, 1]) tanh

([−1,−1/3], (−1/3, 1/3], (1/3, 1])

1

NNActivation f d

module NN

import PyPlot

struct Activationf::Functiond::Function

end

function sigma(x::Float64)1/(1+exp(-x))

end

const sigmoid = Activation(sigma,x::Float64 -> sigma(x) * (1-sigma(x)))

Cost

struct Costf::Function

delta::Functionend

function quadratic_cost(activation::Activation)Cost((a, y) -> 0.5 * norm(a-y)^2,

(z, a, y) -> (a-y) .* activation.d.(z))end

const cross_entropy_cost = Cost((a, y) -> sum(- y .* log.(a) - (1-y) .* log.(1-a)),(z, a, y) -> a-y)

Networkweights biases

sizes

Networknew

sqrt(j)

mutable struct Networkactivation::Activationcost::Costn_layers::Intsizes::Vector{Int}weights::Vector{Array{Float64, 2}}biases::Vector{Vector{Float64}}training_cost::Vector{Float64}evaluation_cost::Vector{Float64}training_accuracy::Vector{Float64}evaluation_accuracy::Vector{Float64}

function Network(sizes; activation = sigmoid,cost = quadratic_cost(activation),scale_weights = true)

new(activation, cost, length(sizes), sizes,[randn(i, j) / if scale_weights sqrt(j) else 1 endfor (i, j) in zip(sizes[2:end], sizes[1:end-1])],

[randn(i) for i in sizes[2:end]],[], [], [], [])

endend

Network Network([100, 10, 1])

a W*a+bnn.activation.f

function feed_forward(nn::Network, input::Vector{Float64})local a = input

for (W, b) in zip(nn.weights, nn.biases)a = nn.activation.f.(W*a+b)

end

aend

φ

nf Rd

σ

KRd f ∈ C(K,R) ϵ ∈ R+

σ

n bi ∈ R vi ∈ Rwi ∈ Rd i ∈ {1, . . . , d}

maxx∈K

|φ(x)− f(fx)| < ϵ

φ

φ(x) :=n∑

i=1

viσ(wi · x+ bi),

φ C(K)

C(K,RD) D

g ∈ C([a, b],R)

ψj(x) :=

{cj , x ∈ [aj , bj),

0, .

2m

ψ(x) =m∑

j=1

ψj(x) ≈ σ−1 ◦ g(x)

[a, b] g ∈C([a, b]) σ m

f ∈C(Rd,R)

ψ1j((x1, x2)) :=

{c1j , x1 ∈ [a1j , b1j),

0, ,

ψ2k((x1, x2)) :=

{c2k, x2 ∈ [a2k, b2k),

0,

ψjk((x1, x2)) :=

{cjk, (x1, x2) ∈

([a1j , b1j), [a2k, b2k)

),

0, ,

ψjk

f ∈ C(R2,R)

ψ((x1, x2)) :=

mj∑

j=1

mk∑

k=1

ψjk((x1, x2)) ≈ σ−1 ◦ f((x1, x2))

σd

C(R,R)

Nf(x)/N x

n

⊓(

28 × 28

1 7

MNIST Pkg.add(”MNIST”)

MNIST_n_rows MNIST_n_cols

MNIST_read_images[0, 1]

global MNIST_file_training_images = Pkg.dir(”MNIST”, ”data”, ”train-images.idx3-ubyte”)global MNIST_file_training_labels = Pkg.dir(”MNIST”, ”data”, ”train-labels.idx1-ubyte”)global MNIST_file_test_images = Pkg.dir(”MNIST”, ”data”, ”t10k-images.idx3-ubyte”)global MNIST_file_test_labels = Pkg.dir(”MNIST”, ”data”, ”t10k-labels.idx1-ubyte”)

global MNIST_n_rows, MNIST_n_cols

function MNIST_read_images(filename::String)open(filename, ”r”) do s

local magic_number = bswap(read(s, UInt32))local n_items = Int(bswap(read(s, UInt32)))global MNIST_n_rows = Int(bswap(read(s, UInt32)))global MNIST_n_cols = Int(bswap(read(s, UInt32)))

[Vector{Float64}(read(s, UInt8, MNIST_n_rows*MNIST_n_cols)) ./ typemax(UInt8)for i in 1:n_items]

endend

function MNIST_read_labels(filename::String)open(filename, ”r”) do s

local magic_number = bswap(read(s, UInt32))local n_items = Int(bswap(read(s, UInt32)))

[Int(read(s, UInt8)) for i in 1:n_items]end

end

vectorize n10 n

function vectorize(n::Integer)local result = zeros(10)result[n+1] = 1result

end

load_MNIST_data

function load_MNIST_data()local MNIST_training_x = MNIST_read_images(MNIST_file_training_images)local MNIST_training_y = MNIST_read_labels(MNIST_file_training_labels)local MNIST_test_x = MNIST_read_images(MNIST_file_test_images)local MNIST_test_y = MNIST_read_labels(MNIST_file_test_labels)

(MNIST_training_x[1:50_000], vectorize.(MNIST_training_y[1:50_000]),MNIST_training_x[50_001:60_000], MNIST_training_y[50_001:60_000],MNIST_test_x, MNIST_test_y)

end

global (training_data_x, training_data_y,validation_data_x, validation_data_y,test_data_x, test_data_y) = @time load_MNIST_data()

PyPlot

function plot_digit(n::Int, file=nothing)local v

if 1 <= n <= 50_000v = training_data_x[n]

elseif 50_001 <= n <= 60_000v = validation_data_x[n-50_000]

elseif 60_001 <= n <= 70_000v = test_data_x[n-60_000]

end

PyPlot.matshow(reshape(v, (MNIST_n_rows, MNIST_n_cols))’, cmap=”Blues”)PyPlot.axis(”off”)

if isa(file, String)PyPlot.savefig(file * string(n) * ”.pdf”, bbox_inches=”tight”, pad_inches=0)

endend

x ∈ R784

28 · 28 = 784

y(x) ∈ R10

y : R784 → R10

a : R784 → R10

C2(W,b) :=1

2n

∑

x∈T

∥y(x)− a(x)∥22,

n := |T | xT

W b1/n

n1/2 2

2

C (W,b) := − 1

n

∑

x∈T

(y(x) · ln a(x) + (1− y(x)) · ln(1− a(x))

),

C

pC

∇C =

⎛

⎜⎝

∂C∂p1

∂C∂pn

⎞

⎟⎠

∂C

∂e(p)

C p e

∂C

∂e(p) = ∇C(p) · e

C pp

C(p) eC

∣∣∣∣∂C

∂e(p)

∣∣∣∣ = |∇C(p) · e| ≤ |∇C(p)|,

|e| = 1

e = ∇C(p)

∂C

∂e(p) = ∇C(p) ·∇C(p) = ∥∇C(p)∥22 ≥ 0,

Ce = −∇C(p)

∂C

∂e(p) = −∇C(p) ·∇C(p) = −∥∇C(p)∥22 ≤ 0,

CC ∆p

p∆p := −η∇C(p),

η ∈ R+

∆C

∆C ≈ ∇C(p) ·∆p,

∆C ≈ −η∇C(p) ·∇C(p) = −η∥∇C(p)∥22 ≤ 0

∆p

SGD

λ

function SGD(nn::Network,training_data_x::Vector{Vector{Float64}},training_data_y::Vector{Vector{Float64}},epochs::Int, batch_size::Int, eta::Float64,lambda::Float64 = 0.0;evaluation_data_x::Vector{Vector{Float64}} = [],evaluation_data_y::Union{Vector{Int64}, Vector{Vector{Float64}}} = [],monitor_training_cost = true,monitor_evaluation_cost = true,monitor_training_accuracy = true,monitor_evaluation_accuracy = true)

nn.training_cost = []nn.evaluation_cost = []nn.training_accuracy = []nn.evaluation_accuracy = []

for epoch in 1:epochslocal perm = randperm(length(training_data_x))

for k in 1:batch_size:length(training_data_x)update!(nn,

training_data_x[perm[k:min(k+batch_size-1, end)]],training_data_y[perm[k:min(k+batch_size-1, end)]],eta, lambda, length(training_data_x))

end

info(@sprintf(”Epoch %d done”, epoch))

if monitor_training_costpush!(nn.training_cost, total_cost(nn, training_data_x, training_data_y, lambda))info(@sprintf(”Cost on training data: %f”, nn.training_cost[end]))

end

if monitor_evaluation_costpush!(nn.evaluation_cost, total_cost(nn, evaluation_data_x, evaluation_data_y, lambda))info(@sprintf(”Cost on evaluation data: %f”, nn.evaluation_cost[end]))

end

if monitor_training_accuracylocal a = accuracy(nn, training_data_x, training_data_y)local l = length(training_data_x)local r = a/linfo(@sprintf(”Accuracy on training data: %5d / %5d = %5.1f%% correct”, a, l, 100*r))push!(nn.training_accuracy, r)

end

if monitor_evaluation_accuracya = accuracy(nn, evaluation_data_x, evaluation_data_y)l = length(evaluation_data_x)r = a/linfo(@sprintf(”Accuracy on evaluation data: %5d / %5d = %5.1f%% correct”, a, l, 100*r))push!(nn.evaluation_accuracy, r)

endend

nnend

function total_cost(nn::Network,data_x::Vector{Vector{Float64}},data_y::Vector{Int64}, lambda::Float64)

total_cost(nn, data_x, vectorize.(data_y), lambda)end

function total_cost(nn::Network,data_x::Vector{Vector{Float64}},data_y::Vector{Vector{Float64}}, lambda::Float64)

sum(map((x, y) -> nn.cost.f(feed_forward(nn, x), y), data_x, data_y)) / length(data_x) +0.5 * lambda * sum(vecnorm(w)^2 for w in nn.weights) / length(data_x)

end

function accuracy(nn::Network,data_x::Vector{Vector{Float64}},

data_y::Vector{Int64})count(map((x, y) -> y == indmax(feed_forward(nn, x)) - 1, data_x, data_y))

end

function accuracy(nn::Network,data_x::Vector{Vector{Float64}},data_y::Vector{Vector{Float64}})

accuracy(nn, data_x, map(y -> indmax(y) - 1, data_y))end

update!η

λ = 0propagate_back

function update!(nn::Network,batch_x::Vector{Vector{Float64}},batch_y::Vector{Vector{Float64}},eta::Float64, lambda::Float64, n::Int)

local grad_W = [zeros(W) for W in nn.weights]local grad_b = [zeros(b) for b in nn.biases]

for (x, y) in zip(batch_x, batch_y)(delta_grad_W, delta_grad_b) = propagate_back(nn, x, y)grad_W += delta_grad_Wgrad_b += delta_grad_b

end

nn.weights = (1-eta*lambda/n) * nn.weights - (eta/length(batch_x)) * grad_Wnn.biases -= (eta/length(batch_x)) * grad_b

nnend

propagate_back

l

(l) = σ(W (l) (l−1) + b(l)),

σl

(l) := W (l) (l−1) + b(l).

(l) = σ( (l))

∂C

∂W (l)ij

∂C

∂b(l)i

CW (l) b(l)

CC = (1/n)

∑x K(x)

x

C = C(a(l))

C z(l)i i l

δ(l)i :=∂C

∂z(l)i

.

i lL

δ(L)i =

∑

k

∂C

∂a(L)k

∂a(L)k

∂z(L)i

∀i,

k L

σ ∂a(L)k /∂z(L)

ik = i

δ(L)i =

∂C

∂a(L)i

σ′(z(L)i ) ∀i

δ(L) =∂C

∂a(L)⊙ σ′(z(L)),

⊙σ′

δ(l) δ(l+1)

δ(l)i =∂C

∂z(l)i

=∑

k

∂C

∂z(l+1)k

∂z(l+1)k

∂z(l)i

=∑

k

∂z(l+1)k

∂z(l)i

δ(l+1)k .

(l+1) = W (l+1) (l) + b(l+1) = W (l+1)σ( (l)) + b(l+1)

(l+1)k =

∑

i

w(l+1)ki σ( (l)

i ) + b(l+1)k

∂z(l+1)k

∂z(l)i

= w(l+1)ki σ′( (l)

i )

δ(l)i =∑

k

w(l+1)ki σ′( (l)

i )δ(l+1)k

δ(l) =((W (l+1))⊤δ(l+1)

)⊙ σ′( (l)).

Cδ(l)

∂C

∂w(l)ij

=∑

k

∂C

∂z(l)k

∂z(l)k

∂w(l)ij

,

∂C

∂b(l)i

=∑

k

∂C

∂z(l)k

∂z(l)k

∂b(l)i

.

∂C/∂z(l)k = δ(l)k

(l)k =

∑

j

w(l)kj

(l−1)j + b(l)

k

∂z(l)k

∂w(l)ij

=

{(l−1)j , i = k,

0, i ̸= k,

∂z(l)k

∂b(l)i

=

{1, i = k,

0, i ̸= k.

∂C

∂w(l)ij

= δ(l)i a(l−1)j ,

∂C

∂b(l)i

= δ(l)i .

C δ(l) z(l)

a(l)

δ(l)

δ(L) Lδ(l)

δ(l+1) ∂C/∂w(l)ij ∂C/∂b(l)

i

propagate_backgrad_W grad_b

z aa[l] = a(l−1) a[1]

deltaδ(L)

delta Costgrad_W[end] grad_b[end] l = L

deltagrad_W[l] grad_b[l]

a[l] = a(l−1)

function propagate_back(nn::Network, x::Vector{Float64}, y::Vector{Float64})local grad_W = [zeros(W) for W in nn.weights]local grad_b = [zeros(b) for b in nn.biases]

local z = Vector(nn.n_layers-1)local a = Vector(nn.n_layers)

a[1] = xfor (i, (W, b)) in enumerate(zip(nn.weights, nn.biases))

z[i] = W * a[i] + ba[i+1] = nn.activation.f.(z[i])

end

local delta = nn.cost.delta(z[end], a[end], y)grad_W[end] = delta * a[end-1]’grad_b[end] = delta

for l in nn.n_layers-2:-1:1delta = (nn.weights[l+1]’ * delta) .* nn.activation.d.(z[l])grad_W[l] = delta * a[l]’grad_b[l] = delta

end

(grad_W, grad_b)end

end # module

srand

srand(0)NN.SGD(NN.Network([NN.MNIST_n_rows*NN.MNIST_n_cols, 30, 10]),

NN.training_data_x, NN.training_data_y,100, 10, 3.0,evaluation_data_x = NN.test_data_x,evaluation_data_y = NN.test_data_y)

NN

η

SGD

100

C0 w

W (l)ij ℓ2

Cℓ2(W,b,λ) := C0(W,b) +λ

2n∥w∥22

= C0(W,b) +λ

2n

∑

k

w2k

= C0(W,b) +λ

2n

∑

l,i,j

(W (l)ij )2,

λ ∈ R+0 n

1/n C0

ℓp

Cℓp(W,b,λ) := C0(W,b) +λ

pn∥w∥pp = C0(W,b) +

λ

pn

∑

k

|wk|p.

λC0

∂Cℓ2

∂w=∂C0

∂w+λ

nw,

∂Cℓ2

∂b=∂C0

∂b,

∆w := −η ∂C0

∂w− ηλ

nw,

∆b := −η ∂C0

∂b.

∆w w

(1− ηλ

n

)w − η

∂C0

∂w,

update!

λ = 1.0

NN.SGD(NN.Network([NN.MNIST_n_rows*NN.MNIST_n_cols, 30, 10]),NN.training_data_x, NN.training_data_y,100, 10, 3.0, 1.0,evaluation_data_x = NN.test_data_x,evaluation_data_y = NN.test_data_y)

δ(l)

σ′(z(l)) σ′(z(l))

σ′(z(l))σ2 σ5

σ′(z(l)) δ(L)

C

∂C

∂b= a− y.

C b(L)i b(L)

L b a(L)i a yi

y

∂C

∂b=∂C

∂aσ′(z).

σ1 σ′1(z) = σ1(z)(1− σ1(z)) = a(1− a)

a− y =∂C

∂b=∂C

∂aa(1− a),

∂C

∂a=

a− y

a(1− a)= −y

a+

1− y

1− a

∂C/∂a

C = −y ln a− (1− y) ln(1− a) + const.

C

C (W,b) := − 1

n

∑

x∈T

(y(x) · lna(L)(x) + (1− y(x)) · ln(1− a(L)(x))

),

L

δ(L) =∂C

∂a(L)⊙ σ′(z(L))

= − 1

n

∑

x∈T

(y(x)⊘ a(L)(x)− (1− y(x))⊘ (1− a(L)(x))

)⊙ a(L)(x)⊙ (1− a(L)(x))

=1

n

∑

x∈T

a(L)(x)− y(x),

⊘

∂C

∂w(L)ij

= δ(L)i a(L−1)

j =1

n

∑

x∈T

(a(L)i (x)− yi(x))a

(L−1)j ,

∂C

∂b(L)= δ(L) =

1

n

∑

x∈T

a(L)(x)− y(x).

σ′(z(L)

C2

σ L a(L)j = z(L)

j

δ(L) = a(L) − y

∂C2

∂w(L)ij

=1

n

∑

x∈T

(a(L)i − z(L)

i )a(L−1)j ,

∂C2

∂b(L)=

1

n

∑

x∈T

(a(L) − z(L)),

L

σi i ∈ {1, 2, 3, 4, 5}limx→±∞ σi(x)

i ∈ {1, 2, 3, 4, 5}

σi i ∈ {1, 2, 3, 4, 5}

σi i ∈ {1, 2, 3, 4, 5} SGD

σi i ∈ {1, 2, 3, 4, 5}

∀u ∈ Rd : ∀v ∈ Rd : |⟨u,v⟩|2 ≤ ⟨u,u⟩⟨v,v⟩,

ηµ ∈ R+ µ ≈ 1 µλ λ λ/µ

η

forSGD

ℓ1

ℓp

ℓ1 ℓ2

Documents

*? Ti2`ky L2m` HL2irQ`Fb - SJTU · Ti2`ky L2m` HL2irQ`Fb J K Hr vbb B/ óbQM-B7vQm? / #` BM-vQm ö/#2/ M;2`Qmb- ô;m2bbBiT vbiQ#2#` BMH2bbX 1KBM2K-aQM; ó"` BMH2bb ô-h?2J `b? HHJ