Upload
others
View
1
Download
0
Embed Size (px)
Citation preview
Hidden layerInput layer Output layerHidden layer Hidden layer
Vector
Vector
Vector
Vector
Vector
k − 1 x(k−1) kx(k)
k
x(k) := σ(W (k)x(k−1) + b(k)),
W (k) b(k)
σ : R → R
σ
σ1(x) :=1
1 + e−x.
σ2(x) :=
{x, x ≥ 0,
αx, x < 0,
α α = 1/10 α = 1/100
σ3(x) := max(x, 0).
σ4(x) := ln(1 + exp(x)).
σ5(x) := tanh(x).
σ2x < 0 σ3
α
σ1([0, 1/2], (1/2, 1]) tanh
([−1,−1/3], (−1/3, 1/3], (1/3, 1])
1
NNActivation f d
module NN
import PyPlot
struct Activationf::Functiond::Function
end
function sigma(x::Float64)1/(1+exp(-x))
end
const sigmoid = Activation(sigma,x::Float64 -> sigma(x) * (1-sigma(x)))
Cost
struct Costf::Function
delta::Functionend
function quadratic_cost(activation::Activation)Cost((a, y) -> 0.5 * norm(a-y)^2,
(z, a, y) -> (a-y) .* activation.d.(z))end
const cross_entropy_cost = Cost((a, y) -> sum(- y .* log.(a) - (1-y) .* log.(1-a)),(z, a, y) -> a-y)
Networkweights biases
sizes
Networknew
sqrt(j)
mutable struct Networkactivation::Activationcost::Costn_layers::Intsizes::Vector{Int}weights::Vector{Array{Float64, 2}}biases::Vector{Vector{Float64}}training_cost::Vector{Float64}evaluation_cost::Vector{Float64}training_accuracy::Vector{Float64}evaluation_accuracy::Vector{Float64}
function Network(sizes; activation = sigmoid,cost = quadratic_cost(activation),scale_weights = true)
new(activation, cost, length(sizes), sizes,[randn(i, j) / if scale_weights sqrt(j) else 1 endfor (i, j) in zip(sizes[2:end], sizes[1:end-1])],
[randn(i) for i in sizes[2:end]],[], [], [], [])
endend
Network Network([100, 10, 1])
a W*a+bnn.activation.f
function feed_forward(nn::Network, input::Vector{Float64})local a = input
for (W, b) in zip(nn.weights, nn.biases)a = nn.activation.f.(W*a+b)
end
aend
φ
nf Rd
σ
KRd f ∈ C(K,R) ϵ ∈ R+
σ
n bi ∈ R vi ∈ Rwi ∈ Rd i ∈ {1, . . . , d}
maxx∈K
|φ(x)− f(fx)| < ϵ
φ
φ(x) :=n∑
i=1
viσ(wi · x+ bi),
φ C(K)
C(K,RD) D
g ∈ C([a, b],R)
ψj(x) :=
{cj , x ∈ [aj , bj),
0, .
2m
ψ(x) =m∑
j=1
ψj(x) ≈ σ−1 ◦ g(x)
[a, b] g ∈C([a, b]) σ m
f ∈C(Rd,R)
ψ1j((x1, x2)) :=
{c1j , x1 ∈ [a1j , b1j),
0, ,
ψ2k((x1, x2)) :=
{c2k, x2 ∈ [a2k, b2k),
0,
ψjk((x1, x2)) :=
{cjk, (x1, x2) ∈
([a1j , b1j), [a2k, b2k)
),
0, ,
ψjk
f ∈ C(R2,R)
ψ((x1, x2)) :=
mj∑
j=1
mk∑
k=1
ψjk((x1, x2)) ≈ σ−1 ◦ f((x1, x2))
σd
C(R,R)
Nf(x)/N x
n
⊓(
28 × 28
1 7
MNIST Pkg.add(”MNIST”)
MNIST_n_rows MNIST_n_cols
MNIST_read_images[0, 1]
global MNIST_file_training_images = Pkg.dir(”MNIST”, ”data”, ”train-images.idx3-ubyte”)global MNIST_file_training_labels = Pkg.dir(”MNIST”, ”data”, ”train-labels.idx1-ubyte”)global MNIST_file_test_images = Pkg.dir(”MNIST”, ”data”, ”t10k-images.idx3-ubyte”)global MNIST_file_test_labels = Pkg.dir(”MNIST”, ”data”, ”t10k-labels.idx1-ubyte”)
global MNIST_n_rows, MNIST_n_cols
function MNIST_read_images(filename::String)open(filename, ”r”) do s
local magic_number = bswap(read(s, UInt32))local n_items = Int(bswap(read(s, UInt32)))global MNIST_n_rows = Int(bswap(read(s, UInt32)))global MNIST_n_cols = Int(bswap(read(s, UInt32)))
[Vector{Float64}(read(s, UInt8, MNIST_n_rows*MNIST_n_cols)) ./ typemax(UInt8)for i in 1:n_items]
endend
function MNIST_read_labels(filename::String)open(filename, ”r”) do s
local magic_number = bswap(read(s, UInt32))local n_items = Int(bswap(read(s, UInt32)))
[Int(read(s, UInt8)) for i in 1:n_items]end
end
vectorize n10 n
function vectorize(n::Integer)local result = zeros(10)result[n+1] = 1result
end
load_MNIST_data
function load_MNIST_data()local MNIST_training_x = MNIST_read_images(MNIST_file_training_images)local MNIST_training_y = MNIST_read_labels(MNIST_file_training_labels)local MNIST_test_x = MNIST_read_images(MNIST_file_test_images)local MNIST_test_y = MNIST_read_labels(MNIST_file_test_labels)
(MNIST_training_x[1:50_000], vectorize.(MNIST_training_y[1:50_000]),MNIST_training_x[50_001:60_000], MNIST_training_y[50_001:60_000],MNIST_test_x, MNIST_test_y)
end
global (training_data_x, training_data_y,validation_data_x, validation_data_y,test_data_x, test_data_y) = @time load_MNIST_data()
PyPlot
function plot_digit(n::Int, file=nothing)local v
if 1 <= n <= 50_000v = training_data_x[n]
elseif 50_001 <= n <= 60_000v = validation_data_x[n-50_000]
elseif 60_001 <= n <= 70_000v = test_data_x[n-60_000]
end
PyPlot.matshow(reshape(v, (MNIST_n_rows, MNIST_n_cols))’, cmap=”Blues”)PyPlot.axis(”off”)
if isa(file, String)PyPlot.savefig(file * string(n) * ”.pdf”, bbox_inches=”tight”, pad_inches=0)
endend
x ∈ R784
28 · 28 = 784
y(x) ∈ R10
y : R784 → R10
a : R784 → R10
C2(W,b) :=1
2n
∑
x∈T
∥y(x)− a(x)∥22,
n := |T | xT
W b1/n
n1/2 2
2
C (W,b) := − 1
n
∑
x∈T
(y(x) · ln a(x) + (1− y(x)) · ln(1− a(x))
),
C
pC
∇C =
⎛
⎜⎝
∂C∂p1
∂C∂pn
⎞
⎟⎠
∂C
∂e(p)
C p e
∂C
∂e(p) = ∇C(p) · e
C pp
C(p) eC
∣∣∣∣∂C
∂e(p)
∣∣∣∣ = |∇C(p) · e| ≤ |∇C(p)|,
|e| = 1
e = ∇C(p)
∂C
∂e(p) = ∇C(p) ·∇C(p) = ∥∇C(p)∥22 ≥ 0,
Ce = −∇C(p)
∂C
∂e(p) = −∇C(p) ·∇C(p) = −∥∇C(p)∥22 ≤ 0,
CC ∆p
p∆p := −η∇C(p),
η ∈ R+
∆C
∆C ≈ ∇C(p) ·∆p,
∆C ≈ −η∇C(p) ·∇C(p) = −η∥∇C(p)∥22 ≤ 0
∆p
SGD
λ
function SGD(nn::Network,training_data_x::Vector{Vector{Float64}},training_data_y::Vector{Vector{Float64}},epochs::Int, batch_size::Int, eta::Float64,lambda::Float64 = 0.0;evaluation_data_x::Vector{Vector{Float64}} = [],evaluation_data_y::Union{Vector{Int64}, Vector{Vector{Float64}}} = [],monitor_training_cost = true,monitor_evaluation_cost = true,monitor_training_accuracy = true,monitor_evaluation_accuracy = true)
nn.training_cost = []nn.evaluation_cost = []nn.training_accuracy = []nn.evaluation_accuracy = []
for epoch in 1:epochslocal perm = randperm(length(training_data_x))
for k in 1:batch_size:length(training_data_x)update!(nn,
training_data_x[perm[k:min(k+batch_size-1, end)]],training_data_y[perm[k:min(k+batch_size-1, end)]],eta, lambda, length(training_data_x))
end
info(@sprintf(”Epoch %d done”, epoch))
if monitor_training_costpush!(nn.training_cost, total_cost(nn, training_data_x, training_data_y, lambda))info(@sprintf(”Cost on training data: %f”, nn.training_cost[end]))
end
if monitor_evaluation_costpush!(nn.evaluation_cost, total_cost(nn, evaluation_data_x, evaluation_data_y, lambda))info(@sprintf(”Cost on evaluation data: %f”, nn.evaluation_cost[end]))
end
if monitor_training_accuracylocal a = accuracy(nn, training_data_x, training_data_y)local l = length(training_data_x)local r = a/linfo(@sprintf(”Accuracy on training data: %5d / %5d = %5.1f%% correct”, a, l, 100*r))push!(nn.training_accuracy, r)
end
if monitor_evaluation_accuracya = accuracy(nn, evaluation_data_x, evaluation_data_y)l = length(evaluation_data_x)r = a/linfo(@sprintf(”Accuracy on evaluation data: %5d / %5d = %5.1f%% correct”, a, l, 100*r))push!(nn.evaluation_accuracy, r)
endend
nnend
function total_cost(nn::Network,data_x::Vector{Vector{Float64}},data_y::Vector{Int64}, lambda::Float64)
total_cost(nn, data_x, vectorize.(data_y), lambda)end
function total_cost(nn::Network,data_x::Vector{Vector{Float64}},data_y::Vector{Vector{Float64}}, lambda::Float64)
sum(map((x, y) -> nn.cost.f(feed_forward(nn, x), y), data_x, data_y)) / length(data_x) +0.5 * lambda * sum(vecnorm(w)^2 for w in nn.weights) / length(data_x)
end
function accuracy(nn::Network,data_x::Vector{Vector{Float64}},
data_y::Vector{Int64})count(map((x, y) -> y == indmax(feed_forward(nn, x)) - 1, data_x, data_y))
end
function accuracy(nn::Network,data_x::Vector{Vector{Float64}},data_y::Vector{Vector{Float64}})
accuracy(nn, data_x, map(y -> indmax(y) - 1, data_y))end
update!η
λ = 0propagate_back
function update!(nn::Network,batch_x::Vector{Vector{Float64}},batch_y::Vector{Vector{Float64}},eta::Float64, lambda::Float64, n::Int)
local grad_W = [zeros(W) for W in nn.weights]local grad_b = [zeros(b) for b in nn.biases]
for (x, y) in zip(batch_x, batch_y)(delta_grad_W, delta_grad_b) = propagate_back(nn, x, y)grad_W += delta_grad_Wgrad_b += delta_grad_b
end
nn.weights = (1-eta*lambda/n) * nn.weights - (eta/length(batch_x)) * grad_Wnn.biases -= (eta/length(batch_x)) * grad_b
nnend
propagate_back
l
(l) = σ(W (l) (l−1) + b(l)),
σl
(l) := W (l) (l−1) + b(l).
(l) = σ( (l))
∂C
∂W (l)ij
∂C
∂b(l)i
CW (l) b(l)
CC = (1/n)
∑x K(x)
x
C = C(a(l))
C z(l)i i l
δ(l)i :=∂C
∂z(l)i
.
i lL
δ(L)i =
∑
k
∂C
∂a(L)k
∂a(L)k
∂z(L)i
∀i,
k L
σ ∂a(L)k /∂z(L)
ik = i
δ(L)i =
∂C
∂a(L)i
σ′(z(L)i ) ∀i
δ(L) =∂C
∂a(L)⊙ σ′(z(L)),
⊙σ′
δ(l) δ(l+1)
δ(l)i =∂C
∂z(l)i
=∑
k
∂C
∂z(l+1)k
∂z(l+1)k
∂z(l)i
=∑
k
∂z(l+1)k
∂z(l)i
δ(l+1)k .
(l+1) = W (l+1) (l) + b(l+1) = W (l+1)σ( (l)) + b(l+1)
(l+1)k =
∑
i
w(l+1)ki σ( (l)
i ) + b(l+1)k
∂z(l+1)k
∂z(l)i
= w(l+1)ki σ′( (l)
i )
δ(l)i =∑
k
w(l+1)ki σ′( (l)
i )δ(l+1)k
δ(l) =((W (l+1))⊤δ(l+1)
)⊙ σ′( (l)).
Cδ(l)
∂C
∂w(l)ij
=∑
k
∂C
∂z(l)k
∂z(l)k
∂w(l)ij
,
∂C
∂b(l)i
=∑
k
∂C
∂z(l)k
∂z(l)k
∂b(l)i
.
∂C/∂z(l)k = δ(l)k
(l)k =
∑
j
w(l)kj
(l−1)j + b(l)
k
∂z(l)k
∂w(l)ij
=
{(l−1)j , i = k,
0, i ̸= k,
∂z(l)k
∂b(l)i
=
{1, i = k,
0, i ̸= k.
∂C
∂w(l)ij
= δ(l)i a(l−1)j ,
∂C
∂b(l)i
= δ(l)i .
C δ(l) z(l)
a(l)
δ(l)
δ(L) Lδ(l)
δ(l+1) ∂C/∂w(l)ij ∂C/∂b(l)
i
propagate_backgrad_W grad_b
z aa[l] = a(l−1) a[1]
deltaδ(L)
delta Costgrad_W[end] grad_b[end] l = L
deltagrad_W[l] grad_b[l]
a[l] = a(l−1)
function propagate_back(nn::Network, x::Vector{Float64}, y::Vector{Float64})local grad_W = [zeros(W) for W in nn.weights]local grad_b = [zeros(b) for b in nn.biases]
local z = Vector(nn.n_layers-1)local a = Vector(nn.n_layers)
a[1] = xfor (i, (W, b)) in enumerate(zip(nn.weights, nn.biases))
z[i] = W * a[i] + ba[i+1] = nn.activation.f.(z[i])
end
local delta = nn.cost.delta(z[end], a[end], y)grad_W[end] = delta * a[end-1]’grad_b[end] = delta
for l in nn.n_layers-2:-1:1delta = (nn.weights[l+1]’ * delta) .* nn.activation.d.(z[l])grad_W[l] = delta * a[l]’grad_b[l] = delta
end
(grad_W, grad_b)end
end # module
srand
srand(0)NN.SGD(NN.Network([NN.MNIST_n_rows*NN.MNIST_n_cols, 30, 10]),
NN.training_data_x, NN.training_data_y,100, 10, 3.0,evaluation_data_x = NN.test_data_x,evaluation_data_y = NN.test_data_y)
NN
η
SGD
100
C0 w
W (l)ij ℓ2
Cℓ2(W,b,λ) := C0(W,b) +λ
2n∥w∥22
= C0(W,b) +λ
2n
∑
k
w2k
= C0(W,b) +λ
2n
∑
l,i,j
(W (l)ij )2,
λ ∈ R+0 n
1/n C0
ℓp
Cℓp(W,b,λ) := C0(W,b) +λ
pn∥w∥pp = C0(W,b) +
λ
pn
∑
k
|wk|p.
λC0
∂Cℓ2
∂w=∂C0
∂w+λ
nw,
∂Cℓ2
∂b=∂C0
∂b,
∆w := −η ∂C0
∂w− ηλ
nw,
∆b := −η ∂C0
∂b.
∆w w
(1− ηλ
n
)w − η
∂C0
∂w,
update!
λ = 1.0
NN.SGD(NN.Network([NN.MNIST_n_rows*NN.MNIST_n_cols, 30, 10]),NN.training_data_x, NN.training_data_y,100, 10, 3.0, 1.0,evaluation_data_x = NN.test_data_x,evaluation_data_y = NN.test_data_y)
δ(l)
σ′(z(l)) σ′(z(l))
σ′(z(l))σ2 σ5
σ′(z(l)) δ(L)
C
∂C
∂b= a− y.
C b(L)i b(L)
L b a(L)i a yi
y
∂C
∂b=∂C
∂aσ′(z).
σ1 σ′1(z) = σ1(z)(1− σ1(z)) = a(1− a)
a− y =∂C
∂b=∂C
∂aa(1− a),
∂C
∂a=
a− y
a(1− a)= −y
a+
1− y
1− a
∂C/∂a
C = −y ln a− (1− y) ln(1− a) + const.
C
C (W,b) := − 1
n
∑
x∈T
(y(x) · lna(L)(x) + (1− y(x)) · ln(1− a(L)(x))
),
L
δ(L) =∂C
∂a(L)⊙ σ′(z(L))
= − 1
n
∑
x∈T
(y(x)⊘ a(L)(x)− (1− y(x))⊘ (1− a(L)(x))
)⊙ a(L)(x)⊙ (1− a(L)(x))
=1
n
∑
x∈T
a(L)(x)− y(x),
⊘
∂C
∂w(L)ij
= δ(L)i a(L−1)
j =1
n
∑
x∈T
(a(L)i (x)− yi(x))a
(L−1)j ,
∂C
∂b(L)= δ(L) =
1
n
∑
x∈T
a(L)(x)− y(x).
σ′(z(L)
C2
σ L a(L)j = z(L)
j
δ(L) = a(L) − y
∂C2
∂w(L)ij
=1
n
∑
x∈T
(a(L)i − z(L)
i )a(L−1)j ,
∂C2
∂b(L)=
1
n
∑
x∈T
(a(L) − z(L)),
L
σi i ∈ {1, 2, 3, 4, 5}limx→±∞ σi(x)
i ∈ {1, 2, 3, 4, 5}
σi i ∈ {1, 2, 3, 4, 5}
σi i ∈ {1, 2, 3, 4, 5} SGD
σi i ∈ {1, 2, 3, 4, 5}
∀u ∈ Rd : ∀v ∈ Rd : |⟨u,v⟩|2 ≤ ⟨u,u⟩⟨v,v⟩,
ηµ ∈ R+ µ ≈ 1 µλ λ λ/µ
η
forSGD
ℓ1
ℓp
ℓ1 ℓ2