library(tidyverse)
library(R6)
5章 誤差逆伝播法
準備
source("common/functions.R")
単純なレイヤの実装
乗算レイヤの実装
<- R6Class("MulLayer", list(
MulLayer x = NULL,
y = NULL,
forward = function(x, y) {
$x <- x
self$y <- y
self* y
x
},backward = function(dout) {
<- dout * self$y
dx <- dout * self$x
dy c(dx, dy)
}))
順伝播
<- 100
apple <- 2
apple_num <- 1.1
tax
<- MulLayer$new()
mul_apple_layer <- MulLayer$new()
mul_tax_layer
<- mul_apple_layer$forward(apple, apple_num)
apple_price <- mul_tax_layer$forward(apple_price, tax) price
price
[1] 220
逆伝播(各変数に関する微分)
<- 1
dprice <- mul_tax_layer$backward(dprice) %>%
b1 set_names(c("dapple_price", "dtax"))
<- mul_apple_layer$backward(b1["dapple_price"]) %>%
b2 set_names(c("dapple", "dapple_num"))
b2
dapple dapple_num
2.2 110.0
b1
dapple_price dtax
1.1 200.0
加算レイヤの実装
<- R6Class("AddLayer", list(
AddLayer forward = function(x, y) {
+ y
x
},backward = function(dout) {
<- dout * 1
dx <- dout * 1
dy c(dx, dy)
}))
リンゴ2個とみかん3個の買い物
<- 100
apple <- 2
apple_num <- 150
orange <- 3
orange_num <- 1.1
tax
# layer
<- MulLayer$new()
mul_apple_layer <- MulLayer$new()
mul_orange_layer <- AddLayer$new()
add_apple_orange_layer <- MulLayer$new()
mul_tax_layer
# forward
<- mul_apple_layer$forward(apple, apple_num)
apple_price <- mul_orange_layer$forward(orange, orange_num)
orange_price <- add_apple_orange_layer$forward(apple_price, orange_price)
all_price <- mul_tax_layer$forward(all_price, tax)
price
# backward
<- 1
dprice <- mul_tax_layer$backward(dprice) %>%
b1 set_names(c("dall_price", "dtax"))
<- add_apple_orange_layer$backward(b1["dall_price"]) %>%
b2 set_names(c("dapple_price", "dorange_price"))
<- mul_orange_layer$backward(b2["dorange_price"]) %>%
b3 set_names(c("dorange", "dorange_num"))
<- mul_apple_layer$backward(b2["dapple_price"]) %>%
b4 set_names(c("dapple", "dapple_num"))
price
[1] 715
b4
dapple dapple_num
2.2 110.0
b3
dorange dorange_num
3.3 165.0
b2
dapple_price dorange_price
1.1 1.1
b1
dall_price dtax
1.1 650.0
活性化関数レイヤの実装
ReLUレイヤ
<- R6Class("Relu", list(
Relu positive = NULL,
forward = function(x) {
$positive = x > 0
self* self$positive
x
},backward = function(dout) {
* self$positive
dout }))
Sigmoidレイヤ
<- R6Class("Sigmoid", list(
Sigmoid out = NULL,
forward = function(x) {
$out = 1 / (1 + exp(-x))
self$out
self
},backward = function(dout) {
* (1 - self$out) * self$out
dout }))
Affine/Softmaxレイヤの実装
バッチ版Affineレイヤ
<- R6Class("Affine", list(
Affine W = NULL,
b = NULL,
x = NULL,
dW = NULL,
db = NULL,
initialize = function(W, b) {
$W <- W
self$b <- b
self
},forward = function(x) {
$x <- x
self%*% self$W + rep_row(self$b, nrow(x))
x
},backward = function(dout) {
$dW <- t(self$x) %*% dout
self$db <- matrix(apply(dout, 2, sum), 1)
self%*% t(self$W)
dout }))
Softmax-with-Lossレイヤ
<- R6Class("SoftmaxWithLoss", list(
SoftmaxWithLoss loss = NULL,
y = NULL,
t = NULL,
forward = function(x, t) {
$t <- t
self$y <- base::t(apply(x, 1, softmax))
self$loss <- cross_entropy_error(self$y, self$t)
self$loss
self
},backward = function(dout) {
<- nrow(self$t)
batch_size $y - self$t) / batch_size
(self }))
誤差逆伝播法の実装
誤差逆伝播法に対応したニューラルネットワークの実装
<- R6Class("TwoLayerNet", list(
TwoLayerNet params = NULL,
layers = NULL,
last_layer = NULL,
initialize = function(input_size, hidden_size, output_size,
weight_init_std = 0.01) {
$params <- list(
selfW1 = matrix(rnorm(input_size * hidden_size, sd = weight_init_std),
input_size, hidden_size),b1 = matrix(0, ncol = hidden_size),
W2 = matrix(rnorm(hidden_size * output_size, sd = weight_init_std),
hidden_size, output_size),b2 = matrix(0, ncol = output_size)
)
$update_layers()
self
},update_layers = function() {
$layers <- list(
selfAffine1 = Affine$new(self$params$W1, self$params$b1),
Relu1 = Relu$new(),
Affine2 = Affine$new(self$params$W2, self$params$b2)
)
$last_layer = SoftmaxWithLoss$new()
self
},predict = function(x) {
for (layer in self$layers) {
<- layer$forward(x)
x
}
x
},loss = function(x, t) {
<- self$predict(x)
y $last_layer$forward(y, t)
self
},loss_function = function(name, x, t) {
<- self$params[[name]]
w_orig function(w) {
$params[[name]] <- w
self$update_layers()
self<- self$loss(x, t)
loss $params[[name]] <- w_orig
self$update_layers()
self
loss
}
},accuracy = function(x, t) {
<- self$predict(x) %>%
y apply(1, which.max) - 1
if (is.matrix(t)) {
<- t %>%
t apply(1, which.max) - 1
}
mean(y == t)
},numerical_gradient = function(x, t) {
list(
W1 = numerical_gradient(self$loss_function("W1", x, t), self$params$W1),
b1 = numerical_gradient(self$loss_function("b1", x, t), self$params$b1),
W2 = numerical_gradient(self$loss_function("W2", x, t), self$params$W2),
b2 = numerical_gradient(self$loss_function("b2", x, t), self$params$b2)
)
},gradient = function(x, t) {
# forward
$loss(x, t)
self
# backward
<- 1
dout <- self$last_layer$backward(dout)
dout
for (layer in rev(self$layers)) {
<- layer$backward(dout)
dout
}
list(
W1 = self$layers$Affine1$dW,
b1 = self$layers$Affine1$db,
W2 = self$layers$Affine2$dW,
b2 = self$layers$Affine2$db
) }))
誤差逆伝播法の勾配確認
ラベルをone-hot表現に変換する関数
<- function(labels) {
to_one_hot matrix(labels, 1) %>%
apply(2, function(x) {
<- rep(0, 10)
v + 1] <- 1
v[x
v%>%
})
t }
to_one_hot(c(0, 2, 9))
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 0 0 0 0 0 0 0 0 0
[2,] 0 0 1 0 0 0 0 0 0 0
[3,] 0 0 0 0 0 0 0 0 0 1
勾配確認
<- dslabs::read_mnist(path = "input/mnist/")
mnist <- mnist$train$images / 255
x_train <- to_one_hot(mnist$train$labels)
t_train
set.seed(1)
<- TwoLayerNet$new(input_size = 784, hidden_size = 50, output_size = 10)
network
<- x_train %>% head(3)
x_batch <- t_train %>% head(3)
t_batch
<- network$numerical_gradient(x_batch, t_batch)
grad_numerical <- network$gradient(x_batch, t_batch)
grad_backprop
map2(grad_numerical, grad_backprop, ~ mean(abs(.x - .y)))
$W1
[1] 4.062289e-10
$b1
[1] 2.386305e-09
$W2
[1] 4.953931e-09
$b2
[1] 1.403384e-07
誤差逆伝播法を使った学習
<- mnist$test$images / 255
x_test <- to_one_hot(mnist$test$labels)
t_test
set.seed(1)
<- TwoLayerNet$new(input_size = 784, hidden_size = 50, output_size = 10)
network
<- 10000
iters_num <- nrow(x_train)
train_size <- 100
batch_size <- 0.1
learning_rate
<- max(train_size %/% batch_size, 1)
iter_per_epoch
<- map_dfr(1:iters_num, function(i) {
acc_df <- sample(train_size, batch_size)
batch_mask <- x_train[batch_mask, ]
x_batch <- t_train[batch_mask, ]
t_batch
<- network$gradient(x_batch, t_batch)
grad
for (name in names(grad)) {
$params[[name]] <- network$params[[name]] - learning_rate * grad[[name]]
network
}$update_layers()
network
<- network$loss(x_batch, t_batch)
loss
<- test_acc <- NA
train_acc if (i %% iter_per_epoch == 1) {
<- network$accuracy(x_train, t_train)
train_acc <- network$accuracy(x_test, t_test)
test_acc
}
lst(loss, train_acc, test_acc)
})
%>% filter(!is.na(train_acc)) acc_df
# A tibble: 17 × 3
loss train_acc test_acc
<dbl> <dbl> <dbl>
1 2.30 0.0872 0.0835
2 0.241 0.905 0.910
3 0.254 0.923 0.926
4 0.243 0.937 0.937
5 0.0995 0.944 0.941
6 0.114 0.950 0.949
7 0.126 0.956 0.953
8 0.0897 0.961 0.958
9 0.0906 0.965 0.964
10 0.0401 0.965 0.961
11 0.150 0.967 0.964
12 0.0623 0.971 0.966
13 0.0763 0.974 0.969
14 0.0678 0.974 0.970
15 0.0427 0.975 0.970
16 0.0848 0.978 0.971
17 0.0259 0.978 0.971