library(tidyverse)
library(R6)5章 誤差逆伝播法
準備
source("common/functions.R")単純なレイヤの実装
乗算レイヤの実装
MulLayer <- R6Class("MulLayer", list(
x = NULL,
y = NULL,
forward = function(x, y) {
self$x <- x
self$y <- y
x * y
},
backward = function(dout) {
dx <- dout * self$y
dy <- dout * self$x
c(dx, dy)
}))順伝播
apple <- 100
apple_num <- 2
tax <- 1.1
mul_apple_layer <- MulLayer$new()
mul_tax_layer <- MulLayer$new()
apple_price <- mul_apple_layer$forward(apple, apple_num)
price <- mul_tax_layer$forward(apple_price, tax)price[1] 220
逆伝播(各変数に関する微分)
dprice <- 1
b1 <- mul_tax_layer$backward(dprice) %>%
set_names(c("dapple_price", "dtax"))
b2 <- mul_apple_layer$backward(b1["dapple_price"]) %>%
set_names(c("dapple", "dapple_num"))b2 dapple dapple_num
2.2 110.0
b1dapple_price dtax
1.1 200.0
加算レイヤの実装
AddLayer <- R6Class("AddLayer", list(
forward = function(x, y) {
x + y
},
backward = function(dout) {
dx <- dout * 1
dy <- dout * 1
c(dx, dy)
}))リンゴ2個とみかん3個の買い物
apple <- 100
apple_num <- 2
orange <- 150
orange_num <- 3
tax <- 1.1
# layer
mul_apple_layer <- MulLayer$new()
mul_orange_layer <- MulLayer$new()
add_apple_orange_layer <- AddLayer$new()
mul_tax_layer <- MulLayer$new()
# forward
apple_price <- mul_apple_layer$forward(apple, apple_num)
orange_price <- mul_orange_layer$forward(orange, orange_num)
all_price <- add_apple_orange_layer$forward(apple_price, orange_price)
price <- mul_tax_layer$forward(all_price, tax)
# backward
dprice <- 1
b1 <- mul_tax_layer$backward(dprice) %>%
set_names(c("dall_price", "dtax"))
b2 <- add_apple_orange_layer$backward(b1["dall_price"]) %>%
set_names(c("dapple_price", "dorange_price"))
b3 <- mul_orange_layer$backward(b2["dorange_price"]) %>%
set_names(c("dorange", "dorange_num"))
b4 <- mul_apple_layer$backward(b2["dapple_price"]) %>%
set_names(c("dapple", "dapple_num"))price[1] 715
b4 dapple dapple_num
2.2 110.0
b3 dorange dorange_num
3.3 165.0
b2 dapple_price dorange_price
1.1 1.1
b1dall_price dtax
1.1 650.0
活性化関数レイヤの実装
ReLUレイヤ
Relu <- R6Class("Relu", list(
positive = NULL,
forward = function(x) {
self$positive = x > 0
x * self$positive
},
backward = function(dout) {
dout * self$positive
}))Sigmoidレイヤ
Sigmoid <- R6Class("Sigmoid", list(
out = NULL,
forward = function(x) {
self$out = 1 / (1 + exp(-x))
self$out
},
backward = function(dout) {
dout * (1 - self$out) * self$out
}))Affine/Softmaxレイヤの実装
バッチ版Affineレイヤ
Affine <- R6Class("Affine", list(
W = NULL,
b = NULL,
x = NULL,
dW = NULL,
db = NULL,
initialize = function(W, b) {
self$W <- W
self$b <- b
},
forward = function(x) {
self$x <- x
x %*% self$W + rep_row(self$b, nrow(x))
},
backward = function(dout) {
self$dW <- t(self$x) %*% dout
self$db <- matrix(apply(dout, 2, sum), 1)
dout %*% t(self$W)
}))Softmax-with-Lossレイヤ
SoftmaxWithLoss <- R6Class("SoftmaxWithLoss", list(
loss = NULL,
y = NULL,
t = NULL,
forward = function(x, t) {
self$t <- t
self$y <- base::t(apply(x, 1, softmax))
self$loss <- cross_entropy_error(self$y, self$t)
self$loss
},
backward = function(dout) {
batch_size <- nrow(self$t)
(self$y - self$t) / batch_size
}))誤差逆伝播法の実装
誤差逆伝播法に対応したニューラルネットワークの実装
TwoLayerNet <- R6Class("TwoLayerNet", list(
params = NULL,
layers = NULL,
last_layer = NULL,
initialize = function(input_size, hidden_size, output_size,
weight_init_std = 0.01) {
self$params <- list(
W1 = matrix(rnorm(input_size * hidden_size, sd = weight_init_std),
input_size, hidden_size),
b1 = matrix(0, ncol = hidden_size),
W2 = matrix(rnorm(hidden_size * output_size, sd = weight_init_std),
hidden_size, output_size),
b2 = matrix(0, ncol = output_size)
)
self$update_layers()
},
update_layers = function() {
self$layers <- list(
Affine1 = Affine$new(self$params$W1, self$params$b1),
Relu1 = Relu$new(),
Affine2 = Affine$new(self$params$W2, self$params$b2)
)
self$last_layer = SoftmaxWithLoss$new()
},
predict = function(x) {
for (layer in self$layers) {
x <- layer$forward(x)
}
x
},
loss = function(x, t) {
y <- self$predict(x)
self$last_layer$forward(y, t)
},
loss_function = function(name, x, t) {
w_orig <- self$params[[name]]
function(w) {
self$params[[name]] <- w
self$update_layers()
loss <- self$loss(x, t)
self$params[[name]] <- w_orig
self$update_layers()
loss
}
},
accuracy = function(x, t) {
y <- self$predict(x) %>%
apply(1, which.max) - 1
if (is.matrix(t)) {
t <- t %>%
apply(1, which.max) - 1
}
mean(y == t)
},
numerical_gradient = function(x, t) {
list(
W1 = numerical_gradient(self$loss_function("W1", x, t), self$params$W1),
b1 = numerical_gradient(self$loss_function("b1", x, t), self$params$b1),
W2 = numerical_gradient(self$loss_function("W2", x, t), self$params$W2),
b2 = numerical_gradient(self$loss_function("b2", x, t), self$params$b2)
)
},
gradient = function(x, t) {
# forward
self$loss(x, t)
# backward
dout <- 1
dout <- self$last_layer$backward(dout)
for (layer in rev(self$layers)) {
dout <- layer$backward(dout)
}
list(
W1 = self$layers$Affine1$dW,
b1 = self$layers$Affine1$db,
W2 = self$layers$Affine2$dW,
b2 = self$layers$Affine2$db
)
}))誤差逆伝播法の勾配確認
ラベルをone-hot表現に変換する関数
to_one_hot <- function(labels) {
matrix(labels, 1) %>%
apply(2, function(x) {
v <- rep(0, 10)
v[x + 1] <- 1
v
}) %>%
t
}to_one_hot(c(0, 2, 9)) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1 0 0 0 0 0 0 0 0 0
[2,] 0 0 1 0 0 0 0 0 0 0
[3,] 0 0 0 0 0 0 0 0 0 1
勾配確認
mnist <- dslabs::read_mnist(path = "input/mnist/")
x_train <- mnist$train$images / 255
t_train <- to_one_hot(mnist$train$labels)
set.seed(1)
network <- TwoLayerNet$new(input_size = 784, hidden_size = 50, output_size = 10)
x_batch <- x_train %>% head(3)
t_batch <- t_train %>% head(3)
grad_numerical <- network$numerical_gradient(x_batch, t_batch)
grad_backprop <- network$gradient(x_batch, t_batch)
map2(grad_numerical, grad_backprop, ~ mean(abs(.x - .y)))$W1
[1] 4.062289e-10
$b1
[1] 2.386305e-09
$W2
[1] 4.953931e-09
$b2
[1] 1.403384e-07
誤差逆伝播法を使った学習
x_test <- mnist$test$images / 255
t_test <- to_one_hot(mnist$test$labels)
set.seed(1)
network <- TwoLayerNet$new(input_size = 784, hidden_size = 50, output_size = 10)
iters_num <- 10000
train_size <- nrow(x_train)
batch_size <- 100
learning_rate <- 0.1
iter_per_epoch <- max(train_size %/% batch_size, 1)
acc_df <- map_dfr(1:iters_num, function(i) {
batch_mask <- sample(train_size, batch_size)
x_batch <- x_train[batch_mask, ]
t_batch <- t_train[batch_mask, ]
grad <- network$gradient(x_batch, t_batch)
for (name in names(grad)) {
network$params[[name]] <- network$params[[name]] - learning_rate * grad[[name]]
}
network$update_layers()
loss <- network$loss(x_batch, t_batch)
train_acc <- test_acc <- NA
if (i %% iter_per_epoch == 1) {
train_acc <- network$accuracy(x_train, t_train)
test_acc <- network$accuracy(x_test, t_test)
}
lst(loss, train_acc, test_acc)
})acc_df %>% filter(!is.na(train_acc))# A tibble: 17 × 3
loss train_acc test_acc
<dbl> <dbl> <dbl>
1 2.30 0.0872 0.0835
2 0.241 0.905 0.910
3 0.254 0.923 0.926
4 0.243 0.937 0.937
5 0.0995 0.944 0.941
6 0.114 0.950 0.949
7 0.126 0.956 0.953
8 0.0897 0.961 0.958
9 0.0906 0.965 0.964
10 0.0401 0.965 0.961
11 0.150 0.967 0.964
12 0.0623 0.971 0.966
13 0.0763 0.974 0.969
14 0.0678 0.974 0.970
15 0.0427 0.975 0.970
16 0.0848 0.978 0.971
17 0.0259 0.978 0.971