library(tidyverse)
library(R6)
4章 ニューラルネットワークの学習
準備
source("common/functions.R")
損失関数
交差エントロピー誤差
<- function(y, t) {
cross_entropy_error if (is.vector(y)) {
<- matrix(y, 1)
y <- matrix(t, 1)
t
}
if (identical(dim(t), dim(y))) {
<- apply(t, 1, which.max) - 1
t
}
<- nrow(y)
batch_size
-sum(log(map2_dbl(1:batch_size, t, ~ y[.x, .y + 1]) + 1e-7)) / batch_size
}
<- c(0.1, 0.05, 0.6, 0.0, 0.05, 0.1, 0.0, 0.1, 0.0, 0.0) y
one-hot表現
<- c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0)
t cross_entropy_error(y, t)
[1] 0.5108255
ラベル
cross_entropy_error(y, 2)
[1] 0.5108255
バッチデータ
<- matrix(seq(0.1, 0.6, 0.1), 2, byrow = TRUE)
y2 <- matrix(c(0, 1, 0, 0, 0, 1), 2, byrow = TRUE)
t2 cross_entropy_error(y2, t2)
[1] 1.060131
cross_entropy_error(y2, c(1, 2))
[1] 1.060131
勾配
勾配
<- function(f, x) {
numerical_gradient <- 1e-4
h if (is.vector(x)) x <- matrix(x, 1)
<- matrix(0, nrow(x), ncol(x))
grad
for (i in 1:nrow(x)) {
for (j in 1:ncol(x)) {
<- identity(x)
x1 <- x[i, j] + h
x1[i, j] <- identity(x)
x2 <- x[i, j] - h
x2[i, j] <- (f(x1) - f(x2)) / (2 * h)
grad[i, j]
}
}
grad }
<- function(x) sum(x ^ 2) function_2
numerical_gradient(function_2, c(3.0, 4.0))
[,1] [,2]
[1,] 6 8
numerical_gradient(function_2, c(0.0, 2.0))
[,1] [,2]
[1,] 0 4
numerical_gradient(function_2, c(3.0, 0.0))
[,1] [,2]
[1,] 6 0
勾配法
勾配降下法
<- function(f, init_x, lr = 0.01, step_num = 100) {
gradient_descent <- init_x
x for (i in 1:step_num) {
<- x - lr * numerical_gradient(f, x)
x
}
x }
もしくは
<- function(f, init_x, lr = 0.01, step_num = 100) {
gradient_descent reduce(1:step_num, ~ .x - lr * numerical_gradient(f, .x), .init = init_x)
}
<- c(-3.0, 4.0)
init_x gradient_descent(function_2, init_x, lr = 0.1, step_num = 100)
[,1] [,2]
[1,] -6.111108e-10 8.148144e-10
勾配法による更新のプロセス
<- function(f, init_x, lr = 0.01, step_num = 100) {
gradient_descent_history accumulate(1:step_num, ~ .x - lr * numerical_gradient(f, .x), .init = init_x)
}
<- gradient_descent_history(function_2, matrix(init_x, 1), lr = 0.1, step_num = 20)
x_history <- x_history %>%
x_history_df reduce(rbind) %>%
%>%
as.data.frame set_names("x0", "x1")
head(x_history_df)
x0 x1
1 -3.00000 4.00000
2 -2.40000 3.20000
3 -1.92000 2.56000
4 -1.53600 2.04800
5 -1.22880 1.63840
6 -0.98304 1.31072
plot(x_history_df)
ニューラルネットワークに対する勾配
<- R6Class("SimpleNet", list(
SimpleNet W = NULL,
initialize = function(W) self$W <- W,
predict = function(x) x %*% self$W,
loss = function(x, t) {
<- self$predict(x)
z <- softmax(z)
y cross_entropy_error(y, t)
},loss_function = function(x, t) {
<- self$W
w_orig function(w) {
$W <- w
self<- self$loss(x, t)
loss $W <- w_orig
self
loss
} }))
<- matrix(c(0.47355232, 0.9977393, 0.84668094,
W 0.85557411, 0.03563661, 0.69422093),
2, byrow = TRUE)
<- SimpleNet$new(W)
net <- c(0.6, 0.9)
x <- net$predict(x)
p p
[,1] [,2] [,3]
[1,] 1.054148 0.6307165 1.132807
<- matrix(c(0, 0, 1), 1)
t $loss(x, t) net
[1] 0.9280683
勾配
<- net$loss_function(x, t)
f numerical_gradient(f, net$W)
[,1] [,2] [,3]
[1,] 0.2192476 0.1435624 -0.362810
[2,] 0.3288714 0.2153436 -0.544215
f
は重みを受け取って損失を返す関数。
学習アルゴリズムの実装
2層ニューラルネットワークのクラス
<- R6Class("TwoLayerNet", list(
TwoLayerNet params = NULL,
initialize = function(input_size, hidden_size, output_size,
weight_init_std = 0.01) {
$params <- list()
self$params$W1 <- matrix(rnorm(input_size * hidden_size, sd = weight_init_std),
self
input_size, hidden_size)$params$b1 <- matrix(0, ncol = hidden_size)
self$params$W2 <- matrix(rnorm(hidden_size * output_size, sd = weight_init_std),
self
hidden_size, output_size)$params$b2 <- matrix(0, ncol = output_size)
self
},predict = function(x) {
<- nrow(x)
n if (is.null(n)) n <- 1
<- x %*% self$params$W1 + rep_row(self$params$b1, n)
a1 <- sigmoid(a1)
z1 <- z1 %*% self$params$W2 + rep_row(self$params$b2, n)
a2
softmax(a2)
},loss = function(x, t) {
<- self$predict(x)
y cross_entropy_error(y, t)
},loss_function = function(name, x, t) {
<- self$params[[name]]
w_orig function(w) {
$params[[name]] <- w
self<- self$loss(x, t)
loss $params[[name]] <- w_orig
self
loss
}
},numerical_gradient = function(x, t) {
list(
W1 = numerical_gradient(self$loss_function("W1", x, t), self$params$W1),
b1 = numerical_gradient(self$loss_function("b1", x, t), self$params$b1),
W2 = numerical_gradient(self$loss_function("W2", x, t), self$params$W2),
b2 = numerical_gradient(self$loss_function("b2", x, t), self$params$b2)
) }))
<- TwoLayerNet$new(input_size = 784, hidden_size = 100, output_size = 10)
net $params %>% map(dim) net
$W1
[1] 784 100
$b1
[1] 1 100
$W2
[1] 100 10
$b2
[1] 1 10
ダミーの入力データと正解ラベル
<- matrix(runif(100 * 784), 100)
x <- matrix(runif(100 * 10), 100) t
勾配を計算
<- net$numerical_gradient(x, t) grads