library(tidyverse)
library(R6)
6章 学習に関するテクニック
準備
source("common/functions.R")
パラメータの更新
SGD
<- R6Class("SGD", list(
SGD lr = NULL,
initialize = function(lr = 0.01) {
$lr <- lr
self
},update = function(params, grads) {
for (name in names(grads)) {
<- params[[name]] - self$lr * grads[[name]]
params[[name]]
}
params }))
Momentum
<- R6Class("Momentum", list(
Momentum lr = NULL,
momentum = NULL,
v = NULL,
initialize = function(lr = 0.01, momentum = 0.9) {
$lr <- lr
self$momentum <- momentum
self
},update = function(params, grads) {
if (is.null(self$v)) {
$v <- map(params, ~ 0)
self
}for (name in names(params)) {
$v[[name]] <- self$momentum * self$v[[name]] - self$lr * grads[[name]]
self<- params[[name]] + self$v[[name]]
params[[name]]
}
params }))
AdaGrad
<- R6Class("AdaGrad", list(
AdaGrad lr = NULL,
h = NULL,
initialize = function(lr = 0.01) {
$lr <- lr
self
},update = function(params, grads) {
if (is.null(self$h)) {
$h <- map(params, ~ 0)
self
}for (name in names(params)) {
$h[[name]] <- self$h[[name]] + grads[[name]] ^ 2
self<- params[[name]] - self$lr * grads[[name]] / (sqrt(self$h[[name]]) + 1e-7)
params[[name]]
}
params }))
Adam
<- R6Class("Adam", list(
Adam lr = NULL,
beta1 = NULL,
beta2 = NULL,
iter = NULL,
m = NULL,
v = NULL,
initialize = function(lr = 0.001, beta1 = 0.9, beta2 = 0.999) {
$lr <- lr
self$beta1 <- beta1
self$beta2 <- beta2
self$iter <- 0
self
},update = function(params, grads) {
if (is.null(self$m)) {
$m <- map(params, ~ 0)
self$v <- map(params, ~ 0)
self
}$iter <- self$iter + 1
self<- self$lr * sqrt(1 - self$beta2 ^ self$iter) / (1 - self$beta1 ^ self$iter)
lr_t for (name in names(params)) {
$m[[name]] <- self$m[[name]] + (1 - self$beta1) * (grads[[name]] - self$m[[name]])
self$v[[name]] <- self$v[[name]] + (1 - self$beta2) * (grads[[name]]^2 - self$v[[name]])
self<- params[[name]] - lr_t * self$m[[name]] / (sqrt(self$v[[name]]) + 1e-7)
params[[name]]
}
params }))
どの更新手法を用いるか?
<- function(x, y) x^2 / 20 + y^2
f <- function(x, y) list(x / 10, 2 * y)
df
<- list(x = -7.0, y = 2.0)
init_pos <- init_pos
params
<- list(
optimizers SGD = SGD$new(lr = 0.95),
Momentum = Momentum$new(lr = 0.1),
AdaGrad = AdaGrad$new(lr = 1.5),
Adam = Adam$new(lr = 0.3)
)
<- function(optimizer, init_pos, n = 30) {
update <- init_pos
params map_dfr(1:n, function(i) {
<- df(params$x, params$y) %>%
grads set_names(c("x", "y"))
<<- optimizer$update(params, grads)
params %>%
}) rbind(init_pos, .)
}
<- map(optimizers, update, init_pos)
results
%>%
results imap_dfr(~ mutate(.x, optimizer = .y)) %>%
mutate(optimizer = factor(optimizer, levels = names(optimizers))) %>%
ggplot(aes(x, y)) +
geom_line() +
facet_wrap(vars(optimizer), nrow = 2)
重みの初期値
隠れ層のアクティベーション分布
<- 100
node_num <- 5
hidden_layer_size <- 1000
n
set.seed(1)
<- matrix(rnorm(n * node_num), n, node_num)
x1
<- function(init, activation = sigmoid) {
accum_activations accumulate(
1:hidden_layer_size,
function(x, i) {
<- matrix(rnorm(node_num * node_num), node_num) * init
w <- x %*% w
a activation(a)
},.init = x1
%>% tail(-1)
)
}
<- function(activations) {
plot_activations <- activations %>%
activation_df imap_dfr(~ tibble(layer = .y, a = as.vector(.x)))
ggplot(activation_df, aes(x = a)) +
geom_histogram(binwidth = 0.02) +
coord_cartesian(xlim = c(0, 1)) +
facet_wrap(vars(layer), nrow = 1)
}
0と1に偏ったデータ分布
<- accum_activations(1)
activations plot_activations(activations)
重みの標準偏差を0.01にする。
<- accum_activations(0.01)
activations plot_activations(activations)
Xavierの初期値を使う。
<- accum_activations(sqrt(1 / node_num))
activations plot_activations(activations)
ReLUの場合の重みの初期値
<- function(x) ifelse(x > 0, x, 0) relu
標準偏差が0.01のガウス分布を重みの初期値とした場合
<- accum_activations(0.01, relu)
activations plot_activations(activations)
Xavierの初期値の場合
<- accum_activations(sqrt(1 / node_num), relu)
activations plot_activations(activations)
Heの初期値の場合
<- accum_activations(sqrt(2 / node_num), relu)
activations plot_activations(activations)
Batch Normalization
TODO
正則化
TODO
ハイパーパラメータの検証
TODO