5章 誤差逆伝播法

準備

library(tidyverse)
library(R6)
source("common/functions.R")

単純なレイヤの実装

乗算レイヤの実装

MulLayer <- R6Class("MulLayer", list(
  x = NULL,
  y = NULL,
  forward = function(x, y) {
    self$x <- x
    self$y <- y
    x * y
  },
  backward = function(dout) {
    dx <- dout * self$y
    dy <- dout * self$x
    c(dx, dy)
  }))

順伝播

apple <- 100
apple_num <- 2
tax <- 1.1

mul_apple_layer <- MulLayer$new()
mul_tax_layer <- MulLayer$new()

apple_price <- mul_apple_layer$forward(apple, apple_num)
price <- mul_tax_layer$forward(apple_price, tax)
price
[1] 220

逆伝播(各変数に関する微分)

dprice <- 1
b1 <- mul_tax_layer$backward(dprice) %>%
  set_names(c("dapple_price", "dtax"))
b2 <- mul_apple_layer$backward(b1["dapple_price"]) %>%
  set_names(c("dapple", "dapple_num"))
b2
    dapple dapple_num 
       2.2      110.0 
b1
dapple_price         dtax 
         1.1        200.0 

加算レイヤの実装

AddLayer <- R6Class("AddLayer", list(
  forward = function(x, y) {
    x + y
  },
  backward = function(dout) {
    dx <- dout * 1
    dy <- dout * 1
    c(dx, dy)
  }))

リンゴ2個とみかん3個の買い物

apple <- 100
apple_num <- 2
orange <- 150
orange_num <- 3
tax <- 1.1

# layer
mul_apple_layer <- MulLayer$new()
mul_orange_layer <- MulLayer$new()
add_apple_orange_layer <- AddLayer$new()
mul_tax_layer <- MulLayer$new()

# forward
apple_price <- mul_apple_layer$forward(apple, apple_num)
orange_price <- mul_orange_layer$forward(orange, orange_num)
all_price <- add_apple_orange_layer$forward(apple_price, orange_price)
price <- mul_tax_layer$forward(all_price, tax)

# backward
dprice <- 1
b1 <- mul_tax_layer$backward(dprice) %>%
  set_names(c("dall_price", "dtax"))
b2 <- add_apple_orange_layer$backward(b1["dall_price"]) %>%
  set_names(c("dapple_price", "dorange_price"))
b3 <- mul_orange_layer$backward(b2["dorange_price"]) %>%
  set_names(c("dorange", "dorange_num"))
b4 <- mul_apple_layer$backward(b2["dapple_price"]) %>%
  set_names(c("dapple", "dapple_num"))
price
[1] 715
b4
    dapple dapple_num 
       2.2      110.0 
b3
    dorange dorange_num 
        3.3       165.0 
b2
 dapple_price dorange_price 
          1.1           1.1 
b1
dall_price       dtax 
       1.1      650.0 

活性化関数レイヤの実装

ReLUレイヤ

Relu <- R6Class("Relu", list(
  positive = NULL,
  forward = function(x) {
    self$positive = x > 0
    x * self$positive
  },
  backward = function(dout) {
    dout * self$positive
  }))

Sigmoidレイヤ

Sigmoid <- R6Class("Sigmoid", list(
  out = NULL,
  forward = function(x) {
    self$out = 1 / (1 + exp(-x))
    self$out
  },
  backward = function(dout) {
    dout * (1 - self$out) * self$out
  }))

Affine/Softmaxレイヤの実装

バッチ版Affineレイヤ

Affine <- R6Class("Affine", list(
  W = NULL,
  b = NULL,
  x = NULL,
  dW = NULL,
  db = NULL,
  initialize = function(W, b) {
    self$W <- W
    self$b <- b
  },
  forward = function(x) {
    self$x <- x
    x %*% self$W + rep_row(self$b, nrow(x))
  },
  backward = function(dout) {
    self$dW <- t(self$x) %*% dout
    self$db <- matrix(apply(dout, 2, sum), 1)
    dout %*% t(self$W)
  }))

Softmax-with-Lossレイヤ

SoftmaxWithLoss <- R6Class("SoftmaxWithLoss", list(
  loss = NULL,
  y = NULL,
  t = NULL,
  forward = function(x, t) {
    self$t <- t
    self$y <- base::t(apply(x, 1, softmax))
    self$loss <- cross_entropy_error(self$y, self$t)
    self$loss
  },
  backward = function(dout) {
    batch_size <- nrow(self$t)
    (self$y - self$t) / batch_size
  }))

誤差逆伝播法の実装

誤差逆伝播法に対応したニューラルネットワークの実装

TwoLayerNet <- R6Class("TwoLayerNet", list(
  params = NULL,
  layers = NULL,
  last_layer = NULL,
  initialize = function(input_size, hidden_size, output_size,
                        weight_init_std = 0.01) {
    self$params <- list(
      W1 = matrix(rnorm(input_size * hidden_size, sd = weight_init_std),
                  input_size, hidden_size),
      b1 = matrix(0, ncol = hidden_size),
      W2 = matrix(rnorm(hidden_size * output_size, sd = weight_init_std),
                  hidden_size, output_size),
      b2 = matrix(0, ncol = output_size)
    )

    self$update_layers()
  },
  update_layers = function() {
    self$layers <- list(
      Affine1 = Affine$new(self$params$W1, self$params$b1),
      Relu1 = Relu$new(),
      Affine2 = Affine$new(self$params$W2, self$params$b2)
    )

    self$last_layer = SoftmaxWithLoss$new()
  },
  predict = function(x) {
    for (layer in self$layers) {
      x <- layer$forward(x)
    }
    x
  },
  loss = function(x, t) {
    y <- self$predict(x)
    self$last_layer$forward(y, t)
  },
  loss_function = function(name, x, t) {
    w_orig <- self$params[[name]]
    function(w) {
      self$params[[name]] <- w
      self$update_layers()
      loss <- self$loss(x, t)
      self$params[[name]] <- w_orig
      self$update_layers()
      loss
    }
  },
  accuracy = function(x, t) {
    y <- self$predict(x) %>%
      apply(1, which.max) - 1

    if (is.matrix(t)) {
      t <- t %>%
        apply(1, which.max) - 1
    }

    mean(y == t)
  },
  numerical_gradient = function(x, t) {
    list(
      W1 = numerical_gradient(self$loss_function("W1", x, t), self$params$W1),
      b1 = numerical_gradient(self$loss_function("b1", x, t), self$params$b1),
      W2 = numerical_gradient(self$loss_function("W2", x, t), self$params$W2),
      b2 = numerical_gradient(self$loss_function("b2", x, t), self$params$b2)
    )
  },
  gradient = function(x, t) {
    # forward
    self$loss(x, t)

    # backward
    dout <- 1
    dout <- self$last_layer$backward(dout)

    for (layer in rev(self$layers)) {
      dout <- layer$backward(dout)
    }

    list(
      W1 = self$layers$Affine1$dW,
      b1 = self$layers$Affine1$db,
      W2 = self$layers$Affine2$dW,
      b2 = self$layers$Affine2$db
    )
  }))

誤差逆伝播法の勾配確認

ラベルをone-hot表現に変換する関数

to_one_hot <- function(labels) {
  matrix(labels, 1) %>%
    apply(2, function(x) {
      v <- rep(0, 10)
      v[x + 1] <- 1
      v
    }) %>%
    t
}
to_one_hot(c(0, 2, 9))
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,]    1    0    0    0    0    0    0    0    0     0
[2,]    0    0    1    0    0    0    0    0    0     0
[3,]    0    0    0    0    0    0    0    0    0     1

勾配確認

mnist <- dslabs::read_mnist(path = "input/mnist/")
x_train <- mnist$train$images / 255
t_train <- to_one_hot(mnist$train$labels)

set.seed(1)
network <- TwoLayerNet$new(input_size = 784, hidden_size = 50, output_size = 10)

x_batch <- x_train %>% head(3)
t_batch <- t_train %>% head(3)

grad_numerical <- network$numerical_gradient(x_batch, t_batch)
grad_backprop <- network$gradient(x_batch, t_batch)

map2(grad_numerical, grad_backprop, ~ mean(abs(.x - .y)))
$W1
[1] 4.062289e-10

$b1
[1] 2.386305e-09

$W2
[1] 4.953931e-09

$b2
[1] 1.403384e-07

誤差逆伝播法を使った学習

x_test <- mnist$test$images / 255
t_test <- to_one_hot(mnist$test$labels)

set.seed(1)
network <- TwoLayerNet$new(input_size = 784, hidden_size = 50, output_size = 10)

iters_num <- 10000
train_size <- nrow(x_train)
batch_size <- 100
learning_rate <- 0.1

iter_per_epoch <- max(train_size %/% batch_size, 1)

acc_df <- map_dfr(1:iters_num, function(i) {
  batch_mask <- sample(train_size, batch_size)
  x_batch <- x_train[batch_mask, ]
  t_batch <- t_train[batch_mask, ]

  grad <- network$gradient(x_batch, t_batch)

  for (name in names(grad)) {
    network$params[[name]] <- network$params[[name]] - learning_rate * grad[[name]]
  }
  network$update_layers()

  loss <- network$loss(x_batch, t_batch)

  train_acc <- test_acc <- NA
  if (i %% iter_per_epoch == 1) {
    train_acc <- network$accuracy(x_train, t_train)
    test_acc <- network$accuracy(x_test, t_test)
  }

  lst(loss, train_acc, test_acc)
})
acc_df %>% filter(!is.na(train_acc))
# A tibble: 17 × 3
     loss train_acc test_acc
    <dbl>     <dbl>    <dbl>
 1 2.30      0.0872   0.0835
 2 0.241     0.905    0.910 
 3 0.254     0.923    0.926 
 4 0.243     0.937    0.937 
 5 0.0995    0.944    0.941 
 6 0.114     0.950    0.949 
 7 0.126     0.956    0.953 
 8 0.0897    0.961    0.958 
 9 0.0906    0.965    0.964 
10 0.0401    0.965    0.961 
11 0.150     0.967    0.964 
12 0.0623    0.971    0.966 
13 0.0763    0.974    0.969 
14 0.0678    0.974    0.970 
15 0.0427    0.975    0.970 
16 0.0848    0.978    0.971 
17 0.0259    0.978    0.971