강의영상

imports

import tensorflow as tf 
import tensorflow.experimental.numpy as tnp

import matplotlib.pyplot as plt

tnp.experimental_enable_numpy_behavior()

미분

tf.GradientTape() 사용방법

- 예제9: 카페예제로 돌아오자.

x=tnp.array([20.1, 22.2, 22.7, 23.3, 24.4, 25.1, 26.2, 27.3, 28.4, 30.4])
x

<tf.Tensor: shape=(10,), dtype=float64, numpy=array([20.1, 22.2, 22.7, 23.3, 24.4, 25.1, 26.2, 27.3, 28.4, 30.4])>

tnp.random.seed(43052) 
y= 10.2+ x*2.2 + tnp.random.randn(10) 
y

<tf.Tensor: shape=(10,), dtype=float64, numpy=
array([54.98269924, 60.27348365, 61.27621687, 60.53495888, 62.9770905 ,
       66.32168996, 66.87781372, 71.0050025 , 72.63837337, 77.11143943])>

beta0= tf.Variable(9.0) 
beta1= tf.Variable(2.0)

with tf.GradientTape(persistent=True) as tape: 
    loss=sum((y-beta0-beta1*x)**2)

tape.gradient(loss,beta0),tape.gradient(loss,beta1)

(<tf.Tensor: shape=(), dtype=float32, numpy=-127.597534>,
 <tf.Tensor: shape=(), dtype=float32, numpy=-3214.2532>)

- 예제10: 카페예제의 매트릭스 버전

X= tnp.array([1]*10+ [20.1, 22.2, 22.7, 23.3, 24.4, 25.1, 26.2, 27.3, 28.4, 30.4]).reshape(2,10).T
X

<tf.Tensor: shape=(10, 2), dtype=float64, numpy=
array([[ 1. , 20.1],
       [ 1. , 22.2],
       [ 1. , 22.7],
       [ 1. , 23.3],
       [ 1. , 24.4],
       [ 1. , 25.1],
       [ 1. , 26.2],
       [ 1. , 27.3],
       [ 1. , 28.4],
       [ 1. , 30.4]])>

beta_true = tnp.array([[10.2],[2.2]])
beta_true

<tf.Tensor: shape=(2, 1), dtype=float64, numpy=
array([[10.2],
       [ 2.2]])>

tnp.random.seed(43052) 
y= X@beta_true + tnp.random.randn(10).reshape(10,1)
y

<tf.Tensor: shape=(10, 1), dtype=float64, numpy=
array([[54.98269924],
       [60.27348365],
       [61.27621687],
       [60.53495888],
       [62.9770905 ],
       [66.32168996],
       [66.87781372],
       [71.0050025 ],
       [72.63837337],
       [77.11143943]])>

beta = tnp.array([[9.0],[2.0]])
beta

<tf.Tensor: shape=(2, 1), dtype=float64, numpy=
array([[9.],
       [2.]])>

with tf.GradientTape(persistent=True) as tape:
    tape.watch(beta)
    yhat = X@beta 
    loss=(y-yhat).T @ (y-yhat)

tape.gradient(loss,beta) # 텐서플로우가 계산한 미분값

<tf.Tensor: shape=(2, 1), dtype=float64, numpy=
array([[ -127.59753624],
       [-3214.25306574]])>

- 해석적풀이

$$loss'(\beta)= -2X'y + 2X'X\beta$$

-2 * X.T @ y + 2* X.T @ X @ beta # 이론적인 값

<tf.Tensor: shape=(2, 1), dtype=float64, numpy=
array([[ -127.59753624],
       [-3214.25306574]])>

- 예제11: 위의 예제에서 이론적인 $\boldsymbol{\beta}$의 최적값을 찾아보고 (즉 $\boldsymbol{\hat\beta}$을 찾고) 그 지점에서 loss의 미분값(=접선의 기울기)를 구하라. 결과가 $\bf{0}$인지 확인하라. (단 ${\bf 0}$은 길이가 2이고 각 원소가 0인 벡터)

betahat = tf.linalg.inv(X.T @ X) @ X.T @ y 
betahat

<tf.Tensor: shape=(2, 1), dtype=float64, numpy=
array([[12.10040012],
       [ 2.13112662]])>

with tf.GradientTape() as tape: 
    tape.watch(betahat)
    yhat = X@betahat 
    loss=(y-yhat).T @ (y-yhat)

tape.gradient(loss,betahat)

<tf.Tensor: shape=(2, 1), dtype=float64, numpy=
array([[-4.23483471e-12],
       [-1.06379688e-10]])>

경사하강법

최적화문제

- $loss=(\frac{1}{2}\beta-1)^2$를 최소하는 $\beta$를 컴퓨터를 활용하여 구하는 문제를 생각해보자. (답은 이미 알고 있어요, $\beta=2$입니다.)

방법1: grid search

알고리즘

(1) beta = [-10, -9.99, -9.98, ... , 9.99, 10] 와 같은 수열을 만든다.

(2) 각 지점에서 (beta/2 -1)^2 을 계산한다.

(3) (2)의 결과를 가장 작게 만드는 값을 고른다.

구현코드

beta = tnp.linspace(-10,10,100) 
loss = (beta/2 -1)**2

loss

<tf.Tensor: shape=(100,), dtype=float64, numpy=
array([3.60000000e+01, 3.47980818e+01, 3.36165697e+01, 3.24554637e+01,
       3.13147638e+01, 3.01944700e+01, 2.90945822e+01, 2.80151005e+01,
       2.69560249e+01, 2.59173554e+01, 2.48990919e+01, 2.39012346e+01,
       2.29237833e+01, 2.19667381e+01, 2.10300990e+01, 2.01138659e+01,
       1.92180390e+01, 1.83426181e+01, 1.74876033e+01, 1.66529946e+01,
       1.58387920e+01, 1.50449954e+01, 1.42716049e+01, 1.35186205e+01,
       1.27860422e+01, 1.20738700e+01, 1.13821039e+01, 1.07107438e+01,
       1.00597898e+01, 9.42924191e+00, 8.81910009e+00, 8.22936435e+00,
       7.66003469e+00, 7.11111111e+00, 6.58259361e+00, 6.07448220e+00,
       5.58677686e+00, 5.11947760e+00, 4.67258443e+00, 4.24609734e+00,
       3.84001632e+00, 3.45434139e+00, 3.08907254e+00, 2.74420977e+00,
       2.41975309e+00, 2.11570248e+00, 1.83205795e+00, 1.56881951e+00,
       1.32598714e+00, 1.10356086e+00, 9.01540659e-01, 7.19926538e-01,
       5.58718498e-01, 4.17916539e-01, 2.97520661e-01, 1.97530864e-01,
       1.17947148e-01, 5.87695133e-02, 1.99979594e-02, 1.63248648e-03,
       3.67309458e-03, 2.61197837e-02, 6.89725538e-02, 1.32231405e-01,
       2.15896337e-01, 3.19967350e-01, 4.44444444e-01, 5.89327620e-01,
       7.54616876e-01, 9.40312213e-01, 1.14641363e+00, 1.37292113e+00,
       1.61983471e+00, 1.88715437e+00, 2.17488011e+00, 2.48301194e+00,
       2.81154984e+00, 3.16049383e+00, 3.52984389e+00, 3.91960004e+00,
       4.32976227e+00, 4.76033058e+00, 5.21130497e+00, 5.68268544e+00,
       6.17447199e+00, 6.68666463e+00, 7.21926334e+00, 7.77226814e+00,
       8.34567901e+00, 8.93949597e+00, 9.55371901e+00, 1.01883481e+01,
       1.08433833e+01, 1.15188246e+01, 1.22146720e+01, 1.29309254e+01,
       1.36675849e+01, 1.44246505e+01, 1.52021222e+01, 1.60000000e+01])>

(예비학습)

tnp.argmin([1,2,3,-1,5])

<tf.Tensor: shape=(), dtype=int64, numpy=3>

tnp.argmin([1,2,-1,3,5])

<tf.Tensor: shape=(), dtype=int64, numpy=2>

예비학습 끝

tnp.argmin(loss)

<tf.Tensor: shape=(), dtype=int64, numpy=59>

beta[59]

<tf.Tensor: shape=(), dtype=float64, numpy=1.9191919191919187>

beta[60]

<tf.Tensor: shape=(), dtype=float64, numpy=2.121212121212121>

loss[59],loss[60]

(<tf.Tensor: shape=(), dtype=float64, numpy=0.0016324864809713507>,
 <tf.Tensor: shape=(), dtype=float64, numpy=0.0036730945821854847>)

그리드서치의 문제점

- 비판1: [-10,10]이외에 해가 존재하면?

이 예제의 경우는 운좋게 [-10,10]에서 해가 존재했음
하지만 임의의 고정된 $x,y$에 대하여 $loss(\beta)=(x\beta-y)^2$ 의 형태의 해가 항상 [-10,10]에서 존재한다는 보장은 없음
해결책: 더 넓게 많은 범위를 탐색하자?

- 비판2: 효율적이지 않음

알고리즘을 요약하면 결국 -10부터 10까지 작은 간격으로 조금씩 이동하며 loss를 조사하는 것이 grid search의 아이디어
$\to$ 생각해보니까 $\beta=2$인 순간 $loss=(\frac{1}{2}\beta-1)^2=0$이 되어서 이것보다 작은 최소값은 존재하지 않는다(제곱은 항상 양수이어야 하므로)
$\to$ 따라서 $\beta=2$ 이후로는 탐색할 필요가 없다