(5주차) 4월4일
- 강의영상
- imports
- 최적화의 문제
- tf.keras.optimizers를 이용한 최적화방법
- 회귀분석 문제
- 이론적 풀이
- GradientTape를 이용
- GradientTape + opt.apply_gradients
- opt.minimize
- tf.keras.Sequential
#!conda install -c conda-forge python-graphviz -y
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.experimental.numpy as tnp
tnp.experimental_enable_numpy_behavior()
- $loss=(\frac{1}{2}\beta-1)^2$
- 기존에 했던 방법은 수식을 알고 있어야 한다는 단점이 있음
alpha= 0.01/6
beta= tf.Variable(-10.0)
opt = tf.keras.optimizers.SGD(alpha)
- iter1
with tf.GradientTape() as tape:
tape.watch(beta)
loss=(beta/2-1)**2
slope = tape.gradient(loss,beta)
opt.apply_gradients([(slope,beta)]) # beta.assign_sub(slope * alpha)
beta
- iter2
with tf.GradientTape() as tape:
tape.watch(beta)
loss=(beta/2-1)**2
slope = tape.gradient(loss,beta)
opt.apply_gradients([(slope,beta)]) # beta.assign_sub(slope * alpha)
beta
- for문으로 정리
alpha= 0.01/6
beta= tf.Variable(-10.0)
opt = tf.keras.optimizers.SGD(alpha)
for epoc in range(10000):
with tf.GradientTape() as tape:
tape.watch(beta)
loss=(beta/2-1)**2
slope = tape.gradient(loss,beta)
opt.apply_gradients([(slope,beta)]) # beta.assign_sub(slope * alpha)
beta
beta
- opt.apply_gradients()의 입력은 pair 의 list
alpha= 0.01/6
beta= tf.Variable(-10.0)
opt = tf.keras.optimizers.SGD(alpha)
loss_fn = lambda: (beta/2-1)**2
-
lambda x: x**2<=>lambda(x)=x^2 -
lambda x,y: x+y<=>lambda(x,y)=x+y -
lambda: y<=>lambda()=y, 입력이 없으며 출력은 항상 y인 함수
loss_fn() # 입력은 없고 출력은 뭔가 계산되는 함수
- iter 1
opt.minimize(loss_fn, beta)
beta
- iter2
opt.minimize(loss_fn, beta)
beta
- for문으로 정리하면
alpha= 0.01/6
beta= tf.Variable(-10.0)
opt = tf.keras.optimizers.SGD(alpha)
loss_fn = lambda: (beta/2-1)**2
for epoc in range(10000):
opt.minimize(loss_fn, beta)
beta
- ${\bf y} \approx 2.5 + 4.0 {\bf x}$
tnp.random.seed(43052)
N = 200
x = tnp.linspace(0,1,N)
epsilon = tnp.random.randn(N)*0.5
y = 2.5+4*x + epsilon
y_true = 2.5+4*x
plt.plot(x,y,'.')
plt.plot(x,y_true,'r--')
- 포인트
- $S_{xx}=$, $S_{xy}=$
- $\hat{\beta}_0=$, $\hat{\beta}_1=$
- 풀이
Sxx = sum((x-x.mean())**2)
Sxy = sum((x-x.mean())*(y-y.mean()))
beta1_hat = Sxy/Sxx
beta1_hat
beta0_hat = y.mean() - x.mean()*beta1_hat
beta0_hat
- 포인트
- $\hat{\beta}=(X'X)^{-1}X'y$
- 풀이
y=y.reshape(N,1)
X=tf.stack([tf.ones(N,dtype=tf.float64),x],axis=1)
y.shape,X.shape
tf.linalg.inv(X.T @ X ) @ X.T @ y
- 포인트
- $loss'(\beta)=-2X'y +2X'X\beta$
- $\beta_{new} = \beta_{old} - \alpha \times loss'(\beta_{old})$
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tnp.array([-5,10]).reshape(2,1)
beta_hat
slope = (-2*X.T @ y + 2*X.T @ X @ beta_hat) / N
slope
alpha= 0.1
step = slope*alpha
step
for epoc in range(1000):
slope = (-2*X.T @ y + 2*X.T @ X @ beta_hat)/N
beta_hat = beta_hat - alpha* slope
beta_hat
- 포인트
## 포인트코드1: 그레디언트 테입
with tf.GradientTape() as tape:
loss =
## 포인트코드2: 미분
slope = tape.gradient(loss,beta_hat)
## 포인트코드3: update
beta_hat.assign_sub(slope*alph)
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
alpha=0.1
for epoc in range(1000):
with tf.GradientTape() as tape:
yhat= X@beta_hat
loss= (y-yhat).T @ (y-yhat) / N
slope = tape.gradient(loss,beta_hat)
beta_hat.assign_sub(alpha*slope)
beta_hat
- 포인트
## 포인트코드: 미분
slope0,slope1 = tape.gradient(loss,[beta0_hat,beta1_hat])
- 풀이
y=y.reshape(-1)
y.shape,x.shape
beta0_hat = tf.Variable(-5.0)
beta1_hat = tf.Variable(10.0)
alpha=0.1
for epoc in range(1000):
with tf.GradientTape() as tape:
yhat= beta0_hat + x*beta1_hat
loss= tf.reduce_sum((y-yhat)**2)/N #loss= sum((y-yhat)**2)/N
slope0,slope1 = tape.gradient(loss,[beta0_hat,beta1_hat])
beta0_hat.assign_sub(alpha*slope0)
beta1_hat.assign_sub(alpha*slope1)
beta0_hat,beta1_hat
- 포인트
## 포인트코드: 업데이트
opt.apply_gradients([(slope,beta_hat)]) ## pair의 list가 입력
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
with tf.GradientTape() as tape:
yhat= X@beta_hat
loss= (y-yhat).T @ (y-yhat) / N
slope = tape.gradient(loss,beta_hat)
opt.apply_gradients([(slope,beta_hat)])
#beta_hat.assign_sub(alpha*slope)
beta_hat
- 포인트
## 포인트코드: 업데이트
opt.apply_gradients([(slope0,beta0_hat),(slope1,beta1_hat)]) ## pair의 list가 입력
- 풀이
y=y.reshape(-1)
y.shape,x.shape
beta0_hat = tf.Variable(-5.0)
beta1_hat = tf.Variable(10.0)
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
with tf.GradientTape() as tape:
yhat= beta0_hat + beta1_hat*x #X@beta_hat
loss= tf.reduce_sum((y-yhat)**2) / N
slope0,slope1 = tape.gradient(loss,[beta0_hat,beta1_hat])
opt.apply_gradients([(slope0,beta0_hat),(slope1,beta1_hat)])
beta0_hat,beta1_hat
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
loss_fn = lambda: (y-X@beta_hat).T @ (y-X@beta_hat) / N
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
opt.minimize(loss_fn,beta_hat)
beta_hat
- 포인트
## 포인트코드: 미분 & 업데이트 = minimize
opt.minimize(loss_fn,[beta0_hat,beta1_hat])
- 풀이
y=y.reshape(-1)
y.shape,x.shape
beta0_hat = tf.Variable(-5.0)
beta1_hat = tf.Variable(10.0)
loss_fn = lambda: tf.reduce_sum((y-beta0_hat-beta1_hat*x )**2) / N
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
opt.minimize(loss_fn,[beta0_hat,beta1_hat])
beta0_hat,beta1_hat
- 포인트
## 포인트코드: 손실함수정의
def loss_fn():
return ??
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
def loss_fn():
return (y-X@beta_hat).T @ (y-X@beta_hat) / N
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
opt.minimize(loss_fn,beta_hat)
beta_hat
- 포인트
## 포인트코드: 손실함수정의
def loss_fn():
??
??
return ??
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
def loss_fn():
yhat= X@beta_hat # 컴퓨터한테 전달할 수식1
loss = (y-yhat).T @ (y-yhat) / N # 컴퓨터한테 전달할 수식 2
return loss # tape.gradient(loss,beta_hat) 에서의 미분당하는애
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
opt.minimize(loss_fn,beta_hat)
beta_hat
- 포인트
## 포인트코드: 미리구현되어있는 손실함수 이용
tf.losses.MSE(y,yhat)
- 풀이
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
def loss_fn():
yhat= X@beta_hat # 컴퓨터한테 전달할 수식1
loss = tf.keras.losses.MSE(y.reshape(-1),yhat.reshape(-1)) # 컴퓨터한테 전달할 수식 2
return loss # tape.gradient(loss,beta_hat) 에서의 미분당하는애
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
opt.minimize(loss_fn,beta_hat)
beta_hat
- 포인트
## 포인트코드: 클래스로부터 손실함수 오브젝트 생성 (함수를 찍어내는 클래스)
mse_fn = tf.losses.MeanSquaredError()
mse_fn(y,yhat)
- 풀이
mseloss_fn = tf.losses.MeanSquaredError()
-
mseloss_fn=tf.keras.losses.MSE라고 보면된다.
y=y.reshape(N,1)
y.shape,X.shape
beta_hat = tf.Variable(tnp.array([-5.0,10.0]).reshape(2,1))
beta_hat
def loss_fn():
yhat= X@beta_hat # 컴퓨터한테 전달할 수식1
loss = mseloss_fn(y.reshape(-1),yhat.reshape(-1)) # 컴퓨터한테 전달할 수식 2
return loss # tape.gradient(loss,beta_hat) 에서의 미분당하는애
alpha=0.1
opt = tf.optimizers.SGD(alpha)
for epoc in range(1000):
opt.minimize(loss_fn,beta_hat)
beta_hat
- $\hat{y}_i=\hat{\beta}_0+\hat{\beta}_1x_i$ 의 서로다른 표현
import graphviz
def gv(s): return graphviz.Source('digraph G{ rankdir="LR"'+s + '; }')
gv('''
"1" -> "beta0_hat + x*beta1_hat, bias=False"[label="* beta0_hat"]
"x" -> "beta0_hat + x*beta1_hat, bias=False"[label="* beta1_hat"]
"beta0_hat + x*beta1_hat, bias=False" -> "yhat"[label="indentity"]
''')
gv('''
"x" -> "x*beta1_hat, bias=True"[label="*beta1_hat"] ;
"x*beta1_hat, bias=True" -> "yhat"[label="indentity"] ''')
gv('''
"X=[1 x]" -> "X@beta_hat, bias=False"[label="@beta_hat"] ;
"X@beta_hat, bias=False" -> "yhat"[label="indentity"] ''')
- 포인트
## 포인트코드1: 네트워크 생성
net = tf.keras.Sequential()
## 포인트코드2: 네트워크의 아키텍처 설계
net.add(tf.keras.layers.Dense(1,input_shape=(2,),use_bias=False))
## 포인트코드3: 네트워크 컴파일 = 아키텍처 + 손실함수 + 옵티마이저
net.compile(opt,loss=loss_fn2)
## 포인트코드4: 미분 & update
net.fit(X,y,epochs=1000,verbose=0,batch_size=N)
- 풀이
net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(units=1,input_shape=(2,),use_bias=False)) ## yhat을 구하는 방법정의 = 아키텍처가 설계
- units는 layer의 출력의 차원, 이 경우는 yhat의 차원, yhat은 (200,1) 이므로 1임.
- input_shape는 layer의 입력의 차원, 이 경우는 X의 차원, X는 (200,2) 이므로 2임.
def loss_fn2(y,yhat):
return (y-yhat).T @ (y-yhat) / N
alpha=0.1
opt =tf.optimizers.SGD(alpha)
[np.array([[-5.0],[10.0]],dtype=np.float32)]
net.set_weights([np.array([[-5.0],[10.0]],dtype=np.float32)])
net.weights
net.compile(opt,loss=tf.losses.MSE)
# 아키텍처 + 손실함수 + 옵티마이저 => 네트워크에 다 합치자 => 네트워크를 컴파일한다.
net.fit(X,y,epochs=1000,batch_size=N,verbose=0) # 미분 + 파라메터업데이트 = net.fit
net.weights