강의영상

imports

import numpy as np
import matplotlib.pyplot as plt 
import tensorflow as tf 
import tensorflow.experimental.numpy as tnp

tnp.experimental_enable_numpy_behavior()

import graphviz
def gv(s): return graphviz.Source('digraph G{ rankdir="LR"'+s + '; }')

piece-wise linear regression

model: $y_i=\begin{cases} x_i +0.3\epsilon_i & x\leq 0 \\ 3.5x_i +0.3\epsilon_i & x>0 \end{cases}$

np.random.seed(43052)
N=100
x = np.linspace(-1,1,N)
lamb = lambda x: x*1+np.random.normal()*0.3 if x<0 else x*3.5+np.random.normal()*0.3 
y= np.array(list(map(lamb,x)))
y

array([-0.88497385, -0.65454563, -0.61676249, -0.84702584, -0.84785569,
       -0.79220455, -1.3777105 , -1.27341781, -1.41643729, -1.26404671,
       -0.79590224, -0.78824395, -0.86064773, -0.52468679, -1.18247354,
       -0.29327295, -0.69373049, -0.90561768, -1.07554911, -0.7225404 ,
       -0.69867774, -0.34811037,  0.11188474, -1.05046296, -0.03840085,
       -0.38356861, -0.24299798, -0.58403161, -0.20344022, -0.13872303,
       -0.529586  , -0.27814478, -0.10852781, -0.38294596,  0.02669763,
       -0.23042603, -0.77720364, -0.34287396, -0.04512022, -0.30180793,
       -0.26711438, -0.51880349, -0.53939672, -0.32052379, -0.32080763,
        0.28917092,  0.18175206, -0.48988124, -0.08084459,  0.37706178,
        0.14478908,  0.07621827, -0.071864  ,  0.05143365,  0.33932009,
       -0.35071776,  0.87742867,  0.51370399,  0.34863976,  0.55855514,
        1.14196717,  0.86421076,  0.72957843,  0.57342304,  1.54803332,
        0.98840018,  1.11129366,  1.42410801,  1.44322465,  1.25926455,
        1.12940772,  1.46516829,  1.16365096,  1.45560853,  1.9530553 ,
        2.45940445,  1.52921129,  1.8606463 ,  1.86406718,  1.5866523 ,
        1.49033473,  2.35242686,  2.12246412,  2.41951931,  2.43615052,
        1.96024441,  2.65843789,  2.46854394,  2.76381882,  2.78547462,
        2.56568465,  3.15212157,  3.11482949,  3.17901774,  3.31268904,
        3.60977818,  3.40949166,  3.30306495,  3.74590922,  3.85610433])

plt.plot(x,y,'.')

[<matplotlib.lines.Line2D at 0x7f3b7f1187f0>]

풀이1: 단순회귀모형

x= x.reshape(N,1)
y= y.reshape(N,1)

net = tf.keras.Sequential()
net.add(tf.keras.layers.Dense(1)) 
net.compile(optimizer=tf.optimizers.SGD(0.1),loss='mse')
net.fit(x,y,batch_size=N,epochs=1000,verbose=0) # numpy로 해도 돌아감

2022-04-18 11:40:03.840482: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero

<keras.callbacks.History at 0x7f3b7d0bf670>

net.weights

[<tf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[2.2616348]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(1,) dtype=float32, numpy=array([0.6069048], dtype=float32)>]

yhat = x * 2.2616348 + 0.6069048
yhat = net.predict(x)

plt.plot(x,y,'.')
plt.plot(x,yhat,'--')

[<matplotlib.lines.Line2D at 0x7f3b7f163370>]

- 실패: 이 모형은 epoch을 10억번 돌려도 실패할 모형임

왜? 아키텍처 설계자체가 틀렸음
꺽인부분을 표현하기에는 아키텍처의 표현력이 너무 부족하다 -> under fit의 문제

풀이2: 비선형 활성화 함수의 도입

- 여기에서 비선형 활성화 함수는 relu

- 네트워크를 아래와 같이 수정하자.

(수정전) hat은 생략

gv('''
"x" -> "x*w,    bias=True"[label="*w"] ;
"x*w,    bias=True" -> "y"[label="indentity"] ''')

(수정후) hat은 생략

gv('''
"x" -> "x*w,    bias=True"[label="*w"] ;
"x*w,    bias=True" -> "y"[label="relu"] ''')

마지막에 $f(x)=x$ 라는 함수대신에 relu를 취하는 것으로 구조를 약간 변경
활성화함수(acitivation function)를 indentity에서 relu로 변경

- relu함수란?

_x = np.linspace(-1,1,100)
tf.nn.relu(_x)

<tf.Tensor: shape=(100,), dtype=float64, numpy=
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.01010101, 0.03030303, 0.05050505, 0.07070707, 0.09090909,
       0.11111111, 0.13131313, 0.15151515, 0.17171717, 0.19191919,
       0.21212121, 0.23232323, 0.25252525, 0.27272727, 0.29292929,
       0.31313131, 0.33333333, 0.35353535, 0.37373737, 0.39393939,
       0.41414141, 0.43434343, 0.45454545, 0.47474747, 0.49494949,
       0.51515152, 0.53535354, 0.55555556, 0.57575758, 0.5959596 ,
       0.61616162, 0.63636364, 0.65656566, 0.67676768, 0.6969697 ,
       0.71717172, 0.73737374, 0.75757576, 0.77777778, 0.7979798 ,
       0.81818182, 0.83838384, 0.85858586, 0.87878788, 0.8989899 ,
       0.91919192, 0.93939394, 0.95959596, 0.97979798, 1.        ])>

plt.plot(_x,_x)
plt.plot(_x,tf.nn.relu(_x))

[<matplotlib.lines.Line2D at 0x7f3b7c1ab610>]

파란색을 주황색으로 바꿔주는 것이 렐루함수임
$f(x)=\max(0,x)=\begin{cases} 0 & x\leq 0 \\ x & x>0 \end{cases}$

- 아키텍처: $\hat{y}_i=relu(\hat{w}_0+\hat{w}_1x_i)$, $relu(x)=\max(0,x)$

- 풀이시작

1단계

net2 = tf.keras.Sequential()

2단계

tf.random.set_seed(43053)
l1 = tf.keras.layers.Dense(1, input_shape=(1,)) 
a1 = tf.keras.layers.Activation(tf.nn.relu)

net2.add(l1)

net2.layers

[<keras.layers.core.dense.Dense at 0x7f3b5c6e74f0>]

net2.add(a1)

net2.layers

[<keras.layers.core.dense.Dense at 0x7f3b5c6e74f0>,
 <keras.layers.core.activation.Activation at 0x7f3aa99a0ee0>]

l1.get_weights()

[array([[0.41721308]], dtype=float32), array([0.], dtype=float32)]

net2.get_weights()

[array([[0.41721308]], dtype=float32), array([0.], dtype=float32)]

(네트워크 상황 확인)

u1= l1(x)
#u1= x@l1.weights[0] + l1.weights[1]

v1= a1(u1)
#v1= tf.nn.relu(u1)

plt.plot(x,x)
plt.plot(x,u1,'--r')
plt.plot(x,v1,'--b')

[<matplotlib.lines.Line2D at 0x7f3aa98243a0>]

3단계

net2.compile(optimizer=tf.optimizers.SGD(0.1),loss='mse')

4단계

net2.fit(x,y,epochs=1000,verbose=0,batch_size=N)

<keras.callbacks.History at 0x7f3aa9885990>

- result

yhat = tf.nn.relu(x@l1.weights[0] + l1.weights[1]) 
yhat = net2.predict(x)
yhat = net2(x)
yhat = a1(l1(x))
yhat = net2.layers[1](net2.layers[0](x))

plt.plot(x,y,'.')
plt.plot(x,yhat,'--')

[<matplotlib.lines.Line2D at 0x7f3aa636bee0>]

- discussion

이것 역시 수백억번 에폭을 반복해도 이 이상 적합이 힘들다 $\to$ 모형의 표현력이 떨어진다.
해결책: 주황색점선이 2개 있다면 어떨까?

풀이3: 노드수추가 + 레이어추가

목표: 2개의 주황색 점선을 만들자.

1단계

net3 = tf.keras.Sequential()

2단계

tf.random.set_seed(43053)
l1 = tf.keras.layers.Dense(2,input_shape=(1,))
a1 = tf.keras.layers.Activation(tf.nn.relu)

net3.add(l1)
net3.add(a1)

(네트워크 상황 확인)

l1(x).shape
# l1(x) : (100,1) -> (100,2)

TensorShape([100, 2])

plt.plot(x,x)
plt.plot(x,l1(x),'--')

[<matplotlib.lines.Line2D at 0x7f3a9c5f7730>,
 <matplotlib.lines.Line2D at 0x7f3a9c5f7970>]

plt.plot(x,x)
plt.plot(x,a1(l1(x)),'--')

[<matplotlib.lines.Line2D at 0x7f3a9c48cf70>,
 <matplotlib.lines.Line2D at 0x7f3a9c48d1b0>]

- 이 상태에서는 yhat이 안나온다. 왜?

차원이 안맞음. a1(l1(x))의 차원은 (N,2)인데 최종적인 yhat의 차원은 (N,1)이어야 함.
차원이 어찌저찌 맞다고 쳐도 relu를 통과하면 항상 yhat>0 임. 따라서 음수값을 가지는 y는 0으로 밖에 맞출 수 없음.

- 해결책: a1(l1(x))에 연속으로(Sequential하게!) 또 다른 레이어를 설계! (N,2) -> (N,1) 이 되도록!

yhat= bias + weight1 * a1(l1(x))[0] + weight2 * a1(l1(x))[1]

- 즉 a1(l1(x)) 를 새로운 입력으로 해석하고 출력을 만들어주는 선형모형을 다시태우면 된다.

입력차원: 2
출력차원: 1

net3.layers

[<keras.layers.core.dense.Dense at 0x7f3aa62bb3d0>,
 <keras.layers.core.activation.Activation at 0x7f3aa62baad0>]

tf.random.set_seed(43053) 
l2 = tf.keras.layers.Dense(1, input_shape=(2,))

net3.add(l2)

net3.layers

[<keras.layers.core.dense.Dense at 0x7f3aa62bb3d0>,
 <keras.layers.core.activation.Activation at 0x7f3aa62baad0>,
 <keras.layers.core.dense.Dense at 0x7f3aa61c3160>]

net3.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_10 (Dense)            (None, 2)                 4         
                                                                 
 activation_9 (Activation)   (None, 2)                 0         
                                                                 
 dense_11 (Dense)            (None, 1)                 3         
                                                                 
=================================================================
Total params: 7
Trainable params: 7
Non-trainable params: 0
_________________________________________________________________

- 추정해야할 파라메터수가 4,0,3으로 나온다.

- 수식표현: $X \to X@W^{(1)}+b^{(1)} \to relu(X@W^{(1)}+b^{(1)}) \to relu(X@W^{(1)}+b^{(1)})@W^{(2)}+b^{(2)}=yhat$

$X$: (N,1)
$W^{(1)}$: (1,2) ==> 파라메터 2개 추정
$b^{(1)}$: (2,) ==> 파라메터 2개가 추가 // 여기까지 추정할 파라메터는 4개
$W^{(2)}$: (2,1) ==> 파라메터 2개 추정
$b^{(2)}$: (1,) ==> 파라메터 1개가 추가 // 따라서 3개

- 참고: 추정할 파라메터수가 많다 = 복잡한 모형이다.

초거대AI: 추정할 파라메터수가 엄청 많은..

net3.weights

[<tf.Variable 'dense_10/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[ 0.34065306, -0.7533803 ]], dtype=float32)>,
 <tf.Variable 'dense_10/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>,
 <tf.Variable 'dense_11/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[ 0.34065306],
        [-0.7533803 ]], dtype=float32)>,
 <tf.Variable 'dense_11/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

l1.weights

[<tf.Variable 'dense_10/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[ 0.34065306, -0.7533803 ]], dtype=float32)>,
 <tf.Variable 'dense_10/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>]

l2.weights

[<tf.Variable 'dense_11/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[ 0.34065306],
        [-0.7533803 ]], dtype=float32)>,
 <tf.Variable 'dense_11/bias:0' shape=(1,) dtype=float32, numpy=array([0.], dtype=float32)>]

- 좀 더 간단한 수식표현: $X \to (u_1 \to v_1) \to (u_2 \to v_2) = yhat$

$u_1= X@W^{(1)}+b^{(1)}$
$v_1= relu(u_1)$
$u_2= v_1@W^{(2)}+b^{(2)}$
$v_2= indentity(u_2):=yhat$

gv('''
subgraph cluster_1{
    style=filled;
    color=lightgrey;
    "X" 
    label = "Layer 0"
}
subgraph cluster_2{
    style=filled;
    color=lightgrey;
    "X" -> "u1[:,0]"[label="*W1[0,0]"]
    "X" -> "u1[:,1]"[label="*W1[0,1]"]
    "u1[:,0]" -> "v1[:,0]"[label="relu"]
    "u1[:,1]" -> "v1[:,1]"[label="relu"]
    label = "Layer 1"
}
subgraph cluster_3{
    style=filled;
    color=lightgrey;
    "v1[:,0]" -> "yhat"[label="*W2[0,0]"]
    "v1[:,1]" -> "yhat"[label="*W2[1,0]"]
    label = "Layer 2"
}
''')

gv('''
subgraph cluster_1{
    style=filled;
    color=lightgrey;
    "X" 
    label = "Layer 0"
}
subgraph cluster_2{
    style=filled;
    color=lightgrey;
    "X" -> "node1"
    "X" -> "node2"
    label = "Layer 1: relu"
}
subgraph cluster_3{
    style=filled;
    color=lightgrey;
    "node1" -> "yhat"
    "node2" -> "yhat"
    label = "Layer 2"
}
''')

3단계

net3.compile(loss='mse',optimizer=tf.optimizers.SGD(0.1))

4단계

net3.fit(x,y,epochs=1000,verbose=0, batch_size=N)

<keras.callbacks.History at 0x7f3aa61ba560>

- 결과확인

net3.weights

[<tf.Variable 'dense_10/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[ 1.6352799 , -0.85507524]], dtype=float32)>,
 <tf.Variable 'dense_10/bias:0' shape=(2,) dtype=float32, numpy=array([-0.08284465,  0.85552216], dtype=float32)>,
 <tf.Variable 'dense_11/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[ 1.6328746],
        [-1.2001747]], dtype=float32)>,
 <tf.Variable 'dense_11/bias:0' shape=(1,) dtype=float32, numpy=array([1.0253307], dtype=float32)>]

plt.plot(x,y,'.') 
plt.plot(x,net3(x),'--')

[<matplotlib.lines.Line2D at 0x7f3a9c384040>]

- 분석

plt.plot(x,y,'.') 
plt.plot(x,l1(x),'--')

[<matplotlib.lines.Line2D at 0x7f3a9c2bdde0>,
 <matplotlib.lines.Line2D at 0x7f3a9c2be020>]

plt.plot(x,y,'.') 
plt.plot(x,a1(l1(x)),'--')

[<matplotlib.lines.Line2D at 0x7f3a9c131870>,
 <matplotlib.lines.Line2D at 0x7f3a9c131ab0>]

plt.plot(x,y,'.') 
plt.plot(x,l2(a1(l1(x))),'--')

[<matplotlib.lines.Line2D at 0x7f3a9c1ad480>]

- 마지막 2개의 그림을 분석

l2.weights

[<tf.Variable 'dense_11/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[ 1.6328746],
        [-1.2001747]], dtype=float32)>,
 <tf.Variable 'dense_11/bias:0' shape=(1,) dtype=float32, numpy=array([1.0253307], dtype=float32)>]

fig, (ax1,ax2,ax3) = plt.subplots(1,3) 
fig.set_figwidth(12) 
ax1.plot(x,y,'.')
ax1.plot(x,a1(l1(x))[:,0],'--r')
ax1.plot(x,a1(l1(x))[:,1],'--b')
ax2.plot(x,y,'.')
ax2.plot(x,a1(l1(x))[:,0]*1.6328746,'--r')
ax2.plot(x,a1(l1(x))[:,1]*(-1.2001747)+1.0253307,'--b')
ax3.plot(x,y,'.')
ax3.plot(x,a1(l1(x))[:,0]*1.6328746+a1(l1(x))[:,1]*(-1.2001747)+1.0253307,'--')

[<matplotlib.lines.Line2D at 0x7f3a71028c40>]

풀이3의 실패

tf.random.set_seed(43054) 
## 1단계
net3 = tf.keras.Sequential() 
## 2단계
net3.add(tf.keras.layers.Dense(2))
net3.add(tf.keras.layers.Activation('relu')) 
net3.add(tf.keras.layers.Dense(1))
## 3단계 
net3.compile(optimizer=tf.optimizers.SGD(0.1),loss='mse')
## 4단계 
net3.fit(x,y,epochs=1000,verbose=0,batch_size=N)

<keras.callbacks.History at 0x7f3a70c237c0>

plt.plot(x,y,'.')
plt.plot(x,net3(x),'--')

[<matplotlib.lines.Line2D at 0x7f3a7032f700>]

- 엥? 에폭이 부족한가?

net3.fit(x,y,epochs=10000,verbose=0,batch_size=N)
plt.plot(x,y,'.')
plt.plot(x,net3(x),'--')

[<matplotlib.lines.Line2D at 0x7f3a70278160>]

- 실패분석

l1,a1,l2 = net3.layers

l2.weights

[<tf.Variable 'dense_13/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[0.65121335],
        [1.8592643 ]], dtype=float32)>,
 <tf.Variable 'dense_13/bias:0' shape=(1,) dtype=float32, numpy=array([-0.60076195], dtype=float32)>]

fig, (ax1,ax2,ax3,ax4) = plt.subplots(1,4) 
fig.set_figwidth(16) 
ax1.plot(x,y,'.')
ax1.plot(x,l1(x)[:,0],'--r')
ax1.plot(x,l1(x)[:,1],'--b')
ax2.plot(x,y,'.')
ax2.plot(x,a1(l1(x))[:,0],'--r')
ax2.plot(x,a1(l1(x))[:,1],'--b')
ax3.plot(x,y,'.')
ax3.plot(x,a1(l1(x))[:,0]*0.65121335,'--r')
ax3.plot(x,a1(l1(x))[:,1]*(1.8592643)+(-0.60076195),'--b')
ax4.plot(x,y,'.')
ax4.plot(x,a1(l1(x))[:,0]*0.65121335+a1(l1(x))[:,1]*(1.8592643)+(-0.60076195),'--')

[<matplotlib.lines.Line2D at 0x7f3a7081c130>]

보니까 빨간색선이 하는 역할을 없음
그런데 생각해보니까 이 상황에서는 빨간색선이 할수 있는 일이 별로 없음
왜? 지금은 나름 파란색선에 의해서 최적화가 된 상태임 $\to$ 빨간선이 뭔가 하려고하면 최적화된 상태가 깨질 수 있음 (loss 증가)
즉 이 상황 자체가 나름 최적회된 상태이다. 이러한 현상을 "global minimum을 찾지 못하고 local minimum에 빠졌다"라고 표현한다.

확인:

net3.weights

[<tf.Variable 'dense_12/kernel:0' shape=(1, 2) dtype=float32, numpy=array([[-0.03077251,  1.8713338 ]], dtype=float32)>,
 <tf.Variable 'dense_12/bias:0' shape=(2,) dtype=float32, numpy=array([-0.04834982,  0.3259186 ], dtype=float32)>,
 <tf.Variable 'dense_13/kernel:0' shape=(2, 1) dtype=float32, numpy=
 array([[0.65121335],
        [1.8592643 ]], dtype=float32)>,
 <tf.Variable 'dense_13/bias:0' shape=(1,) dtype=float32, numpy=array([-0.60076195], dtype=float32)>]

W1= tf.Variable(tnp.array([[-0.03077251,  1.8713338 ]]))
b1= tf.Variable(tnp.array([-0.04834982,  0.3259186 ]))
W2= tf.Variable(tnp.array([[0.65121335],[1.8592643 ]]))
b2= tf.Variable(tnp.array([-0.60076195]))

with tf.GradientTape() as tape: 
    u = tf.constant(x) @ W1 + b1 
    v = tf.nn.relu(u) 
    yhat = v@W2 + b2 
    loss = tf.losses.mse(y,yhat)

tape.gradient(loss,[W1,b1,W2,b2])

[<tf.Tensor: shape=(1, 2), dtype=float64, numpy=array([[ 0.00000000e+00, -4.77330119e-05]])>,
 <tf.Tensor: shape=(2,), dtype=float64, numpy=array([0.0000000e+00, 3.1478608e-06])>,
 <tf.Tensor: shape=(2, 1), dtype=float64, numpy=
 array([[ 0.00000000e+00],
        [-4.74910706e-05]])>,
 <tf.Tensor: shape=(1,), dtype=float64, numpy=array([-2.43031263e-05])>]

예상대로 계수값이 거의 다 0이다.

풀이4: 노드수를 더 추가한다면?

- 노드수를 더 추가해보면 어떻게 될까? (주황색 점선이 더 여러개 있다면?)

gv('''
subgraph cluster_1{
    style=filled;
    color=lightgrey;
    "X" 
    label = "Layer 0"
}
subgraph cluster_2{
    style=filled;
    color=lightgrey;
    "X" -> "node1"
    "X" -> "node2"
    "X" -> "..."
    "X" -> "node512"
    label = "Layer 1: relu"
}
subgraph cluster_3{
    style=filled;
    color=lightgrey;
    "node1" -> "yhat"
    "node2" -> "yhat"
    "..." -> "yhat"
    "node512" -> "yhat"
    label = "Layer 2"
}
''')

tf.random.set_seed(43056)
net4= tf.keras.Sequential()
net4.add(tf.keras.layers.Dense(512,activation='relu')) # 이렇게 해도됩니다. 
net4.add(tf.keras.layers.Dense(1))         
net4.compile(loss='mse',optimizer=tf.optimizers.SGD(0.1)) 
net4.fit(x,y,epochs=1000,verbose=0,batch_size=N)

<keras.callbacks.History at 0x7f34ac815d20>

plt.plot(x,y,'.')
plt.plot(x,net4(x),'--')

[<matplotlib.lines.Line2D at 0x7f34ac6d26b0>]

잘된다..
한두개의 노드가 역할을 못해도 다른노드들이 잘 보완해주는듯!

- 노드수가 많으면 무조건 좋다? -> 대부분 나쁘지 않음. 그런데 종종 맞추지 말아야할것도 맞춤.. (overfit)

np.random.seed(43052)
N=100 
_x = np.linspace(0,1,N).reshape(N,1) 
_y = np.random.normal(loc=0,scale=0.001,size=(N,1))
plt.plot(_x,_y)

[<matplotlib.lines.Line2D at 0x7f34ac4202b0>]

tf.random.set_seed(43052) 
net4 = tf.keras.Sequential()
net4.add(tf.keras.layers.Dense(512,activation='relu'))
net4.add(tf.keras.layers.Dense(1))
net4.compile(loss='mse',optimizer=tf.optimizers.SGD(0.5))
net4.fit(_x,_y,epochs=1000,verbose=0,batch_size=N)

<keras.callbacks.History at 0x7f34ac2f3640>

plt.plot(_x,_y)
plt.plot(_x,net4(_x),'--')

[<matplotlib.lines.Line2D at 0x7f34ac07a1a0>]

이 예제는 추후 다시 공부할 예정

Logistic regression

motive

- 현실에서 이런 경우가 많음

$x$가 커질수록 (혹은 작아질수록) 성공확률이 올라간다.

- 이러한 모형은 아래와 같이 설계할 수 있음 <-- 외우세요!!

$y_i \sim Ber(\pi_i)$, where $\pi_i=\frac{\exp(w_0+w_1x_i)}{1+\exp(w_0+w_1x_i)}$
$\hat{y}_i =\frac{\exp(\hat{w}_0+\hat{w}_1x_i)}{1+\exp(\hat{w}_0+\hat{w}_1x_i)}=\frac{1}{1+exp(-\hat{w}_0-\hat{w}_1x_i)}$
$loss=-\frac{1}{n}\sum_{i=1}^{n}\big(y_i\log(\hat{y}_i)+(1-y_i)\log(1-\hat{y}_i)\big)$

- 위와 같은 손실함수를 BCEloss라고 부른다. (BCE는 Binary Cross Entropy의 약자)

예제

N = 2000

x = tnp.linspace(-1,1,N).reshape(N,1)
w0 = -1 
w1 = 5 
u = w0 + x*w1 
#v = tf.constant(np.exp(u)/(1+np.exp(u))) # v=πi 
v = tf.nn.sigmoid(u) 
y = tf.constant(np.random.binomial(1,v),dtype=tf.float64)

plt.plot(x,y,'.',alpha=0.02)
plt.plot(x,v,'--r')

[<matplotlib.lines.Line2D at 0x7f349e22fa60>]

- 이 아키텍처(yhat을 얻어내는 과정)를 다어어그램으로 나타내면 아래와 같다.

gv('''
subgraph cluster_1{
    style=filled;
    color=lightgrey;
    "x" 
    label = "Layer 0"
}
subgraph cluster_2{
    style=filled;
    color=lightgrey;
    "x" -> "x*w, bias=True"[label="*w"]
    "x*w, bias=True" -> "yhat"[label="sigmoid"]
    label = "Layer 1"
}
''')

- 또는 간단하게 아래와 같이 쓸 수 있다.

gv('''
subgraph cluster_1{
    style=filled;
    color=lightgrey;
    x
    label = "Layer 0"
}
subgraph cluster_2{
    style=filled;
    color=lightgrey;
    x -> "node1=yhat"
    label = "Layer 1: sigmoid"
}
''')

- 케라스를 이용하여 적합을 해보면

$loss=-\frac{1}{n}\sum_{i=1}^{n}\big(y_i\log(\hat{y}_i)+(1-y_i)\log(1-\hat{y}_i)\big)$

tf.random.set_seed(43052)
net = tf.keras.Sequential() 
net.add(tf.keras.layers.Dense(1,activation='sigmoid'))
bceloss_fn = lambda y,yhat: -tf.reduce_mean(y*tnp.log(yhat) + (1-y)*tnp.log(1-yhat))
net.compile(loss=bceloss_fn, optimizer=tf.optimizers.SGD(0.1))
net.fit(x,y,epochs=1000,verbose=0,batch_size=N)

<keras.callbacks.History at 0x7f349c638670>

net.weights

[<tf.Variable 'dense_28/kernel:0' shape=(1, 1) dtype=float32, numpy=array([[4.1423755]], dtype=float32)>,
 <tf.Variable 'dense_28/bias:0' shape=(1,) dtype=float32, numpy=array([-0.820938], dtype=float32)>]

plt.plot(x,y,'.',alpha=0.1)
plt.plot(x,v,'--r')
plt.plot(x,net(x),'--b')

[<matplotlib.lines.Line2D at 0x7f349df177c0>]

MSE loss?

- mse loss를 쓰면 왜 안되는지?

tf.random.set_seed(43052)
net = tf.keras.Sequential() 
net.add(tf.keras.layers.Dense(1,activation='sigmoid'))
mseloss_fn = lambda y,yhat: tf.reduce_mean((y-yhat)**2)
net.compile(loss=mseloss_fn, optimizer=tf.optimizers.SGD(0.1))
net.fit(x,y,epochs=1000,verbose=0,batch_size=N)

<keras.callbacks.History at 0x7f349df8cc10>

plt.plot(x,y,'.',alpha=0.1)
plt.plot(x,v,'--r')
plt.plot(x,net(x),'--b')

[<matplotlib.lines.Line2D at 0x7f349ce42230>]

일단 BCE loss와 비교해보니까 동일 초기값, 동일 epochs에서 적합이 별로임

MSE loss vs BCE loss

- MSEloss, BCEloss의 시각화

w0, w1 = np.meshgrid(np.arange(-10,3,0.2), np.arange(-1,10,0.2), indexing='ij')
w0, w1 = w0.reshape(-1), w1.reshape(-1)

def loss_fn1(w0,w1):
    u = w0+w1*x 
    yhat = np.exp(u)/(np.exp(u)+1)
    return mseloss_fn(y,yhat) 

def loss_fn2(w0,w1):
    u = w0+w1*x 
    yhat = np.exp(u)/(np.exp(u)+1)
    return bceloss_fn(y,yhat) 

loss1 = list(map(loss_fn1,w0,w1))
loss2 = list(map(loss_fn2,w0,w1))

fig = plt.figure()
fig.set_figwidth(9)
fig.set_figheight(9)
ax1=fig.add_subplot(1,2,1,projection='3d')
ax2=fig.add_subplot(1,2,2,projection='3d')
ax1.elev=15
ax2.elev=15
ax1.azim=75
ax2.azim=75
ax1.scatter(w0,w1,loss1,s=0.1)
ax2.scatter(w0,w1,loss2,s=0.1)

<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x7f34a457df00>

왼쪽곡면(MSEloss)보다 오른쪽곡면(BCEloss)이 좀더 예쁘게 생김 -> 오른쪽 곡면에서 더 학습이 잘될것 같음

학습과정 시각화예시1

- 파라메터학습과정 시각화 // 옵티마이저: SGD, 초기값: (w0,w1) = (-3.0,-1.0)

(1) 데이터정리

X = tf.concat([tf.ones(N,dtype=tf.float64).reshape(N,1),x],axis=1)
X

<tf.Tensor: shape=(2000, 2), dtype=float64, numpy=
array([[ 1.       , -1.       ],
       [ 1.       , -0.9989995],
       [ 1.       , -0.997999 ],
       ...,
       [ 1.       ,  0.997999 ],
       [ 1.       ,  0.9989995],
       [ 1.       ,  1.       ]])>

(2) 1ter돌려봄

net_mse = tf.keras.Sequential()
net_mse.add(tf.keras.layers.Dense(1,use_bias=False,activation='sigmoid')) 
net_mse.compile(optimizer=tf.optimizers.SGD(0.1),loss=mseloss_fn) 
net_mse.fit(X,y,epochs=1,batch_size=N)

1/1 [==============================] - 0s 66ms/step - loss: 0.1554

<keras.callbacks.History at 0x7f349dad3760>

net_bce = tf.keras.Sequential()
net_bce.add(tf.keras.layers.Dense(1,use_bias=False,activation='sigmoid')) 
net_bce.compile(optimizer=tf.optimizers.SGD(0.1),loss=bceloss_fn) 
net_bce.fit(X,y,epochs=1,batch_size=N)

1/1 [==============================] - 0s 76ms/step - loss: 0.9265

<keras.callbacks.History at 0x7f349da22fe0>

net_mse.get_weights(), net_bce.get_weights()

([array([[-0.5575908],
         [ 1.1560522]], dtype=float32)],
 [array([[-0.8477989 ],
         [-0.91781974]], dtype=float32)])

net_mse.set_weights([tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)])
net_bce.set_weights([tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)])

net_mse.get_weights(), net_bce.get_weights()

([array([[-3.],
         [-1.]], dtype=float32)],
 [array([[-3.],
         [-1.]], dtype=float32)])

(4) 학습과정기록: 15에폭마다 기록

What_mse = tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)
What_bce = tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)

for k in range(29): 
    net_mse.fit(X,y,epochs=15,batch_size=N,verbose=0)
    net_bce.fit(X,y,epochs=15,batch_size=N,verbose=0)
    What_mse = tf.concat([What_mse,net_mse.weights[0]],axis=1) 
    What_bce = tf.concat([What_bce,net_bce.weights[0]],axis=1)

(5) 시각화

from matplotlib import animation
plt.rcParams["animation.html"] = "jshtml"

fig = plt.figure()
fig.set_figwidth(6)
fig.set_figheight(6)
fig.suptitle("SGD, Winit=(-3,-1)")
ax1=fig.add_subplot(2,2,1,projection='3d')
ax2=fig.add_subplot(2,2,2,projection='3d')
ax1.elev=15;ax2.elev=15;ax1.azim=75;ax2.azim=75
ax3=fig.add_subplot(2,2,3)
ax4=fig.add_subplot(2,2,4)

ax1.scatter(w0,w1,loss1,s=0.1);ax1.scatter(-1,5,loss_fn1(-1,5),color='red',marker='*',s=200)
ax2.scatter(w0,w1,loss2,s=0.1);ax2.scatter(-1,5,loss_fn2(-1,5),color='red',marker='*',s=200)

ax3.plot(x,y,','); ax3.plot(x,v,'--r'); 
line3, = ax3.plot(x,1/(1+np.exp(-X@What_mse[:,0])),'--b')
ax4.plot(x,y,','); ax4.plot(x,v,'--r')
line4, = ax4.plot(x,1/(1+np.exp(-X@What_bce[:,0])),'--b')

def animate(i):
    _w0_mse,_w1_mse = What_mse[:,i]
    _w0_bce,_w1_bce = What_bce[:,i]
    ax1.scatter(_w0_mse, _w1_mse, loss_fn1(_w0_mse, _w1_mse),color='gray')
    ax2.scatter(_w0_bce, _w1_bce, loss_fn2(_w0_bce, _w1_bce),color='gray')
    line3.set_ydata(1/(1+np.exp(-X@What_mse[:,i])))
    line4.set_ydata(1/(1+np.exp(-X@What_bce[:,i])))

ani = animation.FuncAnimation(fig, animate, frames=30)
plt.close()
ani

학습과정 시각화예시2

- 파라메터학습과정 시각화 // 옵티마이저: Adam, 초기값: (w0,w1) = (-3.0,-1.0)

(1) 데이터정리

X = tf.concat([tf.ones(N,dtype=tf.float64).reshape(N,1),x],axis=1)
X

<tf.Tensor: shape=(2000, 2), dtype=float64, numpy=
array([[ 1.       , -1.       ],
       [ 1.       , -0.9989995],
       [ 1.       , -0.997999 ],
       ...,
       [ 1.       ,  0.997999 ],
       [ 1.       ,  0.9989995],
       [ 1.       ,  1.       ]])>

(2) 1ter돌려봄

net_mse = tf.keras.Sequential()
net_mse.add(tf.keras.layers.Dense(1,use_bias=False,activation='sigmoid')) 
net_mse.compile(optimizer=tf.optimizers.Adam(0.1),loss=mseloss_fn) 
net_mse.fit(X,y,epochs=1,batch_size=N)

1/1 [==============================] - 0s 79ms/step - loss: 0.2311

<keras.callbacks.History at 0x7f349d524070>

net_bce = tf.keras.Sequential()
net_bce.add(tf.keras.layers.Dense(1,use_bias=False,activation='sigmoid')) 
net_bce.compile(optimizer=tf.optimizers.Adam(0.1),loss=bceloss_fn) 
net_bce.fit(X,y,epochs=1,batch_size=N)

1/1 [==============================] - 0s 94ms/step - loss: 0.5606

<keras.callbacks.History at 0x7f349d526fb0>

net_mse.get_weights(), net_bce.get_weights()

([array([[0.07441761],
         [0.40206426]], dtype=float32)],
 [array([[-0.86062825],
         [ 0.9297301 ]], dtype=float32)])

net_mse.set_weights([tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)])
net_bce.set_weights([tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)])

net_mse.get_weights(), net_bce.get_weights()

([array([[-3.],
         [-1.]], dtype=float32)],
 [array([[-3.],
         [-1.]], dtype=float32)])

(4) 학습과정기록: 15에폭마다 기록

What_mse = tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)
What_bce = tnp.array([[-3.0 ],[ -1.0]],dtype=tf.float32)

for k in range(29): 
    net_mse.fit(X,y,epochs=15,batch_size=N,verbose=0)
    net_bce.fit(X,y,epochs=15,batch_size=N,verbose=0)
    What_mse = tf.concat([What_mse,net_mse.weights[0]],axis=1) 
    What_bce = tf.concat([What_bce,net_bce.weights[0]],axis=1)

(5) 시각화

from matplotlib import animation
plt.rcParams["animation.html"] = "jshtml"

fig = plt.figure()
fig.set_figwidth(6)
fig.set_figheight(6)
fig.suptitle("Adam, Winit=(-3,-1)")
ax1=fig.add_subplot(2,2,1,projection='3d')
ax2=fig.add_subplot(2,2,2,projection='3d')
ax1.elev=15;ax2.elev=15;ax1.azim=75;ax2.azim=75
ax3=fig.add_subplot(2,2,3)
ax4=fig.add_subplot(2,2,4)

ax1.scatter(w0,w1,loss1,s=0.1);ax1.scatter(-1,5,loss_fn1(-1,5),color='red',marker='*',s=200)
ax2.scatter(w0,w1,loss2,s=0.1);ax2.scatter(-1,5,loss_fn2(-1,5),color='red',marker='*',s=200)

ax3.plot(x,y,','); ax3.plot(x,v,'--r'); 
line3, = ax3.plot(x,1/(1+np.exp(-X@What_mse[:,0])),'--b')
ax4.plot(x,y,','); ax4.plot(x,v,'--r')
line4, = ax4.plot(x,1/(1+np.exp(-X@What_bce[:,0])),'--b')

def animate(i):
    _w0_mse,_w1_mse = What_mse[:,i]
    _w0_bce,_w1_bce = What_bce[:,i]
    ax1.scatter(_w0_mse, _w1_mse, loss_fn1(_w0_mse, _w1_mse),color='gray')
    ax2.scatter(_w0_bce, _w1_bce, loss_fn2(_w0_bce, _w1_bce),color='gray')
    line3.set_ydata(1/(1+np.exp(-X@What_mse[:,i])))
    line4.set_ydata(1/(1+np.exp(-X@What_bce[:,i])))

ani = animation.FuncAnimation(fig, animate, frames=30)
plt.close()
ani

학습과정 시각화예시3

- 파라메터학습과정 시각화 // 옵티마이저: Adam, 초기값: (w0,w1) = (-10.0,-1.0)

(1) 데이터정리

X = tf.concat([tf.ones(N,dtype=tf.float64).reshape(N,1),x],axis=1)
X

<tf.Tensor: shape=(2000, 2), dtype=float64, numpy=
array([[ 1.       , -1.       ],
       [ 1.       , -0.9989995],
       [ 1.       , -0.997999 ],
       ...,
       [ 1.       ,  0.997999 ],
       [ 1.       ,  0.9989995],
       [ 1.       ,  1.       ]])>

(2) 1ter돌려봄

net_mse = tf.keras.Sequential()
net_mse.add(tf.keras.layers.Dense(1,use_bias=False,activation='sigmoid')) 
net_mse.compile(optimizer=tf.optimizers.Adam(0.1),loss=mseloss_fn) 
net_mse.fit(X,y,epochs=1,batch_size=N)

1/1 [==============================] - 0s 75ms/step - loss: 0.2175

<keras.callbacks.History at 0x7f349c9d9e40>

net_bce = tf.keras.Sequential()
net_bce.add(tf.keras.layers.Dense(1,use_bias=False,activation='sigmoid')) 
net_bce.compile(optimizer=tf.optimizers.Adam(0.1),loss=bceloss_fn) 
net_bce.fit(X,y,epochs=1,batch_size=N)

1/1 [==============================] - 0s 82ms/step - loss: 0.5323

<keras.callbacks.History at 0x7f349caa17b0>

net_mse.get_weights(), net_bce.get_weights()

([array([[-0.02143217],
         [ 0.484821  ]], dtype=float32)],
 [array([[-0.8675074],
         [ 1.1268172]], dtype=float32)])

net_mse.set_weights([tnp.array([[-10.0 ],[ -1.0]],dtype=tf.float32)])
net_bce.set_weights([tnp.array([[-10.0 ],[ -1.0]],dtype=tf.float32)])

net_mse.get_weights(), net_bce.get_weights()

([array([[-10.],
         [ -1.]], dtype=float32)],
 [array([[-10.],
         [ -1.]], dtype=float32)])

(4) 학습과정기록: 15에폭마다 기록

What_mse = tnp.array([[-10.0 ],[ -1.0]],dtype=tf.float32)
What_bce = tnp.array([[-10.0 ],[ -1.0]],dtype=tf.float32)

for k in range(29): 
    net_mse.fit(X,y,epochs=15,batch_size=N,verbose=0)
    net_bce.fit(X,y,epochs=15,batch_size=N,verbose=0)
    What_mse = tf.concat([What_mse,net_mse.weights[0]],axis=1) 
    What_bce = tf.concat([What_bce,net_bce.weights[0]],axis=1)

(5) 시각화

from matplotlib import animation
plt.rcParams["animation.html"] = "jshtml"

fig = plt.figure()
fig.set_figwidth(6)
fig.set_figheight(6)
fig.suptitle("Adam, Winit=(-10,-1)")
ax1=fig.add_subplot(2,2,1,projection='3d')
ax2=fig.add_subplot(2,2,2,projection='3d')
ax1.elev=15;ax2.elev=15;ax1.azim=75;ax2.azim=75
ax3=fig.add_subplot(2,2,3)
ax4=fig.add_subplot(2,2,4)

ax1.scatter(w0,w1,loss1,s=0.1);ax1.scatter(-1,5,loss_fn1(-1,5),color='red',marker='*',s=200)
ax2.scatter(w0,w1,loss2,s=0.1);ax2.scatter(-1,5,loss_fn2(-1,5),color='red',marker='*',s=200)

ax3.plot(x,y,','); ax3.plot(x,v,'--r'); 
line3, = ax3.plot(x,1/(1+np.exp(-X@What_mse[:,0])),'--b')
ax4.plot(x,y,','); ax4.plot(x,v,'--r')
line4, = ax4.plot(x,1/(1+np.exp(-X@What_bce[:,0])),'--b')

def animate(i):
    _w0_mse,_w1_mse = What_mse[:,i]
    _w0_bce,_w1_bce = What_bce[:,i]
    ax1.scatter(_w0_mse, _w1_mse, loss_fn1(_w0_mse, _w1_mse),color='gray')
    ax2.scatter(_w0_bce, _w1_bce, loss_fn2(_w0_bce, _w1_bce),color='gray')
    line3.set_ydata(1/(1+np.exp(-X@What_mse[:,i])))
    line4.set_ydata(1/(1+np.exp(-X@What_bce[:,i])))

ani = animation.FuncAnimation(fig, animate, frames=30)
plt.close()
ani

아무리 아담이라고 해도 이건 힘듬

- discussion

mse_loss는 경우에 따라서 엄청 수렴속도가 느릴수도 있음.
근본적인 문제점: mse_loss일 경우 loss function의 곡면이 예쁘지 않음. (전문용어로 convex가 아니라고 말함)
좋은 옵티마지어를 이용하면 mse_loss일 경우에도 수렴속도를 올릴 수 있음 (학습과정 시각화예시2). 그렇지만 이는 근본적인 해결책은 아님. (학습과정 시각화예시3)

- 요약: 왜 logistic regression에서 mse loss를 쓰면 안되는가?

mse loss를 사용하면 손실함수가 convex하지 않으니까!
그리고 bce loss를 사용하면 손실함수가 convex하니까!