import gymnasium as gym
#---#
import numpy as np
import collections
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython13wk-2: (강화학습) – Bandit 환경 설계 및 풀이, 4x4 Grid World 게임설명, 환경구현, 에이전트(랜덤)구현
1. 강의영상
2. Imports
3. Bandit 환경 설계 및 풀이
A. 대충 개념만 실습
action_space = [0,1]
actions_deque = collections.deque(maxlen=500)
rewards_deque = collections.deque(maxlen=500)
#---#for _ in range(10):
action = np.random.choice(action_space)
if action == 1:
reward = 10
else:
reward = 1
actions_deque.append(action)
rewards_deque.append(reward)actions_dequedeque([0, 1, 0, 1, 1, 1, 0, 1, 0, 1], maxlen=500)
rewards_dequedeque([1, 10, 1, 10, 10, 10, 1, 10, 1, 10], maxlen=500)
actions_numpy = np.array(actions_deque)
rewards_numpy = np.array(rewards_deque)q0 = rewards_numpy[actions_numpy == 0].mean()
q1 = rewards_numpy[actions_numpy == 1].mean()
q_table = np.array([q0,q1])
q_tablearray([ 1., 10.])
action = q_table.argmax()for _ in range(5):
action = q_table.argmax()
if action == 1:
reward = 10
else:
reward = 1
actions_deque.append(action)
rewards_deque.append(reward)
actions_numpy = np.array(actions_deque)
rewards_numpy = np.array(rewards_deque)
q0 = rewards_numpy[actions_numpy == 0].mean()
q1 = rewards_numpy[actions_numpy == 1].mean()
q_table = np.array([q0,q1])actions_numpyarray([0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])
rewards_numpyarray([ 1, 10, 1, 10, 10, 10, 1, 10, 1, 10, 10, 10, 10, 10, 10])
B. 클래스를 이용한 구현
class Bandit:
def __init__(self):
self.reward = None
def step(self,action):
if action == 0:
self.reward = 1
else:
self.reward = 10
return self.reward class Agent:
def __init__(self):
pass
def act(self):
# 만약에 경험이 20보다 작음 --> 랜덤액션
# 경험이 20보다 크면 --> action = q_tabel.argmax()
pass
def save_experience(self):
# 데이터 저장
pass
def learn(self):
# q_table 을 업데이트하는 과정
passclass Agent:
def __init__(self):
self.action = None
self.reward = None
self.actions = collections.deque(maxlen=500)
self.rewards = collections.deque(maxlen=500)
self.action_space = [0,1]
self.q_table = None
self.n_experience = 0
def act(self):
if self.n_experience < 20:
self.action = np.random.choice(self.action_space)
else:
self.action = self.q_table.argmax()
print(f"버튼{self.action}누름!")
def save_experience(self):
self.actions.append(self.action)
self.rewards.append(self.reward)
self.n_experience = self.n_experience + 1
def learn(self):
if self.n_experience < 20:
pass
else:
# q_table 을 업데이트하는 과정
actions = np.array(self.actions)
rewards = np.array(self.rewards)
q0 = rewards[actions == 0].mean() # 행동0을했을때 얻는 보상의 평균값
q1 = rewards[actions == 1].mean()# 행동1을했을때 얻는 보상의 평균값
self.q_table = np.array([q0,q1])env = Bandit()
player = Agent()
for _ in range(100):
# step1: agent action
player.act()
# step2: action --> state, reward
player.reward = env.step(player.action)
# step3: agent가 데이터를 축적하고 학습
player.save_experience() # 데이터를 저장
player.learn() #저장된 데이터를 학습
#---강화학습의 종료를 결정--#
if player.n_experience < 20:
pass
else:
if np.array(player.rewards)[-20:].mean() > 9.5:
print("---게임클리어---")
break버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---게임클리어---
4. 예비학습: gym.spaces
ref: https://gymnasium.farama.org/
- 예시1
action_space = gym.spaces.Discrete(4)
action_space Discrete(4)
[action_space.sample() for _ in range(5)][0, 1, 3, 2, 3]
0 in action_spaceTrue
4 in action_spaceFalse
- 예시2
state_space = gym.spaces.MultiDiscrete([4,4])
state_spaceMultiDiscrete([4 4])
[state_space.sample() for _ in range(5)][array([1, 3]), array([2, 0]), array([1, 2]), array([0, 2]), array([2, 0])]
np.array([0,1]) in state_spaceTrue
np.array([3,3]) in state_spaceTrue
np.array([3,4]) in state_spaceFalse
5. 4x4 Grid World 게임 설명
A. 게임설명
- 문제설명: 4x4 그리드월드에서 상하좌우로 움직이는 에이전트가 목표점에 도달하도록 하는 게임
- 백문이 불여일견: https://claude.ai/public/artifacts/76e13820-2b51-4e7e-a514-00190de17c45 (출처: 클로드)
- GridWorld에서 사용되는 주요변수
State: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음.Action: 에이전트는 현재상태에서 다음상태로 이동하기 위해 상,하,좌,우 중 하나의 행동을 취할 수 있음.Reward: 에이전트가 현재상태에서 특정 action을 하면 얻어지는 보상.Terminated: 하나의 에피소드가 종료되었음을 나타내는 상태.
B. 시각화
def show(states):
fig = plt.Figure()
ax = fig.subplots()
ax.matshow(np.zeros([4,4]), cmap='bwr',alpha=0.0)
sc = ax.scatter(0, 0, color='red', s=500)
ax.text(0, 0, 'start', ha='center', va='center')
ax.text(3, 3, 'end', ha='center', va='center')
# Adding grid lines to the plot
ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
ax.set_yticks(np.arange(-.5, 4, 1), minor=True)
ax.grid(which='minor', color='black', linestyle='-', linewidth=2)
state_space = gym.spaces.MultiDiscrete([4,4])
def update(t):
if states[t] in state_space:
s1,s2 = states[t]
states[t] = [s2,s1]
sc.set_offsets(states[t])
else:
s1,s2 = states[t]
s1 = s1 + 0.5 if s1 < 0 else (s1 - 0.5 if s1 > 3 else s1)
s2 = s2 + 0.5 if s2 < 0 else (s2 - 0.5 if s2 > 3 else s2)
states[t] = [s2,s1]
sc.set_offsets(states[t])
ani = FuncAnimation(fig,update,frames=len(states))
display(IPython.display.HTML(ani.to_jshtml()))show([[0,0],[1,0],[2,0],[3,0],[4,0]]) # show 사용방법6. 4x4 Grid World 환경 구현
class GridWorld:
def __init__(self):
self.a2d = {
0: np.array([0,1]), # →
1: np.array([0,-1]), # ←
2: np.array([1,0]), # ↓
3: np.array([-1,0]) # ↑
}
self.state_space = gym.spaces.MultiDiscrete([4,4])
self.state = np.array([0,0])
self.reward = None
self.terminated = False
def step(self,action):
self.state = self.state + self.a2d[action]
s1,s2 = self.state
if (s1==3) and (s2==3):
self.reward = 100
self.terminated = True
elif self.state in self.state_space:
self.reward = -1
self.terminated = False
else:
self.reward = -10
self.terminated = True
print(
f"action = {action}\t"
f"state = {self.state - self.a2d[action]} -> {self.state}\t"
f"reward = {self.reward}\t"
f"termiated = {self.terminated}"
)
return self.state, self.reward, self.terminated
def reset(self):
self.state = np.array([0,0])
self.terminated = False
return self.stateenv = GridWorld()action_space = gym.spaces.Discrete(4)
for _ in range(50):
action = action_space.sample()
env.step(action)
if env.terminated == True:
env.reset()
breakaction = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
7. “에이전트 \(\Leftrightarrow\) 환경” 상호작용 구현
- 우리가 구현하고 싶은 기능
.act(): 액션을 결정 –> 여기서는 그냥 랜덤액션.save_experience(): 데이터를 저장 –> 여기에 일단 초점을 맞추자.learn(): 데이터로에서 학습 –> 패스
class RandomAgent:
def __init__(self):
self.state = None
self.action = None
self.reward = None
self.next_state = None
self.terminated = None
#---#
self.states = collections.deque(maxlen=500)
self.actions = collections.deque(maxlen=500)
self.rewards = collections.deque(maxlen=500)
self.next_states = collections.deque(maxlen=500)
self.terminations = collections.deque(maxlen=500)
#---#
self.action_space = gym.spaces.Discrete(4)
self.n_experience = 0
def act(self):
self.action = self.action_space.sample()
def save_experience(self):
self.states.append(self.state)
self.actions.append(self.action)
self.rewards.append(self.reward)
self.next_states.append(self.next_state)
self.terminations.append(self.terminated)
self.n_experience = self.n_experience + 1
def learn(self):
pass player = RandomAgent()
env = GridWorld()for t in range(50):
# step1 -- 에이전트가 action을 함
player.act()
# step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated
player.next_state, player.reward, player.terminated = env.step(player.action)
# step3 -- 에이전트가 save & learn
player.save_experience()
player.learn()
# step4 -- next iteration
player.state = player.next_state
if env.terminated:
player.state = env.reset()
breakaction = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
scores = []
score = 0
for e in range(1,100):
#---에피소드시작---#
while True:
# step1 -- 에이전트가 action을 함
player.act()
# step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated 을 return
player.next_state, player.reward, player.terminated = env.step(player.action)
# step3 -- 에이전트가 save & learn
player.save_experience()
player.learn()
# step4 -- next iteration
if env.terminated:
score = score + player.reward
scores.append(score)
score = 0
player.state = env.reset()
print(f"---에피소드{e}종료---")
break
else:
score = score + player.reward
player.state = player.next_state
#---에피소드끝---#
if scores[-1] > 0:
breakaction = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드1종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드2종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드3종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드4종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드5종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 3 state = [0 1] -> [-1 1] reward = -10 termiated = True
---에피소드6종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드7종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드8종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 3 state = [1 1] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 3 state = [0 2] -> [-1 2] reward = -10 termiated = True
---에피소드9종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드10종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드11종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드12종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드13종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드14종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 2 state = [1 0] -> [2 0] reward = -1 termiated = False
action = 0 state = [2 0] -> [2 1] reward = -1 termiated = False
action = 2 state = [2 1] -> [3 1] reward = -1 termiated = False
action = 3 state = [3 1] -> [2 1] reward = -1 termiated = False
action = 3 state = [2 1] -> [1 1] reward = -1 termiated = False
action = 3 state = [1 1] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 1 state = [1 1] -> [1 0] reward = -1 termiated = False
action = 2 state = [1 0] -> [2 0] reward = -1 termiated = False
action = 0 state = [2 0] -> [2 1] reward = -1 termiated = False
action = 2 state = [2 1] -> [3 1] reward = -1 termiated = False
action = 0 state = [3 1] -> [3 2] reward = -1 termiated = False
action = 3 state = [3 2] -> [2 2] reward = -1 termiated = False
action = 1 state = [2 2] -> [2 1] reward = -1 termiated = False
action = 2 state = [2 1] -> [3 1] reward = -1 termiated = False
action = 2 state = [3 1] -> [4 1] reward = -10 termiated = True
---에피소드15종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 2 state = [1 0] -> [2 0] reward = -1 termiated = False
action = 3 state = [2 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드16종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드17종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드18종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드19종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드20종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드21종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 2 state = [1 1] -> [2 1] reward = -1 termiated = False
action = 2 state = [2 1] -> [3 1] reward = -1 termiated = False
action = 3 state = [3 1] -> [2 1] reward = -1 termiated = False
action = 0 state = [2 1] -> [2 2] reward = -1 termiated = False
action = 0 state = [2 2] -> [2 3] reward = -1 termiated = False
action = 1 state = [2 3] -> [2 2] reward = -1 termiated = False
action = 3 state = [2 2] -> [1 2] reward = -1 termiated = False
action = 3 state = [1 2] -> [0 2] reward = -1 termiated = False
action = 0 state = [0 2] -> [0 3] reward = -1 termiated = False
action = 2 state = [0 3] -> [1 3] reward = -1 termiated = False
action = 0 state = [1 3] -> [1 4] reward = -10 termiated = True
---에피소드22종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 3 state = [0 1] -> [-1 1] reward = -10 termiated = True
---에피소드23종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드24종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드25종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 0 state = [1 0] -> [1 1] reward = -1 termiated = False
action = 0 state = [1 1] -> [1 2] reward = -1 termiated = False
action = 0 state = [1 2] -> [1 3] reward = -1 termiated = False
action = 0 state = [1 3] -> [1 4] reward = -10 termiated = True
---에피소드26종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 3 state = [0 1] -> [-1 1] reward = -10 termiated = True
---에피소드27종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드28종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드29종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드30종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드31종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드32종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드33종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드34종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드35종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 2 state = [0 2] -> [1 2] reward = -1 termiated = False
action = 3 state = [1 2] -> [0 2] reward = -1 termiated = False
action = 0 state = [0 2] -> [0 3] reward = -1 termiated = False
action = 2 state = [0 3] -> [1 3] reward = -1 termiated = False
action = 1 state = [1 3] -> [1 2] reward = -1 termiated = False
action = 1 state = [1 2] -> [1 1] reward = -1 termiated = False
action = 3 state = [1 1] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드36종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 0 state = [0 2] -> [0 3] reward = -1 termiated = False
action = 3 state = [0 3] -> [-1 3] reward = -10 termiated = True
---에피소드37종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 2 state = [1 0] -> [2 0] reward = -1 termiated = False
action = 3 state = [2 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드38종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드39종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 0 state = [1 0] -> [1 1] reward = -1 termiated = False
action = 0 state = [1 1] -> [1 2] reward = -1 termiated = False
action = 0 state = [1 2] -> [1 3] reward = -1 termiated = False
action = 1 state = [1 3] -> [1 2] reward = -1 termiated = False
action = 3 state = [1 2] -> [0 2] reward = -1 termiated = False
action = 0 state = [0 2] -> [0 3] reward = -1 termiated = False
action = 1 state = [0 3] -> [0 2] reward = -1 termiated = False
action = 2 state = [0 2] -> [1 2] reward = -1 termiated = False
action = 2 state = [1 2] -> [2 2] reward = -1 termiated = False
action = 3 state = [2 2] -> [1 2] reward = -1 termiated = False
action = 0 state = [1 2] -> [1 3] reward = -1 termiated = False
action = 3 state = [1 3] -> [0 3] reward = -1 termiated = False
action = 2 state = [0 3] -> [1 3] reward = -1 termiated = False
action = 3 state = [1 3] -> [0 3] reward = -1 termiated = False
action = 2 state = [0 3] -> [1 3] reward = -1 termiated = False
action = 3 state = [1 3] -> [0 3] reward = -1 termiated = False
action = 0 state = [0 3] -> [0 4] reward = -10 termiated = True
---에피소드40종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 3 state = [1 0] -> [0 0] reward = -1 termiated = False
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 1 state = [0 2] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 1 state = [0 2] -> [0 1] reward = -1 termiated = False
action = 3 state = [0 1] -> [-1 1] reward = -10 termiated = True
---에피소드41종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드42종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드43종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 0 state = [1 0] -> [1 1] reward = -1 termiated = False
action = 3 state = [1 1] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 0 state = [1 1] -> [1 2] reward = -1 termiated = False
action = 2 state = [1 2] -> [2 2] reward = -1 termiated = False
action = 2 state = [2 2] -> [3 2] reward = -1 termiated = False
action = 2 state = [3 2] -> [4 2] reward = -10 termiated = True
---에피소드44종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 3 state = [0 2] -> [-1 2] reward = -10 termiated = True
---에피소드45종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 3 state = [0 1] -> [-1 1] reward = -10 termiated = True
---에피소드46종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드47종료---
action = 2 state = [0 0] -> [1 0] reward = -1 termiated = False
action = 1 state = [1 0] -> [ 1 -1] reward = -10 termiated = True
---에피소드48종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드49종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드50종료---
action = 3 state = [0 0] -> [-1 0] reward = -10 termiated = True
---에피소드51종료---
action = 1 state = [0 0] -> [ 0 -1] reward = -10 termiated = True
---에피소드52종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 3 state = [0 2] -> [-1 2] reward = -10 termiated = True
---에피소드53종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 3 state = [0 2] -> [-1 2] reward = -10 termiated = True
---에피소드54종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 0 state = [0 1] -> [0 2] reward = -1 termiated = False
action = 3 state = [0 2] -> [-1 2] reward = -10 termiated = True
---에피소드55종료---
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 1 state = [0 1] -> [0 0] reward = -1 termiated = False
action = 0 state = [0 0] -> [0 1] reward = -1 termiated = False
action = 2 state = [0 1] -> [1 1] reward = -1 termiated = False
action = 0 state = [1 1] -> [1 2] reward = -1 termiated = False
action = 1 state = [1 2] -> [1 1] reward = -1 termiated = False
action = 2 state = [1 1] -> [2 1] reward = -1 termiated = False
action = 2 state = [2 1] -> [3 1] reward = -1 termiated = False
action = 0 state = [3 1] -> [3 2] reward = -1 termiated = False
action = 0 state = [3 2] -> [3 3] reward = 100 termiated = True
---에피소드56종료---
paths = [np.array([0,0])]+ list(player.next_states)[-10:]
show(paths)