13wk-2: (강화학습) – Bandit 환경 설계 및 풀이, 4x4 Grid World 게임설명, 환경구현, 에이전트(랜덤)구현

Author

최규빈

Published

June 2, 2025

1. 강의영상

2. Imports

import gymnasium as gym
#---#
import numpy as np
import collections
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython

3. Bandit 환경 설계 및 풀이

A. 대충 개념만 실습

action_space = [0,1] 
actions_deque = collections.deque(maxlen=500)
rewards_deque =  collections.deque(maxlen=500)
#---#
for _ in range(10):
    action = np.random.choice(action_space)
    if action == 1:
        reward = 10 
    else:
        reward = 1
    actions_deque.append(action)
    rewards_deque.append(reward)
actions_deque
deque([0, 1, 0, 1, 1, 1, 0, 1, 0, 1], maxlen=500)
rewards_deque
deque([1, 10, 1, 10, 10, 10, 1, 10, 1, 10], maxlen=500)
actions_numpy = np.array(actions_deque)
rewards_numpy = np.array(rewards_deque)
q0 = rewards_numpy[actions_numpy == 0].mean()
q1 = rewards_numpy[actions_numpy == 1].mean()
q_table = np.array([q0,q1])
q_table
array([ 1., 10.])
action = q_table.argmax()
for _ in range(5):
    action = q_table.argmax()
    if action == 1:
        reward = 10 
    else:
        reward = 1
    actions_deque.append(action)
    rewards_deque.append(reward)
    actions_numpy = np.array(actions_deque)
    rewards_numpy = np.array(rewards_deque)    
    q0 = rewards_numpy[actions_numpy == 0].mean()
    q1 = rewards_numpy[actions_numpy == 1].mean()
    q_table = np.array([q0,q1])
actions_numpy
array([0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])
rewards_numpy
array([ 1, 10,  1, 10, 10, 10,  1, 10,  1, 10, 10, 10, 10, 10, 10])

B. 클래스를 이용한 구현

class Bandit:
    def __init__(self):
        self.reward = None 
    def step(self,action):
        if action == 0:
            self.reward = 1
        else: 
            self.reward = 10 
        return self.reward 
class Agent:
    def __init__(self):
        pass 
    def act(self):
        # 만약에 경험이 20보다 작음 --> 랜덤액션 
        # 경험이 20보다 크면 --> action = q_tabel.argmax()
        pass 
    def save_experience(self):
        # 데이터 저장 
        pass 
    def learn(self):
        # q_table 을 업데이트하는 과정 
        pass

class Agent:
    def __init__(self):
        self.action = None 
        self.reward = None 
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.action_space = [0,1] 
        self.q_table = None 
        self.n_experience = 0
    def act(self):
        if self.n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else: 
            self.action = self.q_table.argmax()
        print(f"버튼{self.action}누름!")
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experience = self.n_experience + 1
    def learn(self):
        if self.n_experience < 20:
            pass
        else:
            # q_table 을 업데이트하는 과정 
            actions = np.array(self.actions)
            rewards = np.array(self.rewards)
            q0 = rewards[actions == 0].mean() # 행동0을했을때 얻는 보상의 평균값
            q1 = rewards[actions == 1].mean()# 행동1을했을때 얻는 보상의 평균값
            self.q_table = np.array([q0,q1])
env = Bandit()
player = Agent()
for _ in range(100):
    # step1: agent action 
    player.act()
    # step2: action --> state, reward
    player.reward = env.step(player.action)
    # step3: agent가 데이터를 축적하고 학습
    player.save_experience() # 데이터를 저장
    player.learn() #저장된 데이터를 학습 
    #---강화학습의 종료를 결정--#
    if player.n_experience < 20:
        pass 
    else: 
        if np.array(player.rewards)[-20:].mean() > 9.5:
            print("---게임클리어---")
            break
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---게임클리어---

4. 예비학습: gym.spaces

ref: https://gymnasium.farama.org/

- 예시1

action_space = gym.spaces.Discrete(4) 
action_space 
Discrete(4)
[action_space.sample() for _ in range(5)]
[0, 1, 3, 2, 3]
0 in action_space
True
4 in action_space
False

- 예시2

state_space = gym.spaces.MultiDiscrete([4,4])
state_space
MultiDiscrete([4 4])
[state_space.sample() for _ in range(5)]
[array([1, 3]), array([2, 0]), array([1, 2]), array([0, 2]), array([2, 0])]
np.array([0,1]) in state_space
True
np.array([3,3]) in state_space
True
np.array([3,4]) in state_space
False

5. 4x4 Grid World 게임 설명

A. 게임설명

- 문제설명: 4x4 그리드월드에서 상하좌우로 움직이는 에이전트가 목표점에 도달하도록 하는 게임

- GridWorld에서 사용되는 주요변수

  1. State: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음.
  2. Action: 에이전트는 현재상태에서 다음상태로 이동하기 위해 상,하,좌,우 중 하나의 행동을 취할 수 있음.
  3. Reward: 에이전트가 현재상태에서 특정 action을 하면 얻어지는 보상.
  4. Terminated: 하나의 에피소드가 종료되었음을 나타내는 상태.

B. 시각화

def show(states):
    fig = plt.Figure()
    ax = fig.subplots()
    ax.matshow(np.zeros([4,4]), cmap='bwr',alpha=0.0)
    sc = ax.scatter(0, 0, color='red', s=500)  
    ax.text(0, 0, 'start', ha='center', va='center')
    ax.text(3, 3, 'end', ha='center', va='center')
    # Adding grid lines to the plot
    ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
    ax.set_yticks(np.arange(-.5, 4, 1), minor=True)
    ax.grid(which='minor', color='black', linestyle='-', linewidth=2)
    state_space = gym.spaces.MultiDiscrete([4,4])
    def update(t):
        if states[t] in state_space:
            s1,s2 = states[t]
            states[t] = [s2,s1]
            sc.set_offsets(states[t])
        else:
            s1,s2 = states[t]
            s1 = s1 + 0.5 if s1 < 0 else (s1 - 0.5 if s1 > 3 else s1)
            s2 = s2 + 0.5 if s2 < 0 else (s2 - 0.5 if s2 > 3 else s2)
            states[t] = [s2,s1]       
            sc.set_offsets(states[t])
    ani = FuncAnimation(fig,update,frames=len(states))
    display(IPython.display.HTML(ani.to_jshtml()))
show([[0,0],[1,0],[2,0],[3,0],[4,0]]) # show 사용방법

6. 4x4 Grid World 환경 구현

class GridWorld:
    def __init__(self):
        self.a2d = {
            0: np.array([0,1]), # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0])
        self.reward = None
        self.terminated = False
    def step(self,action):
        self.state = self.state + self.a2d[action]
        s1,s2 = self.state
        if (s1==3) and (s2==3):
            self.reward = 100 
            self.terminated = True
        elif self.state in self.state_space:
            self.reward = -1 
            self.terminated = False
        else:
            self.reward = -10
            self.terminated = True
        print(
            f"action = {action}\t"
            f"state = {self.state - self.a2d[action]} -> {self.state}\t"
            f"reward = {self.reward}\t"
            f"termiated = {self.terminated}"
        )            
        return self.state, self.reward, self.terminated
    def reset(self):
        self.state = np.array([0,0])
        self.terminated = False
        return self.state
env = GridWorld()
action_space = gym.spaces.Discrete(4)
for _ in range(50):
    action = action_space.sample()
    env.step(action)
    if env.terminated == True:
        env.reset()
        break
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True

7. “에이전트 \(\Leftrightarrow\) 환경” 상호작용 구현

- 우리가 구현하고 싶은 기능

  • .act(): 액션을 결정 –> 여기서는 그냥 랜덤액션
  • .save_experience(): 데이터를 저장 –> 여기에 일단 초점을 맞추자
  • .learn(): 데이터로에서 학습 –> 패스
class RandomAgent:
    def __init__(self):
        self.state = None 
        self.action = None 
        self.reward = None 
        self.next_state = None
        self.terminated = None
        #---#
        self.states = collections.deque(maxlen=500)
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.next_states = collections.deque(maxlen=500)
        self.terminations = collections.deque(maxlen=500)
        #---#
        self.action_space = gym.spaces.Discrete(4)
        self.n_experience = 0
    def act(self):
        self.action = self.action_space.sample()
    def save_experience(self):
        self.states.append(self.state)
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.next_states.append(self.next_state)
        self.terminations.append(self.terminated)
        self.n_experience = self.n_experience + 1
    def learn(self):
        pass 
player = RandomAgent()
env = GridWorld()
for t in range(50):
    # step1 -- 에이전트가 action을 함 
    player.act()
    # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated 
    player.next_state, player.reward, player.terminated = env.step(player.action)
    # step3 -- 에이전트가 save & learn
    player.save_experience()
    player.learn()
    # step4 -- next iteration 
    player.state = player.next_state
    if env.terminated:
        player.state = env.reset()
        break
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
scores = [] 
score = 0 
for e in range(1,100):
    #---에피소드시작---#
    while True:
        # step1 -- 에이전트가 action을 함 
        player.act()
        # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated 을 return
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3 -- 에이전트가 save & learn
        player.save_experience()
        player.learn()
        # step4 -- next iteration 
        if env.terminated:
            score = score + player.reward
            scores.append(score)
            score = 0 
            player.state = env.reset() 
            print(f"---에피소드{e}종료---")
            break
        else: 
            score = score + player.reward
            player.state = player.next_state
    #---에피소드끝---#
    if scores[-1] > 0:
        break
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드1종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드2종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드3종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드4종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드5종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 3  state = [0 1] -> [-1  1]    reward = -10    termiated = True
---에피소드6종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드7종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드8종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 3  state = [1 1] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 3  state = [0 2] -> [-1  2]    reward = -10    termiated = True
---에피소드9종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드10종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드11종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드12종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드13종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드14종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 2  state = [1 0] -> [2 0]  reward = -1 termiated = False
action = 0  state = [2 0] -> [2 1]  reward = -1 termiated = False
action = 2  state = [2 1] -> [3 1]  reward = -1 termiated = False
action = 3  state = [3 1] -> [2 1]  reward = -1 termiated = False
action = 3  state = [2 1] -> [1 1]  reward = -1 termiated = False
action = 3  state = [1 1] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 1  state = [1 1] -> [1 0]  reward = -1 termiated = False
action = 2  state = [1 0] -> [2 0]  reward = -1 termiated = False
action = 0  state = [2 0] -> [2 1]  reward = -1 termiated = False
action = 2  state = [2 1] -> [3 1]  reward = -1 termiated = False
action = 0  state = [3 1] -> [3 2]  reward = -1 termiated = False
action = 3  state = [3 2] -> [2 2]  reward = -1 termiated = False
action = 1  state = [2 2] -> [2 1]  reward = -1 termiated = False
action = 2  state = [2 1] -> [3 1]  reward = -1 termiated = False
action = 2  state = [3 1] -> [4 1]  reward = -10    termiated = True
---에피소드15종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 2  state = [1 0] -> [2 0]  reward = -1 termiated = False
action = 3  state = [2 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드16종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드17종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드18종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드19종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드20종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드21종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 2  state = [1 1] -> [2 1]  reward = -1 termiated = False
action = 2  state = [2 1] -> [3 1]  reward = -1 termiated = False
action = 3  state = [3 1] -> [2 1]  reward = -1 termiated = False
action = 0  state = [2 1] -> [2 2]  reward = -1 termiated = False
action = 0  state = [2 2] -> [2 3]  reward = -1 termiated = False
action = 1  state = [2 3] -> [2 2]  reward = -1 termiated = False
action = 3  state = [2 2] -> [1 2]  reward = -1 termiated = False
action = 3  state = [1 2] -> [0 2]  reward = -1 termiated = False
action = 0  state = [0 2] -> [0 3]  reward = -1 termiated = False
action = 2  state = [0 3] -> [1 3]  reward = -1 termiated = False
action = 0  state = [1 3] -> [1 4]  reward = -10    termiated = True
---에피소드22종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 3  state = [0 1] -> [-1  1]    reward = -10    termiated = True
---에피소드23종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드24종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드25종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 0  state = [1 0] -> [1 1]  reward = -1 termiated = False
action = 0  state = [1 1] -> [1 2]  reward = -1 termiated = False
action = 0  state = [1 2] -> [1 3]  reward = -1 termiated = False
action = 0  state = [1 3] -> [1 4]  reward = -10    termiated = True
---에피소드26종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 3  state = [0 1] -> [-1  1]    reward = -10    termiated = True
---에피소드27종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드28종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드29종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드30종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드31종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드32종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드33종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드34종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드35종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 2  state = [0 2] -> [1 2]  reward = -1 termiated = False
action = 3  state = [1 2] -> [0 2]  reward = -1 termiated = False
action = 0  state = [0 2] -> [0 3]  reward = -1 termiated = False
action = 2  state = [0 3] -> [1 3]  reward = -1 termiated = False
action = 1  state = [1 3] -> [1 2]  reward = -1 termiated = False
action = 1  state = [1 2] -> [1 1]  reward = -1 termiated = False
action = 3  state = [1 1] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드36종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 0  state = [0 2] -> [0 3]  reward = -1 termiated = False
action = 3  state = [0 3] -> [-1  3]    reward = -10    termiated = True
---에피소드37종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 2  state = [1 0] -> [2 0]  reward = -1 termiated = False
action = 3  state = [2 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드38종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드39종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 0  state = [1 0] -> [1 1]  reward = -1 termiated = False
action = 0  state = [1 1] -> [1 2]  reward = -1 termiated = False
action = 0  state = [1 2] -> [1 3]  reward = -1 termiated = False
action = 1  state = [1 3] -> [1 2]  reward = -1 termiated = False
action = 3  state = [1 2] -> [0 2]  reward = -1 termiated = False
action = 0  state = [0 2] -> [0 3]  reward = -1 termiated = False
action = 1  state = [0 3] -> [0 2]  reward = -1 termiated = False
action = 2  state = [0 2] -> [1 2]  reward = -1 termiated = False
action = 2  state = [1 2] -> [2 2]  reward = -1 termiated = False
action = 3  state = [2 2] -> [1 2]  reward = -1 termiated = False
action = 0  state = [1 2] -> [1 3]  reward = -1 termiated = False
action = 3  state = [1 3] -> [0 3]  reward = -1 termiated = False
action = 2  state = [0 3] -> [1 3]  reward = -1 termiated = False
action = 3  state = [1 3] -> [0 3]  reward = -1 termiated = False
action = 2  state = [0 3] -> [1 3]  reward = -1 termiated = False
action = 3  state = [1 3] -> [0 3]  reward = -1 termiated = False
action = 0  state = [0 3] -> [0 4]  reward = -10    termiated = True
---에피소드40종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 3  state = [1 0] -> [0 0]  reward = -1 termiated = False
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 1  state = [0 2] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 1  state = [0 2] -> [0 1]  reward = -1 termiated = False
action = 3  state = [0 1] -> [-1  1]    reward = -10    termiated = True
---에피소드41종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드42종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드43종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 0  state = [1 0] -> [1 1]  reward = -1 termiated = False
action = 3  state = [1 1] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 0  state = [1 1] -> [1 2]  reward = -1 termiated = False
action = 2  state = [1 2] -> [2 2]  reward = -1 termiated = False
action = 2  state = [2 2] -> [3 2]  reward = -1 termiated = False
action = 2  state = [3 2] -> [4 2]  reward = -10    termiated = True
---에피소드44종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 3  state = [0 2] -> [-1  2]    reward = -10    termiated = True
---에피소드45종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 3  state = [0 1] -> [-1  1]    reward = -10    termiated = True
---에피소드46종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드47종료---
action = 2  state = [0 0] -> [1 0]  reward = -1 termiated = False
action = 1  state = [1 0] -> [ 1 -1]    reward = -10    termiated = True
---에피소드48종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드49종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드50종료---
action = 3  state = [0 0] -> [-1  0]    reward = -10    termiated = True
---에피소드51종료---
action = 1  state = [0 0] -> [ 0 -1]    reward = -10    termiated = True
---에피소드52종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 3  state = [0 2] -> [-1  2]    reward = -10    termiated = True
---에피소드53종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 3  state = [0 2] -> [-1  2]    reward = -10    termiated = True
---에피소드54종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 0  state = [0 1] -> [0 2]  reward = -1 termiated = False
action = 3  state = [0 2] -> [-1  2]    reward = -10    termiated = True
---에피소드55종료---
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 1  state = [0 1] -> [0 0]  reward = -1 termiated = False
action = 0  state = [0 0] -> [0 1]  reward = -1 termiated = False
action = 2  state = [0 1] -> [1 1]  reward = -1 termiated = False
action = 0  state = [1 1] -> [1 2]  reward = -1 termiated = False
action = 1  state = [1 2] -> [1 1]  reward = -1 termiated = False
action = 2  state = [1 1] -> [2 1]  reward = -1 termiated = False
action = 2  state = [2 1] -> [3 1]  reward = -1 termiated = False
action = 0  state = [3 1] -> [3 2]  reward = -1 termiated = False
action = 0  state = [3 2] -> [3 3]  reward = 100    termiated = True
---에피소드56종료---
paths = [np.array([0,0])]+ list(player.next_states)[-10:]
show(paths)