1. Setup a Mario Environment

!pip install gym_super_mario_bros --quiet
!pip install nes_py --quiet

import gym_super_mario_bros

# Import the Joypad wrapper
from nes_py.wrappers import JoypadSpace

# Import the Simplified Controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

SIMPLE_MOVEMENT

[['NOOP'],
 ['right'],
 ['right', 'A'],
 ['right', 'B'],
 ['right', 'A', 'B'],
 ['A'],
 ['left']]

!pip install pyvirtualdisplay > /dev/null 2>&1 --quiet
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f0950cbfe50>

def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

# Play the recorded video
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

env = wrap_env(gym_super_mario_bros.make('SuperMarioBros-v0'))

# Reset the Mario Environment
observation = env.reset()

# Do random action 1000 times
for i in range(1000):

    # Rendering
    env.render()

    # Random action
    action = env.action_space.sample()

    # Perform the action
    observation, reward, done, info = env.step(action)

# Close the Mario Environment
env.close()

show_video()

2. Preprocess the Mario Environment

!pip install stable-baselines3[extra] --quiet

     |████████████████████████████████| 1.1 MB 5.1 MB/s 
     |████████████████████████████████| 1.6 MB 41.2 MB/s 
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
    Preparing wheel metadata ... done
  Building wheel for AutoROM.accept-rom-license (PEP 517) ... done

from gym.wrappers import FrameStack, GrayScaleObservation

# Import Vectorization Wrapper
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# Import Matplotlib
from matplotlib import pyplot as plt

env = gym_super_mario_bros.make('SuperMarioBros-v0')

# Simplify the controls
env = JoypadSpace(env, SIMPLE_MOVEMENT)

# Make it as grayscale
env = GrayScaleObservation(env, keep_dim=True)

# Wrap inside the Dummy Environment
env = DummyVecEnv([lambda: env])

# Stack the Frames
env = VecFrameStack(env, 4, channels_order='last')

observation = env.reset()
plt.imshow(observation[0][:,:,3])

<matplotlib.image.AxesImage at 0x7f0619a8e0d0>

observation, _, _, _ = env.step([env.action_space.sample()])

for i in range(4):
  observation, _, _, _ = env.step([5])

plt.figure(figsize=(20,16))
for i in range(observation.shape[3]):
  plt.subplot(1, 4, i+1)
  plt.imshow(observation[0][:,:,i])
plt.show()

3. Train a RL Model

from stable_baselines3 import PPO

# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback

# Import os for file path
import os

class TrainAndLoggingCallback(BaseCallback):
    
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
    
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
    
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
        
        return True

CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

callback = TrainAndLoggingCallback(check_freq=1000, save_path=CHECKPOINT_DIR)

model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=512)

Using cpu device
Wrapping the env in a VecTransposeImage.

model.learn(total_timesteps=2000, callback=callback)

Logging to ./logs/PPO_2
----------------------------
| time/              |     |
|    fps             | 48  |
|    iterations      | 1   |
|    time_elapsed    | 10  |
|    total_timesteps | 512 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 8           |
|    iterations           | 2           |
|    time_elapsed         | 127         |
|    total_timesteps      | 1024        |
| train/                  |             |
|    approx_kl            | 0.009011421 |
|    clip_fraction        | 0.0859      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.94       |
|    explained_variance   | -0.00126    |
|    learning_rate        | 0.0001      |
|    loss                 | 3.32        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00303    |
|    value_loss           | 65.5        |
-----------------------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 6         |
|    iterations           | 3         |
|    time_elapsed         | 240       |
|    total_timesteps      | 1536      |
| train/                  |           |
|    approx_kl            | 0.010482  |
|    clip_fraction        | 0.202     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.93     |
|    explained_variance   | 0.271     |
|    learning_rate        | 0.0001    |
|    loss                 | 1.97      |
|    n_updates            | 20        |
|    policy_gradient_loss | -0.000691 |
|    value_loss           | 22        |
---------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 5           |
|    iterations           | 4           |
|    time_elapsed         | 354         |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.043274574 |
|    clip_fraction        | 0.25        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.91       |
|    explained_variance   | 0.575       |
|    learning_rate        | 0.0001      |
|    loss                 | 0.794       |
|    n_updates            | 30          |
|    policy_gradient_loss | 0.00524     |
|    value_loss           | 4.34        |
-----------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f05ae857650>

4. Test the Trained Model

model = PPO.load('./train/best_model_2000')

observation = env.reset()

# Do predicted action 2000 times
for i in range(2000):

  # Predict the best action
  action, _ = model.predict(observation)

  # Perform the action
  observation, reward, done, info = env.step(action)

  # Rendering
  env.render()

# Close the Mario Environment
env.close()

show_video()