PPO算法的基本结构

PPO算法有两类主要情势：PPO-Penalty和PPO-Clip(PPO2)。在这里，我们讨论PPO-Clip（OpenAI使用的主要情势）。 PPO的主要特点以下：

PPO属于on-policy算法

PPO同时适用于离散和连续的动作空间

损耗函数 PPO-Clip算法最精华的地方就是加入了一项比例用以描绘新老策略的差异,经过超参数ϵ限制策略的更新步长：

更新策略：

探索策略 PPO采取随机探索策略。

优点函数表示在状态s下采取动作a，相较于其他动作有多少优点，如果>0,则当前动作比平均动作好，反之，则差

PPO论文

1. 程序初始化

第1步：安装基础依赖

!pip install -U pip!pip install gym==0.19.0
!pip install tqdm==4.48.0
!pip install nes-py==8.1.0!pip install gym-super-mario-bros==7.3.2

import osimport shutil
import subprocess as sp
from collections import deque
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as _mp
from torch.distributions import Categorical
import torch.multiprocessing as mp
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym.spaces import Box
from gym import Wrapper
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY
import cv2
import matplotlib.pyplot as plt
from IPython import displayimport moxing as mox

2. 训练参数初始化

该部份参数可以自己调剂，以训练出更好的效果

opt={"world": 1, # 可选大关：1,2,3,4,5,6,7,8 "stage": 1, # 可选小关：1,2,3,4 "action_type": "simple", # 动作种别："simple"，"right_only", "complex" 'lr': 1e⑷, # 建议学习率：1e⑶，1e⑷, 1e⑸，7e⑸ 'gamma': 0.9, # 嘉奖折扣 'tau': 1.0, # GAE参数 'beta': 0.01, # 熵系数 'epsilon': 0.2, # PPO的Clip系数 'batch_size': 16, # 经验回放的batch_size 'max_episode':10, # 最大训练局数 'num_epochs': 10, # 每条经验回放次数 "num_local_steps": 512, # 每局的最大步数 "num_processes": 8, # 训练进程数，一般等于训练机关键数 "save_interval": 5, # 每{}局保存一次模型 "log_path": "./log", # 日志保存路径 "saved_path": "./model", # 训练模型保存路径 "pretrain_model": True, # 是不是加载预训练模型，至今为止只提供1⑴关卡的预训练模型，其他需要从零开始训练 "episode":5

}

3. 创建环境

结束标志：

成功：mario到达本关终点
失败：mario遭到敌人的伤害、坠入悬崖或时间用完

嘉奖函数：

得分：搜集金币、踩扁敌人、结束时夺旗
扣分：遭到敌人伤害、掉落悬崖、结束时未夺旗

# 创建环境def create_train_env(world, stage, actions, output_path=None):
# 创建基础环境
env = gym_super_mario_bros.make("SuperMarioBros-{}-{}-v0".format(world, stage))
env = JoypadSpace(env, actions)
# 对环境自定义内涵
env = CustomReward(env, world, stage, monitor=None)
env = CustomSkipFrame(env)
return env
# 对原始环境进行修改，以取得更好的训练效果
class CustomReward(Wrapper):
def __init__(self, env=None, world=None, stage=None, monitor=None):
super(CustomReward, self).__init__(env)
self.observation_space = Box(low=0, high=255, shape=(1, 84, 84))
self.curr_score = 0
self.current_x = 40
self.world = world
self.stage = stage
if monitor:
self.monitor = monitor
else:
self.monitor = None
def step(self, action):
state, reward, done, info = self.env.step(action)
if self.monitor:
self.monitor.record(state)
state = process_frame(state)
reward += (info["score"] - self.curr_score) / 40.
self.curr_score = info["score"]
if done:
if info["flag_get"]:
reward += 50
else:
reward -= 50
if self.world == 7 and self.stage == 4:
if (506 <= info["x_pos"] <= 832 and info["y_pos"] > 127) or (
832 < info["x_pos"] <= 1064 and info["y_pos"] < 80) or (
1113 < info["x_pos"] <= 1464 and info["y_pos"] < 191) or (
1579 < info["x_pos"] <= 1943 and info["y_pos"] < 191) or (
1946 < info["x_pos"] <= 1964 and info["y_pos"] >= 191) or (
1984 < info["x_pos"] <= 2060 and (info["y_pos"] >= 191 or info["y_pos"] < 127)) or (
2114 < info["x_pos"] < 2440 and info["y_pos"] < 191) or info["x_pos"] < self.current_x - 500:
reward -= 50
done = True
if self.world == 4 and self.stage == 4:
if (info["x_pos"] <= 1500 and info["y_pos"] < 127) or (
1588 <= info["x_pos"] < 2380 and info["y_pos"] >= 127):
reward = ⑸0
done = True
self.current_x = info["x_pos"]
return state, reward / 10., done, info
def reset(self):
self.curr_score = 0
self.current_x = 40
return process_frame(self.env.reset())
class MultipleEnvironments:
def __init__(self, world, stage, action_type, num_envs, output_path=None):
self.agent_conns, self.env_conns = zip(*[mp.Pipe() for _ in range(num_envs)])
if action_type == "right_only":
actions = RIGHT_ONLY
elif action_type == "simple":
actions = SIMPLE_MOVEMENT
else:
actions = COMPLEX_MOVEMENT
self.envs = [create_train_env(world, stage, actions, output_path=output_path) for _ in range(num_envs)]
self.num_states = self.envs[0].observation_space.shape[0]
self.num_actions = len(actions)
for index in range(num_envs):
process = mp.Process(target=self.run, args=(index,))
process.start()
self.env_conns[index].close()
def run(self, index):
self.agent_conns[index].close()
while True:
request, action = self.env_conns[index].recv()
if request == "step":
self.env_conns[index].send(self.envs[index].step(action.item()))
elif request == "reset":
self.env_conns[index].send(self.envs[index].reset())
else:
raise NotImplementedError
def process_frame(frame):
if frame is not None:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(frame, (84, 84))[None, :, :] / 255.
return frame
else:
return np.zeros((1, 84, 84))
class CustomSkipFrame(Wrapper):
def __init__(self, env, skip=4):
super(CustomSkipFrame, self).__init__(env)
self.observation_space = Box(low=0, high=255, shape=(skip, 84, 84))
self.skip = skip
self.states = np.zeros((skip, 84, 84), dtype=np.float32)
def step(self, action):
total_reward = 0
last_states = []
for i in range(self.skip):
state, reward, done, info = self.env.step(action)
total_reward += reward
if i >= self.skip / 2:
last_states.append(state)
if done:
self.reset()
return self.states[None, :, :, :].astype(np.float32), total_reward, done, info
max_state = np.max(np.concatenate(last_states, 0), 0)
self.states[:⑴] = self.states[1:]
self.states[⑴] = max_state
return self.states[None, :, :, :].astype(np.float32), total_reward, done, info
def reset(self):
state = self.env.reset()
self.states = np.concatenate([state for _ in range(self.skip)], 0)return self.states[None, :, :, :].astype(np.float32)

4. 定义内涵神经网络

神经网络结构包括4层卷积网络和一层全连接网络，提取的特点输入critic层和actor层，分别输出value值和动作几率散布。

class Net(nn.Module):def __init__(self, num_inputs, num_actions):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(num_inputs, 32, 3, stride=2, padding=1)
self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
self.linear = nn.Linear(32 * 6 * 6, 512)
self.critic_linear = nn.Linear(512, 1)
self.actor_linear = nn.Linear(512, num_actions)
self._initialize_weights()
def _initialize_weights(self):
for module in self.modules():
if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
nn.init.orthogonal_(module.weight, nn.init.calculate_gain('relu'))
nn.init.constant_(module.bias, 0)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = F.relu(self.conv4(x))
x = self.linear(x.view(x.size(0), ⑴))return self.actor_linear(x), self.critic_linear(x)

6. 训练模型

训练10 Episode，耗时约5分钟

train(opt)

加载预训练模型Episode: 1. Total loss: 1.1230244636535645 Episode: 2. Total loss: 2.553663730621338 Episode: 3. Total loss: 1.768389344215393 Episode: 4. Total loss: 1.6962862014770508 Episode: 5. Total loss: 1.0912611484527588 Episode: 6. Total loss: 1.6626232862472534 Episode: 7. Total loss: 1.9952025413513184 Episode: 8. Total loss: 1.2410558462142944 Episode: 9. Total loss: 1.3711413145065308

Episode: 10. Total loss: 1.2155205011367798

7. 使用模型推理游戏

定义内涵推理函数

def infer(opt):if torch.cuda.is_available():
torch.cuda.manual_seed(123)
else:
torch.manual_seed(123)
if opt['action_type'] == "right":
actions = RIGHT_ONLY
elif opt['action_type'] == "simple":
actions = SIMPLE_MOVEMENT
else:
actions = COMPLEX_MOVEMENT
env = create_train_env(opt['world'], opt['stage'], actions)
model = Net(env.observation_space.shape[0], len(actions))
if torch.cuda.is_available():
model.load_state_dict(torch.load("{}/ppo_super_mario_bros_{}_{}_{}".format(opt['saved_path'],opt['world'], opt['stage'],opt['episode'])))
model.cuda()
else:
model.load_state_dict(torch.load("{}/ppo_super_mario_bros_{}_{}_{}".format(opt['saved_path'], opt['world'], opt['stage'],opt['episode']),
map_location=torch.device('cpu')))
model.eval()
state = torch.from_numpy(env.reset())
plt.figure(figsize=(10,10))
img = plt.imshow(env.render(mode='rgb_array'))
while True:
if torch.cuda.is_available():
state = state.cuda()
logits, value = model(state)
policy = F.softmax(logits, dim=1)
action = torch.argmax(policy).item()
state, reward, done, info = env.step(action)
state = torch.from_numpy(state)
img.set_data(env.render(mode='rgb_array')) # just update the data
display.display(plt.gcf())
display.clear_output(wait=True)
if info["flag_get"]:
print("World {} stage {} completed".format(opt['world'], opt['stage']))
break
if done and info["flag_get"] is False:
print('Game Failed')break

infer(opt)

8. 作业¶

请你调剂步骤2中的训练参数，重新训练一个模型，使它在游戏中取得更好的表现。

🌟 选择英雄云hpapaas 开启自动化、智能化企业转型未来 🌐 >>>>作为一款提供hpaPaaS平台服务的英雄云，有什么优势呢？面对数字化转型的挑战与机遇，选择一个适合自己企业的高度信息化、智能化和自动化的管理系统尤为重要。英雄云是一个值得考虑的选择。英雄云提供了一系列独特的优势，使其成为企业智能管理的首选。下面是英雄云的八大优势：

1. 无需代码操作 💻

：英雄云的平台无需编程知识，即可操作自定义的管理系统：ERP系统、CRM系统、进销存系统、人事行政OA系统、WMS系统等。这意味着用户无需拥有编程技能，也能轻松操控自己的系统。

2. 高度可定制性 🛠️

：英雄云允许用户根据自己的业务需求、企业规模、成员人数、所需系统等等，进行高度定制服务；三大业务引擎：云表单（进阶版Excel）、工作流程（智能自动触发流程）以及仪表盘（可视化报表），帮助企业进行更简易的业务操作。

3. 减免重复工作 🔁

：无论是数据录入、审批流程还是报表生成，都可以轻松自动化，使员工能够专注于更有价值的任务。这种减免重复工作的方式不仅提高了工作效率，还降低了错误发生的可能性，为企业节省了时间和资源。

4. 生态系统集成 🌍

：英雄云与其他常用企业应用和工具具有良好的生态系统集成能力，可以轻松集成与第三方系统，如财务软件、邮件服务等，实现全面的业务支持。

5. 数据分析与智能决策 📊

：英雄云提供先进的数据分析工具如数据加工工厂，帮助用户更好地理解业务趋势和数据。这使得企业能够做出更明智的战略决策，优化业务流程。

6. 持续更新和改进 ⏫

：英雄云不断进行系统的更新和改进，以适应不断变化的业务环境和技术趋势。这意味着您始终能够使用最新的功能和性能提升。

7. 多样化部署 🌐

：英雄云提供了多样化的部署选项，用户可以根据自身需求选择合适的部署方式，无需自主运维。无论是选择云端部署、私有云部署还是本地部署，英雄云都能提供灵活的解决方案。这意味着用户无需担心硬件维护、系统更新等问题，能够更专注于业务运营，降低了IT运营成本和风险。

8. 免费版本 💸

：英雄云提供免费版本，用户可以在免费版本中获得一定数量的表单数据量和企业数据总量，为小型企业提供了经济实惠的选择。

选择英雄云作为您的数字化管理系统，将带来高度的灵活性、可定制性和效率提升。不管您的企业规模如何，英雄云都能满足您的需求，助力您的仓库管理更上一层楼。不妨注册一个英雄云账户，亲自体验这些优势，并让您的企业管理更加智能化和高效化。

免责声明：

本网址（www.yingxiongyun.com）发布的材料主要源于独立创作和网友匿名投稿。此处提供的所有信息仅供参考之用。我们致力于提供准确且可信的信息，但不对材料的完整性或真实性作出任何保证。用户应自行验证相关信息的正确性，并对其决策承担全部责任。对于由于信息的错误、不准确或遗漏所造成的任何损失，本网址不承担任何法律责任。本网站所展示的所有内容，如文字、

用PowerPoint制作游戏转盘-英雄云拓展知识分享

491 2024-02-01

使用PPO算法玩“超级马里奥兄弟”-英雄云拓展知识分享

案例内容介绍

PPO算法的基本结构

超级马里奥兄弟游戏环境简介

注意事项

实验步骤

1. 程序初始化

2. 训练参数初始化

3. 创建环境

4. 定义内涵神经网络

6. 训练模型

7. 使用模型推理游戏

8. 作业¶

1. 无需代码操作 💻

2. 高度可定制性 🛠️

3. 减免重复工作 🔁

4. 生态系统集成 🌍

5. 数据分析与智能决策 📊

6. 持续更新和改进 ⏫

7. 多样化部署 🌐

8. 免费版本 💸

用PowerPoint制作游戏转盘-英雄云拓展知识分享

excel制作的小游戏你玩过几个？-英雄云拓展知识分享

用了10几年Excel，今天才发现原来一直用错了-英雄云拓展知识分享

最近发表

热评文章

CRM系统很重要吗？-企业客户关系管理中的重要性及其

WPS2012版本中为何看不到之前用的插件-英雄云拓

客户关系管理的核心是什么？-以客户为中心，提升企业竞

excel剪切、复制和粘贴操作，使它们不会破坏已设置

ChM格式文档处理利器：5款免费软件-英雄云拓展知识

WPS怎样制作文字打字机效果的动画?-英雄云拓展知识

热门标签