How can I make action sampling within the range specified by my environment when using onpolicy_trainer? #1142

lidaken · 2024-05-09T07:58:15Z

Hi, I am new to tianshou and RL. I created a env and used ppo in tianshou to run. But I found the action sampling is out of range. So I searched for, and I found map_action. But it seem not used in trainer
So, how can I solve this problem. Thanks a lot
# continuous actions: orn_low = np.array([-30, -30, -30]) * np.pi / 180 orn_high = np.array([30, 30, 30]) * np.pi / 180 v_low = np.array([0.001]) v_high = np.array([0.1]) distance_low = np.array([0.01]) distance_high = np.array([0.5]) act_low = np.concatenate((orn_low,v_low,distance_low)) act_high = np.concatenate((orn_high, v_high,distance_high)) bias = () self.action_space = spaces.Box(low = act_low, high = act_high, dtype = np.float64) self.action = np.zeros(self.action_space.shape, dtype = self.action_space)
`#model
net_a = Net(
args.state_shape,
hidden_sizes=args.hidden_sizes,
activation=nn.Tanh,
device=args.device,
)
actor = ActorProb(
net_a,
args.action_shape,
unbounded=True,
device=args.device,
).to(args.device)
net_c = Net(
args.state_shape,
hidden_sizes=args.hidden_sizes,
activation=nn.Tanh,
device=args.device,
)
critic = Critic(net_c, device=args.device).to(args.device)
actor_critic = ActorCritic(actor, critic)

torch.nn.init.constant_(actor.sigma_param, -0.5)
for m in actor_critic.modules():
    if isinstance(m, torch.nn.Linear):
        # orthogonal initialization
        torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
        torch.nn.init.zeros_(m.bias)
# do last policy layer scaling, this will make initial actions have (close to)
# 0 mean and std, and will help boost performances,
# see https://arxiv.org/abs/2006.05990, Fig.24 for details
for m in actor.mu.modules():
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.zeros_(m.bias)
        m.weight.data.copy_(0.01 * m.weight.data)

optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)

lr_scheduler = None
if args.lr_decay:
    # decay learning rate to 0 linearly
    max_update_num = np.ceil(
        args.step_per_epoch / args.step_per_collect
    ) * args.epoch

    lr_scheduler = LambdaLR(
        optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num
    )

def dist(*logits):
    return Independent(Normal(*logits), 1)

policy = PPOPolicy(
    actor,
    critic,
    optim,
    dist,
    discount_factor=args.gamma,
    gae_lambda=args.gae_lambda,
    max_grad_norm=args.max_grad_norm,
    vf_coef=args.vf_coef,
    ent_coef=args.ent_coef,
    reward_normalization=args.rew_norm,
    action_scaling=True,
    action_bound_method=args.bound_action_method,
    lr_scheduler=lr_scheduler,
    action_space=env.action_space,
    eps_clip=args.eps_clip,
    value_clip=args.value_clip,
    dual_clip=args.dual_clip,
    advantage_normalization=args.norm_adv,
    recompute_advantage=args.recompute_adv,
)

if not args.watch:
    # trainer
    #train_envs.render(args.watch)

    result = onpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.repeat_per_collect,
        args.test_num,
        args.batch_size,
        step_per_collect=args.step_per_collect,
        save_best_fn=save_best_fn,
        logger=logger,
        test_in_train=False,
    )`

The text was updated successfully, but these errors were encountered:

lidaken · 2024-05-09T08:00:26Z

`
#model
net_a = Net(
args.state_shape,
hidden_sizes=args.hidden_sizes,
activation=nn.Tanh,
device=args.device,
)
actor = ActorProb(
net_a,
args.action_shape,
unbounded=True,
device=args.device,
).to(args.device)
net_c = Net(
args.state_shape,
hidden_sizes=args.hidden_sizes,
activation=nn.Tanh,
device=args.device,
)
critic = Critic(net_c, device=args.device).to(args.device)
actor_critic = ActorCritic(actor, critic)

torch.nn.init.constant_(actor.sigma_param, -0.5)
for m in actor_critic.modules():
    if isinstance(m, torch.nn.Linear):
        # orthogonal initialization
        torch.nn.init.orthogonal_(m.weight, gain=np.sqrt(2))
        torch.nn.init.zeros_(m.bias)
# do last policy layer scaling, this will make initial actions have (close to)
# 0 mean and std, and will help boost performances,
# see https://arxiv.org/abs/2006.05990, Fig.24 for details
for m in actor.mu.modules():
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.zeros_(m.bias)
        m.weight.data.copy_(0.01 * m.weight.data)

optim = torch.optim.Adam(actor_critic.parameters(), lr=args.lr)

lr_scheduler = None
if args.lr_decay:
    # decay learning rate to 0 linearly
    max_update_num = np.ceil(
        args.step_per_epoch / args.step_per_collect
    ) * args.epoch

    lr_scheduler = LambdaLR(
        optim, lr_lambda=lambda epoch: 1 - epoch / max_update_num
    )

def dist(*logits):
    return Independent(Normal(*logits), 1)

policy = PPOPolicy(
    actor,
    critic,
    optim,
    dist,
    discount_factor=args.gamma,
    gae_lambda=args.gae_lambda,
    max_grad_norm=args.max_grad_norm,
    vf_coef=args.vf_coef,
    ent_coef=args.ent_coef,
    reward_normalization=args.rew_norm,
    action_scaling=True,
    action_bound_method=args.bound_action_method,
    lr_scheduler=lr_scheduler,
    action_space=env.action_space,
    eps_clip=args.eps_clip,
    value_clip=args.value_clip,
    dual_clip=args.dual_clip,
    advantage_normalization=args.norm_adv,
    recompute_advantage=args.recompute_adv,
)

# # load a previous policy
# if args.resume_path:
#     ckpt = torch.load(args.resume_path, map_location=args.device)
#     policy.load_state_dict(ckpt["model"])
#     train_envs.set_obs_rms(ckpt["obs_rms"])
#     test_envs.set_obs_rms(ckpt["obs_rms"])
#     print("Loaded agent from: ", args.resume_path)

# collector
# if args.training_num > 1:
#     buffer = VectorReplayBuffer(args.buffer_size, len(train_envs))
# else:
buffer = VectorReplayBuffer(args.buffer_size, buffer_num = len(train_envs))
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)


#Collector will transfer env.reset() function, and lead to pybullet error
test_collector = Collector(policy, test_envs)

# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
args.algo_name = "ppo"
log_name = os.path.join(args.task, args.algo_name, str(args.seed), now)
log_path = os.path.join(args.logdir, log_name)

# logger
if args.logger == "wandb":
    logger = WandbLogger(
        save_interval=1,
        name=log_name.replace(os.path.sep, "__"),
        run_id=args.resume_id,
        config=args,
        project=args.wandb_project,
    )
writer = SummaryWriter(log_path)
writer.add_text("args", str(args))
if args.logger == "tensorboard":
    logger = TensorboardLogger(writer)
else:  # wandb
    logger.load(writer)

def save_best_fn(policy):
    state = {"model": policy.state_dict(), "obs_rms": train_envs.get_obs_rms()}
    torch.save(state, os.path.join(log_path, "policy.pth"))

if not args.watch:
    # trainer
    #train_envs.render(args.watch)

    result = onpolicy_trainer(
        policy,
        train_collector,
        test_collector,
        args.epoch,
        args.step_per_epoch,
        args.repeat_per_collect,
        args.test_num,
        args.batch_size,
        step_per_collect=args.step_per_collect,
        save_best_fn=save_best_fn,
        logger=logger,
        test_in_train=False,
    )

`

lidaken · 2024-05-09T08:02:19Z

`# continuous actions:

    orn_low = np.array([-30, -30, -30]) * np.pi / 180

    orn_high = np.array([30, 30, 30]) * np.pi / 180

    v_low = np.array([0.001])

    v_high = np.array([0.1])

    distance_low = np.array([0.01])

    distance_high = np.array([0.5])

    act_low = np.concatenate((orn_low,v_low,distance_low))

    act_high = np.concatenate((orn_high, v_high,distance_high))

    self.action_space = spaces.Box(low = act_low, high = act_high, dtype = np.float64)

    self.action = np.zeros(self.action_space.shape, dtype = self.action_space)`

lidaken · 2024-05-09T08:03:16Z

sorry, I don't know what's wrong with the code T_T

lidaken · 2024-05-09T14:46:32Z

in my debug, I found that map_action will be called in collector.py. And i found I can't enter
if isinstance(self.action_space, gym.spaces.Box) and \ isinstance(act, np.ndarray):
so I set Judgment， and found my action_space is gym.spaces.box.Box. Not gym.spaces.Box, can someone tell me how to solve it

MischaPanch · 2024-05-09T19:39:47Z

I can take a look soon. Could you pls

Format your posts above to make them a bit more readable
Give some info on how you defined your environment. Tianshou only supports gymnasium envs, not gym - maybe that is the problem? Is the environment code available?
Check whether you have the right versions installed. How did you install tianshou, from master or from pypi? As mentioned above, you should not build your env from gym, so gym should not be installed, and gymnasium should be of the version that automatically comes with tianshou

lidaken · 2024-05-10T01:35:46Z

I can take a look soon. Could you pls

Format your posts above to make them a bit more readable

Give some info on how you defined your environment. Tianshou only supports gymnasium envs, not gym - maybe that is the problem? Is the environment code available?

Check whether you have the right versions installed. How did you install tianshou, from master or from pypi? As mentioned above, you should not build your env from gym, so gym should not be installed, and gymnasium should be of the version that automatically comes with tianshou

ok, thanks, I will re-upload code

MischaPanch added the question Further information is requested label May 9, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How can I make action sampling within the range specified by my environment when using onpolicy_trainer? #1142

How can I make action sampling within the range specified by my environment when using onpolicy_trainer? #1142

lidaken commented May 9, 2024

lidaken commented May 9, 2024

lidaken commented May 9, 2024

lidaken commented May 9, 2024

lidaken commented May 9, 2024

MischaPanch commented May 9, 2024

lidaken commented May 10, 2024

How can I make action sampling within the range specified by my environment when using onpolicy_trainer? #1142

How can I make action sampling within the range specified by my environment when using onpolicy_trainer? #1142

Comments

lidaken commented May 9, 2024

lidaken commented May 9, 2024

lidaken commented May 9, 2024

lidaken commented May 9, 2024

lidaken commented May 9, 2024

MischaPanch commented May 9, 2024

lidaken commented May 10, 2024