From 7713030154ffb595e3b87d4e3df3847f1f7712a4 Mon Sep 17 00:00:00 2001 From: spencerfolk Date: Tue, 2 Jan 2024 14:51:30 -0500 Subject: [PATCH] Removed first PPO script --- examples/ppo_hover.py | 87 ------------------------------------------- 1 file changed, 87 deletions(-) delete mode 100644 examples/ppo_hover.py diff --git a/examples/ppo_hover.py b/examples/ppo_hover.py deleted file mode 100644 index 34a9c53..0000000 --- a/examples/ppo_hover.py +++ /dev/null @@ -1,87 +0,0 @@ -import gymnasium as gym -import numpy as np -import matplotlib.pyplot as plt -import os - -from rotorpy.vehicles.crazyflie_params import quad_params # Import quad params for the quadrotor environment. - -# Import the QuadrotorEnv gymnasium environment using the following command. -from rotorpy.learning.quadrotor_environments import QuadrotorEnv - -# Reward functions can be specified by the user, or we can import from existing reward functions. -from rotorpy.learning.quadrotor_reward_functions import hover_reward - -""" -In this script, we demonstrate how to train a hovering control policy in RotorPy using Proximal Policy Optimization. -We use our custom quadrotor environment for Gymnasium along with stable baselines for the PPO implementation. - -The task is for the quadrotor to stabilize to hover at the origin when starting at a random position nearby. - -""" - -# First we'll set up some directories for saving the policy and logs. - -models_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "rotorpy", "learning", "policies") -log_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "rotorpy", "learning", "logs") -if not os.path.exists(models_dir): - os.makedirs(models_dir) -if not os.path.exists(log_dir): - os.makedirs(log_dir) - -# Next import Stable Baselines. -try: - import stable_baselines3 -except: - raise ImportError('You must have Stable Baselines installed via pip install stable_baselines3') - -from stable_baselines3 import PPO # We'll use PPO for training. -from stable_baselines3.ppo.policies import MlpPolicy # The policy will be represented by an MLP -from stable_baselines3.common.evaluation import evaluate_policy # For evaluation - -# Make the environment. For this demo we'll train a policy in cmd_vel. Higher abstractions lead to easier tasks. -env = gym.make("Quadrotor-v0", - control_mode ='cmd_ctbr', - reward_fn = hover_reward, - quad_params = quad_params, - max_time = 5, - world = None, - sim_rate = 100, - render_mode='3D') - -# from stable_baselines3.common.env_checker import check_env -# check_env(env, warn=True) # you can check the environment using built in tools - -# Reset the environment -observation, info = env.reset(initial_state='random') - -# Load the policy. -model_path = os.path.join(models_dir, "PPOMlpPolicy.zip") -# If model is already trained, load it and continue training -if os.path.exists(model_path): - model = PPO.load(model_path, env=env, tensorboard_log=log_dir) - print("Loading existing model.") -else: - model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) - -# Training... -num_timesteps = 10_000 -num_episodes = 100 - -best_mean_reward = -100_000_000 # Really large number so that in the first iteration it is overwritten - -for i in range(num_episodes): - - # This line will run num_timesteps for training and log the results every so often. - model.learn(total_timesteps=num_timesteps, reset_num_timesteps=False, tb_log_name="PPO_{}".format(i)) - - if i % 10 == 0: # Evaluate the policy every 10 episodes - mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100) - - # Save the model if it is better than the previous one - if mean_reward >= best_mean_reward: - best_mean_reward = mean_reward - # Example for saving best model - print("Saving new best model") - model.save(model_path) - else: - print("Not saving model, mean reward was {:.2f}, but best reward is {:.2f}".format(mean_reward, best_mean_reward))