# import ipdb; ipdb.set_trace() import numpy as np import os, random, subprocess, time from lib import plotting, py_asp, helper, induction, abduction import gym, gym_vgdl from random import randint import config as cf def run_experiment(env, i_episode, stats_test, width, time_range): _ = env.reset() t = 0 agent_position = env.unwrapped.observer.get_observation()["position"] abduction.update_agent_position(agent_position, t) abduction.update_time_range(agent_position, t) answer_sets = abduction.run_clingo(cf.CLINGOFILE) states_plan, actions_array = abduction.sort_planning(answer_sets) while t < time_range: is_done = False print("testing phase....") for _, action in enumerate(actions_array): env.render() # time.sleep(0.1) action_int = helper.get_action(action[1]) _, reward, done, _ = env.step(action_int) if done: reward = reward + 10 else: reward = reward - 1 print("reward here is ", reward) print("i_episode here is ", i_episode) # Update stats stats_test.episode_rewards[i_episode] += reward stats_test.episode_lengths[i_episode] = t t = t + 1 if done: is_done = True break if is_done: break if not is_done: # If clingo does not give you a right path, just accumulate -1 punishment action_int = 4 _, reward, done2, _ = env.step(action_int) if done2: reward = reward + 10 else: reward = reward - 1 stats_test.episode_rewards[i_episode] += reward stats_test.episode_lengths[i_episode] = t t = t + 1 def k_learning(env, num_episodes, epsilon=0.1, record_prefix=None, is_link=False): # Get cell range for the game height = env.unwrapped.game.height width = env.unwrapped.game.width cell_range = "\ncell((0..{}, 0..{})).\n".format(width-1, height-1) # Log everything and keep the record here log_dir = None if record_prefix: log_dir = os.path.join(cf.BASE_DIR, "log") log_dir = helper.gen_log_dir(log_dir, record_prefix) # This will be true once the agent reaches the goal (and ILASP kicks in) reached_goal = False # the first abduction needs lots of basic information first_abduction = False keep_link = None # Clean up all the files first helper.silentremove(cf.BASE_DIR, cf.GROUNDING) helper.silentremove(cf.BASE_DIR, cf.LASFILE) helper.silentremove(cf.BASE_DIR, cf.CLINGOFILE) helper.silentremove(cf.BASE_DIR, cf.LAS_CACHE, cf.LAS_CACHE_PATH) helper.create_file(cf.BASE_DIR, cf.LAS_CACHE, cf.LAS_CACHE_PATH) cf.ALREADY_LINK = False # Add mode bias and adjacent definition for ILASP induction.copy_las_base(height, width, cf.LASFILE, is_link) # record the current hypothesis hypothesis = "" abduction.make_lp_base(cell_range) wall_list = induction.get_all_walls(env) # Logging dictionaries stats = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_runtime=np.zeros(num_episodes)) stats_ilasp = plotting.TimeStats( ILASP_runtime=np.zeros((num_episodes,cf.TIME_RANGE))) stats_test = plotting.EpisodeStats( episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes), episode_runtime=np.zeros(num_episodes)) for i_episode in range(num_episodes): print("==============NEW EPISODE======================") start_total_runtime = time.time() previous_state = env.reset() agent_position = env.unwrapped.observer.get_observation()["position"] previous_state_at = py_asp.state_at(previous_state[0], previous_state[1], 0) t = 0 # Once the agent reaches the goal, the algorithm kicks in if reached_goal: new_epsilon = epsilon while t < cf.TIME_RANGE: if first_abduction == False: # Convert syntax of H for ASP solver hypothesis_asp = py_asp.convert_las_asp(hypothesis) abduction.add_hypothesis(hypothesis_asp) abduction.add_start_state(agent_position) abduction.add_goal_state(goal_state) first_abduction = True # Update the starting position for Clingo agent_position = env.unwrapped.observer.get_observation()["position"] abduction.update_agent_position(agent_position, t) abduction.update_time_range(agent_position, t) # Run clingo to get a plan answer_sets = abduction.run_clingo(cf.CLINGOFILE) states_plan, actions_array = abduction.sort_planning(answer_sets) # Record clingo if record_prefix: inputfile = os.path.join(cf.BASE_DIR, cf.CLINGOFILE) helper.log_asp(inputfile, answer_sets, log_dir, i_episode, t) # Execute the planning for action_index, action in enumerate(actions_array): print("---------Planning phase---------------------") # Flip a coin. If threshold < epsilon, explore randomly threshold = random.uniform(0,1) if threshold < new_epsilon: action_int = randint(0, 3) if cf.IS_PRINT: print("Taking a pure random action...", helper.convert_action(action_int)) else: # Following the plan action_int = helper.get_action(action[1]) if cf.IS_PRINT: print("Following the plan...", helper.convert_action(action_int)) action_string = helper.convert_action(action_int) next_state, reward, done, _ = env.step(action_int) next_state_at = py_asp.state_at(next_state[0], next_state[1], t+1) if done: reward = reward + 10 else: reward = reward - 1 # Meanwhile, accumulate all background knowlege abduction.add_new_walls(previous_state, wall_list, cf.CLINGOFILE) # Make ASP syntax of state transition pos1, pos2,link = induction.generate_pos(hypothesis, previous_state, next_state, action_string, wall_list, cell_range) if link is not None: keep_link = link # Update H if necessary if (not induction.check_ILASP_cover(hypothesis, pos1, height, width, keep_link)) or (not induction.check_ILASP_cover(hypothesis, pos2, height, width, keep_link)): start_time = time.time() hypothesis = induction.run_ILASP(cf.LASFILE, cf.CACHE_DIR) ilasp_runtime = (time.time()-start_time) stats_ilasp.ILASP_runtime[i_episode,t] += ilasp_runtime if hypothesis == "UNSATISFIABLE\n": import ipdb; ipdb.set_trace() # Convert syntax of H for ASP solver hypothesis_asp = py_asp.convert_las_asp(hypothesis) abduction.update_h(hypothesis_asp) if record_prefix: inputfile = os.path.join(cf.BASE_DIR, cf.LASFILE) helper.log_las(inputfile, hypothesis, log_dir, i_episode, t) previous_state = next_state previous_state_at = next_state_at # Update stats stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = action_index env.render() # time.sleep(0.1) t = t + 1 if done or (threshold < new_epsilon): break if not actions_array: t = t + 1 if done: break # Random action until ILASP kicks in else: for t in range(cf.TIME_RANGE): env.render() # Uncomment time.sleep() to see the movement of the agent slower # time.sleep(0.1) # Take a step action = randint(0, 3) next_state, reward, done, _ = env.step(action) action_string = helper.convert_action(action) if done: reward = reward + 10 goal_state = next_state reached_goal = True else: reward =reward - 1 # Meanwhile, accumulate all background knowlege abduction.add_new_walls(previous_state, wall_list, cf.CLINGOFILE) # Make ASP syntax of state transition and send it to LASFILE pos1, pos2,link = induction.generate_pos(hypothesis, previous_state, next_state, action_string, wall_list, cell_range) if link is not None: keep_link = link # Update H if necessary if(not induction.check_ILASP_cover(hypothesis, pos1, height, width, keep_link) or not induction.check_ILASP_cover(hypothesis, pos2, height, width, keep_link) or hypothesis == ''): start_time = time.time() hypothesis = induction.run_ILASP(cf.LASFILE, cf.CACHE_DIR) ilasp_runtime = (time.time()-start_time) stats_ilasp.ILASP_runtime[i_episode,t] += ilasp_runtime if hypothesis == "UNSATISFIABLE\n": import ipdb; ipdb.set_trace() if record_prefix: inputfile = os.path.join(cf.BASE_DIR, cf.LASFILE) helper.log_las(inputfile, hypothesis, log_dir, i_episode, t) previous_state = next_state # Update stats stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t if done: break stats.episode_runtime[i_episode] += (time.time()-start_total_runtime) run_experiment(env, i_episode, stats_test, width, cf.TIME_RANGE) return stats, stats_test,stats_ilasp # env = gym.make('vgdl_experiment4_after-v0') env = gym.make('vgdl_experiment1-v0') #env = gym.make('vgdl_experiment3_after-v0') temp_dir = os.path.join(cf.BASE_DIR, "result_pkl/experiment4_after_TL") for i in range(30): stats, stats_test,stats_ilasp = k_learning(env, 100, epsilon=0.1, record_prefix="exp1_again", is_link=False) plotting.store_stats(stats, temp_dir, "exp4_v{}".format(str(i))) plotting.store_stats(stats_test, temp_dir, "exp4_test_v{}".format(str(i))) plotting.store_stats(stats_ilasp, temp_dir, "exp4_ilasp_v{}".format(str(i))) # stats, stats_test,stats_ilasp = k_learning(env, 100, epsilon=0.1, record_prefix=None, is_link=None) # plotting.plot_episode_stats_simple(stats, smoothing_window=1) # plotting.plot_episode_stats_simple(stats_test, smoothing_window=1)