Source code for minestudio.simulator.callbacks.reward_gate

import numpy as np
from minestudio.simulator.callbacks.callback import MinecraftCallback

[docs] class GateRewardsCallback(MinecraftCallback): """ A callback for calculating rewards based on the formation of a Nether portal. This callback rewards the agent for building a valid Nether portal structure using obsidian blocks. """ def __init__(self): """ Initializes the GateRewardsCallback. """ super().__init__() self.prev_info = {} self.reward_memory = {} self.current_step = 0
[docs] def reward_as_smlest_pos(self, obsidian_position, obsidian_positions): """ Calculates the reward for a potential portal frame based on a starting obsidian block. It checks for both X-fixed and Z-fixed portal orientations. :param obsidian_position: The (x, y, z) coordinates of a starting obsidian block. :param obsidian_positions: A list of (x, y, z) coordinates of all obsidian blocks. :return: The calculated reward for the best portal frame found from this starting block. """ x, y, z = obsidian_position positive_pos = [(x, y, z), (x, y, z+1), (x, y, z+2), (x, y, z+3), (x, y+1, z+3), (x, y+2, z+3), (x, y+3, z+3), (x, y+4, z+3), (x, y+4, z+2), (x, y+4, z+1), (x, y+4, z), (x, y+3, z), (x, y+2, z), (x, y+1, z)] negtive_pos = [(x, y+1, z+1), (x, y+1, z+2), (x, y+2, z+2), (x, y+3, z+2), (x, y+3, z+1), (x, y+2, z+1)] frame_num = len(set(positive_pos)&set(obsidian_positions)) extra_bonus = max(0, frame_num-12) fix_x_reward = frame_num+extra_bonus - len(set(negtive_pos)&set(obsidian_positions)) - 0.1*len(set(obsidian_positions)) #fix z reward positive_pos = [(x, y, z), (x+1, y, z), (x+2, y, z), (x+3, y, z), (x+3, y+1, z), (x+3, y+2, z), (x+3, y+3, z), (x+3, y+4, z), (x+2, y+4, z), (x+1, y+4, z), (x, y+4, z), (x, y+3, z), (x, y+2, z), (x, y+1, z)] negtive_pos = [(x+1, y+1, z), (x+2, y+1, z), (x+2, y+2, z), (x+2, y+3, z), (x+1, y+3, z), (x+1, y+2, z)] frame_num = len(set(positive_pos)&set(obsidian_positions)) #extra_bonus = max(0, frame_num-8) + max(0, frame_num-10) + 2*max(0, frame_num-12) + 4*max(0, frame_num-14) extra_bonus = max(0, frame_num-12) fix_z_reward = frame_num+extra_bonus - len(set(negtive_pos)&set(obsidian_positions)) - 0.1*len(set(obsidian_positions)) larger_reward = max(fix_x_reward, fix_z_reward) return larger_reward
[docs] def gate_reward(self, info, obs = {}): """ Calculates the gate reward based on the current voxel information. It iterates through all obsidian blocks and finds the maximum possible portal reward. :param info: The info dictionary containing voxel data. :param obs: The observation dictionary (optional). :return: The maximum gate reward. """ if "voxels" not in info: return 0 voxels = info["voxels"] obsidian_positions = [] for voxel in voxels: if "obsidian" in voxel["type"]: obsidian_positions.append((voxel["x"], voxel["y"], voxel["z"])) max_reward = 0 for obsidian_position in obsidian_positions: reward = self.reward_as_smlest_pos(obsidian_position, obsidian_positions) max_reward = max(max_reward, reward) return max_reward
[docs] def after_reset(self, sim, obs, info): """ Resets the current step count and previous reward. :param sim: The Minecraft simulator. :param obs: The observation from the simulator. :param info: Additional information from the simulator. :return: The observation and info. """ self.current_step = 0 self.prev_reward = 0 return obs, info
[docs] def after_step(self, sim, obs, reward, terminated, truncated, info): """ Calculates the gate reward for the current step. The reward is the difference between the current gate reward and the previous gate reward (delta reward). :param sim: The Minecraft simulator. :param obs: The observation from the simulator. :param reward: The original reward from the simulator. :param terminated: Whether the episode has terminated. :param truncated: Whether the episode has been truncated. :param info: Additional information from the simulator. :return: The modified observation, overridden reward, terminated, truncated, and info. """ override_reward = 0. cur_reward = self.gate_reward(info, obs) override_reward = cur_reward - self.prev_reward self.prev_reward = cur_reward self.current_step += 1 return obs, override_reward, terminated, truncated, info