Search Unity

Agent with multiple reward paramters not improving

Discussion in 'ML-Agents' started by NayanMLAI, Aug 13, 2022.

  1. NayanMLAI

    NayanMLAI

    Joined:
    Jun 22, 2022
    Posts:
    1
    I have an agent that tries to move towards a target. It can move in x,y,z axis and also has momentum. It can also rotate and the goal is for the agent to reach the goal object facing its forward toward the goal with minimum velocity. It also uses a fuel system instead of a max agent step as I want the movement to correspond to the magnitude of the agents movement rather than having a set max agent step (using continuous actions).

    The agent observes its position, the goal position, the difference in direction between the agent 's forward vector and the direction vector between the goal and the agent, its velocity, fuel, and the distance between the goal and agent.

    The reward function is as follows:
    +1 for reaching the state - ((difference in direction between the agent 's forward vector and the direction vector between the goal and the agent)/10) - ((velocity)/25) - ((fuel used)/1000)
    ((distance between agent and goal) / 1000) for not reaching the goal
    -1 for hitting out of bounds

    I am trying to penalise it for not facing the goal and going too fast. The graphs look like this: Screenshot from 2022-08-13 04-49-02.png


    I am using curriculum learning and slowly increasing the distance away from the target and the starting rotation so its not necessarily facing the goal on beginning episode.

    The agent learns to face the right way but doesn't learn to decrease velocity.
    The threshold for each lesson is 0.9 for all lessons (of which there are 5).

    The config file looks like:
    Code (Boo):
    1. default_settings: null
    2. behaviors:
    3.   Lander:
    4.     trainer_type: ppo
    5.     hyperparameters:
    6.       batch_size: 2048
    7.       buffer_size: 20480
    8.       learning_rate: 0.0003
    9.       beta: 0.005
    10.       epsilon: 0.2
    11.       lambd: 0.95
    12.       num_epoch: 3
    13.       learning_rate_schedule: linear
    14.       beta_schedule: linear
    15.       epsilon_schedule: linear
    16.     network_settings:
    17.       normalize: true
    18.       hidden_units: 512
    19.       num_layers: 3
    20.       vis_encode_type: simple
    21.       memory: null
    22.       goal_conditioning_type: hyper
    23.       deterministic: false
    24.     reward_signals:
    25.       extrinsic:
    26.         gamma: 0.99
    27.         strength: 1.0
    28.         network_settings:
    29.           normalize: false
    30.           hidden_units: 128
    31.           num_layers: 2
    32.           vis_encode_type: simple
    33.           memory: null
    34.           goal_conditioning_type: hyper
    35.           deterministic: false
    36.     init_path: null
    37.     keep_checkpoints: 5
    38.     checkpoint_interval: 500000
    39.     max_steps: 50000000
    40.     time_horizon: 1000
    41.     summary_freq: 25000
    42.     threaded: false
    43.     self_play: null
    44. env_settings:
    45.   env_path: null
    46.   env_args: null
    47.   base_port: 5005
    48.   num_envs: 1
    49.   num_areas: 1
    50.   seed: -1
    51.   max_lifetime_restarts: 10
    52.   restarts_rate_limit_n: 1
    53.   restarts_rate_limit_period_s: 60
    54. engine_settings:
    55.   width: 84
    56.   height: 84
    57.   quality_level: 5
    58.   time_scale: 20.0
    59.   target_frame_rate: -1
    60.   capture_frame_rate: 60
    61.   no_graphics: false
    62. environment_parameters:
    63.   start_position:
    64.     curriculum:
    65.       - name: L0
    66.         completion_criteria:
    67.           measure: reward
    68.           behavior: Lander
    69.           signal_smoothing: true
    70.           min_lesson_length: 500
    71.           threshold: 0.9
    72.         value:
    73.           sampler_type: uniform
    74.           sampler_parameters:
    75.             min_value: -60.0
    76.             max_value: -55.0
    77.       - name: L1
    78.         completion_criteria:
    79.           measure: reward
    80.           behavior: Lander
    81.           signal_smoothing: true
    82.           min_lesson_length: 500
    83.           threshold: 0.9
    84.         value:
    85.           sampler_type: uniform
    86.           sampler_parameters:
    87.             min_value: -70.0
    88.             max_value: -60.0
    89.       - name: L2
    90.         completion_criteria:
    91.           measure: reward
    92.           behavior: Lander
    93.           signal_smoothing: true
    94.           min_lesson_length: 500
    95.           threshold: 0.9
    96.         value:
    97.           sampler_type: uniform
    98.           sampler_parameters:
    99.             min_value: -80.0
    100.             max_value: -70.0
    101.       - name: L3
    102.         completion_criteria:
    103.           measure: reward
    104.           behavior: Lander
    105.           signal_smoothing: true
    106.           min_lesson_length: 500
    107.           threshold: 0.9
    108.         value:
    109.           sampler_type: uniform
    110.           sampler_parameters:
    111.             min_value: -90.0
    112.             max_value: -80.0
    113.       - name: L4
    114.           sampler_type: uniform
    115.           sampler_parameters:
    116.             min_value: -100.0
    117.             max_value: -90.0
    118.   start_rotation:
    119.     curriculum:
    120.       - name: L0
    121.         completion_criteria:
    122.           measure: reward
    123.           behavior: Lander
    124.           signal_smoothing: true
    125.           min_lesson_length: 500
    126.           threshold: 0.9
    127.         value:
    128.           sampler_type: uniform
    129.           sampler_parameters:
    130.             min_value: 175.0
    131.             max_value: 185.0
    132.       - name: L1
    133.         completion_criteria:
    134.           measure: reward
    135.           behavior: Lander
    136.           signal_smoothing: true
    137.           min_lesson_length: 500
    138.           threshold: 0.9
    139.         value:
    140.           sampler_type: uniform
    141.           sampler_parameters:
    142.             min_value: 165.0
    143.             max_value: 195.0
    144.       - name: L2
    145.         completion_criteria:
    146.           measure: reward
    147.           behavior: Lander
    148.           signal_smoothing: true
    149.           min_lesson_length: 500
    150.           threshold: 0.9
    151.         value:
    152.           sampler_type: uniform
    153.           sampler_parameters:
    154.             min_value: 155.0
    155.             max_value: 215.0
    156.       - name: L3
    157.         completion_criteria:
    158.           measure: reward
    159.           behavior: Lander
    160.           signal_smoothing: true
    161.           min_lesson_length: 500
    162.           threshold: 0.9
    163.         value:
    164.           sampler_type: uniform
    165.           sampler_parameters:
    166.             min_value: 145.0
    167.             max_value: 225.0
    168.       - name: L4
    169.         value:
    170.           sampler_type: uniform
    171.           sampler_parameters:
    172.             min_value: 135.0
    173.             max_value: 235.0
    174. checkpoint_settings:
    175.   run_id: ppo
    176.   initialize_from: null
    177.   load_model: false
    178.   resume: false
    179.   force: true
    180.   train_model: false
    181.   inference: false
    182.   results_dir: results
    183. torch_settings:
    184.   device: null
    185. debug: false
    186.  
     
    Last edited: Aug 13, 2022