Search Unity

Resolved I can't figure it out with SelfPlay feature

Discussion in 'ML-Agents' started by WatchizZX, Oct 30, 2023.

  1. WatchizZX

    WatchizZX

    Joined:
    Jan 10, 2023
    Posts:
    1
    Hi all, I am trying to create a racing game with ml agents. The agents get research in the form of raycast (x20) and gridSensor (36*1*36). The raycast scans the environment for walls that run along the road, and the gridSensor scans for other players. Agents give throttling (-1, 1), and turning (-1,1) to exit. I'm training two agents on one map at a time, as I want to teach them to push the opponent from behind to overtake him, and also to avoid the opponent who is driving behind. Each agent has a different steam ID, one has 0, the other 1. I also set their maxStep to 0, as I will limit the number of moves with a separate script (which I wrote based on the examples). The problem is that I feel that only one agent works, and the second just copies his actions, they are the same turning and throttling also, no matter in what place from each other they are located. The script controlling the environment and mlagents configuration is attached below

    Code (CSharp):
    1. using System.Collections;
    2. using System.Collections.Generic;
    3. using System.Linq;
    4. using System.Xml.Schema;
    5.  
    6. using Unity.MLAgents;
    7.  
    8. using UnityEngine;
    9.  
    10. public class EnviromentController : MonoBehaviour
    11. {
    12.     private SimpleMultiAgentGroup firstGroup;
    13.     private SimpleMultiAgentGroup secondGroup;
    14.  
    15.     private int m_ResetTimer;
    16.  
    17.     [Header("Agent reward settings")]
    18.     [Range(10, 300)]
    19.     [SerializeField] public float maxSpeed = 60f;
    20.     [Range(0.001f, 1f)]
    21.     [SerializeField] private float speedReward = 0.00005f;
    22.  
    23.     [Space(1f)]
    24.     [SerializeField] private bool addRewardBySpeed = true;
    25.     [SerializeField] private bool addStepPenalty = true;
    26.     [Space(2f)]
    27.  
    28.     [Header("Agent observation settings")]
    29.     [SerializeField] public bool addCheckpointObservation = true;
    30.     [Space(2f)]
    31.  
    32.     [Header("Agent other settings")]
    33.     [SerializeField] public float agentMinSpeed = 2f;
    34.     [SerializeField] public bool agentControlHandBrake = false;
    35.     [SerializeField] public bool agentCanReverse = false;
    36.  
    37.     [Header("Checkpoint options")]
    38.     [SerializeField] public CheckpointManager checkpointManager;
    39.     [Range(0f, 5f)]
    40.     [SerializeField] private float newLapReward = 1f;
    41.     [Range(0f, -1f)]
    42.     [SerializeField] private float incorrectCheckpointPenalty = -0.01f;
    43.     [Space(2f)]
    44.  
    45.     [Header("Extra options")]
    46.     [SerializeField] private float timeOut = 15;
    47.     [SerializeField] private int collisionThreshold = 10;
    48.  
    49.     [Range(0f, 10f)]
    50.     [SerializeField] private float xSpawnOffset = 0f;
    51.     [Range(0f, 10f)]
    52.     [SerializeField] private float ySpawnOffset = 0f;
    53.     [Range(0f, 10f)]
    54.     [SerializeField] private float zSpawnOffset = 0f;
    55.  
    56.     [Range(-0.001f, -5f)]
    57.     [SerializeField] private float collisionPenalty = -0.3f;
    58.     [Range(-0.001f, -5f)]
    59.     [SerializeField] private float collisionStayPenalty = -0.08f;
    60.     [SerializeField] private float hitReward = 0.1f;
    61.     [SerializeField] private float hitPenaly = -0.1f;
    62.  
    63.     [Tooltip("Max Environment Steps")] public int MaxEnvironmentSteps = 25000;
    64.  
    65.     [SerializeField]
    66.     public List<RoverAgent> roverAgents = new List<RoverAgent>();
    67.  
    68.     private float correctCheckpointReward;
    69.     private float stepPenalty;
    70.  
    71.     float firstGroupReward;
    72.     float secondGroupReward;
    73.  
    74.     private RoverAgent firstAgent;
    75.     private RoverAgent secondAgent;
    76.  
    77.     private void InitializeVariables()
    78.     {
    79.         correctCheckpointReward = 1f / checkpointManager.CheckpointsCount;
    80.         stepPenalty = -1f / MaxEnvironmentSteps;
    81.  
    82.         firstGroupReward = 0;
    83.         secondGroupReward = 0;
    84.     }
    85.  
    86.  
    87.     private void Start()
    88.     {
    89.         InitializeVariables();
    90.  
    91.         firstGroup = new SimpleMultiAgentGroup();
    92.         secondGroup = new SimpleMultiAgentGroup();
    93.  
    94.         foreach (var agent in roverAgents)
    95.         {
    96.             if (agent.TeamID == 0)
    97.             {
    98.                 firstGroup.RegisterAgent(agent);
    99.                 firstAgent = agent;
    100.             }
    101.             else if (agent.TeamID == 1)
    102.             {
    103.                 secondGroup.RegisterAgent(agent);
    104.                 secondAgent = agent;
    105.             }
    106.         }
    107.  
    108.         ResetScene();
    109.     }
    110.  
    111.     public void OnCrash(RoverAgent agent, GameObject other, bool stayInCollision)
    112.     {
    113.         if (other.tag == "Vehicle" && stayInCollision == false)
    114.         {
    115.             float selfAngle = Vector3.Angle(agent.transform.forward, other.transform.position);
    116.             float otherAngle = Vector3.Angle(other.transform.forward, agent.transform.position);
    117.  
    118.             if (selfAngle < otherAngle)
    119.             {
    120.                 if (agent.TeamID == 0)
    121.                 {
    122.                     firstGroup.AddGroupReward(hitReward);
    123.                     secondGroup.AddGroupReward(hitPenaly);
    124.                 }
    125.                 else if (agent.TeamID == 1)
    126.                 {
    127.                     firstGroup.AddGroupReward(hitPenaly);
    128.                     secondGroup.AddGroupReward(hitReward);
    129.                 }
    130.             }
    131.         }
    132.         if (other.tag == "Wall")
    133.         {
    134.             if (stayInCollision)
    135.             {
    136.                 switch (agent.TeamID)
    137.                 {
    138.                     case 0:
    139.                         firstGroup.AddGroupReward(collisionStayPenalty);
    140.                         break;
    141.                     case 1:
    142.                         secondGroup.AddGroupReward(collisionStayPenalty);
    143.                         break;
    144.                 }
    145.             }
    146.             else
    147.             {
    148.                 switch (agent.TeamID)
    149.                 {
    150.                     case 0:
    151.                         firstGroup.AddGroupReward(collisionPenalty);
    152.                         break;
    153.                     case 1:
    154.                         secondGroup.AddGroupReward(collisionPenalty);
    155.                         break;
    156.                 }
    157.             }
    158.         }
    159.     }
    160.  
    161.     void FixedUpdate()
    162.     {
    163.         m_ResetTimer += 1;
    164.  
    165.         if (firstGroupReward < -1f && secondGroupReward < -1f)
    166.         {
    167.             firstGroup.EndGroupEpisode();
    168.             secondGroup.EndGroupEpisode();
    169.         }
    170.         else if (firstGroupReward > -1f)
    171.         {
    172.             firstGroup.EndGroupEpisode();
    173.             secondGroup.GroupEpisodeInterrupted();
    174.         }
    175.         else if (secondGroupReward > -1f)
    176.         {
    177.             secondGroup.EndGroupEpisode();
    178.             firstGroup.GroupEpisodeInterrupted();
    179.         }
    180.  
    181.         if (addStepPenalty)
    182.         {
    183.             firstGroupReward += stepPenalty;
    184.             secondGroupReward += stepPenalty;
    185.  
    186.             firstGroup.AddGroupReward(stepPenalty);
    187.             secondGroup.AddGroupReward(stepPenalty);
    188.         }
    189.  
    190.         if (addRewardBySpeed)
    191.         {
    192.             firstGroupReward += firstAgent.Speed * speedReward;
    193.             secondGroupReward += secondAgent.Speed * speedReward;
    194.  
    195.             firstGroup.AddGroupReward(firstAgent.Speed * speedReward);
    196.             secondGroup.AddGroupReward(secondAgent.Speed * speedReward);
    197.         }
    198.  
    199.         if (m_ResetTimer >= MaxEnvironmentSteps && MaxEnvironmentSteps > 0)
    200.         {
    201.             firstGroup.GroupEpisodeInterrupted();
    202.             secondGroup.GroupEpisodeInterrupted();
    203.             ResetScene();
    204.         }
    205.     }
    206.  
    207.     public void OnEndLap(RoverAgent agent)
    208.     {
    209.         switch (agent.TeamID)
    210.         {
    211.             case 0:
    212.                 firstGroup.AddGroupReward(newLapReward);
    213.                 break;
    214.             case 1:
    215.                 secondGroup.AddGroupReward(newLapReward);
    216.                 break;
    217.         }
    218.  
    219.         firstGroup.EndGroupEpisode();
    220.         secondGroup.EndGroupEpisode();
    221.  
    222.         ResetScene();
    223.     }
    224.  
    225.     public void ResetScene()
    226.     {
    227.         m_ResetTimer = 0;
    228.  
    229.         firstGroupReward = 0;
    230.         secondGroupReward = 0;
    231.  
    232.         //Reset Agents
    233.         foreach (var rover in roverAgents)
    234.         {
    235.             rover.ResetVehicle();
    236.         }
    237.     }
    238. }
    239.  

    default_settings: null
    behaviors:
    RoverAgent:
    trainer_type: poca
    hyperparameters:
    batch_size: 4096
    buffer_size: 81920
    learning_rate: 0.00004
    beta: 0.005
    epsilon: 0.12
    lambd: 0.95
    num_epoch: 5
    learning_rate_schedule: constant
    beta_schedule: constant
    epsilon_schedule: constant
    network_settings:
    normalize: true
    hidden_units: 512
    num_layers: 3
    vis_encode_type: nature_cnn
    memory: null
    goal_conditioning_type: hyper
    deterministic: false
    reward_signals:
    extrinsic:
    gamma: 0.99
    strength: 0.99
    network_settings:
    normalize: true
    hidden_units: 512
    num_layers: 3
    vis_encode_type: nature_cnn
    memory: null
    goal_conditioning_type: hyper
    deterministic: false
    behavioral_cloning:
    strength: 0.2
    demo_path: Demos/City2.demo
    init_path: null
    keep_checkpoints: 10
    checkpoint_interval: 100000
    max_steps: 50000000
    time_horizon: 1000
    summary_freq: 100000
    threaded: false
    self_play:
    save_steps: 100000
    team_change: 400000
    swap_steps: 50000
    window: 10