Search Unity

  1. Welcome to the Unity Forums! Please take the time to read our Code of Conduct to familiarize yourself with the forum rules and how to post constructively.
  2. Dismiss Notice

Resolved Optimizations for HashSet<>

Discussion in 'Scripting' started by Zalosath, Apr 20, 2022.

  1. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    Hi,

    I've got a Dictionary that holds word length against a list of DatabaseItems (that hold the word and some other values), the dictionary holds about 87000 entries split throughout the HashSets, probably about ~15 HashSets in the Dictionary.

    Here's my class:
    Code (CSharp):
    1. public class Database
    2. {
    3.     public Dictionary<int, HashSet<DatabaseItem>> databases;
    4.  
    5.     private HashSet<DatabaseItem> usedItems = new HashSet<DatabaseItem>();
    6.     private Dictionary<(int, int), HashSet<DatabaseItem>> cachedDatabases = new Dictionary<(int, int), HashSet<DatabaseItem>>();
    7.  
    8.     public void InitializeDatabase()
    9.     {
    10.         databases = new Dictionary<int, HashSet<DatabaseItem>>();
    11.  
    12.         Object wordFile = Resources.Load("wordoutput");
    13.         foreach (string line in wordFile.ToString().Split("\n"))
    14.         {
    15.             string[] split = line.Split("~");
    16.             string word = split[0];
    17.             string clue = split[1];
    18.  
    19.             if (databases.ContainsKey(word.Length))
    20.             {
    21.                 databases[word.Length].Add(new DatabaseItem(clue, word));
    22.             }
    23.             else
    24.             {
    25.                 HashSet<DatabaseItem> list = new HashSet<DatabaseItem>();
    26.                 list.Add(new DatabaseItem(clue, word));
    27.                 databases.Add(word.Length, list);
    28.             }
    29.         }
    30.     }
    31.  
    32.     public DatabaseItem GetItem(string answer)
    33.     {
    34.         if (!databases.ContainsKey(answer.Length))
    35.             return null;
    36.  
    37.         foreach (DatabaseItem item in databases[answer.Length])
    38.         {
    39.             if (item.answer == answer)
    40.             {
    41.                 return item;
    42.             }
    43.         }
    44.         return null;
    45.     }
    46.  
    47.     public DatabaseItem GetRandomItem(int maxLength, int minLength = 0)
    48.     {
    49.         HashSet<DatabaseItem> newList = GetHashSetBetween(minLength, maxLength);
    50.  
    51.         if (newList.Count == 0)
    52.             return null;
    53.  
    54.         return GetRandomDatabaseItem(newList);
    55.     }
    56.  
    57.     public HashSet<DatabaseItem> GetShuffledItemsWithSharedCharacter(string c, int maxLength, int minLength = 0)
    58.     {
    59.         HashSet<DatabaseItem> newList = GetHashSetBetween(minLength, maxLength);
    60.  
    61.         newList.RemoveWhere(e => !e.answer.Contains(c) || usedItems.Contains(e));
    62.  
    63.         return newList.OrderBy(x => Random.Range(0f, 1f)).ToHashSet();
    64.     }
    65.  
    66.     public HashSet<DatabaseItem> GetHashSetBetween(int minLength, int maxLength)
    67.     {
    68.         if (cachedDatabases.ContainsKey((minLength, maxLength)))
    69.         {
    70.             return cachedDatabases[(minLength, maxLength)];
    71.         }
    72.         else
    73.         {
    74.             HashSet<DatabaseItem> newList = new HashSet<DatabaseItem>();
    75.             for (int i = minLength; i <= maxLength; i++)
    76.             {
    77.                 HashSet<DatabaseItem> database = databases[i];
    78.                 newList = newList.Concat(database).ToHashSet();
    79.             }
    80.             cachedDatabases.Add((minLength, maxLength), newList);
    81.             return newList;
    82.         }
    83.     }
    84.  
    85.     public DatabaseItem GetRandomDatabaseItem(HashSet<DatabaseItem> list)
    86.     {
    87.         int weightTotal = 0;
    88.         foreach (DatabaseItem item in list)
    89.         {
    90.             weightTotal += item.weight;
    91.         }
    92.  
    93.         int total = 0;
    94.         float randVal = Random.Range(0f, 1f) * weightTotal;
    95.         foreach (DatabaseItem item in list)
    96.         {
    97.             total += item.weight;
    98.             if (total > randVal)
    99.                 return item;
    100.         }
    101.         return list.ElementAt(list.Count - 1);
    102.     }
    103.  
    104.     public void AddUsedItem(DatabaseItem item)
    105.     {
    106.         usedItems.Add(item);
    107.     }
    108. }
    Loading in is fairly speedy and takes no longer than ~5 seconds, but a one time 5 seconds so it's not so bad.
    Though, the other functions can take considerably longer.

    Before the Linq police come and tell me not to use Linq, I don't notice a considerable difference in execution time if I have the Linq statements or not. Just as an FYI. Though, if you do have any thoughts on the Linq vs just code it yourself stance then feel free to leave a comment about it.

    The issue is that execution of my overall start-up takes about 15 minutes (a long time!) and is usually capped in terms of memory (fluctuates between 22 and 24 GB of memory usage).

    The reason for the long start-up is because I'm calling the various functions many many times (easily 100+) on the initial frame of the game. I can spread this execution between multiple frames but I'm not confident it'll help, everything will still take ~15 minutes.

    In the above class I've implemented a sort of cache that stores specific ranges of data that have been previously accessed, which is most definitely why the memory is capped, but if I don't include it then the execution time goes to ~25 minutes. So definitely an improvement.

    Any ideas on how to 1. Cut the memory usage, and 2. Speed up the execution speed?

    If you need more info just ask.

    Thanks.

    EDIT: as a sort of update I managed to get the execution time down to ~20 seconds but it's at a cost of loading only 10% of the words. I'm making a crossword game so this isn't really ideal, but it works, still looking for advice on this though.
     
    Last edited: Apr 20, 2022
  2. DragonCoder

    DragonCoder

    Joined:
    Jul 3, 2015
    Posts:
    1,459
    1. Can't you live with pregenerating a couple thousand crosswords, save only some IDs and deliver that to the player?

    2. Why exactly are you using hashsets and not lists? Only thing a hashset does is ensure that there are no duplicates. It doesn't help with anything else.
    Are you using them solely to detect what words you tried already? Then why not put everything in a single list and hold another list of "use_indices_in_main_list" instead of the hashset.

    3. If you can squeeze this into Native datastructures and thus use them in burst-compiled jobs, you'll probably gain some speed as well. Though I believe your algorithm can probably be vastly improved. Unfortunately it's hard to understand it exactly from such uncommented code.

    I'd suggest to apply the rubberducky method: Explain your algorithm in detail to a rubberduck or write it down in words.
    That often helps realizing wasted potential or flaws.
     
  3. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    1. Yes of course if this was going to be deployed somewhere but it's mostly just a fun side project, not really the point of it.
    2. HashSets appear to be considerably faster than Lists for this use case.
    3. Probably, haven't looked into those much.

    Rubberduck isn't much help here since everything that's happening needs to happen. But I tried anyway, not much came out of it though.
    I'm mostly looking for optimizations for existing code or a way to change it up to optimize it better.

    Problem seems to be mostly gone by using ~10-20% of the list, down to about 20 seconds start-up time.
     
  4. nijnstein

    nijnstein

    Joined:
    Feb 6, 2021
    Posts:
    78
    well to start: replace GetRandomDatabaseItem with return list.ElementAt(Random.Range(0, list.Count - 1)); as the rest doesnt seem to do much usefull besides shifting the random distribution a bit, which you can do a lot cheaper: Random.Range(0f, 1f) * 0.7f + Random.Range(0f, 1f) * 0.3f will shift your distribution


    considering memory use/other performance measures:

    1> drop all lists, load 1 array of strings on length sorted and within same length sorted on character
    2> create 1 list of indices: store an index of the item where length changes to quickly get borders on a range of words with the same length
    3> replace functions you use with array iterators that work within the borders and copy nothing

    this should not really be measurable in multiple seconds even if the sorting is not done before loading the dataset
     
    Last edited: Apr 20, 2022
  5. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    The point of the distribution is to prioritise longer words, your distribution suggestion won't work in this way unless the list is sorted with an equal amount of each word length.
     
  6. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    As per your update, this could work quite well. I'll give this a go now, thanks.
     
  7. Gladyon

    Gladyon

    Joined:
    Sep 10, 2015
    Posts:
    389
    Have you tried profiling?
    You can even use Visual Studio profiler by extracting your algorithm out of Unity, as it's not using any Unity-related stuff as far as I can see.

    By profiling I guess that it may appear that using foreach to go over all elements in a hashset proves to be a bit unoptimized, lists are a lot faster for such operation, the best being to avoid such operation altogether by remembering the total weight and updating it when necessary (add/remove/modify only).
    As for the Linq instructions, they are slower but that's not usually the worse, their biggest problem is the amount of GC they generate, which means that will increase the GC pressure and increase the frequency of GC processing (which may be irrelevant now that Unity is using incremental GC...).

    Could you provide a full working project with data?
    That way we could run the profiler and all have the same basis in order to provide more help, currently all we can do is suppositions.
    For example, I have no idea what processing you're doing on the data, not the kind of data (are the texts only words, or are they full books?).


    A few months ago I had to work with a list of texts (in-game English texts and their translation in the currently chosen language).
    I've tested it with half a million texts, meaning that I had to load half a million English texts (the original ones), half a million translated texts and half a million English texts (the texts the translations are based upon, to be compared to the originals to know if the translation had to be updated or not).
    I had to do a bunch of things on all these texts, such as finding out all the html codes and some specific things like {0} etc. and check if they are consistent in the English and translated texts.

    All in all, it takes about 5s to load or save all the data, and about 150-200ms to apply a new filter (a few boolean and a regex on all texts).
    So, I don't understand how it can take 15 minutes to process only 87000 entries.
    Your loading time is about as expected (I have optimized mine by modifying the C# loading instruction and using multi-threading to improve the loading time, all in all I have divided by ten the loading time).
    But I would have expected no more than 50ms for any processing on texts you can have to do, and if your data is only words (say about 5-8 characters long), I wouldn't have expected more than 5ms processing.

    That's why I'd like to see the whole project, including the data, to find out what kind of processing you do, because I know that I am badly mistaken when I am assuming the type of processing you're doing, so any guess on my part would probably be badly wrong.
     
    DragonCoder likes this.
  8. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    Thanks for the detailed response.

    I booted up the profiler late yesterday and it's funny, but Debug.Log was taking up about half of the run time.

    I also switched to using Arrays as per the previous suggestion and that seems to have cut down the time to 3 minutes. Removing debug.log results a final time of ~1m.

    I have a few more Array optimisations that I'd like to add so I'll implement those and get back shortly.
    ToArray calls seem to take up a hefty portion of the time too so I'm looking into that.

    If I can't get it any lower then I'll come back and post the project files.
     
  9. DragonCoder

    DragonCoder

    Joined:
    Jul 3, 2015
    Posts:
    1,459
    ToArray does a full on (shallow) copy of all data because the Array points to an independent datastructure afterwards. So ideally that should be avoided if possible.
     
  10. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    Down to 4 seconds total for everything :)
    New class is posted below, new and improved with comments ;)
    Code (CSharp):
    1.  
    2. public class Database
    3. {
    4.     #region Variables
    5.     // Holds all DatabaseItems
    6.     public DatabaseItem[] database;
    7.     // Holds the boundaries between the word lengths, e.g [5, 158] would mean that words with length 5 start at index 158
    8.     private Dictionary<int, int> lengthBoundaries = new Dictionary<int, int>();
    9.     // Holds used indexes, since crosswords contain 50 words tops, it's okay to use a list here
    10.     private List<int> usedIndexes = new List<int>();
    11.     #endregion
    12.     public void InitializeDatabase()
    13.     {
    14.         // Open word file, worth noting that this file IS SORTED in descending order of word length
    15.         Object wordFile = Resources.Load("wordoutputShort");
    16.         // Split up line by line
    17.         string[] lines = wordFile.ToString().Split("\n");
    18.         // Initialize the database
    19.         database = new DatabaseItem[lines.Length];
    20.         // Keep track of current word length
    21.         int currentWordLength = 0;
    22.         // Loop the entries of lines
    23.         for (int i = 0; i < lines.Length; i++)
    24.         {
    25.             // Decode
    26.             string[] split = lines[i].Split("~");
    27.             string word = split[0];
    28.             string clue = split[1];
    29.             // Update currentWordLength if current length doesn't match
    30.             if (word.Length != currentWordLength)
    31.             {
    32.                 // Add new length to word boundaries
    33.                 lengthBoundaries.Add(word.Length, i);
    34.                 currentWordLength = word.Length;
    35.             }
    36.             // Add the new item to the database
    37.             database[i] = new DatabaseItem(clue, word);
    38.         }
    39.     }
    40.     public DatabaseItem GetItem(string answer)
    41.     {
    42.         // Retrieve the index of an item by name
    43.         int index = GetIndexOfItem(answer);
    44.         // If it's not found then return null
    45.         if (index == -1)
    46.             return null;
    47.         // Return the DatabaseItem at the given index
    48.         return database[index];
    49.     }
    50.     private int GetIndexOfItem(string answer)
    51.     {
    52.         // Loop the database between the length boundary of the answer and the length of the database
    53.         // Since this loop will return after it finds the word it's safe to loop to database.Length
    54.         // But just incase the word can't be found we cancel the for loop if the Length of the current
    55.         // Word is longer than the word we're looking for
    56.         for (int i = lengthBoundaries[answer.Length]; i < database.Length; i++)
    57.         {
    58.             DatabaseItem item = database[i];
    59.             if (item.answer.Length > answer.Length)
    60.                 break;
    61.             if (item.answer == answer)
    62.             {
    63.                 return i;
    64.             }
    65.         }
    66.         return -1;
    67.     }
    68.     public DatabaseItem GetRandomItem(int maxLength, int minLength = 0)
    69.     {
    70.         // Get an array of indexes between minLength and maxLength
    71.         List<int> indexes = GetIndexesBetween(minLength, maxLength);
    72.         // If the array is empty we can return null straight away
    73.         if (indexes.Count == 0)
    74.             return null;
    75.         // Return a random item from the newArray
    76.         return GetRandomDatabaseItem(indexes);
    77.     }
    78.     public List<int> GetShuffledItemsWithSharedCharacter(string c, int maxLength, int minLength = 0)
    79.     {
    80.         // Get an array of indexes between minLength and maxLength
    81.         List<int> indexes = GetIndexesBetween(minLength, maxLength);
    82.         // Filter the array by words that contain the search character
    83.         List<int> filteredIndexes = indexes.Where(i => database[i].answer.Contains(c)).ToList();
    84.         // Shuffle the index list
    85.         filteredIndexes = filteredIndexes.OrderBy(e => Random.Range(0f, 1f)).ToList();
    86.         // Return the new Array and the Shuffled indexes
    87.         return filteredIndexes;
    88.     }
    89.     public List<int> GetIndexesBetween(int minLength, int maxLength)
    90.     {
    91.         // Filter any bad data, just incase, just swaps min with max if min > max
    92.         if (minLength > maxLength)
    93.         {
    94.             int minCopy = minLength;
    95.             minLength = Mathf.Min(minLength, maxLength);
    96.             maxLength = minCopy;
    97.         }
    98.         // Establish min and max boundary based on the lengthBoundaries
    99.         int minBoundary = 0;
    100.         if (lengthBoundaries.ContainsKey(minLength))
    101.         {
    102.             minBoundary = lengthBoundaries[minLength];
    103.         }
    104.         int maxBoundary = database.Length;
    105.         if (lengthBoundaries.ContainsKey(maxLength + 1))
    106.         {
    107.             maxBoundary = lengthBoundaries[maxLength + 1];
    108.         }
    109.         // Add all indexes between min and max boundary and remove used indexes
    110.         List<int> indexes = new List<int>();
    111.         indexes.AddRange(Enumerable.Range(minBoundary, maxBoundary - minBoundary));
    112.         indexes.RemoveAll(e => usedIndexes.Contains(e));
    113.         // Return the new array
    114.         return indexes;
    115.     }
    116.     public DatabaseItem GetRandomDatabaseItem(List<int> indexes)
    117.     {
    118.         // Calculate totalWeight, could be optimized since weights are constant based on word length
    119.         // Could be easier to simply pass in some boundaries and calculate differences between them etc...
    120.         // This works fine for now though, almost no issues caused by this
    121.         int weightTotal = 0;
    122.         foreach (int i in indexes)
    123.         {
    124.             weightTotal += database[i].weight;
    125.         }
    126.         // Pick a random value based on the given weights
    127.         int total = 0;
    128.         float randVal = Random.Range(0f, 1f) * weightTotal;
    129.         foreach (int i in indexes)
    130.         {
    131.             DatabaseItem item = database[i];
    132.             total += item.weight;
    133.             if (total > randVal)
    134.                 return item;
    135.         }
    136.         return database[database.Length];
    137.     }
    138.     public void AddUsedItem(DatabaseItem item)
    139.     {
    140.         // Get the index of the given item
    141.         int i = GetIndexOfItem(item.answer);
    142.         // Set it as used if not already
    143.         if (!usedIndexes.Contains(i))
    144.             usedIndexes.Add(i);
    145.     }
    146. }
    It's absolutely worth noting that the file that I read from is pre-sorted by word length in descending order. Removes an additional sort which is good!

    Yes you're absolutely right, I tried to replace as many as I could. There are still a few but they don't appear to make a considerable difference in the runtime
     
    Last edited: Apr 21, 2022
    DragonCoder likes this.
  11. DragonCoder

    DragonCoder

    Joined:
    Jul 3, 2015
    Posts:
    1,459
    What a success story :D
    Congrats!

    And good idea to add comments. You know what's being said about uncommented code: When you write it, only you and god know its purpose. In 6 months only god knows anymore :p
     
    Zalosath likes this.
  12. Gladyon

    Gladyon

    Joined:
    Sep 10, 2015
    Posts:
    389
    Nice work!

    If you use the profile on 'InitializeDatabase()', I think it will show you that the String.Split() method is very slow.
    If you can modify the data file, you can have the first line containing the word, the second the corresponding clue, the third the next word, the fourth the corresponding clue, and so on.
    That way you won't have to split, I cannot be sure without actually testing and profiling, but my guess is that it would improve things.

    That said, don't spend time doing it if the profiler says that String.Split() isn't a problem.


    As for Debug.Log(), yeah, that thing is a nightmare.
    One of the first things I did in Unity has been to code a new logging system, which doesn't generate any garbage and which is a lot faster.
    Now I can log 20 times more logs per second without impacting the game.
    So, avoid Debug.Log() at all cost, or put it temporarily, or use #if clauses around them so you can remove them easily.
     
  13. DragonCoder

    DragonCoder

    Joined:
    Jul 3, 2015
    Posts:
    1,459
    Huh, have you shared that somewhere?
    In C# even the StringBuilder generates garbage.
     
  14. Gladyon

    Gladyon

    Joined:
    Sep 10, 2015
    Posts:
    389
    No, it's part of the tools I use, currently it's about 30k lines of code and I'm still actively working on it.
    Honestly they are completely tied to what I'm doing and the way I'm thinking, I have done these tools for my own use so there are many limitation which do not bother me but would make them unusable by anybody else (not counting that I have no user manual...).

    That's true, but you can generate most of that garbage at startup by pre-allocating Stringbuilders with a given length.
    The biggest problem being the StringBuilder.ToString() method.
    It used to be possible to make it garbage-free, when StringBuilder was based internally on a string.
    For some years now Unity changed Mono version and it's not possible to do it anymore.
    So you have to use nasty tricks to avoid StringBuilder.ToString() calls at all costs.

    That said, a thing generating most of the garbage in Debug.Log() is the float to string conversion .
    Most of what my logging system is doing is to offer garbage-free conversion of standard types to strings.
    That and lock-free multi-threading.
     
  15. Peeling

    Peeling

    Joined:
    Nov 10, 2013
    Posts:
    401
    Sounds like you're down to a reasonable timescale, but I have an alternative suggestion that might speed things up significantly if you feel like trying it:
    • Create an array of 26 empty List<int>
    • As you are reading in your dictionary, scan the letters of each answer and add the index of that entry to the end of the list corresponding to the character (A=0 Z=25)
    • Avoid entering the same index multiple times by checking to see if the last entry of the list is not 'i' before adding 'i'.
    Now, whenever you want a list of words that share a character... you already have one. And because you added them to the lists in the order they were read from the file, those lists of indices are already pre-sorted by length.

    So to get a list between min/max length, you can just go straight to the list for the shared character and construct a sub-list from the indices that are between the boundaries for the max length.

    EDIT: Also, instead of adding items to a 'used' list, try allocating an array of [maxIndex] ints (or bytes, or bits if you can be bothered) and set each 1 when the corresponding index is consumed.
     
    Last edited: Apr 21, 2022
  16. Zalosath

    Zalosath

    Joined:
    Sep 13, 2014
    Posts:
    671
    Cool idea. I guess there's a million and one ways to do this nicely and they're all bound to have pros and cons, I'll stick with my current implementation as I'm pretty happy with how it turned out, thanks for your suggestion though :)