Search Unity

  1. Unity 6 Preview is now available. To find out what's new, have a look at our Unity 6 Preview blog post.
    Dismiss Notice
  2. Unity is excited to announce that we will be collaborating with TheXPlace for a summer game jam from June 13 - June 19. Learn more.
    Dismiss Notice

Question Bitonic Sort - Compute shaders

Discussion in 'General Graphics' started by sewy, Jan 30, 2024.

  1. sewy

    sewy

    Joined:
    Oct 11, 2015
    Posts:
    150
    Hello,

    I am trying to implement Bitonic Sort for compute shaders based on camera distance. The reason is I am drawing quads rotated towards camera to hold by transparent gpu particles and I need to sort them before sending to DrawProceduralIndirect.

    I've found multiple examples on the internet and have it implemented, but I am battleling with weird problem that returns 0 for some values.

    This are the basic test files:
    BitonicSort.cs creates points on the Z axis, dispatch sorter, (on demand =>) gets back the data and then prints the sorted Z value of the points.
    BitonicSort.compute sorts the points based on distance to the _CameraPos (which is the BitonicSort.cs transform for testing purposes).
    But here comes the problem - it seems like this shared_data[GI | j] and this shared_data[GI ^ j] always returns 0 and I dont know why

    upload_2024-1-30_10-56-48.png

    Blue are not sorted, rest is sorted.

    Code (CSharp):
    1. using UnityEngine;
    2.  
    3. public class BitonicSort : MonoBehaviour
    4. {
    5.     public ComputeShader BitonicSortShader;
    6.  
    7.     public int BUFFER_SIZE = 10;
    8.  
    9.     const uint BITONIC_BLOCK_SIZE = 512;
    10.     const uint TRANSPOSE_BLOCK_SIZE = 16;
    11.  
    12.     private int bitonicSortKernel;
    13.     private int matrixTransposeKernel;
    14.  
    15.     private ComputeBuffer _inBuffer;
    16.     private ComputeBuffer _tempBuffer;
    17.  
    18.     public int show = 0;
    19.     private Vector3[] data;
    20.  
    21.     public bool everyFrame = false;
    22.  
    23.     void Start()
    24.     {
    25.         Debug.Log("<color=lime>1 key :Sort, 2 key : Reset</color>");
    26.  
    27.         Init();
    28.     }
    29.  
    30.     void Update()
    31.     {
    32.         if (everyFrame)
    33.         {
    34.             GPUSort(_inBuffer, _tempBuffer);
    35.             //ShowValuesOnConsole(_inBuffer, "sorted : ");
    36.         }
    37.  
    38.         if (Input.GetKeyUp("1"))
    39.         {
    40.             // Sort
    41.             GPUSort(_inBuffer, _tempBuffer);
    42.             ShowValuesOnConsole(_inBuffer, "sorted : ");
    43.         }
    44.  
    45.         if (Input.GetKeyUp("2"))
    46.         {
    47.             // Reset
    48.             Reset();
    49.         }
    50.  
    51.         if (Input.GetKeyUp("0"))
    52.             show = Mathf.Clamp(show + 1, 0, data.Length - 1);
    53.  
    54.         if (Input.GetKeyUp("9"))
    55.             show = Mathf.Clamp(show - 1, 0, data.Length - 1);
    56.     }
    57.  
    58.     void Init()
    59.     {
    60.         _inBuffer = new ComputeBuffer(BUFFER_SIZE, sizeof(float) * 3);
    61.         _tempBuffer = new ComputeBuffer(BUFFER_SIZE, sizeof(float) * 3);
    62.  
    63.         bitonicSortKernel = BitonicSortShader.FindKernel("BitonicSort");
    64.         matrixTransposeKernel = BitonicSortShader.FindKernel("MatrixTranspose");
    65.  
    66.         Reset();
    67.     }
    68.  
    69.     void Reset()
    70.     {
    71.         Vector3[] data = new Vector3[BUFFER_SIZE];
    72.         for (var i = 0; i < data.Length; i++)
    73.         {
    74.             //data[i] = Random.insideUnitSphere * BUFFER_SIZE;
    75.             data[i] = new Vector3(0, 0, Random.Range(0, BUFFER_SIZE));
    76.         }
    77.  
    78.         _inBuffer.SetData(data);
    79.         _tempBuffer.SetData(data);
    80.  
    81.         ShowValuesOnConsole(_inBuffer, "No sort : ");
    82.     }
    83.  
    84.     private void OnDrawGizmos()
    85.     {
    86.         if (data != null && data.Length > 0)
    87.             Gizmos.DrawSphere(data[show], 1);
    88.     }
    89.  
    90.     void GPUSort(ComputeBuffer inBuffer, ComputeBuffer tempBuffer)
    91.     {
    92.         // Determine parameters.
    93.         uint NUM_ELEMENTS = (uint)BUFFER_SIZE;
    94.         uint MATRIX_WIDTH = BITONIC_BLOCK_SIZE;
    95.         uint MATRIX_HEIGHT = (uint)NUM_ELEMENTS / BITONIC_BLOCK_SIZE;
    96.  
    97.         BitonicSortShader.SetInt("_Width", (int)MATRIX_WIDTH);
    98.         BitonicSortShader.SetInt("_Height", (int)MATRIX_HEIGHT);
    99.         BitonicSortShader.SetVector("_CameraPos", this.transform.position);
    100.  
    101.         // Sort the data
    102.         // First sort the rows for the levels <= to the block size
    103.         for (uint level = 2; level <= BITONIC_BLOCK_SIZE; level = level * 2)
    104.         {
    105.             SetGPUSortConstants(BitonicSortShader, level, level);
    106.  
    107.             // Sort the row data
    108.             BitonicSortShader.SetBuffer(bitonicSortKernel, "Data", inBuffer);
    109.             BitonicSortShader.Dispatch(bitonicSortKernel, Mathf.Max(1, (int)(NUM_ELEMENTS / BITONIC_BLOCK_SIZE)), 1, 1);
    110.         }
    111.      
    112.         // Then sort the rows and columns for the levels > than the block size
    113.         // Transpose. Sort the Columns. Transpose. Sort the Rows.
    114.         for (uint level = BITONIC_BLOCK_SIZE * 2; level <= NUM_ELEMENTS; level = level * 2)
    115.         {
    116.             // Transpose the data from buffer 1 into buffer 2
    117.             SetGPUSortConstants(BitonicSortShader, (level / BITONIC_BLOCK_SIZE), (level & ~NUM_ELEMENTS) / BITONIC_BLOCK_SIZE);
    118.             BitonicSortShader.SetBuffer(matrixTransposeKernel, "Input", inBuffer);
    119.             BitonicSortShader.SetBuffer(matrixTransposeKernel, "Data", tempBuffer);
    120.             int threadGroupX = (int)(MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE);
    121.             if (threadGroupX > 0)
    122.                 BitonicSortShader.Dispatch(matrixTransposeKernel, threadGroupX, Mathf.Max(1, (int)(MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE)), 1);
    123.  
    124.             // Sort the transposed column data
    125.             BitonicSortShader.SetBuffer(bitonicSortKernel, "Data", tempBuffer);
    126.             BitonicSortShader.Dispatch(bitonicSortKernel, (int)(NUM_ELEMENTS / BITONIC_BLOCK_SIZE), 1, 1);
    127.  
    128.             // Transpose the data from buffer 2 back into buffer 1
    129.             SetGPUSortConstants(BitonicSortShader, BITONIC_BLOCK_SIZE, level);
    130.             BitonicSortShader.SetBuffer(matrixTransposeKernel, "Input", tempBuffer);
    131.             BitonicSortShader.SetBuffer(matrixTransposeKernel, "Data", inBuffer);
    132.             threadGroupX = (int)(MATRIX_HEIGHT / TRANSPOSE_BLOCK_SIZE);
    133.             if (threadGroupX > 0)
    134.                 BitonicSortShader.Dispatch(matrixTransposeKernel, threadGroupX, Mathf.Max(1, (int)(MATRIX_WIDTH / TRANSPOSE_BLOCK_SIZE)), 1);
    135.  
    136.             // Sort the row data
    137.             BitonicSortShader.SetBuffer(bitonicSortKernel, "Data", inBuffer);
    138.             BitonicSortShader.Dispatch(bitonicSortKernel, (int)(NUM_ELEMENTS / BITONIC_BLOCK_SIZE), 1, 1);
    139.         }
    140.     }
    141.  
    142.     void SetGPUSortConstants(ComputeShader cs, uint level, uint levelMask)
    143.     {
    144.         cs.SetInt("_Level", (int)level);
    145.         cs.SetInt("_LevelMask", (int)levelMask);
    146.     }
    147.  
    148.     void ShowValuesOnConsole(ComputeBuffer buffer, string label)
    149.     {
    150.         if (buffer == null || buffer.count == 0) return;
    151.         var values = "";
    152.         data = new Vector3[buffer.count];
    153.         buffer.GetData(data);
    154.         for (var i = 0; i < data.Length; i++)
    155.         {
    156.             values += data[i].z + " ";
    157.         }
    158.         //Debug.Log(label + values);
    159.         Debug.Log(values);
    160.     }
    161.  
    162.     void OnDestroy()
    163.     {
    164.         if (_inBuffer != null)
    165.         {
    166.             _inBuffer.Release();
    167.         }
    168.         _inBuffer = null;
    169.  
    170.         if (_tempBuffer != null)
    171.         {
    172.             _tempBuffer.Release();
    173.         }
    174.         _tempBuffer = null;
    175.     }
    176. }

    Code (CSharp):
    1. // https://github.com/hiroakioishi/UnityGPUBitonicSort/blob/master/GPUBitonicSort/Assets/BitonicSortCS/BitonicSort.compute
    2. // https://github.com/ValentinKraft/UE4_SortingComputeShader/blob/master/ComputeShader/Shaders/Private/BitonicSortingKernelComputeShader.usf
    3.  
    4. #define BITONIC_BLOCK_SIZE 512
    5. #define TRANSPOSE_BLOCK_SIZE 16
    6. #define FLT_MAX 3.402823466e+38
    7.  
    8. cbuffer CB
    9. {
    10.     uint _Level;
    11.     uint _LevelMask;
    12.     uint _Width;
    13.     uint _Height;
    14.  
    15.     float3 _CameraPos;
    16. };
    17.  
    18. StructuredBuffer<float3> Input;
    19. RWStructuredBuffer<float3> Data;
    20.  
    21. // Bitonic Sort Compute Shader
    22. #pragma kernel BitonicSort
    23.  
    24. groupshared float3 shared_data[BITONIC_BLOCK_SIZE];
    25.  
    26. [numthreads(BITONIC_BLOCK_SIZE, 1, 1)]
    27. void BitonicSort(uint3 Gid  : SV_GroupID, // current group index (dispatched by c++)
    28.                  uint3 DTid : SV_DispatchThreadID, // "global" thread id
    29.                  uint3 GTid : SV_GroupThreadID, // current threadId in group / "local" threadId
    30.                  uint  GI   : SV_GroupIndex) // "flattened" index of a thread within a group
    31. {
    32.     // Load shared data
    33.     //shared_data[GI] = Data[DTid.y * BITONIC_BLOCK_SIZE + DTid.x];
    34.     shared_data[GI] = Data[DTid.x];
    35.     GroupMemoryBarrierWithGroupSync();
    36.  
    37.     // Sort the shared data - each thread must pick the min or max of the two elements it is comparing. The thread cannot compare and swap both elements because that would require random access writes.
    38.     for (uint j = _Level >> 1; j > 0; j >>= 1)
    39.     {
    40.         float3 pos1 = shared_data[GI & ~j];
    41.         float3 pos2 = shared_data[GI | j]; // this returns 0 => fail
    42.              
    43.         float dist1 = distance(pos1, _CameraPos);
    44.         float dist2 = distance(pos2, _CameraPos);
    45.  
    46.         // Ignore invalid (zero) values
    47.         //if (pos1.x == 0 && pos1.y == 0 && pos1.z == 0)
    48.         //    dist1 = -FLT_MAX;
    49.         //if (pos2.x == 0 && pos2.y == 0 && pos2.z == 0)
    50.         //    dist2 = -FLT_MAX;
    51.  
    52.         float3 result = ((dist1 >= dist2) == (bool) (_LevelMask & DTid.x)) ? shared_data[GI ^ j] : shared_data[GI]; // =>  this returns 0 => fail
    53.         //float3 result = shared_data[GI ^j]; // =>  this returns 0 => fail
    54.         //float3 result = shared_data[1]; // =>  this returns correct value
    55.         GroupMemoryBarrierWithGroupSync();
    56.  
    57.         shared_data[GI] = result;
    58.         GroupMemoryBarrierWithGroupSync();
    59.     }
    60.  
    61.     // Store shared data
    62.     Data[DTid.x] = shared_data[GI];
    63.     //Data[DTid.y * BITONIC_BLOCK_SIZE + DTid.x] = shared_data[GI];
    64. }
    65.  
    66. // Matrix Transpose Compute Shader
    67. #pragma kernel MatrixTranspose
    68.  
    69. groupshared float3 transpose_shared_data[TRANSPOSE_BLOCK_SIZE * TRANSPOSE_BLOCK_SIZE];
    70. [numthreads(TRANSPOSE_BLOCK_SIZE, TRANSPOSE_BLOCK_SIZE, 1)]
    71. void MatrixTranspose(uint3 Gid  : SV_GroupID,
    72.                      uint3 DTid : SV_DispatchThreadID,
    73.                      uint3 GTid : SV_GroupThreadID,
    74.                      uint  GI   : SV_GroupIndex)
    75. {
    76.     transpose_shared_data[GI] = Input[DTid.y * _Width + DTid.x];
    77.     GroupMemoryBarrierWithGroupSync();
    78.  
    79.     uint2 XY = DTid.yx - GTid.yx + GTid.xy;
    80.     Data[XY.y * _Height + XY.x] = transpose_shared_data[GTid.x * TRANSPOSE_BLOCK_SIZE + GTid.y];
    81. }
     
  2. FlaimBrad

    FlaimBrad

    Joined:
    May 6, 2023
    Posts:
    4
    I've also come across this issue, any chance you solved it?
     
  3. sewy

    sewy

    Joined:
    Oct 11, 2015
    Posts:
    150
  4. arkano22

    arkano22

    Joined:
    Sep 20, 2012
    Posts:
    1,967
    Bitonic sort only works on buffers whose length is a power of two.

    If you look at how the algorithm works it's quite obvious it will fail with buffers of any other size since it creates bitonic sequences starting at 2 numbers, then 4, then 8, etc. You're using BUFFER_SIZE = 10 which will break the algorithm.

    The usual workaround for this is to pad the buffers to the next power of two (eg. if you have 10 elements, use 16 as the buffer size, if you have 28 elements then create a buffer of length 32, and so on), then in the shader skip any comparisons with entries that are past the actual number of valid elements in the buffers.
     
    Last edited: Feb 3, 2024
    MaxEden, richardkettlewell and sewy like this.
  5. sewy

    sewy

    Joined:
    Oct 11, 2015
    Posts:
    150
    It is indeed, lol how I've missed that. Thank you!

    Wouldn't it cause problems when using GroupMemoryBarrierWithGroupSync()?

    If you know more about the algorythm, why is there in the sources
    float3 pos1 = shared_data[GI & ~j]; float3 pos2 = shared_data[GI | j];
    but then on assign there is
    ? shared_data[GI ^ j] : shared_data[GI]
    ? Shouldn't both (pos1 and one part of if, pos2 and second part of if) point to the same ID?