Search Unity

  1. Unity 2019.2 is now released.
    Dismiss Notice

Compute Shader is failing to work on anything but DX12!

Discussion in 'Shaders' started by Matt_De_Boss_Developer, Aug 10, 2019.

  1. Matt_De_Boss_Developer

    Matt_De_Boss_Developer

    Joined:
    Oct 17, 2014
    Posts:
    36
    So at the current moment I have 2 different Compute Shaders with 4 different kernels.

    The kernel execution order goes like this:


    GenerateDensityField -----> FindActiveEdges -----> prescan -----> prescan -----> addblocksums


    GenerateDensityField: Basically, populate a Compute Buffer of "voxels" with a "material ID" and "value"

    FindActiveEdges: Use this filled ComputeBuffer of "voxels", and then populate another ComputeBuffer that is 3x the size of the voxel buffer. It populates this larger ComputeBuffer with either a "0" or "1" based on whether that edge is active (edge voxel has 3 UNIQUE edges to it that connect it to other voxels)

    prescan: First step of the prefix sum scan (blelloch), also populate the block sums

    prescan: Second step of the prefix sum scan (blelloch), this finds the prefix sum scan of the block sums

    addblocksums: Final step of the prefix sum scan (blelloch), this adds the block sum values to each portion of the scan that was done in the first step

    So basically, a lot of compute shaders rely on the fact that the one before it finishes its work.

    Now to the issue

    1. The prefix sum scan ComputeShader doesn't work for Vulkan or DX11 platforms, only DX12!
    Is there something that I am missing here?
    Here is all the code below:

    HOST CODE:

    Code (CSharp):
    1. using System.Collections;
    2. using System.Collections.Generic;
    3. using UnityEngine;
    4.  
    5. public class DensityFieldTest : MonoBehaviour
    6. {
    7.     // Start is called before the first frame update
    8.     public ComputeShader densityFieldShader;
    9.     public ComputeShader ScanShader;
    10.     public ComputeShader SetToOne;
    11.  
    12.     ComputeBuffer densityBuffer;
    13.     ComputeBuffer activeEdgesBuffer;
    14.     ComputeBuffer activeEdgesIndices;
    15.  
    16.  
    17.  
    18.     ComputeBuffer sumsBuffer;
    19.     ComputeBuffer auxBuffer;
    20.     ComputeBuffer scanOutputBuffer;
    21.  
    22.     int DensityKernelIndex;
    23.     int ActiveEdgeKernelIndex;
    24.  
    25.     int blellochScanKernel;
    26.     int sumsKernel;
    27.  
    28.     public int SizeX;
    29.     public int SizeY;
    30.     public int SizeZ;
    31.  
    32.     struct Voxel
    33.     {
    34.         public int material;
    35.         public float value;
    36.     };
    37.  
    38.  
    39.     int[] testArray;
    40.     int[] testArray2;
    41.  
    42.     void Start()
    43.     {
    44.  
    45.         SetupComputeBuffers();
    46.         SetupDensityFieldShader();
    47.         SetupScanShader();
    48.  
    49.         testArray = new int[(SizeX * SizeY * SizeZ * 3)];
    50.         testArray2 = new int[(SizeX * SizeY * SizeZ * 3)];
    51.  
    52.  
    53.  
    54.         densityFieldShader.Dispatch(DensityKernelIndex, (SizeX + 1) / 1, (SizeY + 1) / 1, (SizeZ + 1) / 1);
    55.         densityFieldShader.Dispatch(ActiveEdgeKernelIndex, SizeX / 4, SizeY / 4, SizeZ / 4);
    56.  
    57.  
    58.  
    59.      
    60.  
    61.         ScanShader.Dispatch(blellochScanKernel, (SizeX * SizeY * SizeZ * 3) / 1024, 1, 1);
    62.  
    63.         ScanShader.SetBuffer(blellochScanKernel, "d_in", sumsBuffer);
    64.         ScanShader.SetBuffer(blellochScanKernel, "d_out", sumsBuffer);
    65.         ScanShader.SetBuffer(sumsKernel, "d_block_sums", auxBuffer);
    66.         ScanShader.SetInt("len", (SizeX * SizeY * SizeZ * 3)/1024);
    67.  
    68.         ScanShader.Dispatch(blellochScanKernel, 1, 1, 1);
    69.  
    70.  
    71.  
    72.         ScanShader.SetInt("len", (SizeX * SizeY * SizeZ * 3));
    73.  
    74.         ScanShader.SetBuffer(sumsKernel, "d_in", scanOutputBuffer);
    75.         ScanShader.SetBuffer(sumsKernel, "d_out", scanOutputBuffer);
    76.         ScanShader.SetBuffer(sumsKernel, "d_block_sums", sumsBuffer);
    77.  
    78.  
    79.  
    80.         ScanShader.Dispatch(sumsKernel, (SizeX * SizeY * SizeZ * 3) / 1024, 1, 1);
    81.  
    82.  
    83.         scanOutputBuffer.GetData(testArray);
    84.         activeEdgesBuffer.GetData(testArray2);
    85.  
    86.  
    87.  
    88.  
    89.  
    90.         int counter = 0;
    91.         for(int i = 0; i < testArray2.Length; i++)
    92.         {
    93.             if(testArray2[i] == 1)
    94.             {
    95.                 counter++;
    96.             }
    97.         }
    98.  
    99.  
    100.         Debug.Log(counter);
    101.         Debug.Log(testArray[(SizeX * SizeY * SizeZ * 3) - 1]);
    102.  
    103.  
    104.  
    105.  
    106.  
    107.     }
    108.  
    109.     // Update is called once per frame
    110.     void Update()
    111.     {
    112.  
    113.         /*densityFieldShader.Dispatch(DensityKernelIndex, (SizeX + 1) / 1, (SizeY + 1) / 1, (SizeZ + 1) / 1);
    114.         densityFieldShader.Dispatch(ActiveEdgeKernelIndex, SizeX / 4, SizeY / 4, SizeZ / 4);
    115.  
    116.  
    117.  
    118.  
    119.  
    120.         ScanShader.Dispatch(blellochScanKernel, (SizeX * SizeY * SizeZ * 3) / 1024, 1, 1);
    121.  
    122.         ScanShader.SetBuffer(blellochScanKernel, "d_in", sumsBuffer);
    123.         ScanShader.SetBuffer(blellochScanKernel, "d_out", sumsBuffer);
    124.         ScanShader.SetBuffer(sumsKernel, "d_block_sums", auxBuffer);
    125.         ScanShader.SetInt("len", (SizeX * SizeY * SizeZ * 3) / 1024);
    126.  
    127.         ScanShader.Dispatch(blellochScanKernel, 1, 1, 1);
    128.  
    129.  
    130.  
    131.         ScanShader.SetInt("len", (SizeX * SizeY * SizeZ * 3));
    132.  
    133.         ScanShader.SetBuffer(sumsKernel, "d_in", scanOutputBuffer);
    134.         ScanShader.SetBuffer(sumsKernel, "d_out", scanOutputBuffer);
    135.         ScanShader.SetBuffer(sumsKernel, "d_block_sums", sumsBuffer);
    136.  
    137.  
    138.  
    139.         ScanShader.Dispatch(sumsKernel, (SizeX * SizeY * SizeZ * 3) / 1024, 1, 1);*/
    140.  
    141.  
    142.     }
    143.  
    144.  
    145.  
    146.  
    147.  
    148.  
    149.  
    150.  
    151.  
    152.     void SetupComputeBuffers()
    153.     {
    154.  
    155.         densityBuffer = new ComputeBuffer((SizeX + 1) * (SizeY + 1) * (SizeZ + 1), 8);
    156.         activeEdgesBuffer = new ComputeBuffer(SizeX * SizeY * SizeZ * 3, sizeof(int));
    157.         activeEdgesIndices = new ComputeBuffer(SizeX * SizeY * SizeZ * 3, sizeof(int));
    158.  
    159.         sumsBuffer = new ComputeBuffer((SizeX * SizeY * SizeZ * 3)/1024, sizeof(int));
    160.         scanOutputBuffer = new ComputeBuffer(SizeX * SizeY * SizeZ * 3, sizeof(int));
    161.         auxBuffer = new ComputeBuffer((SizeX * SizeY * SizeZ * 3) / 1024, sizeof(int));
    162.  
    163.     }
    164.  
    165.  
    166.     void SetupDensityFieldShader()
    167.     {
    168.  
    169.  
    170.         DensityKernelIndex = densityFieldShader.FindKernel("GenerateDensityField");
    171.         ActiveEdgeKernelIndex = densityFieldShader.FindKernel("FindActiveEdges");
    172.  
    173.         densityFieldShader.SetInt("SizeX", SizeX + 1);
    174.         densityFieldShader.SetInt("SizeY", SizeY + 1);
    175.         densityFieldShader.SetBuffer(DensityKernelIndex, "DensityField", densityBuffer);
    176.  
    177.         densityFieldShader.SetBuffer(ActiveEdgeKernelIndex, "DensityField", densityBuffer);
    178.         densityFieldShader.SetBuffer(ActiveEdgeKernelIndex, "activeEdgeOccupancy", activeEdgesBuffer);
    179.         densityFieldShader.SetBuffer(ActiveEdgeKernelIndex, "activeEdgeIndices", activeEdgesIndices);
    180.     }
    181.  
    182.     void SetupScanShader()
    183.     {
    184.  
    185.         blellochScanKernel = ScanShader.FindKernel("prescan");
    186.         sumsKernel = ScanShader.FindKernel("addblocksums");
    187.  
    188.         ScanShader.SetInt("len", SizeX * SizeY * SizeZ * 3);
    189.         ScanShader.SetBuffer(blellochScanKernel, "d_in", activeEdgesBuffer);
    190.         ScanShader.SetBuffer(blellochScanKernel, "d_block_sums", sumsBuffer);
    191.         ScanShader.SetBuffer(blellochScanKernel, "d_out", auxBuffer);
    192.  
    193.         ScanShader.SetBuffer(sumsKernel, "d_block_sums", sumsBuffer);
    194.  
    195.     }
    196.  
    197.  
    198.  
    199.  
    200. }
    201.  

    THE DENSITY FIELD AND ACTIVE EDGES COMPUTE SHADER:

    Code (CSharp):
    1. #pragma kernel GenerateDensityField
    2. #pragma kernel FindActiveEdges
    3.  
    4.  
    5.  
    6. float3 mod289(float3 x)
    7. {
    8.     return x - floor(x / 289.0) * 289.0;
    9. }
    10.  
    11. float4 mod289(float4 x)
    12. {
    13.     return x - floor(x / 289.0) * 289.0;
    14. }
    15.  
    16. float4 permute(float4 x)
    17. {
    18.     return mod289((x * 34.0 + 1.0) * x);
    19. }
    20.  
    21. float4 taylorInvSqrt(float4 r)
    22. {
    23.     return 1.79284291400159 - r * 0.85373472095314;
    24. }
    25.  
    26. float snoise(float3 v)
    27. {
    28.     const float2 C = float2(1.0 / 6.0, 1.0 / 3.0);
    29.  
    30.     // First corner
    31.     float3 i = floor(v + dot(v, C.yyy));
    32.     float3 x0 = v - i + dot(i, C.xxx);
    33.  
    34.     // Other corners
    35.     float3 g = step(x0.yzx, x0.xyz);
    36.     float3 l = 1.0 - g;
    37.     float3 i1 = min(g.xyz, l.zxy);
    38.     float3 i2 = max(g.xyz, l.zxy);
    39.  
    40.     // x1 = x0 - i1  + 1.0 * C.xxx;
    41.     // x2 = x0 - i2  + 2.0 * C.xxx;
    42.     // x3 = x0 - 1.0 + 3.0 * C.xxx;
    43.     float3 x1 = x0 - i1 + C.xxx;
    44.     float3 x2 = x0 - i2 + C.yyy;
    45.     float3 x3 = x0 - 0.5;
    46.  
    47.     // Permutations
    48.     i = mod289(i); // Avoid truncation effects in permutation
    49.     float4 p =
    50.         permute(permute(permute(i.z + float4(0.0, i1.z, i2.z, 1.0))
    51.             + i.y + float4(0.0, i1.y, i2.y, 1.0))
    52.             + i.x + float4(0.0, i1.x, i2.x, 1.0));
    53.  
    54.     // Gradients: 7x7 points over a square, mapped onto an octahedron.
    55.     // The ring size 17*17 = 289 is close to a multiple of 49 (49*6 = 294)
    56.     float4 j = p - 49.0 * floor(p / 49.0);  // mod(p,7*7)
    57.  
    58.     float4 x_ = floor(j / 7.0);
    59.     float4 y_ = floor(j - 7.0 * x_);  // mod(j,N)
    60.  
    61.     float4 x = (x_ * 2.0 + 0.5) / 7.0 - 1.0;
    62.     float4 y = (y_ * 2.0 + 0.5) / 7.0 - 1.0;
    63.  
    64.     float4 h = 1.0 - abs(x) - abs(y);
    65.  
    66.     float4 b0 = float4(x.xy, y.xy);
    67.     float4 b1 = float4(x.zw, y.zw);
    68.  
    69.     //float4 s0 = float4(lessThan(b0, 0.0)) * 2.0 - 1.0;
    70.     //float4 s1 = float4(lessThan(b1, 0.0)) * 2.0 - 1.0;
    71.     float4 s0 = floor(b0) * 2.0 + 1.0;
    72.     float4 s1 = floor(b1) * 2.0 + 1.0;
    73.     float4 sh = -step(h, 0.0);
    74.  
    75.     float4 a0 = b0.xzyw + s0.xzyw * sh.xxyy;
    76.     float4 a1 = b1.xzyw + s1.xzyw * sh.zzww;
    77.  
    78.     float3 g0 = float3(a0.xy, h.x);
    79.     float3 g1 = float3(a0.zw, h.y);
    80.     float3 g2 = float3(a1.xy, h.z);
    81.     float3 g3 = float3(a1.zw, h.w);
    82.  
    83.     // Normalise gradients
    84.     float4 norm = taylorInvSqrt(float4(dot(g0, g0), dot(g1, g1), dot(g2, g2), dot(g3, g3)));
    85.     g0 *= norm.x;
    86.     g1 *= norm.y;
    87.     g2 *= norm.z;
    88.     g3 *= norm.w;
    89.  
    90.     // Mix final noise value
    91.     float4 m = max(0.6 - float4(dot(x0, x0), dot(x1, x1), dot(x2, x2), dot(x3, x3)), 0.0);
    92.     m = m * m;
    93.     m = m * m;
    94.  
    95.     float4 px = float4(dot(x0, g0), dot(x1, g1), dot(x2, g2), dot(x3, g3));
    96.     return 42.0 * dot(m, px);
    97. }
    98.  
    99. float4 snoise_grad(float3 v)
    100. {
    101.     const float2 C = float2(1.0 / 6.0, 1.0 / 3.0);
    102.  
    103.     // First corner
    104.     float3 i = floor(v + dot(v, C.yyy));
    105.     float3 x0 = v - i + dot(i, C.xxx);
    106.  
    107.     // Other corners
    108.     float3 g = step(x0.yzx, x0.xyz);
    109.     float3 l = 1.0 - g;
    110.     float3 i1 = min(g.xyz, l.zxy);
    111.     float3 i2 = max(g.xyz, l.zxy);
    112.  
    113.     // x1 = x0 - i1  + 1.0 * C.xxx;
    114.     // x2 = x0 - i2  + 2.0 * C.xxx;
    115.     // x3 = x0 - 1.0 + 3.0 * C.xxx;
    116.     float3 x1 = x0 - i1 + C.xxx;
    117.     float3 x2 = x0 - i2 + C.yyy;
    118.     float3 x3 = x0 - 0.5;
    119.  
    120.     // Permutations
    121.     i = mod289(i); // Avoid truncation effects in permutation
    122.     float4 p =
    123.         permute(permute(permute(i.z + float4(0.0, i1.z, i2.z, 1.0))
    124.             + i.y + float4(0.0, i1.y, i2.y, 1.0))
    125.             + i.x + float4(0.0, i1.x, i2.x, 1.0));
    126.  
    127.     // Gradients: 7x7 points over a square, mapped onto an octahedron.
    128.     // The ring size 17*17 = 289 is close to a multiple of 49 (49*6 = 294)
    129.     float4 j = p - 49.0 * floor(p / 49.0);  // mod(p,7*7)
    130.  
    131.     float4 x_ = floor(j / 7.0);
    132.     float4 y_ = floor(j - 7.0 * x_);  // mod(j,N)
    133.  
    134.     float4 x = (x_ * 2.0 + 0.5) / 7.0 - 1.0;
    135.     float4 y = (y_ * 2.0 + 0.5) / 7.0 - 1.0;
    136.  
    137.     float4 h = 1.0 - abs(x) - abs(y);
    138.  
    139.     float4 b0 = float4(x.xy, y.xy);
    140.     float4 b1 = float4(x.zw, y.zw);
    141.  
    142.     //float4 s0 = float4(lessThan(b0, 0.0)) * 2.0 - 1.0;
    143.     //float4 s1 = float4(lessThan(b1, 0.0)) * 2.0 - 1.0;
    144.     float4 s0 = floor(b0) * 2.0 + 1.0;
    145.     float4 s1 = floor(b1) * 2.0 + 1.0;
    146.     float4 sh = -step(h, 0.0);
    147.  
    148.     float4 a0 = b0.xzyw + s0.xzyw * sh.xxyy;
    149.     float4 a1 = b1.xzyw + s1.xzyw * sh.zzww;
    150.  
    151.     float3 g0 = float3(a0.xy, h.x);
    152.     float3 g1 = float3(a0.zw, h.y);
    153.     float3 g2 = float3(a1.xy, h.z);
    154.     float3 g3 = float3(a1.zw, h.w);
    155.  
    156.     // Normalise gradients
    157.     float4 norm = taylorInvSqrt(float4(dot(g0, g0), dot(g1, g1), dot(g2, g2), dot(g3, g3)));
    158.     g0 *= norm.x;
    159.     g1 *= norm.y;
    160.     g2 *= norm.z;
    161.     g3 *= norm.w;
    162.  
    163.     // Compute noise and gradient at P
    164.     float4 m = max(0.6 - float4(dot(x0, x0), dot(x1, x1), dot(x2, x2), dot(x3, x3)), 0.0);
    165.     float4 m2 = m * m;
    166.     float4 m3 = m2 * m;
    167.     float4 m4 = m2 * m2;
    168.     float3 grad =
    169.         -6.0 * m3.x * x0 * dot(x0, g0) + m4.x * g0 +
    170.         -6.0 * m3.y * x1 * dot(x1, g1) + m4.y * g1 +
    171.         -6.0 * m3.z * x2 * dot(x2, g2) + m4.z * g2 +
    172.         -6.0 * m3.w * x3 * dot(x3, g3) + m4.w * g3;
    173.     float4 px = float4(dot(x0, g0), dot(x1, g1), dot(x2, g2), dot(x3, g3));
    174.     return 42.0 * float4(grad, dot(m4, px));
    175. }
    176.  
    177.  
    178. struct Voxel {
    179.     int material;
    180.     float value;
    181. };
    182.  
    183.  
    184. RWStructuredBuffer<Voxel> DensityField;
    185. int SizeX;
    186. int SizeY;
    187.  
    188. [numthreads(1,1,1)]
    189. void GenerateDensityField(uint3 id : SV_DispatchThreadID)
    190. {
    191.  
    192.     float3 p = float3(id.x, id.y, id.z);
    193.     float density = p.x * p.x + p.y * p.y + p.z * p.z - 36;//snoise((p / 16) + float3(0.1f, 0.1f, 0.1f));
    194.     DensityField[id.x + id.y * SizeX + id.z * SizeX * SizeY].value = density;
    195.     DensityField[id.x + id.y * SizeX + id.z * SizeX * SizeY].material = density < 0.f ? 0 : 1;
    196.  
    197.  
    198. }
    199.  
    200.  
    201.  
    202. RWStructuredBuffer<int> activeEdgeOccupancy;
    203. RWStructuredBuffer<int> activeEdgeIndices;
    204.  
    205. [numthreads(4,4,4)]
    206. void FindActiveEdges(uint3 id : SV_DispatchThreadID) {
    207.  
    208.     // *
    209.     // |   *
    210.     // |  /
    211.     // | /
    212.     // |/
    213.     // *----------*
    214.     // ^
    215.     // |
    216.     // Compute Shader index
    217.     //
    218.     //3 edges per voxel
    219.  
    220.  
    221.     const int CORNER_MATERIALS[4] =
    222.     {
    223.         DensityField[id.x + id.y * SizeX + id.z * SizeX * SizeY].material,
    224.         DensityField[(id.x + 1) + id.y * SizeX + id.z * SizeX * SizeY].material,
    225.         DensityField[id.x + (id.y + 1) * SizeX + id.z * SizeX * SizeY].material,
    226.         DensityField[id.x + id.y * SizeX + (id.z + 1) * SizeX * SizeY].material
    227.     };
    228.  
    229.     const int hermiteIndex = id.x | (id.y << 7) | (id.z << (7 * 2));
    230.  
    231.  
    232.  
    233.     for (int i = 0; i < 3; i++) {
    234.  
    235.         const int e = 1 + i;
    236.         const int signChange =
    237.             ((CORNER_MATERIALS[0] != 0 && CORNER_MATERIALS[e] == 0) ||
    238.             (CORNER_MATERIALS[0] == 0 && CORNER_MATERIALS[e] != 0)) ? 1 : 0;
    239.  
    240.         activeEdgeOccupancy[(id.x + id.y * SizeX + id.z * SizeX * SizeY * 3) + i] = signChange;
    241.         activeEdgeIndices[(id.x + id.y * SizeX + id.z * SizeX * SizeY * 3) + i] = signChange ? ((hermiteIndex << 2) | i) : -1;
    242.  
    243.     }
    244.  
    245.  
    246.  
    247. }
    THE PREFIX SUM SCAN COMPUTE SHADER:

    Code (CSharp):
    1. #pragma kernel addblocksums
    2. #pragma kernel prescan
    3.  
    4.  
    5. #define MAX_BLOCK_SZ 1024
    6. #define NUM_BANKS 32
    7. #define LOG_NUM_BANKS 5
    8.  
    9.  
    10. groupshared uint s_out[MAX_BLOCK_SZ + ((MAX_BLOCK_SZ - 1) >> LOG_NUM_BANKS)];
    11.  
    12. uint len;
    13.  
    14. RWStructuredBuffer<uint> d_in;
    15. RWStructuredBuffer<uint> d_block_sums;
    16. RWStructuredBuffer<uint> d_out;
    17.  
    18.  
    19.  
    20.  
    21. [numthreads(512, 1, 1)]
    22. void addblocksums(uint3 threadIdx : SV_GroupThreadID, uint3 glbl_tid : SV_DispatchThreadID, uint3 blockIdx : SV_GroupID) {
    23.  
    24.  
    25.     unsigned int d_block_sum_val = d_block_sums[blockIdx.x];
    26.  
    27.  
    28.     unsigned int cpy_idx = 2 * blockIdx.x * 512 + threadIdx.x;
    29.     if (cpy_idx < len)
    30.     {
    31.         d_out[cpy_idx] = d_in[cpy_idx] + d_block_sum_val;
    32.         if (cpy_idx + 512 < len)
    33.             d_out[cpy_idx + 512] = d_in[cpy_idx + 512] + d_block_sum_val;
    34.     }
    35.  
    36. };
    37.  
    38.  
    39.  
    40.  
    41. int CONFLICT_FREE_OFFSET(int n) {
    42.     return ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS));
    43. }
    44.  
    45.  
    46. [numthreads(512, 1, 1)]
    47. void prescan(uint3 threadIdx : SV_GroupThreadID, uint3 glbl_tid : SV_DispatchThreadID, uint3 blockIdx : SV_GroupID) {
    48.  
    49.     int thid = threadIdx.x;
    50.     int ai = thid;
    51.     int bi = thid + 512;
    52.  
    53.     s_out[thid] = 0;
    54.     s_out[thid + 512] = 0;
    55.  
    56.     if (thid + MAX_BLOCK_SZ < MAX_BLOCK_SZ + ((MAX_BLOCK_SZ - 1) >> LOG_NUM_BANKS))
    57.         s_out[thid + MAX_BLOCK_SZ] = 0;
    58.  
    59.     AllMemoryBarrierWithGroupSync();
    60.  
    61.  
    62.     unsigned int cpy_idx = MAX_BLOCK_SZ * blockIdx.x + threadIdx.x;
    63.     if (cpy_idx < len)
    64.     {
    65.         s_out[ai + CONFLICT_FREE_OFFSET(ai)] = d_in[cpy_idx];
    66.         if (cpy_idx + 512 < len)
    67.             s_out[bi + CONFLICT_FREE_OFFSET(bi)] = d_in[cpy_idx + 512];
    68.     }
    69.  
    70.     //up
    71.     int offset = 1;
    72.     for (int d = MAX_BLOCK_SZ >> 1; d > 0; d >>= 1)
    73.     {
    74.         AllMemoryBarrierWithGroupSync();
    75.  
    76.         if (thid < d)
    77.         {
    78.             int ai = offset * ((thid << 1) + 1) - 1;
    79.             int bi = offset * ((thid << 1) + 2) - 1;
    80.             ai += CONFLICT_FREE_OFFSET(ai);
    81.             bi += CONFLICT_FREE_OFFSET(bi);
    82.  
    83.             s_out[bi] += s_out[ai];
    84.         }
    85.         offset <<= 1;
    86.     }
    87.  
    88.  
    89.     if (thid == 0)
    90.     {
    91.         d_block_sums[blockIdx.x] = s_out[MAX_BLOCK_SZ - 1
    92.             + CONFLICT_FREE_OFFSET(MAX_BLOCK_SZ - 1)];
    93.         s_out[MAX_BLOCK_SZ - 1
    94.             + CONFLICT_FREE_OFFSET(MAX_BLOCK_SZ - 1)] = 0;
    95.     }
    96.  
    97.     //down
    98.     for (int d = 1; d < MAX_BLOCK_SZ; d <<= 1)
    99.     {
    100.         offset >>= 1;
    101.         AllMemoryBarrierWithGroupSync();
    102.  
    103.         if (thid < d)
    104.         {
    105.             int ai = offset * ((thid << 1) + 1) - 1;
    106.             int bi = offset * ((thid << 1) + 2) - 1;
    107.             ai += CONFLICT_FREE_OFFSET(ai);
    108.             bi += CONFLICT_FREE_OFFSET(bi);
    109.  
    110.             unsigned int temp = s_out[ai];
    111.             s_out[ai] = s_out[bi];
    112.             s_out[bi] += temp;
    113.         }
    114.     }
    115.     AllMemoryBarrierWithGroupSync();
    116.  
    117.  
    118.     if (cpy_idx < len)
    119.     {
    120.         d_out[cpy_idx] = s_out[ai + CONFLICT_FREE_OFFSET(ai)];
    121.         if (cpy_idx + 512 < len)
    122.             d_out[cpy_idx + 512] = s_out[bi + CONFLICT_FREE_OFFSET(bi)];
    123.     }
    124.  
    125.  
    126. }
     
    Last edited: Aug 11, 2019