Search Unity

DrawMeshInstancedIndirect Z Sort

Discussion in 'General Graphics' started by GilCat, Sep 27, 2019.

  1. GilCat

    GilCat

    Joined:
    Sep 21, 2013
    Posts:
    676
    I'm having problems rendering object in the correct Z order. I'm using DrawMeshInstancedIndirect and it seems like the order that those quads are rendered depends on the order that the data is supplied to the compute buffer.
    However if i change the ZWrite to On the order is not the one from the ComputeBuffer anymore and i loose my transparency.
    I was wondering what is the best option to sort the objects (based on distance to the camera) to obtain the correct behavior, or if there is something else that i can do regarding the shader ZWrite. It seems to me that doing that sort on the CPU can be too heavy an will not scale well so is using ComputeBuffers the best choice?

    Here is a screen of what is happening:
    upload_2019-9-27_0-21-12.png

    Thanks in advance and sorry for some basic shader questions that might be here!

    Also here is my shader.

    Shader "Sprites/Instanced"
    {
    Properties
    {
    _BaseColor("Main Color", Color) = (1,1,1,1)
    _MainTex("Texture", 2D) = "white" {}
    _TransitionTex("Transition Texture", 2D) = "transparent" {}
    _Cutoff("Cutoff", Range(0, 1)) = 1
    _TileOffset("Tile Offset", Vector) = (1,1,0,0)
    _Scale("Mesh Scale", Vector) = (1,1,1)
    _Pivot("Pivot", Vector) = (0,0,0)
    }
    SubShader
    {
    Tags { "Queue" = "Transparent" "IgnoreProjector" = "True" "RenderType" = "Transparent" }
    LOD 100
    Blend SrcAlpha OneMinusSrcAlpha
    ZWrite Off
    Pass
    {
    CGPROGRAM
    #pragma vertex vert
    #pragma fragment frag
    #pragma multi_compile_local __ USE_COMPUTE
    #pragma multi_compile_instancing
    #pragma instancing_options procedural:setup
    #include "UnityCG.cginc"

    void setup() {}

    struct appdata
    {
    float4 vertex : POSITION;
    float2 uv : TEXCOORD0;
    };

    struct v2f
    {
    float2 uv1 : TEXCOORD0;
    fixed2 uv2 : TEXCOORD1;
    float4 vertex : SV_POSITION;
    fixed4 color : COLOR0;
    float cutoff : CUTOFF;
    };

    float f16tof32(uint x)
    {
    const uint shifted_exp = (0x7c00 << 13);
    uint uf = (x & 0x7fff) << 13;
    uint e = uf & shifted_exp;
    uf += (127 - 15) << 23;
    uf += lerp(0, (128u - 16u) << 23, e == shifted_exp);
    uf = lerp(uf, asuint(asfloat(uf + (1 << 23)) - 6.10351563e-05f), e == 0);
    uf |= (x & 0x8000) << 16;
    return asfloat(uf);
    }

    float2 uintToFloat2(uint input) {
    return float2(f16tof32(input & 0x0000FFFF), f16tof32((input & 0xFFFF0000) >> 16));
    }

    float4 uint2ToFloat4(uint2 input) {
    float2 xy = uintToFloat2(input.x);
    float2 zw = uintToFloat2(input.y);
    return float4(xy.x, xy.y, zw.x, zw.y);
    }

    float4x4 uint4x2ToFloat4x4(uint4x2 input) {
    float4 c0 = uint2ToFloat4(float2(input[0].x, input[1].x));
    float4 c2 = uint2ToFloat4(float2(input[0].y, input[1].y));
    float4 c1 = uint2ToFloat4(float2(input[2].x, input[3].x));
    float4 c3 = uint2ToFloat4(float2(input[2].y, input[3].y));
    return float4x4(
    c0.x, c1.x, c2.x, c3.x,
    c0.y, c1.y, c2.y, c3.y,
    c0.z, c1.z, c2.z, c3.z,
    c0.w, c1.w, c2.w, c3.w);
    }

    sampler2D _MainTex;
    float4 _MainTex_ST;
    float4 _TransitionTex_ST;
    sampler2D _TransitionTex;

    float4 _TileOffset;
    float3 _Scale;
    float2 _Pivot;
    fixed4 _BaseColor;
    float _Cutoff;

    // uint is 32 bit and is filled with half2 (2 * 16 bit)
    StructuredBuffer<uint4x2> localToWorldBuffer; // half4x4
    StructuredBuffer<uint2> tileOffsetBuffer; // half4
    StructuredBuffer<uint2> scaleBuffer; // half4
    StructuredBuffer<uint> pivotBuffer; // half2
    StructuredBuffer<uint2> colorBuffer; // half4
    StructuredBuffer<uint> cutoffBuffer; // half

    v2f vert(appdata v, uint instanceID : SV_InstanceID)
    {
    v2f o;
    #ifdef USE_COMPUTE
    float3 scale = uint2ToFloat4(scaleBuffer[instanceID]);
    float2 pivot = uintToFloat2(pivotBuffer[instanceID]);
    float4 tileOffset = uint2ToFloat4(tileOffsetBuffer[instanceID]);
    float4x4 localToWorld = uint4x2ToFloat4x4(localToWorldBuffer[instanceID]);
    float4 color = uint2ToFloat4(colorBuffer[instanceID]);
    float cutoff = cutoffBuffer[instanceID];
    #else
    float3 scale = _Scale;
    float2 pivot = _Pivot;
    float4 tileOffset = _TileOffset;
    float4x4 localToWorld = float4x4(
    1, 0, 0, 0,
    0, 1, 0, 0,
    0, 0, 1, 0,
    0, 0, 0, 1);
    float4 color = _BaseColor;
    float cutoff = _Cutoff;
    #endif
    float4x4 scaleMatrix = float4x4(
    scale.x, 0, 0, 0,
    0, scale.y, 0, 0,
    0, 0, scale.z, 0,
    0, 0, 0, 1);
    float4 localVertexPos = mul(scaleMatrix, v.vertex) + mul(scaleMatrix, float4(pivot.x, pivot.y, 0, 0));
    float4 localTranslated = mul(localToWorld, localVertexPos);
    o.vertex = UnityObjectToClipPos(localTranslated);
    o.uv1 = TRANSFORM_TEX(v.uv * tileOffset.xy + tileOffset.zw, _MainTex);
    o.uv2 = TRANSFORM_TEX(v.uv, _TransitionTex);
    o.color = color;
    o.cutoff = cutoff;
    return o;
    }

    fixed4 frag(v2f i) : SV_Target
    {
    float4 transit = tex2D(_TransitionTex, i.uv2);
    fixed4 col = tex2D(_MainTex, i.uv1) * i.color;
    fixed alpha = max(transit.a < 1, i.cutoff);
    col.a = step(transit.b, alpha) * col.a;
    return col;
    }
    ENDCG
    }
    }
    }
     
  2. richardkettlewell

    richardkettlewell

    Unity Technologies

    Joined:
    Sep 9, 2015
    Posts:
    2,285
    Yes that's correct, assuming you render sequentially from within that ComputeBuffer (i.e. your vertex shader for rendering pulls data out of the ComputeBuffer in linear order)

    That means it's down to you to sort the data yourself, which on the GPU is a complex topic. Depending how many items you are wanting to sort, you could look into GPU Bitonic Sorting - there should be some stuff on Google that you can use as a starting point, maybe even written for Unity. If you only have a small(ish) number of items to sort, you might even be able to to do the sorting in a single Compute dispatch.

    Alternatively, if you could "get away with" enabling Z write and Alpha Test the edges (aka Cutout render mode, using "clip" in the pixel shader) you would get hardware per-pixel sorting, but you would need MSAA to get any kind of soft edge on the Sprites.
     
    Last edited: Sep 27, 2019
    GilCat likes this.
  3. GilCat

    GilCat

    Joined:
    Sep 21, 2013
    Posts:
    676
    Thanks for you answer.

    Yes that came out in my research about this. I just wanted to know if i was in the right direction.

    How is that the default renderer does it with sprites without the need for enabling Z write and Alpha Test? Is it sorted further away deep inside the rendering pipeline?

    Thanks
     
  4. richardkettlewell

    richardkettlewell

    Unity Technologies

    Joined:
    Sep 9, 2015
    Posts:
    2,285
    The default renderer knows all the object positions, and they are stored on the CPU, so we can perform a sort based on the Transparency Sort Mode defined on the Camera (usually distance to camera), and, once sorted, submit a draw call for each object in the sorted order. Sorting on the CPU is generally much easier than on the GPU :)

    But in your case, the object positions are in a ComputeBuffer, which is on the GPU. You're bypassing all the default Unity stuff by taking this path (presumably in order to get better performance).
     
    MINORLIFE and GilCat like this.
  5. BOBchasing

    BOBchasing

    Joined:
    Mar 3, 2020
    Posts:
    5
    I have the same problem. Did you find a solution later
     
  6. sewy

    sewy

    Joined:
    Oct 11, 2015
    Posts:
    150
    I'd like to know as well @GilCat .