DrawMeshInstancedIndirect Z Sort

GilCat · Sep 27, 2019

I'm having problems rendering object in the correct Z order. I'm using DrawMeshInstancedIndirect and it seems like the order that those quads are rendered depends on the order that the data is supplied to the compute buffer.
However if i change the ZWrite to On the order is not the one from the ComputeBuffer anymore and i loose my transparency.
I was wondering what is the best option to sort the objects (based on distance to the camera) to obtain the correct behavior, or if there is something else that i can do regarding the shader ZWrite. It seems to me that doing that sort on the CPU can be too heavy an will not scale well so is using ComputeBuffers the best choice?

Here is a screen of what is happening:

Thanks in advance and sorry for some basic shader questions that might be here!

Also here is my shader.

Shader "Sprites/Instanced"
{
Properties
{
_BaseColor("Main Color", Color) = (1,1,1,1)
_MainTex("Texture", 2D) = "white" {}
_TransitionTex("Transition Texture", 2D) = "transparent" {}
_Cutoff("Cutoff", Range(0, 1)) = 1
_TileOffset("Tile Offset", Vector) = (1,1,0,0)
_Scale("Mesh Scale", Vector) = (1,1,1)
_Pivot("Pivot", Vector) = (0,0,0)
}
SubShader
{
Tags { "Queue" = "Transparent" "IgnoreProjector" = "True" "RenderType" = "Transparent" }
LOD 100
Blend SrcAlpha OneMinusSrcAlpha
ZWrite Off
Pass
{
CGPROGRAM
#pragma vertex vert
#pragma fragment frag
#pragma multi_compile_local __ USE_COMPUTE
#pragma multi_compile_instancing
#pragma instancing_options procedural:setup
#include "UnityCG.cginc"

void setup() {}

struct appdata
{
float4 vertex : POSITION;
float2 uv : TEXCOORD0;
};

struct v2f
{
float2 uv1 : TEXCOORD0;
fixed2 uv2 : TEXCOORD1;
float4 vertex : SV_POSITION;
fixed4 color : COLOR0;
float cutoff : CUTOFF;
};

float f16tof32(uint x)
{
const uint shifted_exp = (0x7c00 << 13);
uint uf = (x & 0x7fff) << 13;
uint e = uf & shifted_exp;
uf += (127 - 15) << 23;
uf += lerp(0, (128u - 16u) << 23, e == shifted_exp);
uf = lerp(uf, asuint(asfloat(uf + (1 << 23)) - 6.10351563e-05f), e == 0);
uf |= (x & 0x8000) << 16;
return asfloat(uf);
}

float2 uintToFloat2(uint input) {
return float2(f16tof32(input & 0x0000FFFF), f16tof32((input & 0xFFFF0000) >> 16));
}

float4 uint2ToFloat4(uint2 input) {
float2 xy = uintToFloat2(input.x);
float2 zw = uintToFloat2(input.y);
return float4(xy.x, xy.y, zw.x, zw.y);
}

float4x4 uint4x2ToFloat4x4(uint4x2 input) {
float4 c0 = uint2ToFloat4(float2(input[0].x, input[1].x));
float4 c2 = uint2ToFloat4(float2(input[0].y, input[1].y));
float4 c1 = uint2ToFloat4(float2(input[2].x, input[3].x));
float4 c3 = uint2ToFloat4(float2(input[2].y, input[3].y));
return float4x4(
c0.x, c1.x, c2.x, c3.x,
c0.y, c1.y, c2.y, c3.y,
c0.z, c1.z, c2.z, c3.z,
c0.w, c1.w, c2.w, c3.w);
}

sampler2D _MainTex;
float4 _MainTex_ST;
float4 _TransitionTex_ST;
sampler2D _TransitionTex;

float4 _TileOffset;
float3 _Scale;
float2 _Pivot;
fixed4 _BaseColor;
float _Cutoff;

// uint is 32 bit and is filled with half2 (2 * 16 bit)
StructuredBuffer<uint4x2> localToWorldBuffer; // half4x4
StructuredBuffer<uint2> tileOffsetBuffer; // half4
StructuredBuffer<uint2> scaleBuffer; // half4
StructuredBuffer<uint> pivotBuffer; // half2
StructuredBuffer<uint2> colorBuffer; // half4
StructuredBuffer<uint> cutoffBuffer; // half

v2f vert(appdata v, uint instanceID : SV_InstanceID)
{
v2f o;
#ifdef USE_COMPUTE
float3 scale = uint2ToFloat4(scaleBuffer[instanceID]);
float2 pivot = uintToFloat2(pivotBuffer[instanceID]);
float4 tileOffset = uint2ToFloat4(tileOffsetBuffer[instanceID]);
float4x4 localToWorld = uint4x2ToFloat4x4(localToWorldBuffer[instanceID]);
float4 color = uint2ToFloat4(colorBuffer[instanceID]);
float cutoff = cutoffBuffer[instanceID];
#else
float3 scale = _Scale;
float2 pivot = _Pivot;
float4 tileOffset = _TileOffset;
float4x4 localToWorld = float4x4(
1, 0, 0, 0,
0, 1, 0, 0,
0, 0, 1, 0,
0, 0, 0, 1);
float4 color = _BaseColor;
float cutoff = _Cutoff;
#endif
float4x4 scaleMatrix = float4x4(
scale.x, 0, 0, 0,
0, scale.y, 0, 0,
0, 0, scale.z, 0,
0, 0, 0, 1);
float4 localVertexPos = mul(scaleMatrix, v.vertex) + mul(scaleMatrix, float4(pivot.x, pivot.y, 0, 0));
float4 localTranslated = mul(localToWorld, localVertexPos);
o.vertex = UnityObjectToClipPos(localTranslated);
o.uv1 = TRANSFORM_TEX(v.uv * tileOffset.xy + tileOffset.zw, _MainTex);
o.uv2 = TRANSFORM_TEX(v.uv, _TransitionTex);
o.color = color;
o.cutoff = cutoff;
return o;
}

fixed4 frag(v2f i) : SV_Target
{
float4 transit = tex2D(_TransitionTex, i.uv2);
fixed4 col = tex2D(_MainTex, i.uv1) * i.color;
fixed alpha = max(transit.a < 1, i.cutoff);
col.a = step(transit.b, alpha) * col.a;
return col;
}
ENDCG
}
}
}

richardkettlewell · Sep 27, 2019

GilCat said: ↑

I'm using DrawMeshInstancedIndirect and it seems like the order that those quads are rendered depends on the order that the data is supplied to the compute buffer.
Click to expand...

Yes that's correct, assuming you render sequentially from within that ComputeBuffer (i.e. your vertex shader for rendering pulls data out of the ComputeBuffer in linear order)

That means it's down to you to sort the data yourself, which on the GPU is a complex topic. Depending how many items you are wanting to sort, you could look into GPU Bitonic Sorting - there should be some stuff on Google that you can use as a starting point, maybe even written for Unity. If you only have a small(ish) number of items to sort, you might even be able to to do the sorting in a single Compute dispatch.

Alternatively, if you could "get away with" enabling Z write and Alpha Test the edges (aka Cutout render mode, using "clip" in the pixel shader) you would get hardware per-pixel sorting, but you would need MSAA to get any kind of soft edge on the Sprites.

GilCat · Sep 27, 2019

Thanks for you answer.

richardkettlewell said: ↑

you could look into GPU Bitonic Sorting - there should be some stuff on Google that you can use as a starting point, maybe even written for Unity.
Click to expand...

Yes that came out in my research about this. I just wanted to know if i was in the right direction.

How is that the default renderer does it with sprites without the need for enabling Z write and Alpha Test? Is it sorted further away deep inside the rendering pipeline?

Thanks

richardkettlewell · Sep 27, 2019

GilCat said: ↑

How is that the default renderer does it with sprites without the need for enabling Z write and Alpha Test? Is it sorted further away deep inside the rendering pipeline?
Click to expand...

The default renderer knows all the object positions, and they are stored on the CPU, so we can perform a sort based on the Transparency Sort Mode defined on the Camera (usually distance to camera), and, once sorted, submit a draw call for each object in the sorted order. Sorting on the CPU is generally much easier than on the GPU

But in your case, the object positions are in a ComputeBuffer, which is on the GPU. You're bypassing all the default Unity stuff by taking this path (presumably in order to get better performance).

BOBchasing · May 17, 2023

GilCat said: ↑

Thanks for you answer.

Yes that came out in my research about this. I just wanted to know if i was in the right direction.

How is that the default renderer does it with sprites without the need for enabling Z write and Alpha Test? Is it sorted further away deep inside the rendering pipeline?

Thanks
Click to expand...

I have the same problem. Did you find a solution later

sewy · Jan 2, 2024

I'd like to know as well @GilCat .

Search Unity

DrawMeshInstancedIndirect Z Sort

GilCat

richardkettlewell

Unity Technologies

GilCat

richardkettlewell

Unity Technologies

BOBchasing

sewy

Search Unity

Unity ID

Useful Searches

DrawMeshInstancedIndirect Z Sort

GilCat

richardkettlewell

Unity Technologies

GilCat

richardkettlewell

Unity Technologies

BOBchasing

sewy