Search Unity

Custom grass renderer performance question

Discussion in 'Shaders' started by customphase, Mar 15, 2019.

  1. customphase

    customphase

    Joined:
    Aug 19, 2012
    Posts:
    246
    Im writing my own grass renderer using this approach:
    1) At the Start, generate a bunch of points where each grass blade would be, store them in a StructuredBuffer.
    2) Every frame run a compute shader, generating a quad for each grass point, writing 6 vertices per quad into another Structured buffer.
    3) Prepare a command buffer with drawProcedural call in it. pointsNum is the total grass origin points number, multiplying it by 6 since theres 6 vertices in each quad.:
    Code (CSharp):
    1. drawBuffer.DrawProcedural(transform.localToWorldMatrix, grassRenderMat, 0, MeshTopology.Triangles, pointsNum * 6);
    4) In the shader just access that vertices buffer and assign the data to the current vertex.

    The problem comes from the performance. The main bottleneck is the rendering part (compute part takes less than 0.5 ms, so its not a problem), on my 1050ti it takes about 8-9 ms for 100k grass blades. Although within 60 fps target, in my view thats still really slow, considering theres this paper for example (http://developer.amd.com/wordpress/media/2012/10/i3dGrassFINAL.pdf), where they use similar approach and can render 4m grass blades in under 6 ms(for actual rendering part), although they dont mention what kind of GPU they used.

    Am i wrong to assume its slow? Or am i doing something wrong?

    I tried simplifying the frag part of the shader, but it yielded only very insignificant performance increase.

    I also tried reducing the grass blades count by a factor of X, and rendering X instances instead, but it again did not help with performance whatsoever.

    Heres the relevant code:

    Code (CSharp):
    1. struct VertexData {
    2.     float3 wpos;
    3.     float3 wnorm;
    4.     float2 uv;
    5.     float dryness;
    6.     float grazing;
    7.     float2 padding;
    8. };
    9.  
    10. StructuredBuffer<VertexData> _GrassGeometryBuffer;
    11. float3 _MainCamForw;
    12. float _DoCulling;
    13. sampler2D _GrassDynamicsTex;
    14. float4x4 _GrassHeightViewProj;
    15.  
    16. struct v2f {
    17.     float4 pos : SV_POSITION;
    18.     float3 norm : NORMAL;
    19.     float2 uv : TEXCOORD0;
    20.     float3 wpos : TEXCOORD1;
    21.     float3 color : TEXCOORD2;
    22.     float grazing : TEXCOORD3;
    23. };
    24.  
    25. float4 SampleDynamics(float3 center) {
    26.     float4 pos = mul(_GrassHeightViewProj, float4(center, 1));
    27.     pos.xy /= pos.w;
    28.     pos.xy = pos.xy * 0.5 + 0.5;
    29.     pos.x = 1 - pos.x;
    30.     return tex2Dlod(_GrassDynamicsTex, float4(pos.xy, 0, 0));
    31. }
    32.  
    33. v2f grassVert(uint vid : SV_VertexID) {
    34.  
    35.     v2f OUT;
    36.     OUT.wpos = _GrassGeometryBuffer[vid].wpos;
    37.     OUT.pos = UnityObjectToClipPos(float4(OUT.wpos, 1));
    38.     OUT.color = lerp(1, float3(0.55, 0.3, 0), _GrassGeometryBuffer[vid].dryness);
    39.     OUT.uv = _GrassGeometryBuffer[vid].uv;
    40.     OUT.norm = _GrassGeometryBuffer[vid].wnorm;
    41.     OUT.grazing = _GrassGeometryBuffer[vid].grazing;
    42.     return OUT;
    43. }

    Code (CSharp):
    1. // Upgrade NOTE: replaced 'mul(UNITY_MATRIX_MVP,*)' with 'UnityObjectToClipPos(*)'
    2.  
    3. Shader "Custom/GrassGeometryShader" {
    4.     Properties{
    5.         _Color("Color", Color) = (1,1,1,1)
    6.         _MainTex("Albedo (RGB)", 2D) = "white" {}
    7.         _ShadowTex("Shadow (RGB)", 2D) = "white" {}
    8.         _Smoothness("Smoothness", Range(0,1)) = 0.5
    9.         _GrassHeight("Grass Height", Float) = 0.25
    10.         _GrassWidth("Grass Width", Float) = 0.25
    11.         _WindStrength("Wind strength", Float) = 1
    12.         _WindSpeed("Wind speed", Float) = 1
    13.         _AOStrength("AO strength", Range(0,1)) = 1
    14.         _GrazeReduction("Graze reduction", Range(0,1)) = 0.5
    15.     }
    16.     SubShader{
    17.  
    18.         Pass{
    19.  
    20.             Tags { "LightMode" = "Deferred" }
    21.  
    22.             Stencil {
    23.                 Ref 128
    24.                 Comp always
    25.                 Pass replace
    26.             }
    27.  
    28.             LOD 200
    29.             CULL Off
    30.  
    31.             CGPROGRAM
    32.  
    33.             #define LIGHTPROBE_SH 1
    34.             #define UNITY_SAMPLE_FULL_SH_PER_PIXEL 1
    35.  
    36.             // Use shader model 4.0 target, need geometry shader support
    37.             #include "UnityCG.cginc"
    38.             #include "MyGrassCompute.cginc"
    39.             #include "UnityStandardUtils.cginc"
    40.             #include "UnityStandardBRDF.cginc"
    41.             #include "UnityPBSLighting.cginc"
    42.             #include "UnityGlobalIllumination.cginc"
    43.             #include "MyDeferredHelpers.cginc"
    44.  
    45.             #pragma target 5.0
    46.             #pragma vertex grassVert
    47.             #pragma fragment frag
    48.  
    49.             sampler2D _MainTex;
    50.             half _Smoothness;
    51.             half _AOStrength;
    52.             half _GrazeReduction;
    53.  
    54.             struct FragmentOutput {
    55.                 float4 gBuffer0 : SV_Target0;
    56.                 float4 gBuffer1 : SV_Target1;
    57.                 float4 gBuffer2 : SV_Target2;
    58.                 float4 gBuffer3 : SV_Target3;
    59.             };
    60.  
    61.             FragmentOutput frag(v2f IN) {
    62.                 FragmentOutput output;
    63.                 float4 c = tex2D(_MainTex, IN.uv);
    64.                 //c.rgb = lerp(c.rgb, c.rgb*float3(0.55,0.3,0), IN.dryness);
    65.                 c.rgb *= IN.color;
    66.                 c.rgb *= lerp(1, saturate(IN.uv.y+0.2), _AOStrength);
    67.                 float graze = saturate(1-IN.grazing*2);
    68.                 clip(c.a - 0.3 - graze*0.7*_GrazeReduction);
    69.                 output.gBuffer0.rgb = c.rgb;
    70.                 output.gBuffer0.a = 1;
    71.                 output.gBuffer1.rgb = float3(0.03,0.03,0.03);
    72.                 output.gBuffer1.a = _Smoothness;
    73.                 output.gBuffer2 = float4(IN.norm * 0.5 + 0.5, 1);
    74.  
    75.                 //======= AMBIENT LIGHT =========
    76.                 /**/
    77.                 float3 eyeVec = _WorldSpaceCameraPos - IN.wpos;
    78.                 FragmentCommonData s = FragmentSetup(c, saturate(0.6-abs(_WorldSpaceLightPos0.y)), -eyeVec, float4(IN.norm, 1), IN.wpos, 1);
    79.  
    80.                 #if UNITY_ENABLE_REFLECTION_BUFFERS
    81.                     bool sampleReflectionsInDeferred = false;
    82.                 #else
    83.                     bool sampleReflectionsInDeferred = true;
    84.                 #endif
    85.  
    86.                 UnityLight dummyLight = DummyLight();
    87.                 half atten = 1;
    88.  
    89.                 UnityGI gi = FragmentGI(s, 1, float4(0, 0, 0, 1), atten, dummyLight, sampleReflectionsInDeferred);
    90.  
    91.                 half3 emissiveColor = UNITY_BRDF_PBS(s.diffColor, s.specColor, s.oneMinusReflectivity, s.smoothness, IN.norm, eyeVec, gi.light, gi.indirect).rgb;
    92.  
    93.                 output.gBuffer3 = float4(emissiveColor, 1);
    94.                 return output;
    95.             }
    96.  
    97.             ENDCG
    98.         }
    99.  
    100.         Pass
    101.         {
    102.             //Tags { "LightMode" = "ShadowCaster" }
    103.  
    104.             Stencil {
    105.                 Ref 128
    106.                 Comp always
    107.                 Pass replace
    108.             }
    109.  
    110.             LOD 200
    111.             CULL Off
    112.  
    113.             CGPROGRAM
    114.  
    115.             #include "UnityCG.cginc"
    116.             #include "MyGrassCompute.cginc"
    117.  
    118.             #pragma target 5.0
    119.             #pragma vertex grassVert
    120.             #pragma fragment frag
    121.             #pragma multi_compile GRASS_SHADOW _
    122.  
    123.             sampler2D _MainTex;
    124.  
    125.             float4 frag(v2f i) : COLOR
    126.             {
    127.                 float4 c = tex2D(_MainTex, i.uv);
    128.                 clip(c.a - 0.2);
    129.                 SHADOW_CASTER_FRAGMENT(i)
    130.             }
    131.  
    132.             ENDCG
    133.         }
    134.  
    135.     }
    136. }
     
    Last edited: Mar 15, 2019
  2. jvo3dc

    jvo3dc

    Joined:
    Oct 11, 2013
    Posts:
    1,520
    The paper you refer to doesn't render individual blades of grass, but patches with multiple blades instead. My experience is that that is the only way to go for grass (for now.) The full patch in the referenced paper contains 16.384 blades. (They dynamically skip a lot of them.)

    Als note that you only need 4 vertices for a quad, not 6.
     
  3. customphase

    customphase

    Joined:
    Aug 19, 2012
    Posts:
    246
    Thanks for the reply!

    Thats why i said, i tried only generating 1/X number of grass quads and rendering X instances of that instead (using DrawProceduralIndirect), which is what they do in paper i assume, but it didnt change anything. The performance impact was literally the same.

    Yeah, i know. But for some reason i thought DrawProcedural didnt have a support for line strips (which you need to render quad with 4 vertices procedurally). Now that i looked more, it does support that, ill try switching to that and see if that helps.
     
  4. customphase

    customphase

    Joined:
    Aug 19, 2012
    Posts:
    246
    Actually i was mistaken, i was thinking of triangle strips, which they indeed dont support. Line strips are not the same. And now that i think twice, having triangle strips wouldnt help anyway.
    Theres MeshTopology.Quads, but this seems to be just an emulation according to documentation, and it has many problems according to these posts:
    https://forum.unity.com/threads/gra...shtopology-quads-working-as-triangles.491007/
    https://forum.unity.com/threads/drawproceduralindirect-for-quads-broken.616150/
    Anyway, i tried them, and ive got some performance increase, but not nearly enough, about 4-6%.

    Sooo, were back to square one.
     
    Last edited: Mar 15, 2019
  5. customphase

    customphase

    Joined:
    Aug 19, 2012
    Posts:
    246
    Still looking for an answer/advice/explanation here
     
  6. customphase

    customphase

    Joined:
    Aug 19, 2012
    Posts:
    246
    Solved the issue thanks to these articles:

    https://80.lv/articles/preparing-realistic-grass-in-ue4/
    https://80.lv/articles/creating-next-gen-grass-in-ue4/

    The main problem was the overdraw. The solution was to NOT draw quads with a grass clump texture, like the way you would usually do with billboarding grass, and instead prepare a mesh of a clump of grass, having one or two quads per grass blade, 100-200 grass blades total, and "instantiate" that mesh through the shader. Using this approach i can now render about 400k blades of grass in about 4ms (+3 ms for shadows on a medium resolution; increasing the shadow resolution will increase the cost of this as well) on my 1050TI at 1080p:

    upload_2019-3-21_2-41-54.png
     
    DMorock, jvo3dc and bgolus like this.
  7. jvo3dc

    jvo3dc

    Joined:
    Oct 11, 2013
    Posts:
    1,520
    Right, so:
    I'm pretty surprised that that would reduce overdraw by much. It does reduce the amount of data and draw calls per grass blade. In any case, I'm glad you got it working. Looks good!
     
    customphase likes this.