Search Unity

Excessive Texture array samples slow on mobile

Discussion in 'Shaders' started by mrtkhosravi, Oct 25, 2017.

  1. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    I'm creating a terrain shader for mobile. It uses texture arrays. In fragment shader I do this to blend multiple textures.

    Code (CSharp):
    1. Shader "Test" {
    2.     Properties
    3.     {
    4.     _ctrlTx0("Control (RGBA)", 2D) = "red" {}
    5.     _ctrlTx1("Control (RGBA)", 2D) = "red" {}
    6.     _ctrlTxArr("Ctrl Array", 2DArray) = "black" {}
    7.     _albTxArr("Albedo Array", 2DArray) = "black" {}
    8.     _normTxArr("Normal Array", 2DArray) = "bump" {}
    9.  
    10.     _sp0("Layer 3 (A)", 2D) = "white" {}
    11.     _nm0("nm 3 (A)", 2D) = "bump" {}
    12.     _sp1("Layer 3 (A)", 2D) = "white" {}
    13.     _nm1("Layer 3 (A)", 2D) = "bump" {}
    14.     _sp2("Layer 3 (A)", 2D) = "white" {}
    15.     _nm2("Layer 3 (A)", 2D) = "bump" {}
    16.     _sp3("Layer 3 (A)", 2D) = "white" {}
    17.     _nm3("Layer 3 (A)", 2D) = "bump" {}
    18.     _sp4("Layer 3 (A)", 2D) = "white" {}
    19.     _nm4("Layer 3 (A)", 2D) = "bump" {}
    20.     _sp5("Layer 3 (A)", 2D) = "white" {}
    21.     _nm5("Layer 3 (A)", 2D) = "bump" {}
    22.     _sp6("Layer 3 (A)", 2D) = "white" {}
    23.     _nm6("Layer 3 (A)", 2D) = "bump" {}
    24.     _sp7("Layer 3 (A)", 2D) = "white" {}
    25.     _nm7("Layer 3 (A)", 2D) = "bump" {}
    26.  
    27.  
    28.     // Props
    29.     _props0("Props 0", Vector) = (1,0,0,0)
    30.     _props1("Props 1", Vector) = (1,0,0,0)
    31.     _props2("Props 2", Vector) = (1,0,0,0)
    32.     _props3("Props 3", Vector) = (1,0,0,0)
    33.     _props4("Props 4", Vector) = (1,0,0,0)
    34.     _props5("Props 5", Vector) = (1,0,0,0)
    35.     _props6("Props 6", Vector) = (1,0,0,0)
    36.     _props7("Props 7", Vector) = (1,0,0,0)
    37.  
    38.        
    39.         _terrainSize("Terrain Size", Float) = 10000
    40.         _terrainScale("Terrain Scale", Float) = 1
    41.         _Shininess("Shininess", Range(0.03, 1)) = 0.078125
    42.         _SpecColor("Specular Color", Color) = (0.5, 0.5, 0.5, 1)
    43.     }
    44.    
    45.     SubShader{
    46.         Tags{
    47.             "Queue" = "Geometry-99"
    48.             "RenderType" = "Opaque"
    49.         }
    50.         LOD 500
    51.         CGPROGRAM
    52.         #include "TestInclude.cginc"
    53.         #pragma target 3.5
    54.         #pragma debug
    55.         #pragma surface surf BlinnPhong vertex:vert noforwardadd noinstancing        
    56.        
    57.         void surf(Input IN, inout SurfaceOutput o)
    58.  
    59.         {
    60.             float2 uv = IN.uuv_ctrlTx0;
    61.             half3 alb;
    62.             half3 norm;
    63.             half smooth, metal;
    64.  
    65.             FastArray(uv, IN.uuv_sp0, IN.uuv_sp1, IN.uuv_sp2, IN.uuv_sp3, IN.uuv_sp4, IN.uuv_sp5, IN.uuv_sp6, alb, norm, metal, smooth);
    66.             o.Albedo = fixed3(alb);
    67.             o.Normal = fixed3(norm);
    68.             o.Gloss = fixed(smooth);
    69.             o.Specular = _Shininess;
    70.         }
    71.         ENDCG
    72.     }
    73. }
    The include file is:

    Code (CSharp):
    1. struct Input
    2. {
    3.     float2 uuv_sp0 : TEXCOORD0;
    4.     float2 uuv_sp1 : TEXCOORD1;
    5.     float2 uuv_sp2 : TEXCOORD2;
    6.     float2 uuv_sp3 : TEXCOORD3;
    7.     float2 uuv_sp4 : TEXCOORD4;
    8.     float2 uuv_sp5 : TEXCOORD5;
    9.     float2 uuv_sp6 : TEXCOORD6;
    10.     float2 uuv_ctrlTx0: TEXCOORD7;
    11. };
    12.  
    13. struct appdata {
    14.     float4 vertex : POSITION;
    15.     float4 tangent : TANGENT;
    16.     float3 normal : NORMAL;
    17.     float2 texcoord : TEXCOORD0;
    18.     float4 texcoord1 : TEXCOORD1;
    19.     float4 texcoord2 : TEXCOORD2;
    20. };
    21.  
    22. half _Shininess;
    23.  
    24. half4 _props0, _props1, _props2, _props3, _props4, _props5, _props6, _props7;
    25. sampler2D _sp0, _sp1, _sp2, _sp3, _sp4, _sp5, _nm0, _nm1, _nm2, _nm3, _nm4, _nm5;
    26. sampler2D _ctrlTx0, _ctrlTx1;
    27. float _terrainSize, _terrainScale;
    28. UNITY_DECLARE_TEX2DARRAY(_ctrlTxArr);
    29. UNITY_DECLARE_TEX2DARRAY(_albTxArr);
    30. UNITY_DECLARE_TEX2DARRAY(_normTxArr);
    31.  
    32. void FastArray(float2 uv, float2 uv0, float2 uv1, float2 uv2, float2 uv3, float2 uv4, float2 uv5, float2 uv6, out half3 alb, out half3 norm, out half metal, out half gloss) {
    33.     half4 ctrl0 = tex2D(_ctrlTx0, uv);
    34.     half4 ctrl1 = tex2D(_ctrlTx1, uv);
    35.  
    36.     //half4 ctrl0 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 0));
    37.     //half4 ctrl1 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 1));
    38.  
    39.     half4 sumNorm = 0;
    40.     alb = 0;
    41.  
    42.     sumNorm += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv0, 0));
    43.     alb += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv0, 0));
    44.  
    45.     sumNorm += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv1, 1));
    46.     alb += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv1, 1));
    47.  
    48.     sumNorm += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv2, 2));
    49.     alb += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv2, 2));
    50.  
    51.     sumNorm += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv3, 3));
    52.     alb += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv3, 3));
    53.  
    54.     sumNorm += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv4, 4));
    55.     alb += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv4, 4));
    56.  
    57.     sumNorm += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv5, 5));
    58.     alb += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv5, 5));
    59.  
    60.     sumNorm += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv6, 6));
    61.     alb += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv6, 6));
    62.  
    63.     sumNorm += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr, float3(uv6, 7));
    64.     alb += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr, float3(uv6, 7));
    65.  
    66.     gloss = 0.2;
    67.     norm.xyz = sumNorm.xyz * 2 - 1;
    68.     norm.z = sqrt(1 - saturate(dot(norm.xy, norm.xy)));
    69.     metal = 0.2;
    70. }
    71.  
    72. void vert(inout appdata v, out Input data)
    73. {
    74.     UNITY_INITIALIZE_OUTPUT(Input, data);
    75.     float2 uv = v.texcoord;
    76.     data.uuv_ctrlTx0 = uv;
    77.  
    78.     float2 ouv = (uv - 0.5) * _terrainSize;
    79.  
    80.     data.uuv_sp0 = ouv / _props0.x;
    81.     data.uuv_sp1 = ouv / _props1.x;
    82.     data.uuv_sp2 = ouv / _props2.x;
    83.     data.uuv_sp3 = ouv / _props3.x;
    84.     data.uuv_sp4 = ouv / _props4.x;
    85.     data.uuv_sp5 = ouv / _props5.x;
    86.     data.uuv_sp6 = ouv / _props6.x;
    87.  
    88.     v.tangent.xyz = cross(v.normal, float3(0, 0, 1));
    89.     v.tangent.w = -1;
    90. }
    91.  
    Each texture has albedo and normal thus with 4 textures we sample 8 times. The strange thing is performance drops rapidly if more than four texture pairs are sampled.
    8 samples =>24 fps
    12 samples =>14 fps
    16 samples =>7 fps

    Why after 4 texture pairs (8 samples) from texture arrays performance drops this much?

    Ordinary textures do not have this problems and 12 samples will give 20 fps.
     
  2. bgolus

    bgolus

    Joined:
    Dec 7, 2012
    Posts:
    12,352
    I suspect you're running into the issue of sampler stalling.

    When you have 12 separate textures you generally have 12 unique samplers so you can sample all 12 textures without incurring a huge penalty (assuming memory bandwidth isn't completely saturated) as all 12 textures will be sampled in parallel. Basically sampling 12 textures at once takes only as long as the slowest texture sample.

    Sampling a texture array multiple times is going to use the same physical texture sampler unit in the hardware for all of the samples meaning they happen in serial.

    TLDR; Sampling 12 textures in a texture array with a single sampler can take 12 times longer than 12 individual textures.
     
    NathanJSmith, wcw and mrtkhosravi like this.
  3. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    OMG!! That explains a lot. It took me two straight days wondering. Yes I can tell memory bandwidth is not the main issue here because with ordinary textures it gets at least two times faster. I'll revise the shader and see if increasing texture array samplers will solve the issue.
     
  4. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    I tested the shader with 8 texture arrays each having only two textures. It did not get any better. FPS is 6 as before. Here is the sampling part:
    Code (CSharp):
    1.     half4 ctrl0 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 0));
    2.  
    3.     half4 sumNorm = 0;
    4.     alb = 0;
    5.  
    6.     alb += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr0, uv0);
    7.     alb += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr0, uv1);
    8.     alb += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr1, uv2);
    9.     alb += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr1, uv3);
    10.  
    11.  
    12.     sumNorm += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr0, uv0);
    13.     sumNorm += ctrl0.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr0, uv1);
    14.     sumNorm += ctrl0.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr1, uv2);
    15.     sumNorm += ctrl0.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr1, uv3);
    16.  
    17.     half4 ctrl1 = UNITY_SAMPLE_TEX2DARRAY(_ctrlTxArr, float3(uv, 1));
    18.  
    19.     alb += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr2, uv4);
    20.     alb += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_albTxArr2, uv5);
    21.     alb += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_albTxArr3, uv6);
    22.     alb += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_albTxArr3, float3(uv6.xy, 1));
    23.  
    24.     sumNorm += ctrl1.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr2, uv4);
    25.     sumNorm += ctrl1.y * UNITY_SAMPLE_TEX2DARRAY(_normTxArr2, uv5);
    26.     sumNorm += ctrl1.z * UNITY_SAMPLE_TEX2DARRAY(_normTxArr3, uv6);
    27.     sumNorm += ctrl1.w * UNITY_SAMPLE_TEX2DARRAY(_normTxArr3, float3(uv6.xy, 1));
    I tried mixing alb and sumNorm like this:

    Code (CSharp):
    1.     alb += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_albTxArr0, uv0);
    2.     sumNorm += ctrl0.x * UNITY_SAMPLE_TEX2DARRAY(_normTxArr0, uv0);
    3.  
    Still the same. Also rearranged the samples so that sampling two slices of the same texture array be as far to each other as possible. No gain there too.
     
  5. bgolus

    bgolus

    Joined:
    Dec 7, 2012
    Posts:
    12,352
    You could try using UNITY_DECLARE_TEX2DARRAY_NOSAMPLER and defining inline samplers to use to enforce unique samplers per sample.
    https://docs.unity3d.com/Manual/SL-SamplerStates.html

    Or it could be texture arrays are just super slow on the hardware you're using. :/
     
    mrtkhosravi likes this.
  6. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    I don't get it. When we declare with UNITY_DECLARE_TEX2DARRAY doesn't It already declare a sampler2DArray on gles3?

    Maybe this is the case. I will test on other devices too and report on what happens.
     
  7. bgolus

    bgolus

    Joined:
    Dec 7, 2012
    Posts:
    12,352
    Yes. The idea was to declare multiple samplers (like 12) and use those to see if it's still slower using the texture arrays. Basically confirm it's the texture arrays themselves that are the issue as that would get the shader as functionally close to the non-array version as possible. If it's still slow then you know. Though it's odd as everything I know about texture arrays wouldn't make me think there should be any difference. I haven't been working on mobile for a while though so it's not something I've kept up with as much. @slipster216 might have a better idea if you can convince him to weigh in.
     
    mrtkhosravi likes this.
  8. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    I tested the shader on another device and it ran without any problem. 16 samples on texture array resulted in 16 FPS. It seems HTC One's GPU has problems with texture arrays.

    Thank you so much for the help.
     
  9. JasonBooth

    JasonBooth

    Joined:
    Jan 27, 2014
    Posts:
    651
    A lot of mobile GPUs, especially on android, will skimp on various features in the spec, either at the driver or hardware level. While texture arrays have been around for a long time, they aren't used a whole lot, and I could imagine some particular vendor not paying attention to it.
     
    mrtkhosravi likes this.
  10. nat42

    nat42

    Joined:
    Jun 10, 2017
    Posts:
    353
    I'm not familiar with texture arrays, but the example uses when I Google it seem to suggest they may exist or be used to optimise a different use case - sampling from a subset of the textures bound. If that's at all the case (and there is every chance I am far off the mark), why would one expect them to be as fast as or fast than the regular texture sampling approach where the driver might reasonably expect all textures to be sampled from.
     
  11. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    Thanks for the clear answer. That makes sense. Unfortunately in my case to support terrains with 8 albedo and 8 normal textures I have to use texture arrays.
     
  12. nat42

    nat42

    Joined:
    Jun 10, 2017
    Posts:
    353
    Why, they aren't supported on ES2 and they don't increase the textures number of textures you can sample from in ES3.x, do they?

    ES3.0 guarentees GL_MAX_TEXTURE_IMAGE_UNITS be atleast 16 as per https://www.khronos.org/registry/OpenGL-Refpages/es3.0/html/glGet.xhtml
     
  13. Ironmax

    Ironmax

    Joined:
    May 12, 2015
    Posts:
    890
    Try to include "Cull Back" under LOD 500 see if that improves things.
     
  14. JasonBooth

    JasonBooth

    Joined:
    Jan 27, 2014
    Posts:
    651
    On a Unity terrain, for 8 textures w/ normal you need 16 textures for the terrain types, 2 control textures, and then whatever samples the lighting pathway your using needs as well (lightmaps, etc). So it's atlasing or texture arrays.

    On more modern APIs, you can share samplers as well..

    You might want to give MicroSplat's core module a spin then and see if you have similar results. It's free, and more optimal than the code you posted above, so it might perform better than your homegrown solution. It's also extensible via it's module system, so if you don't like the modules I have available you could extend it yourself the same way I do.
     
    Martin_H likes this.
  15. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    Well, 16 is not enough. The most you can get with ordinary textures are 5 or 6 pairs of texture with normal. See slipster's answer.

    Tested and doesn't make any difference.

    Exactly. The atlas bleeding for repeated textures are dreadful and the shader should work on mobiles as well so texture arrays are the only option.


    Yes a lot of people say MicroSplat is the best and judging by the videos it seems it really is. If I wasn't developing my own terrain engine as a asset store product I would definitely go for MicroSplat. My solution is an integrated and complex system and It manages its own database of textures in a precompressed DXT format. I think it would be hard and needs work on both sides to integrate it with Microsplat. Maybe if the product becomes successful we could do it in later versions.
     

    Attached Files:

  16. JasonBooth

    JasonBooth

    Joined:
    Jan 27, 2014
    Posts:
    651
    I actually have a shipable version of MicroSplat for including with other products on the Unity Asset Store. Depending on what you need to do, you might be able to write a module with your custom features and ship with MicroSplat in your product. Users are then prompted to upgrade to the free core module if they want to change the shader/material settings, and once they do the demo version disabled and replaced by the full system. They can then purchase additional features. Ideally, it's a win/win for everyone- other developers get a better looking demo, and we both get cross promotion between our products.
     
    mrtkhosravi and Martin_H like this.
  17. mrtkhosravi

    mrtkhosravi

    Joined:
    Nov 9, 2014
    Posts:
    198
    That's good and I agree it's a win for everybody. I'll get in touch when the project is ready.
     
    Last edited: Nov 1, 2017
  18. hungrybelome

    hungrybelome

    Joined:
    Dec 31, 2014
    Posts:
    336
    @bgolus Hi, I'm working with texture arrays and wondering about this. Does defining more samplers in the shader properties alleviate the stalling? As in if I had _TexArray1, TexArray2 inputs defined, but both have the same texture array as the input. Or does the number of shader texture inputs not map to the amount of physical samplers used? Can one texture/texture array only have one sampler no matter what, due to GPU hardware or something? Thanks!
     
  19. bgolus

    bgolus

    Joined:
    Dec 7, 2012
    Posts:
    12,352
    Defining multiple sampler properties, or just multiple inline sampler states, will generally mean more physical samplers get used, even when reusing the same texture asset.

    One thing I’ve learned since my previous post is some hardware doesn’t actually have that many physical sampling units dedicated to each shader execution thread. The actual number is sometimes difficult to determine. It seems like it’s relatively safe to assume 2 or 4 concurrent texture samples will happen roughly in parallel, but more than that might be reusing the physical hardware and become serial again.
     
    hungrybelome likes this.
  20. hungrybelome

    hungrybelome

    Joined:
    Dec 31, 2014
    Posts:
    336
    Thank you! This info helps a ton. I'm using 2x 30-count TextureArrays and sampling from each one 2x times with each sample using its own defined sampler property, and I wasn't sure if this would actually mean that they are sampling from the same texure array input in parallel.

    If I have multiple materials/shaders that all use the same TextureArray inputs, is there some sort of performance gained which switching between those materials? I was reading this Draw Call Cost Analysis and they graph out the performance cost of reusing/changing textures, but I'm not sure if I am interpreting their data correctly.
     
  21. bgolus

    bgolus

    Joined:
    Dec 7, 2012
    Posts:
    12,352
    Swapping between shaders has a big cost. Swapping between materials has a smaller cost. Swapping textures seems to have a very minimal cost. So by the metrics being used in those tests, reusing a texture array across multiple materials / shaders isn't a huge improvement vs having unique textures per material.

    That said it appears to be a fairly confined test where memory bandwidth isn't necessarily being excercised heavily, so real world use cases may see better performance with texture reuse than that tests make it seem.
     
    hungrybelome likes this.
  22. hungrybelome

    hungrybelome

    Joined:
    Dec 31, 2014
    Posts:
    336
    Thank you!
     
  23. zorrendor

    zorrendor

    Joined:
    May 27, 2017
    Posts:
    5
    I have the same problem with huge cost on mobile for accessing texture arrays, using iPhone6. I tested if to use something like this, it works with almost the same speed as accessing usual texture, around 126 µs:
    Code (CSharp):
    1. half4 frag (v2f i) : SV_Target
    2. {
    3.         i.uv.z = 3.0;
    4.         return UNITY_SAMPLE_TEX2DARRAY(_MainTex, i.uv.xyz);
    5. }
    But if to pass in vertex shader z slice coordinate equals the same value performance drops multiple times, every texture access costs 2ms so if to blend 3 textures, it's already 6ms just for textures. Changing xy coordinate runtime doesn't effect performance really much but it seems when I touch slice coordinate it's doing some texture replacement inside.
    Also, I tried to use TextureAtlas and it works nice but with common bleeding problems and mipmaps.

    I'd be really greatful if somebody could explain how it works on mobile, I couldn't find on official site.
     
    Last edited: Feb 9, 2020
  24. Neto_Kokku

    Neto_Kokku

    Joined:
    Feb 15, 2018
    Posts:
    1,751
    Yikes! It sounds like the hardware there has issues with its texture array implementation. My guess is that with a static index, the shader can start sampling the texture ahead of time, before your fragment shader even executes, but a dynamic index (even a vertex interpolated one) stalls the sampling in a similar way to an indirect sampling.

    It doesn't seem like something you can work around, so I guess you'll have to rely on an atlas solution, implementing the necessary measures to remove bleeding.