metal: SDL_RenderFillRects uses one draw call per 16k rectangles (within the given FillRects call), instead of one draw call per rectangle. Reduces CPU usage when drawing many rectangles.
authorAlex Szpakowski <slime73@gmail.com>
Wed, 21 Nov 2018 23:37:23 -0400
changeset 124277508e848e7cf
parent 12426 779d711b6b5e
child 12428 f5d2abe5af7f
metal: SDL_RenderFillRects uses one draw call per 16k rectangles (within the given FillRects call), instead of one draw call per rectangle. Reduces CPU usage when drawing many rectangles.
src/render/metal/SDL_render_metal.m
     1.1 --- a/src/render/metal/SDL_render_metal.m	Sun Nov 04 21:08:40 2018 +0100
     1.2 +++ b/src/render/metal/SDL_render_metal.m	Wed Nov 21 23:37:23 2018 -0400
     1.3 @@ -117,6 +117,7 @@
     1.4      @property (nonatomic, retain) id<MTLSamplerState> mtlsamplernearest;
     1.5      @property (nonatomic, retain) id<MTLSamplerState> mtlsamplerlinear;
     1.6      @property (nonatomic, retain) id<MTLBuffer> mtlbufconstants;
     1.7 +    @property (nonatomic, retain) id<MTLBuffer> mtlbufquadindices;
     1.8      @property (nonatomic, retain) CAMetalLayer *mtllayer;
     1.9      @property (nonatomic, retain) MTLRenderPassDescriptor *mtlpassdesc;
    1.10      @property (nonatomic, assign) METAL_ShaderPipelines *activepipelines;
    1.11 @@ -137,6 +138,7 @@
    1.12      [_mtlsamplernearest release];
    1.13      [_mtlsamplerlinear release];
    1.14      [_mtlbufconstants release];
    1.15 +    [_mtlbufquadindices release];
    1.16      [_mtllayer release];
    1.17      [_mtlpassdesc release];
    1.18      [super dealloc];
    1.19 @@ -794,7 +796,6 @@
    1.20  static int
    1.21  METAL_QueueFillRects(SDL_Renderer * renderer, SDL_RenderCommand *cmd, const SDL_FRect * rects, int count)
    1.22  {
    1.23 -    // !!! FIXME: use an index buffer
    1.24      const size_t vertlen = (sizeof (float) * 8) * count;
    1.25      float *verts = (float *) SDL_AllocateRenderVertices(renderer, vertlen, 0, &cmd->data.draw.first);
    1.26      if (!verts) {
    1.27 @@ -803,6 +804,11 @@
    1.28  
    1.29      cmd->data.draw.count = count;
    1.30  
    1.31 +    /* Quads in the following vertex order (matches the quad index buffer):
    1.32 +     * 1---3
    1.33 +     * | \ |
    1.34 +     * 0---2
    1.35 +     */
    1.36      for (int i = 0; i < count; i++, rects++) {
    1.37          if ((rects->w <= 0.0f) || (rects->h <= 0.0f)) {
    1.38              cmd->data.draw.count--;
    1.39 @@ -829,9 +835,8 @@
    1.40  METAL_QueueCopy(SDL_Renderer * renderer, SDL_RenderCommand *cmd, SDL_Texture * texture,
    1.41                  const SDL_Rect * srcrect, const SDL_FRect * dstrect)
    1.42  {
    1.43 -    METAL_TextureData *texturedata = (__bridge METAL_TextureData *)texture->driverdata;
    1.44 -    const float texw = (float) texturedata.mtltexture.width;
    1.45 -    const float texh = (float) texturedata.mtltexture.height;
    1.46 +    const float texw = (float) texture->w;
    1.47 +    const float texh = (float) texture->h;
    1.48      // !!! FIXME: use an index buffer
    1.49      const size_t vertlen = (sizeof (float) * 16);
    1.50      float *verts = (float *) SDL_AllocateRenderVertices(renderer, vertlen, 0, &cmd->data.draw.first);
    1.51 @@ -867,9 +872,8 @@
    1.52                    const SDL_Rect * srcquad, const SDL_FRect * dstrect,
    1.53                    const double angle, const SDL_FPoint *center, const SDL_RendererFlip flip)
    1.54  {
    1.55 -    METAL_TextureData *texturedata = (__bridge METAL_TextureData *)texture->driverdata;
    1.56 -    const float texw = (float) texturedata.mtltexture.width;
    1.57 -    const float texh = (float) texturedata.mtltexture.height;
    1.58 +    const float texw = (float) texture->w;
    1.59 +    const float texh = (float) texture->h;
    1.60      const float rads = (float)(M_PI * (float) angle / 180.0f);
    1.61      const float c = cosf(rads), s = sinf(rads);
    1.62      float minu, maxu, minv, maxv;
    1.63 @@ -1159,10 +1163,19 @@
    1.64  
    1.65              case SDL_RENDERCMD_FILL_RECTS: {
    1.66                  const size_t count = cmd->data.draw.count;
    1.67 -                size_t start = 0;
    1.68 +                const size_t maxcount = UINT16_MAX / 6;
    1.69                  SetDrawState(renderer, cmd, SDL_METAL_FRAGMENT_SOLID, CONSTANTS_OFFSET_IDENTITY, mtlbufvertex, &statecache);
    1.70 -                for (size_t i = 0; i < count; i++, start += 4) {   // !!! FIXME: can we do all of these this with a single draw call, using MTLPrimitiveTypeTriangle and an index buffer?
    1.71 -                    [data.mtlcmdencoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:start vertexCount:4];
    1.72 +                /* Our index buffer has 16 bit indices, so we can only draw 65k
    1.73 +                 * vertices (16k rects) at a time. */
    1.74 +                for (size_t i = 0; i < count; i += maxcount) {
    1.75 +                    /* Set the vertex buffer offset for our current positions.
    1.76 +                     * The vertex buffer itself was bound in SetDrawState. */
    1.77 +                    [data.mtlcmdencoder setVertexBufferOffset:cmd->data.draw.first + i*sizeof(float)*8 atIndex:0];
    1.78 +                    [data.mtlcmdencoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle
    1.79 +                                                   indexCount:SDL_min(maxcount, count - i) * 6
    1.80 +                                                    indexType:MTLIndexTypeUInt16
    1.81 +                                                  indexBuffer:data.mtlbufquadindices
    1.82 +                                            indexBufferOffset:0];
    1.83                  }
    1.84                  break;
    1.85              }
    1.86 @@ -1424,11 +1437,6 @@
    1.87      #if !__has_feature(objc_arc)
    1.88      [mtlbufconstantstaging autorelease];
    1.89      #endif
    1.90 -    mtlbufconstantstaging.label = @"SDL constant staging data";
    1.91 -
    1.92 -    id<MTLBuffer> mtlbufconstants = [data.mtldevice newBufferWithLength:CONSTANTS_LENGTH options:MTLResourceStorageModePrivate];
    1.93 -    data.mtlbufconstants = mtlbufconstants;
    1.94 -    data.mtlbufconstants.label = @"SDL constant data";
    1.95  
    1.96      char *constantdata = [mtlbufconstantstaging contents];
    1.97      SDL_memcpy(constantdata + CONSTANTS_OFFSET_IDENTITY, identitytransform, sizeof(identitytransform));
    1.98 @@ -1437,10 +1445,42 @@
    1.99      SDL_memcpy(constantdata + CONSTANTS_OFFSET_DECODE_BT601, decodetransformBT601, sizeof(decodetransformBT601));
   1.100      SDL_memcpy(constantdata + CONSTANTS_OFFSET_DECODE_BT709, decodetransformBT709, sizeof(decodetransformBT709));
   1.101  
   1.102 +    int quadcount = UINT16_MAX / 4;
   1.103 +    size_t indicessize = sizeof(UInt16) * quadcount * 6;
   1.104 +    id<MTLBuffer> mtlbufquadindicesstaging = [data.mtldevice newBufferWithLength:indicessize options:MTLResourceStorageModeShared];
   1.105 +#if !__has_feature(objc_arc)
   1.106 +    [mtlbufquadindicesstaging autorelease];
   1.107 +#endif
   1.108 +
   1.109 +    /* Quads in the following vertex order (matches the FillRects vertices):
   1.110 +     * 1---3
   1.111 +     * | \ |
   1.112 +     * 0---2
   1.113 +     */
   1.114 +    UInt16 *indexdata = [mtlbufquadindicesstaging contents];
   1.115 +    for (int i = 0; i < quadcount; i++) {
   1.116 +        indexdata[i * 6 + 0] = i * 4 + 0;
   1.117 +        indexdata[i * 6 + 1] = i * 4 + 1;
   1.118 +        indexdata[i * 6 + 2] = i * 4 + 2;
   1.119 +
   1.120 +        indexdata[i * 6 + 3] = i * 4 + 2;
   1.121 +        indexdata[i * 6 + 4] = i * 4 + 1;
   1.122 +        indexdata[i * 6 + 5] = i * 4 + 3;
   1.123 +    }
   1.124 +
   1.125 +    id<MTLBuffer> mtlbufconstants = [data.mtldevice newBufferWithLength:CONSTANTS_LENGTH options:MTLResourceStorageModePrivate];
   1.126 +    data.mtlbufconstants = mtlbufconstants;
   1.127 +    data.mtlbufconstants.label = @"SDL constant data";
   1.128 +
   1.129 +    id<MTLBuffer> mtlbufquadindices = [data.mtldevice newBufferWithLength:indicessize options:MTLResourceStorageModePrivate];
   1.130 +    data.mtlbufquadindices = mtlbufquadindices;
   1.131 +    data.mtlbufquadindices.label = @"SDL quad index buffer";
   1.132 +
   1.133      id<MTLCommandBuffer> cmdbuffer = [data.mtlcmdqueue commandBuffer];
   1.134      id<MTLBlitCommandEncoder> blitcmd = [cmdbuffer blitCommandEncoder];
   1.135  
   1.136 -    [blitcmd copyFromBuffer:mtlbufconstantstaging sourceOffset:0 toBuffer:data.mtlbufconstants destinationOffset:0 size:CONSTANTS_LENGTH];
   1.137 +    [blitcmd copyFromBuffer:mtlbufconstantstaging sourceOffset:0 toBuffer:mtlbufconstants destinationOffset:0 size:CONSTANTS_LENGTH];
   1.138 +    [blitcmd copyFromBuffer:mtlbufquadindicesstaging sourceOffset:0 toBuffer:mtlbufquadindices destinationOffset:0 size:indicessize];
   1.139  
   1.140      [blitcmd endEncoding];
   1.141      [cmdbuffer commit];
   1.142 @@ -1503,8 +1543,10 @@
   1.143  #endif
   1.144  #else
   1.145  #ifdef __IPHONE_11_0
   1.146 -    if ([mtldevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1]) {
   1.147 -        maxtexsize = 16384;
   1.148 +    if (@available(iOS 11.0, *)) {
   1.149 +        if ([mtldevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1]) {
   1.150 +            maxtexsize = 16384;
   1.151 +        }
   1.152      } else
   1.153  #endif
   1.154  #ifdef __IPHONE_10_0
   1.155 @@ -1529,6 +1571,7 @@
   1.156      [mtlsamplernearest release];
   1.157      [mtlsamplerlinear release];
   1.158      [mtlbufconstants release];
   1.159 +    [mtlbufquadindices release];
   1.160      [view release];
   1.161      [data release];
   1.162      [mtldevice release];