metal: SDL_RenderFillRects uses one draw call per 16k rectangles (within the given FillRects call), instead of one draw call per rectangle. Reduces CPU usage when drawing many rectangles.
1.1 --- a/src/render/metal/SDL_render_metal.m Sun Nov 04 21:08:40 2018 +0100
1.2 +++ b/src/render/metal/SDL_render_metal.m Wed Nov 21 23:37:23 2018 -0400
1.3 @@ -117,6 +117,7 @@
1.4 @property (nonatomic, retain) id<MTLSamplerState> mtlsamplernearest;
1.5 @property (nonatomic, retain) id<MTLSamplerState> mtlsamplerlinear;
1.6 @property (nonatomic, retain) id<MTLBuffer> mtlbufconstants;
1.7 + @property (nonatomic, retain) id<MTLBuffer> mtlbufquadindices;
1.8 @property (nonatomic, retain) CAMetalLayer *mtllayer;
1.9 @property (nonatomic, retain) MTLRenderPassDescriptor *mtlpassdesc;
1.10 @property (nonatomic, assign) METAL_ShaderPipelines *activepipelines;
1.11 @@ -137,6 +138,7 @@
1.12 [_mtlsamplernearest release];
1.13 [_mtlsamplerlinear release];
1.14 [_mtlbufconstants release];
1.15 + [_mtlbufquadindices release];
1.16 [_mtllayer release];
1.17 [_mtlpassdesc release];
1.18 [super dealloc];
1.19 @@ -794,7 +796,6 @@
1.20 static int
1.21 METAL_QueueFillRects(SDL_Renderer * renderer, SDL_RenderCommand *cmd, const SDL_FRect * rects, int count)
1.22 {
1.23 - // !!! FIXME: use an index buffer
1.24 const size_t vertlen = (sizeof (float) * 8) * count;
1.25 float *verts = (float *) SDL_AllocateRenderVertices(renderer, vertlen, 0, &cmd->data.draw.first);
1.26 if (!verts) {
1.27 @@ -803,6 +804,11 @@
1.28
1.29 cmd->data.draw.count = count;
1.30
1.31 + /* Quads in the following vertex order (matches the quad index buffer):
1.32 + * 1---3
1.33 + * | \ |
1.34 + * 0---2
1.35 + */
1.36 for (int i = 0; i < count; i++, rects++) {
1.37 if ((rects->w <= 0.0f) || (rects->h <= 0.0f)) {
1.38 cmd->data.draw.count--;
1.39 @@ -829,9 +835,8 @@
1.40 METAL_QueueCopy(SDL_Renderer * renderer, SDL_RenderCommand *cmd, SDL_Texture * texture,
1.41 const SDL_Rect * srcrect, const SDL_FRect * dstrect)
1.42 {
1.43 - METAL_TextureData *texturedata = (__bridge METAL_TextureData *)texture->driverdata;
1.44 - const float texw = (float) texturedata.mtltexture.width;
1.45 - const float texh = (float) texturedata.mtltexture.height;
1.46 + const float texw = (float) texture->w;
1.47 + const float texh = (float) texture->h;
1.48 // !!! FIXME: use an index buffer
1.49 const size_t vertlen = (sizeof (float) * 16);
1.50 float *verts = (float *) SDL_AllocateRenderVertices(renderer, vertlen, 0, &cmd->data.draw.first);
1.51 @@ -867,9 +872,8 @@
1.52 const SDL_Rect * srcquad, const SDL_FRect * dstrect,
1.53 const double angle, const SDL_FPoint *center, const SDL_RendererFlip flip)
1.54 {
1.55 - METAL_TextureData *texturedata = (__bridge METAL_TextureData *)texture->driverdata;
1.56 - const float texw = (float) texturedata.mtltexture.width;
1.57 - const float texh = (float) texturedata.mtltexture.height;
1.58 + const float texw = (float) texture->w;
1.59 + const float texh = (float) texture->h;
1.60 const float rads = (float)(M_PI * (float) angle / 180.0f);
1.61 const float c = cosf(rads), s = sinf(rads);
1.62 float minu, maxu, minv, maxv;
1.63 @@ -1159,10 +1163,19 @@
1.64
1.65 case SDL_RENDERCMD_FILL_RECTS: {
1.66 const size_t count = cmd->data.draw.count;
1.67 - size_t start = 0;
1.68 + const size_t maxcount = UINT16_MAX / 6;
1.69 SetDrawState(renderer, cmd, SDL_METAL_FRAGMENT_SOLID, CONSTANTS_OFFSET_IDENTITY, mtlbufvertex, &statecache);
1.70 - for (size_t i = 0; i < count; i++, start += 4) { // !!! FIXME: can we do all of these this with a single draw call, using MTLPrimitiveTypeTriangle and an index buffer?
1.71 - [data.mtlcmdencoder drawPrimitives:MTLPrimitiveTypeTriangleStrip vertexStart:start vertexCount:4];
1.72 + /* Our index buffer has 16 bit indices, so we can only draw 65k
1.73 + * vertices (16k rects) at a time. */
1.74 + for (size_t i = 0; i < count; i += maxcount) {
1.75 + /* Set the vertex buffer offset for our current positions.
1.76 + * The vertex buffer itself was bound in SetDrawState. */
1.77 + [data.mtlcmdencoder setVertexBufferOffset:cmd->data.draw.first + i*sizeof(float)*8 atIndex:0];
1.78 + [data.mtlcmdencoder drawIndexedPrimitives:MTLPrimitiveTypeTriangle
1.79 + indexCount:SDL_min(maxcount, count - i) * 6
1.80 + indexType:MTLIndexTypeUInt16
1.81 + indexBuffer:data.mtlbufquadindices
1.82 + indexBufferOffset:0];
1.83 }
1.84 break;
1.85 }
1.86 @@ -1424,11 +1437,6 @@
1.87 #if !__has_feature(objc_arc)
1.88 [mtlbufconstantstaging autorelease];
1.89 #endif
1.90 - mtlbufconstantstaging.label = @"SDL constant staging data";
1.91 -
1.92 - id<MTLBuffer> mtlbufconstants = [data.mtldevice newBufferWithLength:CONSTANTS_LENGTH options:MTLResourceStorageModePrivate];
1.93 - data.mtlbufconstants = mtlbufconstants;
1.94 - data.mtlbufconstants.label = @"SDL constant data";
1.95
1.96 char *constantdata = [mtlbufconstantstaging contents];
1.97 SDL_memcpy(constantdata + CONSTANTS_OFFSET_IDENTITY, identitytransform, sizeof(identitytransform));
1.98 @@ -1437,10 +1445,42 @@
1.99 SDL_memcpy(constantdata + CONSTANTS_OFFSET_DECODE_BT601, decodetransformBT601, sizeof(decodetransformBT601));
1.100 SDL_memcpy(constantdata + CONSTANTS_OFFSET_DECODE_BT709, decodetransformBT709, sizeof(decodetransformBT709));
1.101
1.102 + int quadcount = UINT16_MAX / 4;
1.103 + size_t indicessize = sizeof(UInt16) * quadcount * 6;
1.104 + id<MTLBuffer> mtlbufquadindicesstaging = [data.mtldevice newBufferWithLength:indicessize options:MTLResourceStorageModeShared];
1.105 +#if !__has_feature(objc_arc)
1.106 + [mtlbufquadindicesstaging autorelease];
1.107 +#endif
1.108 +
1.109 + /* Quads in the following vertex order (matches the FillRects vertices):
1.110 + * 1---3
1.111 + * | \ |
1.112 + * 0---2
1.113 + */
1.114 + UInt16 *indexdata = [mtlbufquadindicesstaging contents];
1.115 + for (int i = 0; i < quadcount; i++) {
1.116 + indexdata[i * 6 + 0] = i * 4 + 0;
1.117 + indexdata[i * 6 + 1] = i * 4 + 1;
1.118 + indexdata[i * 6 + 2] = i * 4 + 2;
1.119 +
1.120 + indexdata[i * 6 + 3] = i * 4 + 2;
1.121 + indexdata[i * 6 + 4] = i * 4 + 1;
1.122 + indexdata[i * 6 + 5] = i * 4 + 3;
1.123 + }
1.124 +
1.125 + id<MTLBuffer> mtlbufconstants = [data.mtldevice newBufferWithLength:CONSTANTS_LENGTH options:MTLResourceStorageModePrivate];
1.126 + data.mtlbufconstants = mtlbufconstants;
1.127 + data.mtlbufconstants.label = @"SDL constant data";
1.128 +
1.129 + id<MTLBuffer> mtlbufquadindices = [data.mtldevice newBufferWithLength:indicessize options:MTLResourceStorageModePrivate];
1.130 + data.mtlbufquadindices = mtlbufquadindices;
1.131 + data.mtlbufquadindices.label = @"SDL quad index buffer";
1.132 +
1.133 id<MTLCommandBuffer> cmdbuffer = [data.mtlcmdqueue commandBuffer];
1.134 id<MTLBlitCommandEncoder> blitcmd = [cmdbuffer blitCommandEncoder];
1.135
1.136 - [blitcmd copyFromBuffer:mtlbufconstantstaging sourceOffset:0 toBuffer:data.mtlbufconstants destinationOffset:0 size:CONSTANTS_LENGTH];
1.137 + [blitcmd copyFromBuffer:mtlbufconstantstaging sourceOffset:0 toBuffer:mtlbufconstants destinationOffset:0 size:CONSTANTS_LENGTH];
1.138 + [blitcmd copyFromBuffer:mtlbufquadindicesstaging sourceOffset:0 toBuffer:mtlbufquadindices destinationOffset:0 size:indicessize];
1.139
1.140 [blitcmd endEncoding];
1.141 [cmdbuffer commit];
1.142 @@ -1503,8 +1543,10 @@
1.143 #endif
1.144 #else
1.145 #ifdef __IPHONE_11_0
1.146 - if ([mtldevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1]) {
1.147 - maxtexsize = 16384;
1.148 + if (@available(iOS 11.0, *)) {
1.149 + if ([mtldevice supportsFeatureSet:MTLFeatureSet_iOS_GPUFamily4_v1]) {
1.150 + maxtexsize = 16384;
1.151 + }
1.152 } else
1.153 #endif
1.154 #ifdef __IPHONE_10_0
1.155 @@ -1529,6 +1571,7 @@
1.156 [mtlsamplernearest release];
1.157 [mtlsamplerlinear release];
1.158 [mtlbufconstants release];
1.159 + [mtlbufquadindices release];
1.160 [view release];
1.161 [data release];
1.162 [mtldevice release];