Compute sum of array values in parallel with metal swift

Question

I am trying to compute sum of large array in parallel with metal swift.

Is there a god way to do it?

My plane was that I divide my array to sub arrays, compute sum of one sub arrays in parallel and then when parallel computation is finished compute sum of sub sums.

for example if I have

array = [a0,....an]

I divide array in sub arrays :

array_1 = [a_0,...a_i],
array_2 = [a_i+1,...a_2i],
....
array_n/i = [a_n-1, ... a_n]

sums for this arrays is computed in parallel and I get

sum_1, sum_2, sum_3, ... sum_n/1

at the end just compute sum of sub sums.

I create application which run my metal shader, but some things I don't understand quite.

        var array:[[Float]] = [[1,2,3], [4,5,6], [7,8,9]]

        // get device
        let device: MTLDevice! = MTLCreateSystemDefaultDevice()

        // get library
        let defaultLibrary:MTLLibrary! = device.newDefaultLibrary()

        // queue
        let commandQueue:MTLCommandQueue! = device.newCommandQueue()

        // function
        let kernerFunction: MTLFunction! = defaultLibrary.newFunctionWithName("calculateSum")

        // pipeline with function
        let pipelineState: MTLComputePipelineState! = try device.newComputePipelineStateWithFunction(kernerFunction)

        // buffer for function
        let commandBuffer:MTLCommandBuffer! = commandQueue.commandBuffer()

        // encode function
        let commandEncoder:MTLComputeCommandEncoder = commandBuffer.computeCommandEncoder()

        // add function to encode
        commandEncoder.setComputePipelineState(pipelineState)

        // options
        let resourceOption = MTLResourceOptions()

        let arrayBiteLength = array.count * array[0].count * sizeofValue(array[0][0])

        let arrayBuffer = device.newBufferWithBytes(&array, length: arrayBiteLength, options: resourceOption)

        commandEncoder.setBuffer(arrayBuffer, offset: 0, atIndex: 0)

        var result:[Float] = [0,0,0]

        let resultBiteLenght = sizeofValue(result[0])

        let resultBuffer = device.newBufferWithBytes(&result, length: resultBiteLenght, options: resourceOption)

        commandEncoder.setBuffer(resultBuffer, offset: 0, atIndex: 1)

        let threadGroupSize = MTLSize(width: 1, height: 1, depth: 1)

        let threadGroups = MTLSize(width: (array.count), height: 1, depth: 1)

        commandEncoder.dispatchThreadgroups(threadGroups, threadsPerThreadgroup: threadGroupSize)

        commandEncoder.endEncoding()

        commandBuffer.commit()

        commandBuffer.waitUntilCompleted()

        let data = NSData(bytesNoCopy: resultBuffer.contents(), length: sizeof(Float), freeWhenDone: false)

        data.getBytes(&result, length: result.count * sizeof(Float))

        print(result)

is my Swift code,

my shader is :

kernel void calculateSum(const device float *inFloat [[buffer(0)]],
                     device float *result [[buffer(1)]],
                     uint id [[ thread_position_in_grid ]]) {


    float * f = inFloat[id];
    float sum = 0;
    for (int i = 0 ; i < 3 ; ++i) {
        sum = sum + f[i];
    }

    result = sum;
}

I don't know how to defined that inFloat is array of array. I don't know exactly what is threadGroupSize and threadGroups. I don't know what is device and uint in shader properties.

Is this right approach?

Kametrixom · Accepted Answer

I took the time to create a fully working example of this problem with Metal. The explanation is in the comments:

let count = 10_000_000
let elementsPerSum = 10_000

// Data type, has to be the same as in the shader
typealias DataType = CInt

let device = MTLCreateSystemDefaultDevice()!
let library = self.library(device: device)
let parsum = library.makeFunction(name: "parsum")!
let pipeline = try! device.makeComputePipelineState(function: parsum)

// Our data, randomly generated:
var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) }
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
// Number of individual results = count / elementsPerSum (rounded up):
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum

// Our data in a buffer (copied):
let dataBuffer = device.makeBuffer(bytes: &data, length: MemoryLayout<DataType>.stride * count, options: [])!
// A buffer for individual results (zero initialized)
let resultsBuffer = device.makeBuffer(length: MemoryLayout<DataType>.stride * resultsCount, options: [])!
// Our results in convenient form to compute the actual result later:
let pointer = resultsBuffer.contents().bindMemory(to: DataType.self, capacity: resultsCount)
let results = UnsafeBufferPointer<DataType>(start: pointer, count: resultsCount)

let queue = device.makeCommandQueue()!
let cmds = queue.makeCommandBuffer()!
let encoder = cmds.makeComputeCommandEncoder()!

encoder.setComputePipelineState(pipeline)

encoder.setBuffer(dataBuffer, offset: 0, index: 0)

encoder.setBytes(&dataCount, length: MemoryLayout<CUnsignedInt>.size, index: 1)
encoder.setBuffer(resultsBuffer, offset: 0, index: 2)
encoder.setBytes(&elementsPerSumC, length: MemoryLayout<CUnsignedInt>.size, index: 3)

// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)

encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()

var start, end : UInt64
var result : DataType = 0

start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
    result += elem
}

end = mach_absolute_time()

print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
result = 0

start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
    for elem in buffer {
        result += elem
    }
}
end = mach_absolute_time()

print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")

I used my Mac to test it, but it should work just fine on iOS.

Output:

Metal result: 494936505, time: 0.024611456
CPU result: 494936505, time: 0.163341018

The Metal version is about 7 times faster. I'm sure you can get more speed if you implement something like divide-and-conquer with cutoff or whatever.

fdcpp · Answer

The accepted answer is annoyingly missing the kernel that was written for it. The source is here, but here is the full program and shader that can be run as a swift command line application.

/*
 * Command line Metal Compute Shader for data processing
 */

import Metal
import Foundation
//------------------------------------------------------------------------------
let count = 10_000_000
let elementsPerSum = 10_000
//------------------------------------------------------------------------------
typealias DataType = CInt // Data type, has to be the same as in the shader
//------------------------------------------------------------------------------
let device = MTLCreateSystemDefaultDevice()!
let library = device.makeDefaultLibrary()!
let parsum = library.makeFunction(name: "parsum")!
let pipeline = try! device.makeComputePipelineState(function: parsum)
//------------------------------------------------------------------------------
// Our data, randomly generated:
var data = (0..<count).map{ _ in DataType(arc4random_uniform(100)) }
var dataCount = CUnsignedInt(count)
var elementsPerSumC = CUnsignedInt(elementsPerSum)
// Number of individual results = count / elementsPerSum (rounded up):
let resultsCount = (count + elementsPerSum - 1) / elementsPerSum
//------------------------------------------------------------------------------
// Our data in a buffer (copied):
let dataBuffer = device.makeBuffer(bytes: &data, length: MemoryLayout<DataType>.stride * count, options: [])!
// A buffer for individual results (zero initialized)
let resultsBuffer = device.makeBuffer(length: MemoryLayout<DataType>.stride * resultsCount, options: [])!
// Our results in convenient form to compute the actual result later:
let pointer = resultsBuffer.contents().bindMemory(to: DataType.self, capacity: resultsCount)
let results = UnsafeBufferPointer<DataType>(start: pointer, count: resultsCount)
//------------------------------------------------------------------------------
let queue = device.makeCommandQueue()!
let cmds = queue.makeCommandBuffer()!
let encoder = cmds.makeComputeCommandEncoder()!
//------------------------------------------------------------------------------
encoder.setComputePipelineState(pipeline)
encoder.setBuffer(dataBuffer, offset: 0, index: 0)
encoder.setBytes(&dataCount, length: MemoryLayout<CUnsignedInt>.size, index: 1)
encoder.setBuffer(resultsBuffer, offset: 0, index: 2)
encoder.setBytes(&elementsPerSumC, length: MemoryLayout<CUnsignedInt>.size, index: 3)
//------------------------------------------------------------------------------
// We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)

// Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
//------------------------------------------------------------------------------
encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
encoder.endEncoding()
//------------------------------------------------------------------------------
var start, end : UInt64
var result : DataType = 0
//------------------------------------------------------------------------------
start = mach_absolute_time()
cmds.commit()
cmds.waitUntilCompleted()
for elem in results {
    result += elem
}

end = mach_absolute_time()
//------------------------------------------------------------------------------
print("Metal result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
//------------------------------------------------------------------------------
result = 0

start = mach_absolute_time()
data.withUnsafeBufferPointer { buffer in
    for elem in buffer {
        result += elem
    }
}
end = mach_absolute_time()

print("CPU result: \(result), time: \(Double(end - start) / Double(NSEC_PER_SEC))")
//------------------------------------------------------------------------------

#include <metal_stdlib>
using namespace metal;

typedef unsigned int uint;
typedef int DataType;

kernel void parsum(const device DataType* data [[ buffer(0) ]],
                   const device uint& dataLength [[ buffer(1) ]],
                   device DataType* sums [[ buffer(2) ]],
                   const device uint& elementsPerSum [[ buffer(3) ]],

                   const uint tgPos [[ threadgroup_position_in_grid ]],
                   const uint tPerTg [[ threads_per_threadgroup ]],
                   const uint tPos [[ thread_position_in_threadgroup ]]) {

    uint resultIndex = tgPos * tPerTg + tPos;

    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end

    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];
}

Compute sum of array values in parallel with metal swift

Tags:

swift

sum

shader

metal

Marko Zadravec

2 Answers

Kametrixom

fdcpp

Recent Activity

Donate For Us

Compute sum of array values in parallel with metal swift

Tags:

swift

sum

shader

metal

Marko Zadravec

2 Answers

Kametrixom

fdcpp

Related questions

Recent Activity

Donate For Us