in the following iteration, the position in the vector "result" is determined by the "counter". Please, also look at the starting values in the FOR-loop, which shows that some iterations are left out.
int counter = (int)0;
for (z=0; z<N; z++)
for (y=z; y<N; y++)
for (x=y; x<N; x++)
{
result[counter] = A[z] + A[y] + A[x];
counter++;
}
However, if I translate this iteration into a 3d kernel I wonder how I can provide each thread with the correct position to write in the vector "result"? Is there a mathematical solution to calculate out the missing threads?
int idz = blockIdx.z*blockDim.z + threadIdx.z;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx >= idy && idy >= idz)
{
result[?????] = A[idz] + A[idy] + A[idx];
}
I tried to skip the missings with the Gauss equation:
0.5 * idx * (idx + 1)
But I am not even close to the solution. Does someone has an idea?
I imagine there might be more than one way to tackle this. I shall sketch one approach.
The formulas for the sum of the first n integers and sum of first n integers squared are readily available as:
Σi i:1..n = (n)(n+1)/2
Σi^2 i:1..n = (n)(n+1)(2n+1)/6
Let's start by building a table showing the amount of loop iterations in x, for each z,y position. To visualize this it will help if we choose a small number for N, like 4:
z| 0 1 2 3
y
-
0 4
1 3 3
2 2 2 2
3 1 1 1 1
From the above table, we can observe that the sum of any column is the sum of the first n integers, where n can be deduced from N and z index. (n = N-z).
We can use this observation, plus the formula (indicated above) for the sum of the first n integers, to compute the offset (into the result array) for a given z value. This offset will be the sum of the columns to the left of z. We can observe from the above pattern that the offset contributed by the z index is:
Σ(i)(i+1)/2 i: (N-z)..N
which is:
1/2 * ( Σ(i^2) + Σi) i: (N-z)..N
now, summing over (N-z)..N is inconvenient, so to realize this using the formulas presented above, we shall rewrite each summation as the difference of two summations, one from 1..N minus one from 1..z:
1/2 * ( Σ(i^2)|1..N - Σ(i^2)|1..z + Σi|1..N - Σi|1..z) )
If we substitute our formulas for the sums presented in the beginning into the above expression, we can come up with a closed-form expression for the offset contributed by the z index.
(((N*(N+1)*(2*N + 1) - (N-z)*(N-z+1)*(2*(N-z)+1))/3 + (N*(N+1) - (N-z)*(N-z+1)))/2
Now we need to come up with a formula for the offset (into the result array) contributed by the y index. Again, by inspection of the pattern in the z,y array presented earlier, we can observe that this is given by:
Σi i: (N-y)..(N-z)
We can likewise convert this to the difference of two summations for calculation purposes, and then use our formula for the sum of the integers:
((N-z)(N-z+1) - (N-y)(N-y+1))/2
We also have to make an adjustment to the x index.
Here's a full example:
$ cat t112.cu
#include <iostream>
template <typename T>
void on_cpu(int N, T *A, T *result){
int counter = 0;
int x,y,z;
for (z=0; z<N; z++)
for (y=z; y<N; y++)
for (x=y; x<N; x++){
result[counter] = A[z] + A[y] + A[x];
counter++;}
std::cout << " counter = " << counter << std::endl;
}
__host__ __device__ int compute_z_offset(int N, int z){
int i = N-z;
// 0.5*(sum n^2|1..N - sum n^2|1..i + sum n|1..N - sum n|1..i )
int sumn2 = ((N*(N+1)*(2*N + 1)) - (i*(i+1)*(2*i + 1)))/3;
int sumn = ((N*(N+1)) - (i*(i+1)));
return (sumn2 + sumn)>>2;
}
__host__ __device__ int compute_y_offset(int N, int z, int y){
int i = N-z;
int j = N-y;
return (i*(i+1) - j*(j+1))>>1;
}
template <typename T>
__global__ void on_gpu(int N, T *A, T *result){
int idz = blockIdx.z*blockDim.z + threadIdx.z;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx >= idy && idy >= idz && idx < N && idy < N && idz < N){
result[compute_z_offset(N, idz) + compute_y_offset(N, idz, idy) + (idx-idy)] = A[idz] + A[idy] + A[idx];
}
}
int main(){
const int N = 9;
int *A, *result, *d_A, *d_result, *h_result;
A = new int[N];
result = new int[N*N*N];
h_result = new int[N*N*N];
cudaMalloc(&d_result, N*N*N*sizeof(result[0]));
cudaMalloc(&d_A, N*sizeof(A[0]));
for (int i = 0; i < N; i++) A[i] = i+1;
cudaMemcpy(d_A, A, N*sizeof(A[0]), cudaMemcpyHostToDevice);
cudaMemset(d_result, 0, N*N*N*sizeof(result[0]));
on_cpu(N, A, result);
dim3 block(8,8,8);
dim3 grid((N+block.x-1)/block.x, (N+block.y-1)/block.y, (N+block.z-1)/block.z);
on_gpu<<<grid, block>>>(N, d_A, d_result);
cudaMemcpy(h_result, d_result, N*N*N*sizeof(result[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < N*N*N; i++) if (result[i] != h_result[i]) {std::cout << "mismatch at: " << i << " was: " << h_result[i] << " should be: " << result[i] << std::endl; break;}
}
$ nvcc -o t112 t112.cu
$ cuda-memcheck ./t112
========= CUDA-MEMCHECK
counter = 165
========= ERROR SUMMARY: 0 errors
$
I have not tested this thoroughly. My purpose is to demonstrate how it might be done, but there may be defects. Also, I imagine the indexing calculations perhaps could be simplified (combine like terms, etc.) For convenience, I simply set the size of the result arrays to N*N*N, although not all those locations would be "used". You can determine the necessary size analytically using the formulas I have already presented here.
Here is a slightly edited version of the code. Mostly it is just modified to allow for testing various N values, but I also did some like terms combination in the z-offset calculation. It seems to work up to N values of 1000, which is as far as I wanted to test. I've also demonstrated how to pre-compute the size needed for the output array.
#include <iostream>
#include <cstdlib>
template <typename T>
int on_cpu(int N, T *A, T *result){
int counter = 0;
int x,y,z;
for (z=0; z<N; z++)
for (y=z; y<N; y++)
for (x=y; x<N; x++){
result[counter] = A[z] + A[y] + A[x];
counter++;}
std::cout << " N: " << N << " counter = " << counter << std::endl;
return counter;
}
__host__ __device__ int compute_z_offset(int N, int z){
int i = N-z;
// 0.5*(sum n^2|1..N - sum n^2|1..i + sum n|1..N - sum n|1..i
//int sumn2 = ((N*(N+1)*(2*N + 1)) - (i*(i+1)*(2*i + 1)))/3;
//int sumn = ((N*(N+1)) - (i*(i+1)));
//return (sumn2 + sumn)>>2;
return ((N*(N+1)*(2*N + 4)) - (i*(i+1)*(2*i + 4)))/12;
}
__host__ __device__ int compute_y_offset(int N, int z, int y){
int i = N-z;
int j = N-y;
return (i*(i+1) - j*(j+1))>>1;
}
template <typename T>
__global__ void on_gpu(int N, T *A, T *result){
int idz = blockIdx.z*blockDim.z + threadIdx.z;
int idy = blockIdx.y*blockDim.y + threadIdx.y;
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx >= idy && idy >= idz && idx < N && idy < N && idz < N)
{
result[compute_z_offset(N, idz) + compute_y_offset(N, idz, idy) + (idx-idy)] = A[idz] + A[idy] + A[idx];
}
}
int main(int argc, char *argv[]){
int N = 32;
if (argc > 1) N = atoi(argv[1]);
int *A, *result, *d_A, *d_result, *h_result;
A = new int[N];
int max_N_size = compute_z_offset(N, N-1) + 1;
result = new int[max_N_size];
h_result = new int[max_N_size];
cudaMalloc(&d_result, sizeof(result[0])*max_N_size);
cudaMalloc(&d_A, N*sizeof(A[0]));
for (int my_N = 1; my_N<N; my_N++){
for (int i = 0; i < my_N; i++) A[i] = i+1;
int N_size = on_cpu(my_N, A, result);
int test_N_size = compute_z_offset(my_N, my_N-1) + 1;
if (test_N_size != N_size) std::cout << "mismatch on size calc" << std::endl;
cudaMemcpy(d_A, A, sizeof(A[0])*my_N, cudaMemcpyHostToDevice);
cudaMemset(d_result, 0, sizeof(result[0])*N_size);
dim3 block(8,8,8);
dim3 grid((my_N+block.x-1)/block.x, (my_N+block.y-1)/block.y, (my_N+block.z-1)/block.z);
on_gpu<<<grid, block>>>(my_N, d_A, d_result);
cudaMemcpy(h_result, d_result, sizeof(result[0])*N_size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N_size; i++) if (result[i] != h_result[i]) {std::cout << "mismatch at: " << i << " was: " << h_result[i] << " should be: " << result[i] << std::endl; break;}
}
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With