I’m working with a matrix multiplication implementation from llm.c project, specifically from this file.
There are two implementations provided:
matmul_forward_cpu: A basic matrix multiplication loop.matmul_forward_ngc92: An optimized version using loop unrolling and caching.I’ve compiled these using gcc with the -Ofast flag, and the performance improvements have been impressive. Here are the results:
matmul_forward_cpu: 5.5 secondsmatmul_forward_ngc92: 2.8 secondsWithout -Ofast, the performance is much worse (more than 2 minutes), but with -Ofast, it’s clear that the compiler is applying heavy optimizations.
What I’ve Tried:
I’ve tried manually implementing AVX and NEON instructions to beat the matmul_forward_ngc92 implementation, but it seems the -Ofast flag is already applying SIMD optimizations, and I haven't been able to surpass the 2.8-second result.
My Question:
Does anyone have further suggestions on how to optimize the matmul_forward_ngc92 function, or are there specific CPU architecture features or other techniques that could further improve the performance? My goal is to push it beyond the 2.8-second mark.
One thing that comes to my mind is use OpenMP to utilize multiple cores
Code is as follows (from llm.c):
void matmul_forward_cpu(float* out,
const float* inp, const float* weight, const float* bias,
int B, int T, int C, int OC) {
// OC is short for "output channels"
// inp is (B,T,C), weight is (OC, C), bias is (OC)
// out will be (B,T,OC)
for (int b = 0; b < B; b++) {
for (int t = 0; t < T; t++) {
float* out_bt = out + b * T * OC + t * OC;
const float* inp_bt = inp + b * T * C + t * C;
for (int o = 0; o < OC; o++) {
float val = (bias != NULL) ? bias[o] : 0.0f;
const float* wrow = weight + o*C;
for (int i = 0; i < C; i++) {
val += inp_bt[i] * wrow[i];
}
out_bt[o] = val;
}
}
}
}
void matmul_forward_ngc92(float* out,
const float* inp, const float* weight, const float* bias,
int B, int T, int C, int OC) {
// most of the running time is spent here and in matmul_backward
// OC is short for "output channels"
// inp is (B,T,C), weight is (OC, C), bias is (OC)
// out will be (B,T,OC)
// make sure the tiled loop will be correct, otherwise, fallback to slow version
#define LOOP_UNROLL 8
if (B * T % LOOP_UNROLL != 0) {
printf("MUST BE A MULTIPLE OF 8"); // FIXME
return;
}
// collapse the B and T loops into one and turn it into a strided loop.
// then we can tile the inner loop, and reuse the loaded weight LOOP_UNROLL many times
// for significant speed-ups.
for (int obt = 0; obt < B * T; obt += LOOP_UNROLL) {
for (int o = 0; o < OC; o++) {
// keep LOOP_UNROLL many results in register, initialized by the bias term.
float result[LOOP_UNROLL];
for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
result[ibt] = (bias != NULL) ? bias[o] : 0.0f;
}
// inner loops. Because we do LOOP_UNROLL steps of inner bt, we can cache
// the value of weight[i + o * C] and reuse it.
// we compile with -Ofast, so the compiler will turn the inner loop into a bunch of FMAs
for (int i = 0; i < C; i++) {
float w = weight[i + o * C];
for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
int bt = obt + ibt;
result[ibt] += inp[bt * C + i] * w;
}
}
// write back results to main memory
for (int ibt = 0; ibt < LOOP_UNROLL; ++ibt) {
int bt = obt + ibt;
out[bt * OC + o] = result[ibt];
}
}
}
}
Here is the test code I used.
#define NUM_KERNELS 2
void matmul_forward(int kernel_num,
float* out,
const float* inp, const float* weight, const float* bias,
int B, int T, int C, int OC) {
switch (kernel_num) {
case 0:
matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC);
break;
case 1:
matmul_forward_ngc92(out, inp, weight, bias, B, T, C, OC);
break;
default:
printf("Invalid kernel number\n");
exit(1);
}
}
void validate_results_cpu(const float* device_result, const float* cpu_reference, const char* name, int num_elements, float tolerance);
float* make_random_float(size_t N);
int main(int argc, char **argv) {
srand(0);
int B = 8;
int T = 1024;
int C = 768;
int OC = 768 * 4; // expansion of 4, e.g. in the MLP
int RUNS = 4; // number of times to run a kernel for benchmarks
srand(137);
float* out = make_random_float(B * T * OC);
float* inp = make_random_float(B * T * C);
float* weight = make_random_float(OC * C);
float* bias = make_random_float(OC);
printf("> Calculating reference\n");
matmul_forward_cpu(out, inp, weight, bias, B, T, C, OC);
for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) {
printf("> Verifying kernel #%d\n", kernel_num);
srand(137);
float* kernel_out = make_random_float(B * T * OC);
float* kernel_inp = make_random_float(B * T * C);
float* kernel_weight = make_random_float(OC * C);
float* kernel_bias = make_random_float(OC);
// float* kernel_bias = NULL;
matmul_forward(kernel_num, kernel_out, kernel_inp, kernel_weight, kernel_bias, B, T, C, OC);
validate_results_cpu(kernel_out, out, "out", B * T * OC, 1e-5);
free(kernel_out);
free(kernel_inp);
free(kernel_weight);
free(kernel_bias);
}
printf("All kernels passed! Starting benchmarks.\n\n");
for (int kernel_num = 0; kernel_num < NUM_KERNELS; kernel_num++) {
printf("> Running kernel #%d\n", kernel_num);
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
for (int i = 0; i < RUNS; i++) {
matmul_forward(kernel_num, out, inp, weight, bias, B, T, C, OC);
}
clock_gettime(CLOCK_MONOTONIC, &end);
double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
printf("> Kernel #%d, (took %f ms)\n", kernel_num, time_elapsed_s * 1000);
}
// free memory
free(out);
free(inp);
free(weight);
free(bias);
return 0;
}
float* make_random_float(size_t N) {
float* arr = (float*)malloc(N * sizeof(float));
for (size_t i = 0; i < N; i++) {
arr[i] = ((float)rand() / RAND_MAX) * 2.0 - 1.0; // range -1..1
}
return arr;
}
void validate_results_cpu(const float* kernel_result, const float* cpu_reference, const char* name, int num_elements, float tolerance) {
int nfaults = 0;
for (int i = 0; i < num_elements; i++) {
// print the first few comparisons
if (i < 5) {
printf("%f %f\n", cpu_reference[i], kernel_result[i]);
}
float t_eff = tolerance + fabs(cpu_reference[i]);
// ensure correctness for all elements.
if (fabs(cpu_reference[i] - kernel_result[i]) > t_eff) {
printf("Mismatch of %s at %d: CPU_ref: %f vs CPU_new: %f\n", name, i, cpu_reference[i], kernel_result[i]);
nfaults++;
if (nfaults >= 10) {
exit(EXIT_FAILURE);
}
}
}
if (nfaults > 0) {
exit(EXIT_FAILURE);
}
printf("OK\n");
}
I tested the following options based on the suggestions, and here are the results from my system.
I ran the tests on an Apple M2 CPU with 16GB RAM.
I didn't specify the -march flag, assuming it would use the default for my system, apple-m2.
| method | runtime for matmul_forward_cpu (ms) | runtime for matmul_forward_ngc92 (ms) |
|---|---|---|
| with -Ofast (my main method) | 5,578 | 2,881 |
| with restrict and -Ofast | 5,587 | 2,869 |
| with -O2 -fno-tree-vectorize | 81,621 | 11,648 |
| with -O3 | 55,174 | 11,642 |
restrict
Given a function like:
void matmul_forward_cpu(float* out,
const float* inp, const float* weight, const float* bias,
int B, int T, int C, int OC) {
...
The compiler is not allowed to assume the data pointed to by out does not overlap the data pointed to by inp, weight nor bias.
This restriction prevents some optimizations. See `restrict'
If there is never overlap in various pointers' data, let the compiler know that to allow additional optimizations:
void matmul_forward_cpu_alt(float* restrict out,
const float* restrict inp, const float* restrict weight, const float* restrict bias,
int B, int T, int C, int OC) {
...
A deep understanding of restrict is not trivial nor do compilers always use it well. Good luck.
This applies to matmul_forward_ngc92() too.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With