The code below uses a very straightforward approach to calculate the matrix product a * b
and store the result in c
. The code was compiled with -O3
on both GCC 4.4.6 (with -mtune=native
) and Intel Compiler 13.0.1 and the speed on GCC is significantly worse (over a factor of two for the sample data used).
I'm curious about the cause of these differences, but unfortunately I'm not familiar enough with the assembly output to understand what's going on here. From a glance it looks as though ICC is doing a better job at vectorizing the computation, but I can't decipher much more than that. (This is mainly for learning purposes since there's no way I would use this in production!)
void __attribute__ ((noinline)) mm( // Line 3
int n,
double*__restrict__ c,
double*__restrict__ a,
double*__restrict__ b
) {
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i + n * j] = 0; // Line 12
for (k = 0; k < n; k++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}
}
Here is the output from GCC:
_Z2mmiPdS_S_:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
pushq %r14 #
.cfi_def_cfa_offset 16
.cfi_offset 14, -16
testl %edi, %edi # n
movq %rcx, %r14 # b, b
pushq %r13 #
.cfi_def_cfa_offset 24
.cfi_offset 13, -24
pushq %r12 #
.cfi_def_cfa_offset 32
.cfi_offset 12, -32
pushq %rbp #
.cfi_def_cfa_offset 40
.cfi_offset 6, -40
pushq %rbx #
.cfi_def_cfa_offset 48
.cfi_offset 3, -48
jle .L6 #,
leal -1(%rdi), %eax #, tmp96
movslq %edi, %r11 # n, n
movq %rdx, %rbx # a, ivtmp.54
xorl %r12d, %r12d # ivtmp.67
salq $3, %r11 #, D.2193
xorl %ebp, %ebp # prephitmp.37
leaq 8(,%rax,8), %r13 #, D.2208
.L3:
leaq (%rsi,%r12), %r10 #, ivtmp.61
movq %r14, %rcx # b, ivtmp.63
xorl %edx, %edx # j
.p2align 4,,10
.p2align 3
.L5:
movq $0, (%r10) #,* ivtmp.61
movq %rbp, -8(%rsp) # prephitmp.37,
movq %rcx, %r9 # ivtmp.63, ivtmp.70
movsd -8(%rsp), %xmm1 #, prephitmp.37
movq %rbx, %r8 # ivtmp.54, ivtmp.69
xorl %eax, %eax # k
.p2align 4,,10
.p2align 3
.L4:
movsd (%r8), %xmm0 #* ivtmp.69, tmp99
addl $1, %eax #, k
addq %r11, %r8 # D.2193, ivtmp.69
mulsd (%r9), %xmm0 #* ivtmp.70, tmp99
addq $8, %r9 #, ivtmp.70
cmpl %edi, %eax # n, k
addsd %xmm0, %xmm1 # tmp99, prephitmp.37
movsd %xmm1, (%r10) # prephitmp.37,* ivtmp.61
jne .L4 #,
addl $1, %edx #, j
addq %r11, %r10 # D.2193, ivtmp.61
addq %r11, %rcx # D.2193, ivtmp.63
cmpl %edi, %edx # n, j
jne .L5 #,
addq $8, %r12 #, ivtmp.67
addq $8, %rbx #, ivtmp.54
cmpq %r13, %r12 # D.2208, ivtmp.67
jne .L3 #,
.L6:
popq %rbx #
.cfi_def_cfa_offset 40
popq %rbp #
.cfi_def_cfa_offset 32
popq %r12 #
.cfi_def_cfa_offset 24
popq %r13 #
.cfi_def_cfa_offset 16
popq %r14 #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
And here is the output from ICC:
# -- Begin _Z2mmiPdS_S_
# mark_begin;
.align 16,0x90
.globl _Z2mmiPdS_S_
_Z2mmiPdS_S_:
# parameter 1: %edi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %rcx
..B1.1: # Preds ..B1.0
..___tag_value__Z2mmiPdS_S_.1: #8.3
pushq %r12 #8.3
..___tag_value__Z2mmiPdS_S_.3: #
pushq %r13 #8.3
..___tag_value__Z2mmiPdS_S_.5: #
pushq %r14 #8.3
..___tag_value__Z2mmiPdS_S_.7: #
pushq %r15 #8.3
..___tag_value__Z2mmiPdS_S_.9: #
pushq %rbx #8.3
..___tag_value__Z2mmiPdS_S_.11: #
pushq %rbp #8.3
..___tag_value__Z2mmiPdS_S_.13: #
subq $72, %rsp #8.3
..___tag_value__Z2mmiPdS_S_.15: #
movq %rsi, %r9 #
movslq %edi, %rax #
xorl %r10d, %r10d #11.9
testl %edi, %edi #11.25
jle ..B1.7 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r9 r12 r13 r14 r15 edi r10d
..B1.2: # Preds ..B1.1
movl %edi, %r11d #10.5
lea (,%rax,8), %r8 #
andl $-4, %r11d #10.5
movq %rax, %r14 #12.28
movslq %r11d, %r11 #10.5
movl %edi, %r12d #12.28
movq %rsi, 8(%rsp) #12.28
movq %r8, %rbp #12.28
movq %rdx, 32(%rsp) #12.28
movq %r9, %r13 #12.28
movq %rcx, (%rsp) #12.28
movl %r10d, %r15d #12.28
pxor %xmm0, %xmm0 #12.28
movq %r11, %rbx #12.28
# LOE rbx rbp r13 r14 r12d r15d
..B1.3: # Preds ..B1.5 ..B1.48 ..B1.45 ..B1.2
cmpl $12, %r12d #10.5
jle ..B1.38 # Prob 0% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.4: # Preds ..B1.3
movq %r13, %rdi #12.13
xorl %esi, %esi #12.13
movq %rbp, %rdx #12.13
call _intel_fast_memset #12.13
# LOE rbx rbp r13 r14 r12d r15d
..B1.5: # Preds ..B1.4
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.6: # Preds ..B1.48 ..B1.45 ..B1.5 # Infreq
movl %r12d, %edi #
movq %r14, %rax #
movq 8(%rsp), %rsi #
testl %edi, %edi #11.25
movq 32(%rsp), %rdx #
movq (%rsp), %rcx #
# LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 edi
..B1.7: # Preds ..B1.1 ..B1.6 # Infreq
movl $0, %r9d #11.9
movl $0, %r8d #
jle ..B1.33 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r8 r12 r13 r14 r15 edi r9d
..B1.8: # Preds ..B1.7 # Infreq
movq %rdx, 32(%rsp) #
# LOE rax rcx rsi r8 edi r9d
..B1.9: # Preds ..B1.31 ..B1.8 # Infreq
xorl %r12d, %r12d #
lea (%rsi,%r8,8), %r13 #14.17
movq %r13, %r15 #10.5
xorl %ebx, %ebx #13.13
andq $15, %r15 #10.5
xorl %r10d, %r10d #
movl %r15d, %r14d #10.5
lea (%rcx,%r8,8), %rbp #14.48
andl $7, %r14d #10.5
xorl %r11d, %r11d #
movl %r14d, 48(%rsp) #
xorl %edx, %edx #
movl %r15d, 56(%rsp) #
movq %r13, 40(%rsp) #
movq %r8, 16(%rsp) #
movl %r9d, 24(%rsp) #
movq %rsi, 8(%rsp) #
movq %rcx, (%rsp) #
movq 32(%rsp), %r14 #
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.10: # Preds ..B1.30 ..B1.9 # Infreq
cmpq $8, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.11: # Preds ..B1.10 # Infreq
movl 56(%rsp), %r9d #10.5
testl %r9d, %r9d #10.5
je ..B1.14 # Prob 50% #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.12: # Preds ..B1.11 # Infreq
cmpl $0, 48(%rsp) #10.5
jne ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.13: # Preds ..B1.12 # Infreq
movl $1, %r9d #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.14: # Preds ..B1.13 ..B1.11 # Infreq
movl %r9d, %r13d #10.5
lea 8(%r13), %rcx #10.5
cmpq %rcx, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r9 r10 r12 r13 r14 ebx edi r11d
..B1.15: # Preds ..B1.14 # Infreq
movl %edi, %r15d #10.5
xorl %ecx, %ecx #10.5
subl %r9d, %r15d #10.5
movslq %r11d, %r8 #14.33
andl $7, %r15d #10.5
negl %r15d #10.5
addl %edi, %r15d #10.5
movslq %r15d, %r15 #10.5
testq %r13, %r13 #10.5
lea (%r14,%r8,8), %rsi #14.33
jbe ..B1.35 # Prob 0% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d
..B1.16: # Preds ..B1.15 # Infreq
movsd (%r10,%rbp), %xmm0 #14.48
movq 40(%rsp), %r14 #14.48
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.17: # Preds ..B1.17 ..B1.16 # Infreq
movsd (%rsi,%rcx,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%r14,%rcx,8), %xmm1 #14.17
movsd %xmm1, (%r14,%rcx,8) #14.17
incq %rcx #10.5
cmpq %r13, %rcx #10.5
jb ..B1.17 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.18: # Preds ..B1.17 # Infreq
movq 32(%rsp), %r14 #
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.19: # Preds ..B1.18 ..B1.35 # Infreq
addq %r9, %r8 #14.33
lea (%r14,%r8,8), %rcx #14.33
testq $15, %rcx #10.5
je ..B1.23 # Prob 60% #10.5
# LOE rax rdx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.20: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.21: # Preds ..B1.21 ..B1.20 # Infreq
movsd (%rsi,%r13,8), %xmm1 #14.33
movsd 16(%rsi,%r13,8), %xmm2 #14.33
movsd 32(%rsi,%r13,8), %xmm3 #14.33
movsd 48(%rsi,%r13,8), %xmm4 #14.33
movhpd 8(%rsi,%r13,8), %xmm1 #14.33
movhpd 24(%rsi,%r13,8), %xmm2 #14.33
movhpd 40(%rsi,%r13,8), %xmm3 #14.33
movhpd 56(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.21 # Prob 82% #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.23: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
.align 16,0x90
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.24: # Preds ..B1.24 ..B1.23 # Infreq
movaps (%rsi,%r13,8), %xmm1 #14.33
movaps 16(%rsi,%r13,8), %xmm2 #14.33
movaps 32(%rsi,%r13,8), %xmm3 #14.33
movaps 48(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.24 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.26: # Preds ..B1.24 ..B1.21 ..B1.34 # Infreq
cmpq %rax, %r15 #10.5
jae ..B1.30 # Prob 0% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.27: # Preds ..B1.26 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
lea (%r14,%rdx,8), %rcx #14.33
movq 40(%rsp), %rsi #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.28: # Preds ..B1.28 ..B1.27 # Infreq
movsd (%rcx,%r15,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%rsi,%r15,8), %xmm1 #14.17
movsd %xmm1, (%rsi,%r15,8) #14.17
incq %r15 #10.5
cmpq %rax, %r15 #10.5
jb ..B1.28 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.30: # Preds ..B1.28 ..B1.26 # Infreq
incl %ebx #13.13
addq %rax, %rdx #13.13
addl %edi, %r11d #13.13
addq $8, %r10 #13.13
incq %r12 #13.13
cmpl %edi, %ebx #13.13
jb ..B1.10 # Prob 82% #13.13
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.31: # Preds ..B1.30 # Infreq
movl 24(%rsp), %r9d #
incl %r9d #11.9
movq 16(%rsp), %r8 #
addq %rax, %r8 #11.9
movq 8(%rsp), %rsi #
cmpl %edi, %r9d #11.9
movq (%rsp), %rcx #
jb ..B1.9 # Prob 82% #11.9
# LOE rax rcx rsi r8 edi r9d
..B1.33: # Preds ..B1.31 ..B1.7 # Infreq
addq $72, %rsp #18.1
..___tag_value__Z2mmiPdS_S_.16: #
popq %rbp #18.1
..___tag_value__Z2mmiPdS_S_.18: #
popq %rbx #18.1
..___tag_value__Z2mmiPdS_S_.20: #
popq %r15 #18.1
..___tag_value__Z2mmiPdS_S_.22: #
popq %r14 #18.1
..___tag_value__Z2mmiPdS_S_.24: #
popq %r13 #18.1
..___tag_value__Z2mmiPdS_S_.26: #
popq %r12 #18.1
..___tag_value__Z2mmiPdS_S_.28: #
ret #18.1
..___tag_value__Z2mmiPdS_S_.29: #
# LOE
..B1.34: # Preds ..B1.10 ..B1.14 ..B1.12 # Infreq
xorl %r15d, %r15d #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.35: # Preds ..B1.15 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
jmp ..B1.19 # Prob 100% #14.48
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.38: # Preds ..B1.3 # Infreq
cmpq $4, %r14 #10.5
jl ..B1.47 # Prob 10% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.39: # Preds ..B1.38 # Infreq
xorl %esi, %esi #10.5
movq %rbx, %rdx #10.5
movq %r13, %rcx #
xorl %eax, %eax #
pxor %xmm0, %xmm0 #
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.40: # Preds ..B1.40 ..B1.39 # Infreq
addq $4, %rsi #10.5
movq %rax, (%rcx) #12.13
movhpd %xmm0, 8(%rcx) #12.13
movq %rax, 16(%rcx) #12.13
movhpd %xmm0, 24(%rcx) #12.13
addq $32, %rcx #10.5
cmpq %rbx, %rsi #10.5
jb ..B1.40 # Prob 82% #10.5
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.42: # Preds ..B1.40 ..B1.47 # Infreq
cmpq %r14, %rdx #10.5
jae ..B1.48 # Prob 0% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.43: # Preds ..B1.42 # Infreq
xorl %ecx, %ecx #
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.44: # Preds ..B1.44 ..B1.43 # Infreq
movq %rcx, (%r13,%rdx,8) #12.13
incq %rdx #10.5
cmpq %r14, %rdx #10.5
jb ..B1.44 # Prob 82% #10.5
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.45: # Preds ..B1.44 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.47: # Preds ..B1.38 # Infreq
xorl %edx, %edx #10.5
jmp ..B1.42 # Prob 100% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.48: # Preds ..B1.42 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
.align 16,0x90
..___tag_value__Z2mmiPdS_S_.36: #
# LOE rbx rbp r13 r14 r12d r15d
# mark_end;
.type _Z2mmiPdS_S_,@function
.size _Z2mmiPdS_S_,.-_Z2mmiPdS_S_
.data
# -- End _Z2mmiPdS_S_
Edit: With the help of Olaf Dietsche, it looks like the code below can run much faster with GCC 4.8.2, though still a bit (~30%) slower than Intel. The main difference is that the initialization is done ahead of time (this by itself makes no difference) and the loop ordering has been interchanged (this makes a major difference for GCC).
memset(c, 0, n * n);
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
for (i = 0; i < n; i++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}
Your code seems to be wrong or not suitable for vectorization.
When I modify your code according to this blog post Performance – GCC & auto-vectorization
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
c[n * i + k] += a[n * i + j] * b[n * j + k]; // Line 14
}
}
}
and compile it with
gcc-4.8 -O3 -S a.c
it uses at least some SIMD instructions
.L8:
movsd (%rcx), %xmm1
addl $1, %r8d
movsd (%rdx,%rsi), %xmm2
unpcklpd %xmm1, %xmm1
movhpd 8(%rdx,%rsi), %xmm2
movsd (%rax,%rsi), %xmm0
mulpd %xmm2, %xmm1
movhpd 8(%rax,%rsi), %xmm0
addpd %xmm1, %xmm0
movlpd %xmm0, (%rax,%rsi)
movhpd %xmm0, 8(%rax,%rsi)
addq $16, %rsi
cmpl %r8d, %ebx
ja .L8
cmpl %edi, %r15d
je .L9
although not as much as ICC does.
Update:
Adding -funroll-loops
enlarges the generated assembly code substantially to about the length of your posted ICC assembly.
Looks like the Intel compiler is using SIMD instructions (mulpd
, addpd
, movaps
etc) -- it's able to perform more than one operation (i.e. both a = b*c and d = e*f) in a single clock cycle, whereas the GCC code would take two to do the same.. I'm not sure if it's possible to enable these operations automatically in GCC, but you can hand-write them in with some work.
It seems like the flags -msse, -msse2, -msse3 to GCC cause it to attempt to do some SIMD optimization of its own.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With