Squeeze more speed from Common Lisp / SBCL

Question

This paper claims to make a certain Lisp program run faster than its C equivalent. Trying to reproduce the results, I was able to get close (Lisp is 50% slower than C) but wanted to know if anyone knows ways to squeeze more perf out of SBCL 1.3.1.

The target problem is adding a constant single float to every cell in an 800 x 800 array of single floats. The method is to write the program in C and in Common Lisp and compare the times. Using this portable timer, The C code is as follows:

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>

#include "./modules/tictoc/tictoc.h"

const int HORZ = 800;
const int VERT = 800;

#define PERF_REPS 1000

typedef float DATA_T;

struct image_s {
    size_t n;
    size_t w, h;
    DATA_T * data;
};
typedef struct image_s image;

image * make_image (size_t w, size_t h) {
    size_t n = w * h;
    DATA_T * data = (DATA_T *)malloc(sizeof(DATA_T) * n);
    assert (NULL != data);
    image * result = (image *)malloc(sizeof(image));
    assert (NULL != result);
    result->n = n;
    result->w = w;
    result->h = h;
    result->data = data;
    return result;
}

void free_image (image * it) {
    assert (NULL != it);
    assert (NULL != it->data);
    free (it->data);
    free (it);
}

image * init_to_value (image * it, DATA_T val) {
    assert (NULL != it);
    assert (NULL != it->data);
    size_t i;
    const size_t n = it->n;
    for (i = 0; i < n; ++i) {
        it->data[i] = val;
    }
    return it;
}

void add (image * to, image * from, DATA_T val) {
    assert (NULL != to);
    assert (NULL != to->data);
    assert (NULL != from);
    assert (NULL != from->data);
    size_t i;
    const size_t n = to->n;
    for (i = 0; i < n; ++i) {
        to->data[i] = from->data[i] + val;
    }
}

int main (int argc, char ** argv) {
    image * from = init_to_value (make_image (HORZ, VERT), 0.0f);
    image * to = init_to_value (make_image (HORZ, VERT), 0.0f);
    TicTocTimer clock = tic();
    for (size_t i = 0; i < PERF_REPS; ++i)
        add (to, from, 42.0);
    printf("Elapsed time %f seconds.
",toc(&clock));
    free_image (to);
    free_image (from);
    return 0;
}

I compile and run the code as follows:

gcc -O3 image-add.c ./modules/tictoc/libtictoc.a && ./a.out

A typical time on my mac book pro is about 0.178 seconds. Pretty nice.

The equivalent Lisp code, using every option I was able to find in the Lisp hyperspec, in the new book Common Lisp Recipes, and in the SBCL user manual, is as follows. The comments indicate some things I tried that did not make a difference.

;;; None of the commented-out declarations made any difference in speed. 

(declaim (optimize speed (safety 0)))

(defun image-add (to from val)
  (declare (type (simple-array single-float (*))
                 to from))
  (declare (type single-float val))
  (let ((size (array-dimension to 0)))
    ;(declare (type fixnum size))
    (dotimes (i size)
      ;(declare (type fixnum i))
      (setf (aref to i) (+ (aref from i) val)))))

(defparameter HORZ 800)
(defparameter VERT 800)

(defparameter PERF-REPS 1000)

(let ((to (make-array (* HORZ VERT) :element-type 'single-float))
      (fm (make-array (* HORZ VERT) :element-type 'single-float)))
  ;(declare (type fixnum HORZ))
  ;(declare (type fixnum VERT))
  (time (dotimes (_ PERF-REPS)
          ;(declare (type fixnum PERF-REPS))
          ;(declare (type fixnum _))
          ;(declare (inline image-add))
          (image-add to fm 42.0))))

I compile and run it as follows:

sbcl --script image-perf.lisp

and typical run times are 0.276. Not bad, but I want it much better. Of course, the whole point of the exercise is that the Lisp code is shorter, but does anyone know a way to make it as fast or faster?

coredump · Accepted Answer

For reference, here are some results with a slightly modified version.

C version

The C version takes on average 0.197s.

Lisp version

(declaim (optimize (speed 3) (debug 0) (safety 0)))

(defconstant HORZ 800)
(defconstant VERT 800)
(defconstant PERF-REPS 1000)

(defun test ()
  (let ((target #1=(make-array (* HORZ VERT)
                               :element-type 'single-float
                               :initial-element 0f0))
        (source #1#))
    (declare (type (simple-array single-float (*)) target source))
    (time 
      (dotimes (_ PERF-REPS)
        (map-into target
                  (lambda (x)
                    (declare (single-float x))
                    (the single-float (+ x 42f0)))
                  source)))))

Here is the output:

Evaluation took:                                                                                                 
  0.372 seconds of real time                                                                                     
  0.372024 seconds of total run time (0.368023 user, 0.004001 system)                                            
  100.00% CPU                                                                                                    
  965,075,988 processor cycles                                                                                   
  0 bytes consed

Replacing map-into with lparallel:pmap-into, the shortest time is obtained with a kernel composed of 4 workers and gives:

Evaluation took:                                                                                                 
 0.122 seconds of real time                                                                                     
 0.496031 seconds of total run time (0.492030 user, 0.004001 system)                                            
 406.56% CPU                                                                                                    
 316,445,789 processor cycles                                                                                   
 753,280 bytes consed

Note the difference in memory usage.

Joshua Taylor · Answer

SBCL provides a bunch of information

When I saved your code (and wrapped the final let example into a separate function), and compiled it with SBCL, I actually get a bunch of diagnostic output that informs us of some of the places where better code could be generated. There's a lot of it, but skim through here, though it's all in test, so it may or may not be helpful. But, since the test code might slow things down, it's worth a shot.

CL-USER> (compile-file ".../compile.lisp")
; compiling file ".../compile.lisp" (written 25 JAN 2016 01:53:23 PM):
; compiling (DECLAIM (OPTIMIZE SPEED ...))
; compiling (DEFUN IMAGE-ADD ...)
; compiling (DEFPARAMETER HORZ ...)
; compiling (DEFPARAMETER VERT ...)
; compiling (DEFPARAMETER PERF-REPS ...)
; compiling (DEFUN TEST ...)

; file: /home/taylorj/tmp/compile.lisp
; in: DEFUN TEST
;     (* HORZ VERT)
; 
; note: unable to
;   convert x*2^k to shift
; due to type uncertainty:
;   The first argument is a NUMBER, not a INTEGER.
;   The second argument is a NUMBER, not a INTEGER.
; 
; note: unable to
;   convert x*2^k to shift
; due to type uncertainty:
;   The first argument is a NUMBER, not a INTEGER.
;   The second argument is a NUMBER, not a INTEGER.

;     (DOTIMES (_ PERF-REPS) (IMAGE-ADD TO FM 42.0))
; --> DO BLOCK LET TAGBODY UNLESS IF >= IF 
; ==>
;   (< SB-C::X SB-C::Y)
; 
; note: forced to do GENERIC-< (cost 10)
;       unable to do inline fixnum comparison (cost 4) because:
;       The first argument is a UNSIGNED-BYTE, not a FIXNUM.
;       The second argument is a INTEGER, not a FIXNUM.

; --> DO BLOCK LET TAGBODY PSETQ PSETF LET* MULTIPLE-VALUE-BIND LET 1+ 
; ==>
;   (+ _ 1)
; 
; note: forced to do GENERIC-+ (cost 10)
;       unable to do inline fixnum arithmetic (cost 1) because:
;       The first argument is a UNSIGNED-BYTE, not a FIXNUM.
;       The result is a (VALUES (INTEGER 1) &OPTIONAL), not a (VALUES FIXNUM
;                                                                     &REST T).
;       unable to do inline fixnum arithmetic (cost 2) because:
;       The first argument is a UNSIGNED-BYTE, not a FIXNUM.
;       The result is a (VALUES (INTEGER 1) &OPTIONAL), not a (VALUES FIXNUM
;                                                                     &REST T).
;       etc.

;     (* HORZ VERT)
; 
; note: forced to do GENERIC-* (cost 30)
;       unable to do inline fixnum arithmetic (cost 4) because:
;       The first argument is a NUMBER, not a FIXNUM.
;       The second argument is a NUMBER, not a FIXNUM.
;       unable to do inline (signed-byte 64) arithmetic (cost 5) because:
;       The first argument is a NUMBER, not a (SIGNED-BYTE 64).
;       The second argument is a NUMBER, not a (SIGNED-BYTE 64).
;       etc.
; 
; note: forced to do GENERIC-* (cost 30)
;       unable to do inline fixnum arithmetic (cost 4) because:
;       The first argument is a NUMBER, not a FIXNUM.
;       The second argument is a NUMBER, not a FIXNUM.
;       unable to do inline (signed-byte 64) arithmetic (cost 5) because:
;       The first argument is a NUMBER, not a (SIGNED-BYTE 64).
;       The second argument is a NUMBER, not a (SIGNED-BYTE 64).
;       etc.
; 
; compilation unit finished
;   printed 6 notes

; .../compile.fasl written
; compilation finished in 0:00:00.009

That stuff was all in test, but since you are timing your dotimes loop, it might make sense to at least fix up that comparison. Here's the test code with declarations that should make the dotimes a bit quicker:

(defun test ()
  (declare (type fixnum HORZ VERT PERF-REPS))
  (let ((to (make-array (* HORZ VERT) :element-type 'single-float))
        (fm (make-array (* HORZ VERT) :element-type 'single-float)))
    (time (dotimes (_ PERF-REPS)
            (image-add to fm 42.0)))))

After that you might want to look into possible loop unrolling, caching the array dimension, and considering the memory locality of the arrays. Those are all pretty generic hints, though. I'm not sure what specific things could help more here.

Standard answer about "be sure to use optimize and safety"

Edit: shoot, I missed the toplevel declaim that does reference speed and safety. It's still worth checking into whether those are having the desired effect, but if they are, then this answer is mostly redundant.

For the most part, the compiler is allowed to completely ignore declarations. (The only exception is special declarations, which change the binding semantics for a variable.) So what a compiler does with them is up to it. A declaration like type could be used in at least two different ways. If you're trying to compile very safe code, type declarations let the compiler know that additional checks could be added. That would result in slower code, of course, but it would be safer. On the other hand, if you're trying to generate very fast code, then the compiler could take those type declarations as your guarantee that the values will always be of the right type, and thus generate faster code.

It looks like you're only adding type declarations. If you want faster (or safer) code, you'll want to add declarations to say that too. In this, case, you'll probably want (declare (optimize (speed 3) (safety 0))). For instance, take a look a few disassmblies of just a simple function that returns its fixnum argument. First, just declaring the type, the code ends up being 18 bytes:

(defun int-identity (x)
  (declare (type fixnum x))
  x)

INT-IDENTITY
CL-USER> (disassemble 'int-identity)
; disassembly for INT-IDENTITY
; Size: 18 bytes. Origin: #x100470619A
; 9A:       488BE5           MOV RSP, RBP                     ; no-arg-parsing entry point
; 9D:       F8               CLC
; 9E:       5D               POP RBP
; 9F:       C3               RET
; A0:       CC0A             BREAK 10                         ; error trap
; A2:       02               BYTE #X02
; A3:       19               BYTE #X19                        ; INVALID-ARG-COUNT-ERROR
; A4:       9A               BYTE #X9A                        ; RCX
; A5:       CC0A             BREAK 10                         ; error trap
; A7:       04               BYTE #X04
; A8:       08               BYTE #X08                        ; OBJECT-NOT-FIXNUM-ERROR
; A9:       FE1B01           BYTE #XFE, #X1B, #X01            ; RDX
NIL

Now, if we also add a speed optimization, the code size goes up a little bit. (That's not necessarily a bad thing, though. Some speed optimizations, such as loop unrolling or function inlining, will generate bigger code.)

CL-USER> 
(defun int-identity (x)
  (declare (type fixnum x)
           (optimize (speed 3)))
  x)

STYLE-WARNING: redefining COMMON-LISP-USER::INT-IDENTITY in DEFUN
INT-IDENTITY
CL-USER> (disassemble 'int-identity)
; disassembly for INT-IDENTITY
; Size: 20 bytes. Origin: #x1004A5D23D
; 3D:       488BE5           MOV RSP, RBP                     ; no-arg-parsing entry point
; 40:       F8               CLC
; 41:       5D               POP RBP
; 42:       C3               RET
; 43:       CC0A             BREAK 10                         ; error trap
; 45:       04               BYTE #X04
; 46:       19               BYTE #X19                        ; INVALID-ARG-COUNT-ERROR
; 47:       FE9A01           BYTE #XFE, #X9A, #X01            ; RBX
; 4A:       CC0A             BREAK 10                         ; error trap
; 4C:       04               BYTE #X04
; 4D:       08               BYTE #X08                        ; OBJECT-NOT-FIXNUM-ERROR
; 4E:       FE1B01           BYTE #XFE, #X1B, #X01            ; RDX
NIL

Finally, when we remove safety, we finally get some really short code, just 9 bytes:

CL-USER> 
(defun int-identity (x)
  (declare (type fixnum x)
           (optimize (speed 3)
                     (safety 0)))
  x)

STYLE-WARNING: redefining COMMON-LISP-USER::INT-IDENTITY in DEFUN
INT-IDENTITY
CL-USER> (disassemble 'int-identity)
; disassembly for INT-IDENTITY
; Size: 9 bytes. Origin: #x1004AFF3E2
; 2:       488BD1           MOV RDX, RCX                      ; no-arg-parsing entry point
; 5:       488BE5           MOV RSP, RBP
; 8:       F8               CLC
; 9:       5D               POP RBP
; A:       C3               RET

Reb.Cabin · Answer

Using the :lparallel suggestion by @coredump, I was able to get consistent runs of 0.125 seconds, distinctly faster than gcc -O3's 0.175. The various techniques suggested in comments of compiling the file and inlining the image-add function did not produce appreciable speedup. Here is the fastest code so far.

(load "~/quicklisp/setup.lisp")
(ql:quickload :lparallel)

(declaim (optimize (speed 3) (safety 0)))

(defparameter HORZ 800)
(defparameter VERT 800)

(defparameter PERF-REPS 1000)

(setf lparallel:*kernel* (lparallel:make-kernel 4))

(defun test ()
  (declare (type fixnum HORZ VERT PERF-REPS))
  (let ((to (make-array (* HORZ VERT) :element-type 'single-float))
        (fm (make-array (* HORZ VERT) :element-type 'single-float)))
    (time (dotimes (_ PERF-REPS)
            (lparallel:pmap-into to (lambda (x) (+ x 42f0)) fm)))))

(test)

EDIT: I will note that this isn't exactly fair: I added explicit parallelism to the Lisp code but not to the C code. However, it's notable how easy this was to do with Lisp. Because the chief advantage of Lisp over C in this scenario is code brevity and the relative ease of adding features like parallelism, the trade-off is in the right direction (of illustrating Lisp's relative flexibility). I suspect that parallel C code (if and when I can get around to implementing it) will be faster again than the Lisp code.

bpecsek · Answer

This is quite an old post but you might still be interested.

I would do this if I am after speed!

When we compare speed of C and Common Lisp we need to consider what the C compiler can do with high optimization setting.

Since C with -O3 does do and SBCL doesn't do auto vectorization we have to be careful when we do the comparison.

With vectorization as we can seen bellow SBCL should be plenty fast.

Please install sb-simd first to ~/quicklisp/local-projects.

Then run (ql:quickload :sb-simd) ones to build it.

You need sbcl-2.1.11 to run the latest sb-simd.

The generated vectorized SBCL executable image is faster than the C that is also doing vectorization and most likely loop unrolling as well with the -O3 compiler flag.

EDIT: I've just checked the generated assembly and it is vectorized. With the --march=native --mavx2 the speed did improve slightly.

(declaim (optimize speed (safety 0)))
(ql:quickload :sb-simd :silent t)
(use-package :sb-simd)

(declaim (ftype (function (f32vec f32vec f32) null) image-add))
(defun image-add (to from val)
  (do-vectorized (i 0 (array-total-size to))
    (:unroll 2)
    (setf (f32-aref to i) (f32+ (f32-aref from i) val))))

(defparameter *HORZ* 800)
(defparameter *VERT* 800)

(defparameter *PERF-REPS* 1000)

(defun main ()
  (declare (type fixnum *HORZ* *VERT* *PERF-REPS*))  
  (let ((to   (make-array (* *HORZ* *VERT*) :element-type 'f32))
        (from (make-array (* *HORZ* *VERT*) :element-type 'f32)))
    (time (dotimes (_ *PERF-REPS*)
            (image-add to from 42.0)))))

(save-lisp-and-die "image-add" :executable t :toplevel #'main)

EDITED: On my i7-7700HQ CPU I get:

$ gcc -O3 image-add.c ./tictoc/build/libtictoc.a
$ ./a.out 
Elapsed time 0.124956 seconds.

$ gcc -O3 -march=native -mavx2 image-add.c ./tictoc/build/libtictoc.a
$ ./a.out
Elapsed time 0.116989 seconds.

$ ./image-add 
Evaluation took:
  0.072 seconds of real time
  0.070440 seconds of total run time (0.070236 user, 0.000204 system)
  97.22% CPU
  199,079,678 processor cycles
  0 bytes consed

Squeeze more speed from Common Lisp / SBCL

Tags:

common-lisp

sbcl

Reb.Cabin

4 Answers

C version

Lisp version

coredump

SBCL provides a bunch of information

Standard answer about "be sure to use optimize and safety"

Joshua Taylor

Reb.Cabin

bpecsek

Recent Activity

Donate For Us

Squeeze more speed from Common Lisp / SBCL

Tags:

common-lisp

sbcl

Reb.Cabin

4 Answers

C version

Lisp version

coredump

SBCL provides a bunch of information

Standard answer about "be sure to use optimize and safety"

Joshua Taylor

Reb.Cabin

bpecsek

Related questions

Recent Activity

Donate For Us