Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

why is clang optimization breaking my inline assembly code?

in an attempt to learn something about ARM assembly, i have written a simple test project to perform image downscaling using inline assembly and NEON instructions. you can see it here:

https://github.com/rmaz/NEON-Image-Downscaling

after some effort i managed to get it working, happy days. except that it only works for optimization levels less than -O2. i have taken a look at the generated ASM, but i cannot see any obvious reason why this should occur. can anyone offer any insight? here is the function responsible for the inline assembly part:

static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow)
{
    const uint32_t * rowB = src + pixelsPerRow;

    // force the number of pixels per row to a mutliple of 8
    pixelsPerRow = 8 * (pixelsPerRow / 8);    

    __asm__ volatile("Lresizeloop:                      \n" // start loop
                     "vld1.32       {d0-d3}, [%1]!      \n" // load 8 pixels from the top row
                     "vld1.32       {d4-d7}, [%2]!      \n" // load 8 pixels from the bottom row
                     "vhadd.u8      q0, q0, q2          \n" // average the pixels vertically
                     "vhadd.u8      q1, q1, q3          \n"
                     "vtrn.32       q0, q2              \n" // transpose to put the horizontally adjacent pixels in different registers
                     "vtrn.32       q1, q3              \n"
                     "vhadd.u8      q0, q0, q2          \n" // average the pixels horizontally
                     "vhadd.u8      q1, q1, q3          \n"
                     "vtrn.32       d0, d1              \n" // fill the registers with pixels
                     "vtrn.32       d2, d3              \n"
                     "vswp          d1, d2              \n"
                     "vst1.64       {d0-d1}, [%0]!      \n" // store the result
                     "subs          %3, %3, #8          \n" // subtract 8 from the pixel count
                     "bne           Lresizeloop         \n" // repeat until the row is complete
                     : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
                     : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
                     : "q0", "q1", "q2", "q3"
                     );
}

the functioning generated output at O1 for the surrounding function and loop is as follows:

.align  2
    .code   16                      @ @"\01-[BDPViewController downscaleImageNeon:]"
    .thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
    .cfi_startproc
Lfunc_begin4:
    .loc    1 86 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
@ BB#0:
    .loc    1 86 1 prologue_end     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
    push    {r4, r5, r6, r7, lr}
    add r7, sp, #12
    push.w  {r8, r10, r11}
    sub sp, #20
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
    .loc    1 88 20                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
    movw    r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
    mov r6, r2
Ltmp43:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
    movt    r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
    add r0, pc
    ldr.w   r11, [r0]
    mov r0, r6
    blx _objc_retain
    mov r4, r0
    mov r0, r6
    mov r1, r11
Ltmp44:
    blx _objc_msgSend
    blx _CGImageGetWidth
    mov r5, r0
Ltmp45:
    @DEBUG_VALUE: width <- R5+0
    .loc    1 89 21                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
    mov r0, r6
    mov r1, r11
    str r5, [sp, #16]           @ 4-byte Spill
    blx _objc_msgSend
    blx _CGImageGetHeight
    mov r10, r0
Ltmp46:
    @DEBUG_VALUE: height <- R10+0
    .loc    1 90 26                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetBytesPerRow
    str r0, [sp, #12]           @ 4-byte Spill
Ltmp47:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    .loc    1 91 35                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetAlphaInfo
    str r0, [sp, #4]            @ 4-byte Spill
Ltmp48:
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    mov r6, r0
Ltmp49:
    mov r0, r4
    blx _objc_release
    mov r0, r6
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    mul r8, r10, r5
Ltmp50:
    @DEBUG_VALUE: width <- [sp+#16]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    blx _CGImageGetDataProvider
    blx _CGDataProviderCopyData
Ltmp51:
    @DEBUG_VALUE: data <- R0+0
    str r0, [sp, #8]            @ 4-byte Spill
Ltmp52:
    @DEBUG_VALUE: data <- [sp+#8]+#0
    .loc    1 95 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
    blx _CFDataGetBytePtr
    mov r4, r0
Ltmp53:
    @DEBUG_VALUE: buffer <- R4+0
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    lsr.w   r0, r8, #2
    movs    r1, #4
    blx _calloc
    mov r5, r0
Ltmp54:
    @DEBUG_VALUE: outputBuffer <- R5+0
    mov r0, r10
Ltmp55:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    cmp r0, #0
Ltmp56:
    @DEBUG_VALUE: rowIndex <- 0+0
    beq LBB4_3
@ BB#1:                                 @ %.lr.ph
Ltmp57:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: buffer <- R4+0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    ldr r1, [sp, #12]           @ 4-byte Reload
Ltmp58:
    @DEBUG_VALUE: bytesPerRow <- R1+0
    mov.w   r8, #0
    lsl.w   r11, r1, #1
    .loc    1 104 74                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
    lsr.w   r10, r1, #1
Ltmp60:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2:                                 @ =>This Inner Loop Header: Depth=1
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    lsr.w   r1, r8, #1
Ltmp61:
    mov r6, r0
Ltmp62:
    @DEBUG_VALUE: height <- R6+0
    mla r0, r1, r10, r5
Ltmp63:
    @DEBUG_VALUE: destRow <- R1+0
    .loc    1 105 9                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
    ldr r2, [sp, #16]           @ 4-byte Reload
    mov r1, r4
Ltmp64:
    bl  _resizeRow
    mov r0, r6
Ltmp65:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 50                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
    add.w   r8, r8, #2
Ltmp66:
    @DEBUG_VALUE: rowIndex <- R8+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    add r4, r11
    cmp r8, r0
    blo LBB4_2
Ltmp67:
LBB4_3:                                 @ %._crit_edge
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    .loc    1 109 28                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
    ldr r1, [sp, #4]            @ 4-byte Reload
Ltmp68:
    lsrs    r2, r0, #1
    str r1, [sp]
    mov r6, r5
Ltmp69:
    @DEBUG_VALUE: outputBuffer <- R6+0
    ldr r1, [sp, #16]           @ 4-byte Reload
    ldr r0, [sp, #12]           @ 4-byte Reload
Ltmp70:
    lsrs    r1, r1, #1
    lsrs    r3, r0, #1
    mov r0, r5
    bl  _createBitmapContext
    mov r4, r0
Ltmp71:
    @DEBUG_VALUE: context <- R4+0
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    blx _CGBitmapContextCreateImage
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movw    r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    mov r5, r0
Ltmp72:
    @DEBUG_VALUE: scaledImage <- R5+0
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movt    r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    movw    r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
    movt    r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
    add r1, pc
LPC4_2:
    add r0, pc
    mov r2, r5
    ldr r1, [r1]
    ldr r0, [r0]
    blx _objc_msgSend
Ltmp73:
    @DEBUG_VALUE: returnImage <- R0+0
    @ InlineAsm Start
    mov r7, r7      @ marker for objc_retainAutoreleaseReturnValue
    @ InlineAsm End
    blx _objc_retainAutoreleasedReturnValue
Ltmp74:
    mov r8, r0
    .loc    1 112 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
    mov r0, r5
    blx _CGImageRelease
    .loc    1 113 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
    mov r0, r4
    blx _CGContextRelease
    .loc    1 114 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
    ldr r0, [sp, #8]            @ 4-byte Reload
    blx _CFRelease
    .loc    1 115 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
    mov r0, r6
    blx _free
Ltmp75:
    .loc    1 118 1                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
    mov r0, r8
    add sp, #20
    pop.w   {r8, r10, r11}
    pop.w   {r4, r5, r6, r7, lr}
Ltmp76:
    b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
    .cfi_endproc

    .align  2
    .code   16                      @ @resizeRow
    .thumb_func _resizeRow
_resizeRow:
    .cfi_startproc
Lfunc_begin5:
    .loc    1 26 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
@ BB#0:
    @DEBUG_VALUE: resizeRow:dst <- R0+0
    @DEBUG_VALUE: resizeRow:src <- R1+0
    @DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
    .loc    1 27 47 prologue_end    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
    add.w   r3, r1, r2, lsl #2
Ltmp78:
    @DEBUG_VALUE: rowB <- R3+0
    .loc    1 30 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
    bic r2, r2, #7
Ltmp79:
    .loc    1 32 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
    @ InlineAsm Start
    Lresizeloop:                      
vld1.32       {d0-d3}, [r1]!      
vld1.32       {d4-d7}, [r3]!      
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       q0, q2              
vtrn.32       q1, q3              
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       d0, d1              
vtrn.32       d2, d3              
vswp          d1, d2              
vst1.64       {d0-d1}, [r0]!      
subs          r2, r2, #8          
bne           Lresizeloop         

    @ InlineAsm End
Ltmp80:
    .loc    1 51 1                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
    bx  lr
Ltmp81:
Lfunc_end5:
    .cfi_endproc

and the non functioning output at O2 is as follows:

    .align  2
    .code   16                      @ @"\01-[BDPViewController downscaleImageNeon:]"
    .thumb_func "-[BDPViewController downscaleImageNeon:]"
"-[BDPViewController downscaleImageNeon:]":
    .cfi_startproc
Lfunc_begin4:
    .loc    1 86 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0
@ BB#0:
    .loc    1 86 1 prologue_end     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1
    push    {r4, r5, r6, r7, lr}
    add r7, sp, #12
    push.w  {r8, r10, r11}
    sub sp, #20
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0
    .loc    1 88 20                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20
Ltmp41:
    movw    r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
Ltmp42:
    mov r6, r2
Ltmp43:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0
    movt    r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4))
LPC4_0:
    add r0, pc
    ldr.w   r11, [r0]
    mov r0, r6
    blx _objc_retain
    mov r4, r0
    mov r0, r6
    mov r1, r11
Ltmp44:
    blx _objc_msgSend
    blx _CGImageGetWidth
    mov r5, r0
Ltmp45:
    @DEBUG_VALUE: width <- R5+0
    .loc    1 89 21                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21
    mov r0, r6
    mov r1, r11
    str r5, [sp, #16]           @ 4-byte Spill
    blx _objc_msgSend
    blx _CGImageGetHeight
    mov r10, r0
Ltmp46:
    @DEBUG_VALUE: height <- R10+0
    .loc    1 90 26                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetBytesPerRow
    str r0, [sp, #12]           @ 4-byte Spill
Ltmp47:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    .loc    1 91 35                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    blx _CGImageGetAlphaInfo
    str r0, [sp, #4]            @ 4-byte Spill
Ltmp48:
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    mov r0, r6
    mov r1, r11
    blx _objc_msgSend
    mov r6, r0
Ltmp49:
    mov r0, r4
    blx _objc_release
    mov r0, r6
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    mul r8, r10, r5
Ltmp50:
    @DEBUG_VALUE: width <- [sp+#16]+#0
    .loc    1 94 45                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45
    blx _CGImageGetDataProvider
    blx _CGDataProviderCopyData
Ltmp51:
    @DEBUG_VALUE: data <- R0+0
    str r0, [sp, #8]            @ 4-byte Spill
Ltmp52:
    @DEBUG_VALUE: data <- [sp+#8]+#0
    .loc    1 95 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29
    blx _CFDataGetBytePtr
    mov r4, r0
Ltmp53:
    @DEBUG_VALUE: buffer <- R4+0
    .loc    1 98 29                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29
    lsr.w   r0, r8, #2
    movs    r1, #4
    blx _calloc
    mov r5, r0
Ltmp54:
    @DEBUG_VALUE: outputBuffer <- R5+0
    mov r0, r10
Ltmp55:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    cmp r0, #0
Ltmp56:
    @DEBUG_VALUE: rowIndex <- 0+0
    beq LBB4_3
@ BB#1:                                 @ %.lr.ph
Ltmp57:
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: buffer <- R4+0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    ldr r1, [sp, #12]           @ 4-byte Reload
Ltmp58:
    @DEBUG_VALUE: bytesPerRow <- R1+0
    mov.w   r8, #0
    lsl.w   r11, r1, #1
    .loc    1 104 74                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74
Ltmp59:
    lsr.w   r10, r1, #1
Ltmp60:
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
LBB4_2:                                 @ =>This Inner Loop Header: Depth=1
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    @DEBUG_VALUE: rowIndex <- 0+0
    lsr.w   r1, r8, #1
Ltmp61:
    mov r6, r0
Ltmp62:
    @DEBUG_VALUE: height <- R6+0
    mla r0, r1, r10, r5
Ltmp63:
    @DEBUG_VALUE: destRow <- R1+0
    .loc    1 105 9                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9
    ldr r2, [sp, #16]           @ 4-byte Reload
    mov r1, r4
Ltmp64:
    bl  _resizeRow
    mov r0, r6
Ltmp65:
    @DEBUG_VALUE: height <- R0+0
    .loc    1 101 50                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50
    add.w   r8, r8, #2
Ltmp66:
    @DEBUG_VALUE: rowIndex <- R8+0
    .loc    1 101 29                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    add r4, r11
    cmp r8, r0
    blo LBB4_2
Ltmp67:
LBB4_3:                                 @ %._crit_edge
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0
    @DEBUG_VALUE: width <- [sp+#16]+#0
    @DEBUG_VALUE: height <- R0+0
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0
    @DEBUG_VALUE: data <- [sp+#8]+#0
    @DEBUG_VALUE: outputBuffer <- R5+0
    .loc    1 109 28                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28
    ldr r1, [sp, #4]            @ 4-byte Reload
Ltmp68:
    lsrs    r2, r0, #1
    str r1, [sp]
    mov r6, r5
Ltmp69:
    @DEBUG_VALUE: outputBuffer <- R6+0
    ldr r1, [sp, #16]           @ 4-byte Reload
    ldr r0, [sp, #12]           @ 4-byte Reload
Ltmp70:
    lsrs    r1, r1, #1
    lsrs    r3, r0, #1
    mov r0, r5
    bl  _createBitmapContext
    mov r4, r0
Ltmp71:
    @DEBUG_VALUE: context <- R4+0
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    blx _CGBitmapContextCreateImage
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movw    r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    .loc    1 110 30                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30
    mov r5, r0
Ltmp72:
    @DEBUG_VALUE: scaledImage <- R5+0
    .loc    1 111 66                @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66
    movt    r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4))
    movw    r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
    movt    r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4))
LPC4_1:
    add r1, pc
LPC4_2:
    add r0, pc
    mov r2, r5
    ldr r1, [r1]
    ldr r0, [r0]
    blx _objc_msgSend
Ltmp73:
    @DEBUG_VALUE: returnImage <- R0+0
    @ InlineAsm Start
    mov r7, r7      @ marker for objc_retainAutoreleaseReturnValue
    @ InlineAsm End
    blx _objc_retainAutoreleasedReturnValue
Ltmp74:
    mov r8, r0
    .loc    1 112 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5
    mov r0, r5
    blx _CGImageRelease
    .loc    1 113 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5
    mov r0, r4
    blx _CGContextRelease
    .loc    1 114 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5
    ldr r0, [sp, #8]            @ 4-byte Reload
    blx _CFRelease
    .loc    1 115 5                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5
    mov r0, r6
    blx _free
Ltmp75:
    .loc    1 118 1                 @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1
    mov r0, r8
    add sp, #20
    pop.w   {r8, r10, r11}
    pop.w   {r4, r5, r6, r7, lr}
Ltmp76:
    b.w _objc_autoreleaseReturnValue
Ltmp77:
Lfunc_end4:
    .cfi_endproc

    .align  2
    .code   16                      @ @resizeRow
    .thumb_func _resizeRow
_resizeRow:
    .cfi_startproc
Lfunc_begin5:
    .loc    1 26 0                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0
@ BB#0:
    @DEBUG_VALUE: resizeRow:dst <- R0+0
    @DEBUG_VALUE: resizeRow:src <- R1+0
    @DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0
    .loc    1 27 47 prologue_end    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47
    add.w   r3, r1, r2, lsl #2
Ltmp78:
    @DEBUG_VALUE: rowB <- R3+0
    .loc    1 30 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5
    bic r2, r2, #7
Ltmp79:
    .loc    1 32 5                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
    @ InlineAsm Start
    Lresizeloop:                      
vld1.32       {d0-d3}, [r1]!      
vld1.32       {d4-d7}, [r3]!      
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       q0, q2              
vtrn.32       q1, q3              
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       d0, d1              
vtrn.32       d2, d3              
vswp          d1, d2              
vst1.64       {d0-d1}, [r0]!      
subs          r2, r2, #8          
bne           Lresizeloop         

    @ InlineAsm End
Ltmp80:
    .loc    1 51 1                  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1
    bx  lr
Ltmp81:
Lfunc_end5:
    .cfi_endproc
like image 334
Tark Avatar asked Aug 16 '12 11:08

Tark


1 Answers

Here's a snippet of the assembly code I get from your Xcode project with -O2. (Building with -O1 doesn't bother to inline the function, so I'm not surprised it works fine.)

Ltmp55:
    @DEBUG_VALUE: rowIndex <- R3+0
    .loc    1 101 29                @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29
    add r8, r12
    cmp r3, r11
    .loc    1 32 5                  @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5
Ltmp56:
    @ InlineAsm Start
    Lresizeloop:                      
vld1.32       {d0-d3}, [r4]!      
vld1.32       {d4-d7}, [r5]!      
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       q0, q2              
vtrn.32       q1, q3              
vhadd.u8      q0, q0, q2          
vhadd.u8      q1, q1, q3          
vtrn.32       d0, d1              
vtrn.32       d2, d3              
vswp          d1, d2              
vst1.64       {d0-d1}, [r6]!      
subs          r2, r2, #8          
bne           Lresizeloop         

    @ InlineAsm End
Ltmp57:
    blo LBB2_2

See that blo (branch-if-lower) instruction on the final line? It uses the condition codes set by the cmp r3, r11 at the top of the assembly block. But of course your inline assembly code has totally trashed the condition code register by then. So is this a compiler bug?... Nope! You just forgot to tell the compiler that your inline assembly code trashes the condition codes. Replace

                 : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
                 : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
                 : "q0", "q1", "q2", "q3"
                 );

with

                 : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow)
                 : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow)
                 : "q0", "q1", "q2", "q3", "cc"
                 );

and the assembly output fixes itself. I haven't run the app, but I bet you'll find it's all better now. :)

like image 73
Quuxplusone Avatar answered Nov 16 '22 22:11

Quuxplusone