2012-08-16 24 views
5

en un intento de aprender algo sobre el ensamblaje de ARM, he escrito un proyecto de prueba simple para realizar la reducción de imagen mediante el ensamblaje en línea y las instrucciones NEON. se puede ver aquí:¿por qué la optimización de clang está rompiendo mi código ensamblador en línea?

https://github.com/rmaz/NEON-Image-Downscaling

después de algún esfuerzo me las arreglé para conseguir que funcione, días felices. excepto que solo funciona para niveles de optimización inferiores a -O2. He echado un vistazo al ASM generado, pero no veo ninguna razón obvia para que esto ocurra. ¿Alguien puede ofrecer alguna idea? aquí es la función responsable de la pieza de montaje en línea:

static void inline resizeRow(uint32_t *dst, uint32_t *src, uint32_t pixelsPerRow) 
{ 
    const uint32_t * rowB = src + pixelsPerRow; 

    // force the number of pixels per row to a mutliple of 8 
    pixelsPerRow = 8 * (pixelsPerRow/8);  

    __asm__ volatile("Lresizeloop:      \n" // start loop 
        "vld1.32  {d0-d3}, [%1]!  \n" // load 8 pixels from the top row 
        "vld1.32  {d4-d7}, [%2]!  \n" // load 8 pixels from the bottom row 
        "vhadd.u8  q0, q0, q2   \n" // average the pixels vertically 
        "vhadd.u8  q1, q1, q3   \n" 
        "vtrn.32  q0, q2    \n" // transpose to put the horizontally adjacent pixels in different registers 
        "vtrn.32  q1, q3    \n" 
        "vhadd.u8  q0, q0, q2   \n" // average the pixels horizontally 
        "vhadd.u8  q1, q1, q3   \n" 
        "vtrn.32  d0, d1    \n" // fill the registers with pixels 
        "vtrn.32  d2, d3    \n" 
        "vswp   d1, d2    \n" 
        "vst1.64  {d0-d1}, [%0]!  \n" // store the result 
        "subs   %3, %3, #8   \n" // subtract 8 from the pixel count 
        "bne   Lresizeloop   \n" // repeat until the row is complete 
        : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow) 
        : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow) 
        : "q0", "q1", "q2", "q3" 
        ); 
} 

el funcionamiento genera de salida en O1 para la función de los alrededores y bucle es la siguiente:

.align 2 
    .code 16      @ @"\01-[BDPViewController downscaleImageNeon:]" 
    .thumb_func "-[BDPViewController downscaleImageNeon:]" 
"-[BDPViewController downscaleImageNeon:]": 
    .cfi_startproc 
Lfunc_begin4: 
    .loc 1 86 0     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0 
@ BB#0: 
    .loc 1 86 1 prologue_end  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1 
    push {r4, r5, r6, r7, lr} 
    add r7, sp, #12 
    push.w {r8, r10, r11} 
    sub sp, #20 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0 
    .loc 1 88 20     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20 
Ltmp41: 
    movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4)) 
Ltmp42: 
    mov r6, r2 
Ltmp43: 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0 
    movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4)) 
LPC4_0: 
    add r0, pc 
    ldr.w r11, [r0] 
    mov r0, r6 
    blx _objc_retain 
    mov r4, r0 
    mov r0, r6 
    mov r1, r11 
Ltmp44: 
    blx _objc_msgSend 
    blx _CGImageGetWidth 
    mov r5, r0 
Ltmp45: 
    @DEBUG_VALUE: width <- R5+0 
    .loc 1 89 21     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21 
    mov r0, r6 
    mov r1, r11 
    str r5, [sp, #16]   @ 4-byte Spill 
    blx _objc_msgSend 
    blx _CGImageGetHeight 
    mov r10, r0 
Ltmp46: 
    @DEBUG_VALUE: height <- R10+0 
    .loc 1 90 26     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26 
    mov r0, r6 
    mov r1, r11 
    blx _objc_msgSend 
    blx _CGImageGetBytesPerRow 
    str r0, [sp, #12]   @ 4-byte Spill 
Ltmp47: 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    .loc 1 91 35     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35 
    mov r0, r6 
    mov r1, r11 
    blx _objc_msgSend 
    blx _CGImageGetAlphaInfo 
    str r0, [sp, #4]   @ 4-byte Spill 
Ltmp48: 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    .loc 1 94 45     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45 
    mov r0, r6 
    mov r1, r11 
    blx _objc_msgSend 
    mov r6, r0 
Ltmp49: 
    mov r0, r4 
    blx _objc_release 
    mov r0, r6 
    .loc 1 98 29     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29 
    mul r8, r10, r5 
Ltmp50: 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    .loc 1 94 45     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45 
    blx _CGImageGetDataProvider 
    blx _CGDataProviderCopyData 
Ltmp51: 
    @DEBUG_VALUE: data <- R0+0 
    str r0, [sp, #8]   @ 4-byte Spill 
Ltmp52: 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    .loc 1 95 29     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29 
    blx _CFDataGetBytePtr 
    mov r4, r0 
Ltmp53: 
    @DEBUG_VALUE: buffer <- R4+0 
    .loc 1 98 29     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29 
    lsr.w r0, r8, #2 
    movs r1, #4 
    blx _calloc 
    mov r5, r0 
Ltmp54: 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    mov r0, r10 
Ltmp55: 
    @DEBUG_VALUE: height <- R0+0 
    .loc 1 101 29    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29 
    cmp r0, #0 
Ltmp56: 
    @DEBUG_VALUE: rowIndex <- 0+0 
    beq LBB4_3 
@ BB#1:         @ %.lr.ph 
Ltmp57: 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    @DEBUG_VALUE: height <- R0+0 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    @DEBUG_VALUE: buffer <- R4+0 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    @DEBUG_VALUE: rowIndex <- 0+0 
    ldr r1, [sp, #12]   @ 4-byte Reload 
Ltmp58: 
    @DEBUG_VALUE: bytesPerRow <- R1+0 
    mov.w r8, #0 
    lsl.w r11, r1, #1 
    .loc 1 104 74    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74 
Ltmp59: 
    lsr.w r10, r1, #1 
Ltmp60: 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
LBB4_2:         @ =>This Inner Loop Header: Depth=1 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    @DEBUG_VALUE: height <- R0+0 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    @DEBUG_VALUE: rowIndex <- 0+0 
    lsr.w r1, r8, #1 
Ltmp61: 
    mov r6, r0 
Ltmp62: 
    @DEBUG_VALUE: height <- R6+0 
    mla r0, r1, r10, r5 
Ltmp63: 
    @DEBUG_VALUE: destRow <- R1+0 
    .loc 1 105 9     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9 
    ldr r2, [sp, #16]   @ 4-byte Reload 
    mov r1, r4 
Ltmp64: 
    bl _resizeRow 
    mov r0, r6 
Ltmp65: 
    @DEBUG_VALUE: height <- R0+0 
    .loc 1 101 50    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50 
    add.w r8, r8, #2 
Ltmp66: 
    @DEBUG_VALUE: rowIndex <- R8+0 
    .loc 1 101 29    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29 
    add r4, r11 
    cmp r8, r0 
    blo LBB4_2 
Ltmp67: 
LBB4_3:         @ %._crit_edge 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    @DEBUG_VALUE: height <- R0+0 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    .loc 1 109 28    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28 
    ldr r1, [sp, #4]   @ 4-byte Reload 
Ltmp68: 
    lsrs r2, r0, #1 
    str r1, [sp] 
    mov r6, r5 
Ltmp69: 
    @DEBUG_VALUE: outputBuffer <- R6+0 
    ldr r1, [sp, #16]   @ 4-byte Reload 
    ldr r0, [sp, #12]   @ 4-byte Reload 
Ltmp70: 
    lsrs r1, r1, #1 
    lsrs r3, r0, #1 
    mov r0, r5 
    bl _createBitmapContext 
    mov r4, r0 
Ltmp71: 
    @DEBUG_VALUE: context <- R4+0 
    .loc 1 110 30    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30 
    blx _CGBitmapContextCreateImage 
    .loc 1 111 66    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66 
    movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4)) 
    .loc 1 110 30    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30 
    mov r5, r0 
Ltmp72: 
    @DEBUG_VALUE: scaledImage <- R5+0 
    .loc 1 111 66    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66 
    movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4)) 
    movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4)) 
    movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4)) 
LPC4_1: 
    add r1, pc 
LPC4_2: 
    add r0, pc 
    mov r2, r5 
    ldr r1, [r1] 
    ldr r0, [r0] 
    blx _objc_msgSend 
Ltmp73: 
    @DEBUG_VALUE: returnImage <- R0+0 
    @ InlineAsm Start 
    mov r7, r7  @ marker for objc_retainAutoreleaseReturnValue 
    @ InlineAsm End 
    blx _objc_retainAutoreleasedReturnValue 
Ltmp74: 
    mov r8, r0 
    .loc 1 112 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5 
    mov r0, r5 
    blx _CGImageRelease 
    .loc 1 113 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5 
    mov r0, r4 
    blx _CGContextRelease 
    .loc 1 114 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5 
    ldr r0, [sp, #8]   @ 4-byte Reload 
    blx _CFRelease 
    .loc 1 115 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5 
    mov r0, r6 
    blx _free 
Ltmp75: 
    .loc 1 118 1     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1 
    mov r0, r8 
    add sp, #20 
    pop.w {r8, r10, r11} 
    pop.w {r4, r5, r6, r7, lr} 
Ltmp76: 
    b.w _objc_autoreleaseReturnValue 
Ltmp77: 
Lfunc_end4: 
    .cfi_endproc 

    .align 2 
    .code 16      @ @resizeRow 
    .thumb_func _resizeRow 
_resizeRow: 
    .cfi_startproc 
Lfunc_begin5: 
    .loc 1 26 0     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0 
@ BB#0: 
    @DEBUG_VALUE: resizeRow:dst <- R0+0 
    @DEBUG_VALUE: resizeRow:src <- R1+0 
    @DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0 
    .loc 1 27 47 prologue_end @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47 
    add.w r3, r1, r2, lsl #2 
Ltmp78: 
    @DEBUG_VALUE: rowB <- R3+0 
    .loc 1 30 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5 
    bic r2, r2, #7 
Ltmp79: 
    .loc 1 32 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5 
    @ InlineAsm Start 
    Lresizeloop:      
vld1.32  {d0-d3}, [r1]!  
vld1.32  {d4-d7}, [r3]!  
vhadd.u8  q0, q0, q2   
vhadd.u8  q1, q1, q3   
vtrn.32  q0, q2    
vtrn.32  q1, q3    
vhadd.u8  q0, q0, q2   
vhadd.u8  q1, q1, q3   
vtrn.32  d0, d1    
vtrn.32  d2, d3    
vswp   d1, d2    
vst1.64  {d0-d1}, [r0]!  
subs   r2, r2, #8   
bne   Lresizeloop   

    @ InlineAsm End 
Ltmp80: 
    .loc 1 51 1     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1 
    bx lr 
Ltmp81: 
Lfunc_end5: 
    .cfi_endproc 

y la salida no funcionamiento en O2 es tan sigue:

.align 2 
    .code 16      @ @"\01-[BDPViewController downscaleImageNeon:]" 
    .thumb_func "-[BDPViewController downscaleImageNeon:]" 
"-[BDPViewController downscaleImageNeon:]": 
    .cfi_startproc 
Lfunc_begin4: 
    .loc 1 86 0     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:0 
@ BB#0: 
    .loc 1 86 1 prologue_end  @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:86:1 
    push {r4, r5, r6, r7, lr} 
    add r7, sp, #12 
    push.w {r8, r10, r11} 
    sub sp, #20 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R2+0 
    .loc 1 88 20     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:88:20 
Ltmp41: 
    movw r0, :lower16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4)) 
Ltmp42: 
    mov r6, r2 
Ltmp43: 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:image <- R6+0 
    movt r0, :upper16:(L_OBJC_SELECTOR_REFERENCES_2-(LPC4_0+4)) 
LPC4_0: 
    add r0, pc 
    ldr.w r11, [r0] 
    mov r0, r6 
    blx _objc_retain 
    mov r4, r0 
    mov r0, r6 
    mov r1, r11 
Ltmp44: 
    blx _objc_msgSend 
    blx _CGImageGetWidth 
    mov r5, r0 
Ltmp45: 
    @DEBUG_VALUE: width <- R5+0 
    .loc 1 89 21     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:89:21 
    mov r0, r6 
    mov r1, r11 
    str r5, [sp, #16]   @ 4-byte Spill 
    blx _objc_msgSend 
    blx _CGImageGetHeight 
    mov r10, r0 
Ltmp46: 
    @DEBUG_VALUE: height <- R10+0 
    .loc 1 90 26     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:90:26 
    mov r0, r6 
    mov r1, r11 
    blx _objc_msgSend 
    blx _CGImageGetBytesPerRow 
    str r0, [sp, #12]   @ 4-byte Spill 
Ltmp47: 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    .loc 1 91 35     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:91:35 
    mov r0, r6 
    mov r1, r11 
    blx _objc_msgSend 
    blx _CGImageGetAlphaInfo 
    str r0, [sp, #4]   @ 4-byte Spill 
Ltmp48: 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    .loc 1 94 45     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45 
    mov r0, r6 
    mov r1, r11 
    blx _objc_msgSend 
    mov r6, r0 
Ltmp49: 
    mov r0, r4 
    blx _objc_release 
    mov r0, r6 
    .loc 1 98 29     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29 
    mul r8, r10, r5 
Ltmp50: 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    .loc 1 94 45     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:94:45 
    blx _CGImageGetDataProvider 
    blx _CGDataProviderCopyData 
Ltmp51: 
    @DEBUG_VALUE: data <- R0+0 
    str r0, [sp, #8]   @ 4-byte Spill 
Ltmp52: 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    .loc 1 95 29     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:95:29 
    blx _CFDataGetBytePtr 
    mov r4, r0 
Ltmp53: 
    @DEBUG_VALUE: buffer <- R4+0 
    .loc 1 98 29     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:98:29 
    lsr.w r0, r8, #2 
    movs r1, #4 
    blx _calloc 
    mov r5, r0 
Ltmp54: 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    mov r0, r10 
Ltmp55: 
    @DEBUG_VALUE: height <- R0+0 
    .loc 1 101 29    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29 
    cmp r0, #0 
Ltmp56: 
    @DEBUG_VALUE: rowIndex <- 0+0 
    beq LBB4_3 
@ BB#1:         @ %.lr.ph 
Ltmp57: 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    @DEBUG_VALUE: height <- R0+0 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    @DEBUG_VALUE: buffer <- R4+0 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    @DEBUG_VALUE: rowIndex <- 0+0 
    ldr r1, [sp, #12]   @ 4-byte Reload 
Ltmp58: 
    @DEBUG_VALUE: bytesPerRow <- R1+0 
    mov.w r8, #0 
    lsl.w r11, r1, #1 
    .loc 1 104 74    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:104:74 
Ltmp59: 
    lsr.w r10, r1, #1 
Ltmp60: 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
LBB4_2:         @ =>This Inner Loop Header: Depth=1 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    @DEBUG_VALUE: height <- R0+0 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    @DEBUG_VALUE: rowIndex <- 0+0 
    lsr.w r1, r8, #1 
Ltmp61: 
    mov r6, r0 
Ltmp62: 
    @DEBUG_VALUE: height <- R6+0 
    mla r0, r1, r10, r5 
Ltmp63: 
    @DEBUG_VALUE: destRow <- R1+0 
    .loc 1 105 9     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:105:9 
    ldr r2, [sp, #16]   @ 4-byte Reload 
    mov r1, r4 
Ltmp64: 
    bl _resizeRow 
    mov r0, r6 
Ltmp65: 
    @DEBUG_VALUE: height <- R0+0 
    .loc 1 101 50    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:50 
    add.w r8, r8, #2 
Ltmp66: 
    @DEBUG_VALUE: rowIndex <- R8+0 
    .loc 1 101 29    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29 
    add r4, r11 
    cmp r8, r0 
    blo LBB4_2 
Ltmp67: 
LBB4_3:         @ %._crit_edge 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:self <- R0+0 
    @DEBUG_VALUE: -[BDPViewController downscaleImageNeon:]:_cmd <- R1+0 
    @DEBUG_VALUE: width <- [sp+#16]+#0 
    @DEBUG_VALUE: height <- R0+0 
    @DEBUG_VALUE: bytesPerRow <- [sp+#12]+#0 
    @DEBUG_VALUE: imageAlpha <- [sp+#4]+#0 
    @DEBUG_VALUE: data <- [sp+#8]+#0 
    @DEBUG_VALUE: outputBuffer <- R5+0 
    .loc 1 109 28    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:109:28 
    ldr r1, [sp, #4]   @ 4-byte Reload 
Ltmp68: 
    lsrs r2, r0, #1 
    str r1, [sp] 
    mov r6, r5 
Ltmp69: 
    @DEBUG_VALUE: outputBuffer <- R6+0 
    ldr r1, [sp, #16]   @ 4-byte Reload 
    ldr r0, [sp, #12]   @ 4-byte Reload 
Ltmp70: 
    lsrs r1, r1, #1 
    lsrs r3, r0, #1 
    mov r0, r5 
    bl _createBitmapContext 
    mov r4, r0 
Ltmp71: 
    @DEBUG_VALUE: context <- R4+0 
    .loc 1 110 30    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30 
    blx _CGBitmapContextCreateImage 
    .loc 1 111 66    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66 
    movw r1, :lower16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4)) 
    .loc 1 110 30    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:110:30 
    mov r5, r0 
Ltmp72: 
    @DEBUG_VALUE: scaledImage <- R5+0 
    .loc 1 111 66    @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:111:66 
    movt r1, :upper16:(L_OBJC_SELECTOR_REFERENCES_4-(LPC4_1+4)) 
    movw r0, :lower16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4)) 
    movt r0, :upper16:(L_OBJC_CLASSLIST_REFERENCES_$_-(LPC4_2+4)) 
LPC4_1: 
    add r1, pc 
LPC4_2: 
    add r0, pc 
    mov r2, r5 
    ldr r1, [r1] 
    ldr r0, [r0] 
    blx _objc_msgSend 
Ltmp73: 
    @DEBUG_VALUE: returnImage <- R0+0 
    @ InlineAsm Start 
    mov r7, r7  @ marker for objc_retainAutoreleaseReturnValue 
    @ InlineAsm End 
    blx _objc_retainAutoreleasedReturnValue 
Ltmp74: 
    mov r8, r0 
    .loc 1 112 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:112:5 
    mov r0, r5 
    blx _CGImageRelease 
    .loc 1 113 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:113:5 
    mov r0, r4 
    blx _CGContextRelease 
    .loc 1 114 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:114:5 
    ldr r0, [sp, #8]   @ 4-byte Reload 
    blx _CFRelease 
    .loc 1 115 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:115:5 
    mov r0, r6 
    blx _free 
Ltmp75: 
    .loc 1 118 1     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:118:1 
    mov r0, r8 
    add sp, #20 
    pop.w {r8, r10, r11} 
    pop.w {r4, r5, r6, r7, lr} 
Ltmp76: 
    b.w _objc_autoreleaseReturnValue 
Ltmp77: 
Lfunc_end4: 
    .cfi_endproc 

    .align 2 
    .code 16      @ @resizeRow 
    .thumb_func _resizeRow 
_resizeRow: 
    .cfi_startproc 
Lfunc_begin5: 
    .loc 1 26 0     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:26:0 
@ BB#0: 
    @DEBUG_VALUE: resizeRow:dst <- R0+0 
    @DEBUG_VALUE: resizeRow:src <- R1+0 
    @DEBUG_VALUE: resizeRow:pixelsPerRow <- R2+0 
    .loc 1 27 47 prologue_end @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:27:47 
    add.w r3, r1, r2, lsl #2 
Ltmp78: 
    @DEBUG_VALUE: rowB <- R3+0 
    .loc 1 30 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:30:5 
    bic r2, r2, #7 
Ltmp79: 
    .loc 1 32 5     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5 
    @ InlineAsm Start 
    Lresizeloop:      
vld1.32  {d0-d3}, [r1]!  
vld1.32  {d4-d7}, [r3]!  
vhadd.u8  q0, q0, q2   
vhadd.u8  q1, q1, q3   
vtrn.32  q0, q2    
vtrn.32  q1, q3    
vhadd.u8  q0, q0, q2   
vhadd.u8  q1, q1, q3   
vtrn.32  d0, d1    
vtrn.32  d2, d3    
vswp   d1, d2    
vst1.64  {d0-d1}, [r0]!  
subs   r2, r2, #8   
bne   Lresizeloop   

    @ InlineAsm End 
Ltmp80: 
    .loc 1 51 1     @ NEON-Image-Downscaling/ImageResize/BDPViewController.m:51:1 
    bx lr 
Ltmp81: 
Lfunc_end5: 
    .cfi_endproc 
+1

¿Por qué no publicas el código generado? –

+0

Esos dos parecen idénticos. ¿Es este el resultado de ensamblaje del compilador? Intenta usar objdump para obtener el ensamblaje de dos binarios compilados de forma diferente. – auselen

Respuesta

13

he aquí un fragmento del código ensamblador que recibo de su proyecto Xcode con -O2. (Edificio con -O1 no se molesta en inline la función, por lo que no me sorprende que trabaja muy bien.)

Ltmp55: 
    @DEBUG_VALUE: rowIndex <- R3+0 
    .loc 1 101 29    @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:101:29 
    add r8, r12 
    cmp r3, r11 
    .loc 1 32 5     @ /tmp/NEON-Image-Downscaling/ImageResize/BDPViewController.m:32:5 
Ltmp56: 
    @ InlineAsm Start 
    Lresizeloop:      
vld1.32  {d0-d3}, [r4]!  
vld1.32  {d4-d7}, [r5]!  
vhadd.u8  q0, q0, q2   
vhadd.u8  q1, q1, q3   
vtrn.32  q0, q2    
vtrn.32  q1, q3    
vhadd.u8  q0, q0, q2   
vhadd.u8  q1, q1, q3   
vtrn.32  d0, d1    
vtrn.32  d2, d3    
vswp   d1, d2    
vst1.64  {d0-d1}, [r6]!  
subs   r2, r2, #8   
bne   Lresizeloop   

    @ InlineAsm End 
Ltmp57: 
    blo LBB2_2 

Ver que blo (rama-si-inferior) instrucción en la línea final? Utiliza los códigos de condición establecidos por cmp r3, r11 en la parte superior del bloque de ensamblaje. Pero, por supuesto, su código de ensamblaje en línea ha descartado por completo el registro del código de condición para entonces. Entonces, ¿esto es un error del compilador? ... ¡No! Acaba de olvidarse tell compilador que su código de ensamblaje en línea traspasa los códigos de condición. Reemplazar

    : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow) 
       : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow) 
       : "q0", "q1", "q2", "q3" 
       ); 

con

    : "=r"(dst), "=r"(src), "=r"(rowB), "=r"(pixelsPerRow) 
       : "0"(dst), "1"(src), "2"(rowB), "3"(pixelsPerRow) 
       : "q0", "q1", "q2", "q3", "cc" 
       ); 

y las propias correcciones de salida de montaje. No he ejecutado la aplicación, pero apuesto a que encontrarás que todo está mejor ahora. :)

+0

nice one quux, eso resolvió el problema, las mejores calificaciones. – Tark

Cuestiones relacionadas