Skip to content

[BUG] Performance regressions in NDK r23b : arm64 vectorization bug (needs cherrypick) #1619

@Over17

Description

@Over17

I'm working at upgrading our custom build system from NDK r21d to r23b, and facing a number of performance regressions.

A weird one is a regression in strstr(), happens only in AArch64:
image

More or less, it's 2x slower (orange bars are runs on my branch). Our code that is being benchmarked is a simple wrapper around libc's strstr().

Assembly in libc.a for api30, NDK r21d:

Disassembly of section .text.strstr:

0000000000000000 <strstr>:
       0: f7 0f 1c f8  	str	x23, [sp, #-64]!
       4: f6 57 01 a9  	stp	x22, x21, [sp, #16]
       8: f4 4f 02 a9  	stp	x20, x19, [sp, #32]
       c: fd 7b 03 a9  	stp	x29, x30, [sp, #48]
      10: fd c3 00 91  	add	x29, sp, #48
      14: f3 03 01 aa  	mov	x19, x1
      18: 77 16 40 38  	ldrb	w23, [x19], #1
      1c: f4 03 00 aa  	mov	x20, x0
      20: f7 01 00 34  	cbz	w23, 0x5c <strstr+0x5c>
      24: e0 03 13 aa  	mov	x0, x19
      28: 00 00 00 94  	bl	0x28 <strstr+0x28>
      2c: f6 03 00 aa  	mov	x22, x0
      30: f5 03 14 aa  	mov	x21, x20
      34: 88 16 40 38  	ldrb	w8, [x20], #1
      38: 68 01 00 34  	cbz	w8, 0x64 <strstr+0x64>
      3c: 1f 01 17 6b  	cmp	w8, w23
      40: 81 ff ff 54  	b.ne	0x30 <strstr+0x30>
      44: e0 03 14 aa  	mov	x0, x20
      48: e1 03 13 aa  	mov	x1, x19
      4c: e2 03 16 aa  	mov	x2, x22
      50: 00 00 00 94  	bl	0x50 <strstr+0x50>
      54: e0 fe ff 35  	cbnz	w0, 0x30 <strstr+0x30>
      58: 04 00 00 14  	b	0x68 <strstr+0x68>
      5c: f5 03 14 aa  	mov	x21, x20
      60: 02 00 00 14  	b	0x68 <strstr+0x68>
      64: f5 03 1f aa  	mov	x21, xzr
      68: e0 03 15 aa  	mov	x0, x21
      6c: fd 7b 43 a9  	ldp	x29, x30, [sp, #48]
      70: f4 4f 42 a9  	ldp	x20, x19, [sp, #32]
      74: f6 57 41 a9  	ldp	x22, x21, [sp, #16]
      78: f7 07 44 f8  	ldr	x23, [sp], #64
      7c: c0 03 5f d6  	ret

Looked at the assembly of libc.a for api31, NDK r23b:

Disassembly of section .text.strstr:

0000000000000000 <strstr>:
       0: 3f 23 03 d5  	paciasp
       4: fd 7b be a9  	stp	x29, x30, [sp, #-32]!
       8: f3 0b 00 f9  	str	x19, [sp, #16]
       c: fd 03 00 91  	mov	x29, sp
      10: f3 03 01 aa  	mov	x19, x1
      14: 21 00 40 39  	ldrb	w1, [x1]
      18: 01 07 00 34  	cbz	w1, 0xf8 <strstr+0xf8>
      1c: 00 00 00 94  	bl	0x1c <strstr+0x1c>
      20: c0 06 00 b4  	cbz	x0, 0xf8 <strstr+0xf8>
      24: 68 06 40 39  	ldrb	w8, [x19, #1]
      28: 88 06 00 34  	cbz	w8, 0xf8 <strstr+0xf8>
      2c: 09 04 40 39  	ldrb	w9, [x0, #1]
      30: 29 06 00 34  	cbz	w9, 0xf4 <strstr+0xf4>
      34: 6a 0a 40 39  	ldrb	w10, [x19, #2]
      38: ca 01 00 34  	cbz	w10, 0x70 <strstr+0x70>
      3c: 0b 08 40 39  	ldrb	w11, [x0, #2]
      40: ab 05 00 34  	cbz	w11, 0xf4 <strstr+0xf4>
      44: 6c 0e 40 39  	ldrb	w12, [x19, #3]
      48: 2c 03 00 34  	cbz	w12, 0xac <strstr+0xac>
      4c: 08 0c 40 39  	ldrb	w8, [x0, #3]
      50: 28 05 00 34  	cbz	w8, 0xf4 <strstr+0xf4>
      54: 68 12 40 39  	ldrb	w8, [x19, #4]
      58: 88 05 00 34  	cbz	w8, 0x108 <strstr+0x108>
      5c: e1 03 13 aa  	mov	x1, x19
      60: f3 0b 40 f9  	ldr	x19, [sp, #16]
      64: fd 7b c2 a8  	ldp	x29, x30, [sp], #32
      68: bf 23 03 d5  	autiasp
      6c: 00 00 00 14  	b	0x6c <strstr+0x6c>
      70: 6a 02 40 39  	ldrb	w10, [x19]
      74: 0b 00 40 39  	ldrb	w11, [x0]
      78: 48 1d 18 33  	bfi	w8, w10, #8, #8
      7c: 69 1d 18 33  	bfi	w9, w11, #8, #8
      80: 3f 01 08 6b  	cmp	w9, w8
      84: a0 03 00 54  	b.eq	0xf8 <strstr+0xf8>
      88: 0a 08 40 39  	ldrb	w10, [x0, #2]
      8c: 00 04 00 91  	add	x0, x0, #1
      90: 0a 03 00 34  	cbz	w10, 0xf0 <strstr+0xf0>
      94: eb 03 0a 2a  	mov	w11, w10
      98: 2b 1d 18 33  	bfi	w11, w9, #8, #8
      9c: 7f 01 08 6b  	cmp	w11, w8
      a0: e9 03 0b 2a  	mov	w9, w11
      a4: 21 ff ff 54  	b.ne	0x88 <strstr+0x88>
      a8: 12 00 00 14  	b	0xf0 <strstr+0xf0>
      ac: 6c 02 40 39  	ldrb	w12, [x19]
      b0: 0d 00 40 39  	ldrb	w13, [x0]
      b4: 08 3d 10 53  	lsl	w8, w8, #16
      b8: 29 3d 10 53  	lsl	w9, w9, #16
      bc: 48 1d 18 33  	bfi	w8, w10, #8, #8
      c0: 69 1d 18 33  	bfi	w9, w11, #8, #8
      c4: 88 1d 08 33  	bfi	w8, w12, #24, #8
      c8: a9 1d 08 33  	bfi	w9, w13, #24, #8
      cc: 3f 01 08 6b  	cmp	w9, w8
      d0: 40 01 00 54  	b.eq	0xf8 <strstr+0xf8>
      d4: 0a 0c 40 39  	ldrb	w10, [x0, #3]
      d8: 00 04 00 91  	add	x0, x0, #1
      dc: aa 00 00 34  	cbz	w10, 0xf0 <strstr+0xf0>
      e0: 29 01 0a 2a  	orr	w9, w9, w10
      e4: 29 5d 18 53  	lsl	w9, w9, #8
      e8: 3f 01 08 6b  	cmp	w9, w8
      ec: 41 ff ff 54  	b.ne	0xd4 <strstr+0xd4>
      f0: 4a 00 00 35  	cbnz	w10, 0xf8 <strstr+0xf8>
      f4: e0 03 1f aa  	mov	x0, xzr
      f8: f3 0b 40 f9  	ldr	x19, [sp, #16]
      fc: fd 7b c2 a8  	ldp	x29, x30, [sp], #32
     100: bf 23 03 d5  	autiasp
     104: c0 03 5f d6  	ret
     108: e1 03 13 aa  	mov	x1, x19
     10c: f3 0b 40 f9  	ldr	x19, [sp, #16]
     110: fd 7b c2 a8  	ldp	x29, x30, [sp], #32
     114: bf 23 03 d5  	autiasp
     118: 00 00 00 14  	b	0x118 <strstr+0x118> 

The only expected change I'm seeing are PAC/BTI instructions, all the rest I'm having hard times to follow to be honest.

Is it known? Expected?

Another (but a much smaller: ~10%) regression happens in this simple code:

bool IsAllUnderscores(core::string_ref s)
{
    for (size_t i = 0, n = s.size(); i != n; ++i)
        if (s[i] != '_')
            return false;
    return true;
}

which becomes

0000000000a7fad4 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE>:
  a7fad4: 08 04 40 f9  	ldr	x8, [x0, #8]
  a7fad8: 08 01 00 b4  	cbz	x8, 0xa7faf8 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE+0x24>
  a7fadc: 09 00 40 f9  	ldr	x9, [x0]
  a7fae0: 2a 01 40 39  	ldrb	w10, [x9]
  a7fae4: 5f 7d 01 71  	cmp	w10, #95
  a7fae8: c1 00 00 54  	b.ne	0xa7fb00 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE+0x2c>
  a7faec: 08 05 00 f1  	subs	x8, x8, #1
  a7faf0: 29 05 00 91  	add	x9, x9, #1
  a7faf4: 61 ff ff 54  	b.ne	0xa7fae0 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE+0xc>
  a7faf8: 20 00 80 52  	mov	w0, #1
  a7fafc: c0 03 5f d6  	ret
  a7fb00: e0 03 1f 2a  	mov	w0, wzr
  a7fb04: c0 03 5f d6  	ret 

with r21d, and

000000000103d6c4 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE>:
 103d6c4: 09 04 40 f9  	ldr	x9, [x0, #8]
 103d6c8: 69 01 00 b4  	cbz	x9, 0x103d6f4 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE+0x30>
 103d6cc: 08 00 40 f9  	ldr	x8, [x0]
 103d6d0: 29 05 00 d1  	sub	x9, x9, #1
 103d6d4: 0a 15 40 38  	ldrb	w10, [x8], #1
 103d6d8: 5f 7d 01 71  	cmp	w10, #95
 103d6dc: e0 17 9f 1a  	cset	w0, eq
 103d6e0: 89 00 00 b4  	cbz	x9, 0x103d6f0 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE+0x2c>
 103d6e4: 5f 7d 01 71  	cmp	w10, #95
 103d6e8: 29 05 00 d1  	sub	x9, x9, #1
 103d6ec: 40 ff ff 54  	b.eq	0x103d6d4 <_Z16IsAllUnderscoresN4core16basic_string_refIcEE+0x10>
 103d6f0: c0 03 5f d6  	ret
 103d6f4: 20 00 80 52  	mov	w0, #1
 103d6f8: c0 03 5f d6  	ret 

with r23b. There's definitely a redundant cmp w10, 95. Godbolt suggests trunk LLVM is better at codegen.

Overall, due to perf issues introduced by https://reviews.llvm.org/D84108, we are seeing a number of cases where the code is no longer vectorized where it used to. I can try build up few cases if you're interested, a bit more complicated than here though.

Are there plans to update LLVM in NDK r23? It's LTS after all.
If not, what's the ETA for r24 then?

Thanks!

Metadata

Metadata

Assignees

Labels

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions