While working on an optimization for the FXVis system, I managed to make some code slower just by rearranging some statics it was using. Not one to take something like this lying down, I looked into it, and learned something that seems worth sharing.
The whole issue stemmed around the use of some static values that existed at the top of the function. Here’s the original function. Note that it has static in various vector formats at the beginning.
qword SimdFloatToHalf2(vf32 a_, vf32 b_)
{
static vu8 shuf_mid16 =
{ 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e };
static vu8 shuf_hi16 =
{ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d };
static vu32 m_mask = (vu32)si_ilh(0x03ff); //{e4}
static vu32 s_mask = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
static vu32 e_mask = { 0x7c007c00, 0x7c007c00, 0x7c007c00, 0x7c007c00 };static vu32 exp_off = { 0x38003800, 0x38003800, 0x38003800, 0x38003800 };
// 0 1 2 3
// seeeeeee emmmmmmm mmmmmmmm mmmmmmmm
qword m = si_shufb((qword)a_, (qword)b_, (qword)shuf_mid16); //{o4} m in hword | emmmmmmm mmmmmmmm |
m = si_rothmi(m, -5); //{e4} | 00000emm mmmmmmmm |
m = si_and(m, (qword)m_mask); //{e2} | 000000mm mmmmmmmm |
qword se = si_shufb((qword)a_, (qword)b_, (qword)shuf_hi16); //{o4} se in hword | seeeeeee emmmmmmm |
qword s = si_and(se, (qword)s_mask); //{e2} mask | s0000000 00000000 |
qword e = si_andc(se, (qword)s_mask); //{e2} mask | 0eeeeeee emmmmmmm |
qword d_mask = si_cgth((qword)exp_off, e); //{e2} (e <= 112 ?) (if e <= 0x3800, then exp_off would make <= 0)
e = si_sfh((qword)exp_off, e); //{e2} correct exponent (e = ((e - 127) + 15))
e = si_shlhi(e, 3); //{e4} e <<= 3 | eeeeeemm mmmmm000 |
e = si_and(e, (qword)e_mask); //{e4} mask | 0eeeee00 00000000 |
qword h = si_or(s, m); //{e2} s | m
h = si_or(h, e); //{e2} s | e | m
h = si_andc(h, d_mask); //{e2} set to 0 if underflow
// e11, o2
// load/store odd
// shuffle odd
return h;
}
Here’s how this function disassembled:
00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>:
0: 30 80 00 02 lqa $2,0
4: 30 80 00 05 lqa $5,0
8: 30 80 00 06 lqa $6,0
c: 30 80 00 07 lqa $7,0
10: 3e 80 00 89 cbd $9,0($1)
14: b0 a1 01 85 shufb $5,$3,$4,$5
18: b0 61 01 82 shufb $3,$3,$4,$2
1c: 30 80 00 04 lqa $4,0
20: 58 21 82 82 andc $2,$5,$6
24: 0f be c1 88 rothmi $8,$3,-5
28: 09 00 83 83 sfh $3,$7,$2
2c: 00 20 00 00 lnop
30: 49 00 83 87 cgth $7,$7,$2
34: 30 80 00 02 lqa $2,0
38: 0f e0 c1 83 shlhi $3,$3,3
3c: 3f 83 42 04 rotqbyi $4,$4,13
40: 18 21 82 86 and $6,$5,$6
44: 7e 00 02 04 ceqbi $4,$4,0
48: 18 20 c1 05 and $5,$2,$3
4c: 56 c0 02 04 xsbh $4,$4
50: 40 20 00 7f nop $127
54: 22 00 03 84 brhz $4,70 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x70> # 70
58: 41 81 ff 82 ilh $2,1023 # 3ff
5c: 30 80 00 03 lqa $3,0
60: 20 80 00 02 stqa $2,0
64: 40 80 00 82 il $2,1
68: b0 60 c1 09 shufb $3,$2,$3,$9
6c: 20 80 00 03 stqa $3,0
70: 30 80 00 03 lqa $3,0
74: 18 22 01 83 and $3,$3,$8
78: 08 20 c3 03 or $3,$6,$3
7c: 08 21 41 83 or $3,$3,$5
80: 58 21 c1 83 andc $3,$3,$7
84: 35 00 00 00 bi $0
In trying to get this code cleaner and more consistent, I switched the statics to qwords (to match the rest of the code in the function) like so:
static qword shuf_mid16 = (qword)(vu8){ 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e };
static qword shuf_hi16 = (qword)(vu8){ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d };
static qword s_mask = (qword)(vu32){ 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
static qword e_mask = (qword)(vu32){ 0x7c007c00, 0x7c007c00, 0x7c007c00, 0x7c007c00 }; //@@@ calc this in code!
static qword exp_off = (qword)(vu32){ 0x38003800, 0x38003800, 0x38003800, 0x38003800 }; //@@@ calc this in code!
static qword m_mask = si_ilh(0x03ff); //{e4}
This had… undesirable effects on the code generated. To show just how much the compiler dislikes this syntax, here’s the disassemble from the same function with only the above changes made to it (all lines of actual code were untouched, only these statics declarations were reformatted):
00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>:
0: 04 00 01 87 ori $7,$3,0
4: 30 80 00 02 lqa $2,0
8: 04 00 02 05 ori $5,$4,0
c: 3f 83 41 02 rotqbyi $2,$2,13
10: 7e 00 01 02 ceqbi $2,$2,0
14: 56 c0 01 02 xsbh $2,$2
18: 22 00 04 02 brhz $2,38 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x38> # 38
1c: 30 80 00 02 lqa $2,0
20: 30 80 00 03 lqa $3,0
24: 3e 80 00 84 cbd $4,0($1)
28: 20 80 00 02 stqa $2,0
2c: 40 80 00 82 il $2,1
30: b0 60 c1 04 shufb $3,$2,$3,$4
34: 20 80 00 03 stqa $3,0
38: 30 80 00 02 lqa $2,0
3c: 3f 83 41 02 rotqbyi $2,$2,13
40: 7e 00 01 02 ceqbi $2,$2,0
44: 56 c0 01 02 xsbh $2,$2
48: 22 00 04 02 brhz $2,68 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x68> # 68
4c: 30 80 00 02 lqa $2,0
50: 30 80 00 03 lqa $3,0
54: 3e 80 00 84 cbd $4,0($1)
58: 20 80 00 02 stqa $2,0
5c: 40 80 00 82 il $2,1
60: b0 60 c1 04 shufb $3,$2,$3,$4
64: 20 80 00 03 stqa $3,0
68: 30 80 00 02 lqa $2,0
6c: 3f 83 41 02 rotqbyi $2,$2,13
70: 7e 00 01 02 ceqbi $2,$2,0
74: 56 c0 01 02 xsbh $2,$2
78: 23 00 1e 02 brhnz $2,168 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x168> # 168
7c: 30 80 00 02 lqa $2,0
80: 3f 83 41 02 rotqbyi $2,$2,13
84: 7e 00 01 02 ceqbi $2,$2,0
88: 56 c0 01 02 xsbh $2,$2
8c: 22 00 04 02 brhz $2,ac <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0xac> # ac
90: 41 c0 00 02 ilh $2,32768 # 8000
94: 30 80 00 03 lqa $3,0
98: 3e 80 00 84 cbd $4,0($1)
9c: 20 80 00 02 stqa $2,0
a0: 40 80 00 82 il $2,1
a4: b0 60 c1 04 shufb $3,$2,$3,$4
a8: 20 80 00 03 stqa $3,0
ac: 30 80 00 02 lqa $2,0
b0: 3f 83 41 02 rotqbyi $2,$2,13
b4: 7e 00 01 02 ceqbi $2,$2,0
b8: 56 c0 01 02 xsbh $2,$2
bc: 22 00 04 02 brhz $2,dc <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0xdc> # dc
c0: 41 be 00 02 ilh $2,31744 # 7c00
c4: 30 80 00 03 lqa $3,0
c8: 3e 80 00 84 cbd $4,0($1)
cc: 20 80 00 02 stqa $2,0
d0: 40 80 00 82 il $2,1
d4: b0 60 c1 04 shufb $3,$2,$3,$4
d8: 20 80 00 03 stqa $3,0
dc: 30 80 00 02 lqa $2,0
e0: 3f 83 41 02 rotqbyi $2,$2,13
e4: 7e 00 01 02 ceqbi $2,$2,0
e8: 56 c0 01 02 xsbh $2,$2
ec: 22 00 04 02 brhz $2,10c <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x10c> # 10c
f0: 41 9c 00 02 ilh $2,14336 # 3800
f4: 30 80 00 03 lqa $3,0
f8: 3e 80 00 84 cbd $4,0($1)
fc: 20 80 00 02 stqa $2,0
100: 40 80 00 82 il $2,1
104: b0 60 c1 04 shufb $3,$2,$3,$4
108: 20 80 00 03 stqa $3,0
10c: 30 80 00 03 lqa $3,0
110: 30 80 00 02 lqa $2,0
114: 30 80 00 04 lqa $4,0
118: 30 80 00 06 lqa $6,0
11c: 35 80 00 12 hbr 164 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x164>,$0
120: b0 61 43 83 shufb $3,$7,$5,$3
124: b0 41 43 82 shufb $2,$7,$5,$2
128: 58 21 01 87 andc $7,$3,$4
12c: 00 20 00 00 lnop
130: 18 21 01 83 and $3,$3,$4
134: 30 80 00 04 lqa $4,0
138: 0f be c1 02 rothmi $2,$2,-5
13c: 09 01 c3 05 sfh $5,$6,$7
140: 49 01 c3 06 cgth $6,$6,$7
144: 0f e0 c2 85 shlhi $5,$5,3
148: 18 20 82 04 and $4,$4,$2
14c: 30 80 00 02 lqa $2,0
150: 08 21 01 83 or $3,$3,$4
154: 18 21 41 02 and $2,$2,$5
158: 08 20 81 83 or $3,$3,$2
15c: 58 21 81 83 andc $3,$3,$6
160: 40 20 00 7f nop $127
164: 35 00 00 00 bi $0
168: 41 81 ff 82 ilh $2,1023 # 3ff
16c: 10 00 00 09 hbra 190 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x190>,0
170: 30 80 00 03 lqa $3,0
174: 3e 80 00 84 cbd $4,0($1)
178: 20 80 00 02 stqa $2,0
17c: 40 80 00 82 il $2,1
180: b0 60 c1 04 shufb $3,$2,$3,$4
184: 40 20 00 7f nop $127
188: 40 20 00 7f nop $127
18c: 20 80 00 03 stqa $3,0
190: 32 7f dd 80 br 7c <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x7c> # 7c
194: 00 20 00 00 lnop
Yup, lots of lovely badness, including a handful of lovely branches in our branchless code.
My next step was to simply move the statics out of the function. That’s the only change made to get this disassembly:
00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>:
0: 04 00 01 85 ori $5,$3,0
4: 30 80 00 03 lqa $3,0
8: 30 80 00 02 lqa $2,0
c: 30 80 00 09 lqa $9,0
10: 30 80 00 07 lqa $7,0
14: 35 80 00 10 hbr 54 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x54>,$0
18: b0 61 02 83 shufb $3,$5,$4,$3
1c: b0 a1 02 82 shufb $5,$5,$4,$2
20: 41 81 ff 84 ilh $4,1023 # 3ff
24: 30 80 00 02 lqa $2,0
28: 58 22 41 88 andc $8,$3,$9
2c: 0f be c2 85 rothmi $5,$5,-5
30: 09 02 03 86 sfh $6,$7,$8
34: 18 22 41 83 and $3,$3,$9
38: 0f e0 c3 06 shlhi $6,$6,3
3c: 18 21 42 04 and $4,$4,$5
40: 49 02 03 87 cgth $7,$7,$8
44: 08 21 01 83 or $3,$3,$4
48: 18 21 81 02 and $2,$2,$6
4c: 08 20 81 83 or $3,$3,$2
50: 58 21 c1 83 andc $3,$3,$7
54: 35 00 00 00 bi $0
Interesting, not only did all the badness go away, but the function is also now almost half the size. (including a branch that was originally hanging out in the middle of the function)
The last change I tried before calling it a day on this was to put the value declarations back into the function, but as const, rather than static, like so:
const qword shuf_mid16 = (qword)(vu8){ 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e };
const qword shuf_hi16 = (qword)(vu8){ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d };
const qword s_mask = (qword)(vu32){ 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
const qword e_mask = (qword)(vu32){ 0x7c007c00, 0x7c007c00, 0x7c007c00, 0x7c007c00 }; //@@@ calc this in code!
const qword exp_off = (qword)(vu32){ 0x38003800, 0x38003800, 0x38003800, 0x38003800 }; //@@@ calc this in code!
const qword m_mask = si_ilh(0x03ff); //{e4}
This change ended up yielding the best results of all the setups:
00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>:
0: 04 00 01 85 ori $5,$3,0
4: 30 80 00 03 lqa $3,0
8: 41 c0 00 09 ilh $9,32768 # 8000
c: 30 80 00 02 lqa $2,0
10: 41 9c 00 07 ilh $7,14336 # 3800
14: 35 80 00 10 hbr 54 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x54>,$0
18: b0 61 02 83 shufb $3,$5,$4,$3
1c: b0 a1 02 82 shufb $5,$5,$4,$2
20: 41 81 ff 84 ilh $4,1023 # 3ff
24: 41 be 00 02 ilh $2,31744 # 7c00
28: 58 22 41 88 andc $8,$3,$9
2c: 0f be c2 85 rothmi $5,$5,-5
30: 09 02 03 86 sfh $6,$7,$8
34: 18 22 41 83 and $3,$3,$9
38: 0f e0 c3 06 shlhi $6,$6,3
3c: 18 21 42 04 and $4,$4,$5
40: 49 02 03 87 cgth $7,$7,$8
44: 08 21 01 83 or $3,$3,$4
48: 18 21 81 02 and $2,$2,$6
4c: 08 20 81 83 or $3,$3,$2
50: 58 21 c1 83 andc $3,$3,$7
54: 35 00 00 00 bi $0
Notice that it’s the same number of instructions as when the static were moved out of the function, but a few of the lqa instructions were automatically changed to ilh instructions to better balance between even/odd.
So, the moral of the story is, be weary of using statics within a function. Use const instead. Also, for values that you know can be easily constructed by a single instructions such as ilh, just leave the value in there so the compiler can load it or construct it as needed to better balance your function.
As a small addendum to the problem of this function, Mike Day optimized that function over the last weekend as just something to do. He managed to make it four instructions shorter (six if it’s inlined, as there’s a nop and an lnop).
Here’s Mike’s version of the function:
qword SimdFloatToHalf( vf32 a_, vf32 b_ )
{
const qword shuf_hi16 = (qword)(vu16){ 0x0001, 0x0405, 0x0809, 0x0c0d, 0x1011, 0x1415, 0x1819, 0x1c1d };
const qword s_mask = (qword)(vu16){ 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 };
const qword blah = (qword)(vu16){ 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000 };
const qword exp_off = (qword)(vu16){ 0x3800, 0x3800, 0x3800, 0x3800, 0x3800, 0x3800, 0x3800, 0x3800 };
qword a_rot5 = si_shlqbii((qword)a_, 3);
qword b_rot5 = si_shlqbii((qword)b_, 3);
qword mid = si_shufb(a_rot5, b_rot5, shuf_hi16); // mid = | e5 e4 e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 m16 m15 m14 |
qword hi = si_shufb((qword)a_, (qword)b_, shuf_hi16); // hi = | s e7 e6 e5 e4 e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 |
qword h = si_xor(mid, blah); // h = | e5 e4' e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 m16 m15 m14 |
qword e = si_andc(hi, s_mask); // e = | 0 e7 e6 e5 e4 e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 |
h = si_selb(h, hi, s_mask); // h = | s e4' e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 m16 m15 m14 |
qword d_mask = si_cgth(exp_off, e);
h = si_andc(h, d_mask); // set to 0 if underflow
return h;
}
And here’s what it compiles into:
00000000 <_ZN3FXV15SimdFloatToHalfEU8__vectorfS0_>:
0: 04 00 01 86 ori $6,$3,0
4: 35 80 00 0f hbr 40 <_ZN3FXV15SimdFloatToHalfEU8__vectorfS0_+0x40>,$0
8: 41 c0 00 08 ilh $8,32768 # 8000
c: 30 80 00 07 lqa $7,0
10: 3f 60 c2 02 shlqbii $2,$4,3
14: 3f 60 c1 83 shlqbii $3,$3,3
18: 40 20 00 7f nop $127
1c: b0 c1 03 07 shufb $6,$6,$4,$7
20: 41 9c 00 04 ilh $4,14336 # 3800
24: b0 60 81 87 shufb $3,$3,$2,$7
28: 41 a0 00 02 ilh $2,16384 # 4000
2c: 58 22 03 05 andc $5,$6,$8
30: 48 20 81 83 xor $3,$3,$2
34: 49 01 42 04 cgth $4,$4,$5
38: 80 61 81 88 selb $3,$3,$6,$8
3c: 58 21 01 83 andc $3,$3,$4
40: 35 00 00 00 bi $0
44: 00 20 00 00 lnop