________________________________________ From: Carl Glave Sent: Friday, August 10, 2007 5:39 PM To: tech Cc: SPU Ninja Subject: Beware Statics Inside Functions! While working on an optimization for the FXVis system, I managed to make some code slower just by rearranging some statics it was using. Not one to take something like this lying down, I looked into it, and learned something that seems worth sharing. The whole issue stemmed around the use of some static values that existed at the top of the function. Here’s the original function. Note that it has static in various vector formats at the beginning. qword SimdFloatToHalf2(vf32 a_, vf32 b_) { static vu8 shuf_mid16 = { 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e }; static vu8 shuf_hi16 = { 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d }; static vu32 m_mask = (vu32)si_ilh(0x03ff); //{e4} static vu32 s_mask = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 }; static vu32 e_mask = { 0x7c007c00, 0x7c007c00, 0x7c007c00, 0x7c007c00 }; //@@@ calc this in code! static vu32 exp_off = { 0x38003800, 0x38003800, 0x38003800, 0x38003800 }; //@@@ calc this in code! // 0 1 2 3 // seeeeeee emmmmmmm mmmmmmmm mmmmmmmm qword m = si_shufb((qword)a_, (qword)b_, (qword)shuf_mid16); //{o4} m in hword | emmmmmmm mmmmmmmm | m = si_rothmi(m, -5); //{e4} | 00000emm mmmmmmmm | m = si_and(m, (qword)m_mask); //{e2} | 000000mm mmmmmmmm | qword se = si_shufb((qword)a_, (qword)b_, (qword)shuf_hi16); //{o4} se in hword | seeeeeee emmmmmmm | qword s = si_and(se, (qword)s_mask); //{e2} mask | s0000000 00000000 | qword e = si_andc(se, (qword)s_mask); //{e2} mask | 0eeeeeee emmmmmmm | qword d_mask = si_cgth((qword)exp_off, e); //{e2} (e <= 112 ?) (if e <= 0x3800, then exp_off would make <= 0) e = si_sfh((qword)exp_off, e); //{e2} correct exponent (e = ((e - 127) + 15)) e = si_shlhi(e, 3); //{e4} e <<= 3 | eeeeeemm mmmmm000 | e = si_and(e, (qword)e_mask); //{e4} mask | 0eeeee00 00000000 | qword h = si_or(s, m); //{e2} s | m h = si_or(h, e); //{e2} s | e | m h = si_andc(h, d_mask); //{e2} set to 0 if underflow // e11, o2 // load/store odd // shuffle odd return h; } Here’s how this function disassembled: 00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>: 0: 30 80 00 02 lqa $2,0 4: 30 80 00 05 lqa $5,0 8: 30 80 00 06 lqa $6,0 c: 30 80 00 07 lqa $7,0 10: 3e 80 00 89 cbd $9,0($1) 14: b0 a1 01 85 shufb $5,$3,$4,$5 18: b0 61 01 82 shufb $3,$3,$4,$2 1c: 30 80 00 04 lqa $4,0 20: 58 21 82 82 andc $2,$5,$6 24: 0f be c1 88 rothmi $8,$3,-5 28: 09 00 83 83 sfh $3,$7,$2 2c: 00 20 00 00 lnop 30: 49 00 83 87 cgth $7,$7,$2 34: 30 80 00 02 lqa $2,0 38: 0f e0 c1 83 shlhi $3,$3,3 3c: 3f 83 42 04 rotqbyi $4,$4,13 40: 18 21 82 86 and $6,$5,$6 44: 7e 00 02 04 ceqbi $4,$4,0 48: 18 20 c1 05 and $5,$2,$3 4c: 56 c0 02 04 xsbh $4,$4 50: 40 20 00 7f nop $127 54: 22 00 03 84 brhz $4,70 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x70> # 70 58: 41 81 ff 82 ilh $2,1023 # 3ff 5c: 30 80 00 03 lqa $3,0 60: 20 80 00 02 stqa $2,0 64: 40 80 00 82 il $2,1 68: b0 60 c1 09 shufb $3,$2,$3,$9 6c: 20 80 00 03 stqa $3,0 70: 30 80 00 03 lqa $3,0 74: 18 22 01 83 and $3,$3,$8 78: 08 20 c3 03 or $3,$6,$3 7c: 08 21 41 83 or $3,$3,$5 80: 58 21 c1 83 andc $3,$3,$7 84: 35 00 00 00 bi $0 In trying to get this code cleaner and more consistent, I switched the statics to qwords (to match the rest of the code in the function) like so: static qword shuf_mid16 = (qword)(vu8){ 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e }; static qword shuf_hi16 = (qword)(vu8){ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d }; static qword s_mask = (qword)(vu32){ 0x80008000, 0x80008000, 0x80008000, 0x80008000 }; static qword e_mask = (qword)(vu32){ 0x7c007c00, 0x7c007c00, 0x7c007c00, 0x7c007c00 }; //@@@ calc this in code! static qword exp_off = (qword)(vu32){ 0x38003800, 0x38003800, 0x38003800, 0x38003800 }; //@@@ calc this in code! static qword m_mask = si_ilh(0x03ff); //{e4} This had… undesirable effects on the code generated. To show just how much the compiler dislikes this syntax, here’s the disassemble from the same function with only the above changes made to it (all lines of actual code were untouched, only these statics declarations were reformatted): 00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>: 0: 04 00 01 87 ori $7,$3,0 4: 30 80 00 02 lqa $2,0 8: 04 00 02 05 ori $5,$4,0 c: 3f 83 41 02 rotqbyi $2,$2,13 10: 7e 00 01 02 ceqbi $2,$2,0 14: 56 c0 01 02 xsbh $2,$2 18: 22 00 04 02 brhz $2,38 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x38> # 38 1c: 30 80 00 02 lqa $2,0 20: 30 80 00 03 lqa $3,0 24: 3e 80 00 84 cbd $4,0($1) 28: 20 80 00 02 stqa $2,0 2c: 40 80 00 82 il $2,1 30: b0 60 c1 04 shufb $3,$2,$3,$4 34: 20 80 00 03 stqa $3,0 38: 30 80 00 02 lqa $2,0 3c: 3f 83 41 02 rotqbyi $2,$2,13 40: 7e 00 01 02 ceqbi $2,$2,0 44: 56 c0 01 02 xsbh $2,$2 48: 22 00 04 02 brhz $2,68 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x68> # 68 4c: 30 80 00 02 lqa $2,0 50: 30 80 00 03 lqa $3,0 54: 3e 80 00 84 cbd $4,0($1) 58: 20 80 00 02 stqa $2,0 5c: 40 80 00 82 il $2,1 60: b0 60 c1 04 shufb $3,$2,$3,$4 64: 20 80 00 03 stqa $3,0 68: 30 80 00 02 lqa $2,0 6c: 3f 83 41 02 rotqbyi $2,$2,13 70: 7e 00 01 02 ceqbi $2,$2,0 74: 56 c0 01 02 xsbh $2,$2 78: 23 00 1e 02 brhnz $2,168 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x168> # 168 7c: 30 80 00 02 lqa $2,0 80: 3f 83 41 02 rotqbyi $2,$2,13 84: 7e 00 01 02 ceqbi $2,$2,0 88: 56 c0 01 02 xsbh $2,$2 8c: 22 00 04 02 brhz $2,ac <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0xac> # ac 90: 41 c0 00 02 ilh $2,32768 # 8000 94: 30 80 00 03 lqa $3,0 98: 3e 80 00 84 cbd $4,0($1) 9c: 20 80 00 02 stqa $2,0 a0: 40 80 00 82 il $2,1 a4: b0 60 c1 04 shufb $3,$2,$3,$4 a8: 20 80 00 03 stqa $3,0 ac: 30 80 00 02 lqa $2,0 b0: 3f 83 41 02 rotqbyi $2,$2,13 b4: 7e 00 01 02 ceqbi $2,$2,0 b8: 56 c0 01 02 xsbh $2,$2 bc: 22 00 04 02 brhz $2,dc <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0xdc> # dc c0: 41 be 00 02 ilh $2,31744 # 7c00 c4: 30 80 00 03 lqa $3,0 c8: 3e 80 00 84 cbd $4,0($1) cc: 20 80 00 02 stqa $2,0 d0: 40 80 00 82 il $2,1 d4: b0 60 c1 04 shufb $3,$2,$3,$4 d8: 20 80 00 03 stqa $3,0 dc: 30 80 00 02 lqa $2,0 e0: 3f 83 41 02 rotqbyi $2,$2,13 e4: 7e 00 01 02 ceqbi $2,$2,0 e8: 56 c0 01 02 xsbh $2,$2 ec: 22 00 04 02 brhz $2,10c <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x10c> # 10c f0: 41 9c 00 02 ilh $2,14336 # 3800 f4: 30 80 00 03 lqa $3,0 f8: 3e 80 00 84 cbd $4,0($1) fc: 20 80 00 02 stqa $2,0 100: 40 80 00 82 il $2,1 104: b0 60 c1 04 shufb $3,$2,$3,$4 108: 20 80 00 03 stqa $3,0 10c: 30 80 00 03 lqa $3,0 110: 30 80 00 02 lqa $2,0 114: 30 80 00 04 lqa $4,0 118: 30 80 00 06 lqa $6,0 11c: 35 80 00 12 hbr 164 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x164>,$0 120: b0 61 43 83 shufb $3,$7,$5,$3 124: b0 41 43 82 shufb $2,$7,$5,$2 128: 58 21 01 87 andc $7,$3,$4 12c: 00 20 00 00 lnop 130: 18 21 01 83 and $3,$3,$4 134: 30 80 00 04 lqa $4,0 138: 0f be c1 02 rothmi $2,$2,-5 13c: 09 01 c3 05 sfh $5,$6,$7 140: 49 01 c3 06 cgth $6,$6,$7 144: 0f e0 c2 85 shlhi $5,$5,3 148: 18 20 82 04 and $4,$4,$2 14c: 30 80 00 02 lqa $2,0 150: 08 21 01 83 or $3,$3,$4 154: 18 21 41 02 and $2,$2,$5 158: 08 20 81 83 or $3,$3,$2 15c: 58 21 81 83 andc $3,$3,$6 160: 40 20 00 7f nop $127 164: 35 00 00 00 bi $0 168: 41 81 ff 82 ilh $2,1023 # 3ff 16c: 10 00 00 09 hbra 190 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x190>,0 170: 30 80 00 03 lqa $3,0 174: 3e 80 00 84 cbd $4,0($1) 178: 20 80 00 02 stqa $2,0 17c: 40 80 00 82 il $2,1 180: b0 60 c1 04 shufb $3,$2,$3,$4 184: 40 20 00 7f nop $127 188: 40 20 00 7f nop $127 18c: 20 80 00 03 stqa $3,0 190: 32 7f dd 80 br 7c <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x7c> # 7c 194: 00 20 00 00 lnop Yup, lots of lovely badness, including a handful of lovely branches in our branchless code. My next step was to simply move the statics out of the function. That’s the only change made to get this disassembly: 00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>: 0: 04 00 01 85 ori $5,$3,0 4: 30 80 00 03 lqa $3,0 8: 30 80 00 02 lqa $2,0 c: 30 80 00 09 lqa $9,0 10: 30 80 00 07 lqa $7,0 14: 35 80 00 10 hbr 54 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x54>,$0 18: b0 61 02 83 shufb $3,$5,$4,$3 1c: b0 a1 02 82 shufb $5,$5,$4,$2 20: 41 81 ff 84 ilh $4,1023 # 3ff 24: 30 80 00 02 lqa $2,0 28: 58 22 41 88 andc $8,$3,$9 2c: 0f be c2 85 rothmi $5,$5,-5 30: 09 02 03 86 sfh $6,$7,$8 34: 18 22 41 83 and $3,$3,$9 38: 0f e0 c3 06 shlhi $6,$6,3 3c: 18 21 42 04 and $4,$4,$5 40: 49 02 03 87 cgth $7,$7,$8 44: 08 21 01 83 or $3,$3,$4 48: 18 21 81 02 and $2,$2,$6 4c: 08 20 81 83 or $3,$3,$2 50: 58 21 c1 83 andc $3,$3,$7 54: 35 00 00 00 bi $0 Interesting, not only did all the badness go away, but the function is also now almost half the size. (including a branch that was originally hanging out in the middle of the function) The last change I tried before calling it a day on this was to put the value declarations back into the function, but as const, rather than static, like so: const qword shuf_mid16 = (qword)(vu8){ 0x01, 0x02, 0x05, 0x06, 0x09, 0x0a, 0x0d, 0x0e, 0x11, 0x12, 0x15, 0x16, 0x19, 0x1a, 0x1d, 0x1e }; const qword shuf_hi16 = (qword)(vu8){ 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0x10, 0x11, 0x14, 0x15, 0x18, 0x19, 0x1c, 0x1d }; const qword s_mask = (qword)(vu32){ 0x80008000, 0x80008000, 0x80008000, 0x80008000 }; const qword e_mask = (qword)(vu32){ 0x7c007c00, 0x7c007c00, 0x7c007c00, 0x7c007c00 }; //@@@ calc this in code! const qword exp_off = (qword)(vu32){ 0x38003800, 0x38003800, 0x38003800, 0x38003800 }; //@@@ calc this in code! const qword m_mask = si_ilh(0x03ff); //{e4} This change ended up yielding the best results of all the setups: 00000000 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_>: 0: 04 00 01 85 ori $5,$3,0 4: 30 80 00 03 lqa $3,0 8: 41 c0 00 09 ilh $9,32768 # 8000 c: 30 80 00 02 lqa $2,0 10: 41 9c 00 07 ilh $7,14336 # 3800 14: 35 80 00 10 hbr 54 <_ZN3FXV16SimdFloatToHalf2EU8__vectorfS0_+0x54>,$0 18: b0 61 02 83 shufb $3,$5,$4,$3 1c: b0 a1 02 82 shufb $5,$5,$4,$2 20: 41 81 ff 84 ilh $4,1023 # 3ff 24: 41 be 00 02 ilh $2,31744 # 7c00 28: 58 22 41 88 andc $8,$3,$9 2c: 0f be c2 85 rothmi $5,$5,-5 30: 09 02 03 86 sfh $6,$7,$8 34: 18 22 41 83 and $3,$3,$9 38: 0f e0 c3 06 shlhi $6,$6,3 3c: 18 21 42 04 and $4,$4,$5 40: 49 02 03 87 cgth $7,$7,$8 44: 08 21 01 83 or $3,$3,$4 48: 18 21 81 02 and $2,$2,$6 4c: 08 20 81 83 or $3,$3,$2 50: 58 21 c1 83 andc $3,$3,$7 54: 35 00 00 00 bi $0 Notice that it’s the same number of instructions as when the static were moved out of the function, but a few of the lqa instructions were automatically changed to ilh instructions to better balance between even/odd. So, the moral of the story is, be weary of using statics within a function. Use const instead. Also, for values that you know can be easily constructed by a single instructions such as ilh, just leave the value in there so the compiler can load it or construct it as needed to better balance your function.