Here’s Mike’s version of the function: qword SimdFloatToHalf( vf32 a_, vf32 b_ ) { const qword shuf_hi16 = (qword)(vu16){ 0x0001, 0x0405, 0x0809, 0x0c0d, 0x1011, 0x1415, 0x1819, 0x1c1d }; const qword s_mask = (qword)(vu16){ 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000 }; const qword blah = (qword)(vu16){ 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000 }; const qword exp_off = (qword)(vu16){ 0x3800, 0x3800, 0x3800, 0x3800, 0x3800, 0x3800, 0x3800, 0x3800 }; qword a_rot5 = si_shlqbii((qword)a_, 3); qword b_rot5 = si_shlqbii((qword)b_, 3); qword mid = si_shufb(a_rot5, b_rot5, shuf_hi16); // mid = | e5 e4 e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 m16 m15 m14 | qword hi = si_shufb((qword)a_, (qword)b_, shuf_hi16); // hi = | s e7 e6 e5 e4 e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 | qword h = si_xor(mid, blah); // h = | e5 e4' e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 m16 m15 m14 | qword e = si_andc(hi, s_mask); // e = | 0 e7 e6 e5 e4 e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 | h = si_selb(h, hi, s_mask); // h = | s e4' e3 e2 e1 e0 m23 m22 m21 m20 m19 m18 m17 m16 m15 m14 | qword d_mask = si_cgth(exp_off, e); h = si_andc(h, d_mask); // set to 0 if underflow return h; } And here’s what it compiles into: 00000000 <_ZN3FXV15SimdFloatToHalfEU8__vectorfS0_>: 0: 04 00 01 86 ori $6,$3,0 4: 35 80 00 0f hbr 40 <_ZN3FXV15SimdFloatToHalfEU8__vectorfS0_+0x40>,$0 8: 41 c0 00 08 ilh $8,32768 # 8000 c: 30 80 00 07 lqa $7,0 10: 3f 60 c2 02 shlqbii $2,$4,3 14: 3f 60 c1 83 shlqbii $3,$3,3 18: 40 20 00 7f nop $127 1c: b0 c1 03 07 shufb $6,$6,$4,$7 20: 41 9c 00 04 ilh $4,14336 # 3800 24: b0 60 81 87 shufb $3,$3,$2,$7 28: 41 a0 00 02 ilh $2,16384 # 4000 2c: 58 22 03 05 andc $5,$6,$8 30: 48 20 81 83 xor $3,$3,$2 34: 49 01 42 04 cgth $4,$4,$5 38: 80 61 81 88 selb $3,$3,$6,$8 3c: 58 21 01 83 andc $3,$3,$4 40: 35 00 00 00 bi $0 44: 00 20 00 00 lnop