大量の計算を常に行うような場合、C言語の速度をさらに上げるには、CPUのSIMD命令に置き換えられる組み込み関数を使う方法がある。 ただし、これもvDSPと同じで量が多くまた見やすい一覧がないので作ってみる。 なお、独断と偏見で使いそうもないものは一覧に載せていない。
__m128 _mm_load_ss(float * p) r0 := *p r1 := 0 r2 := 0 r3 := 0 __m128 _mm_load1_ps(float * p) r0 := *p r1 := *p r2 := *p r3 := *p __m128 _mm_load_ps(float * p) r0 := p[0] r1 := p[1] r2 := p[2] r2 := p[3] The address must be 16-byte aligned __m128 _mm_loadu_ps(float* p) r0 := p[0] r1 := p[1] r2 := p[2] r3 := p[3] The address does not need to be 16-byte aligned __m128 _mm_loadr_ps(float* p) r0 := p[3] r1 := p[2] r2 := p[1] r3 := p[0] The address must be 16-byte aligned __m128 _mm_set_ss(float w) r0 := w r1 := 0 r2 := 0 r3 := 0 __m128 _mm_set1_ps(float w) r0 := w r1 := w r2 := w r3 := w __m128 _mm_set_ps(float z, float y, float x, float w) r0 := w r1 := x r2 := y r3 := z __m128 _mm_setr_ps(float z, float y, float x, float w) r0 := z r1 := y r2 := x r3 := w __m128 _mm_setzero_ps(void) r0 := 0 r1 := 0 r2 := 0 r3 := 0 void _mm_store_ss(float * p, __m128 a) *p := a0 void _mm_store1_ps(float * p, __m128 a) p[0] := a0 p[1] := a0 p[2] := a0 p[3] := a0 void _mm_store_ps(float *p, __m128 a) p[0] := a0 p[1] := a1 p[2] := a2 p[3] := a3 The address must be 16-byte aligned void _mm_storeu_ps(float *p, __m128 a) p[0] := a0 p[1] := a1 p[2] := a2 p[3] := a3 The address does not need to be 16-byte aligned void _mm_storer_ps(float * p, __m128 a) p[0] := a3 p[1] := a2 p[2] := a1 p[3] := a0 The address must be 16-byte aligned __m128 _mm_move_ss(__m128 a, __m128 b) r0 := b0 r1 := a1 r2 := a2 r3 := a3 __m128 _mm_add_ss(__m128 a, __m128 b) r0 := a0 + b0 r1 := a1 r2 := a2 r3 := a3 __m128 _mm_add_ps(__m128 a, __m128 b) r0 := a0 + b0 r1 := a1 + b1 r2 := a2 + b2 r3 := a3 + b3 __m128 _mm_sub_ss(__m128 a, __m128 b) __m128 _mm_sub_ps(__m128 a, __m128 b) __m128 _mm_mul_ss(__m128 a, __m128 b) __m128 _mm_mul_ps(__m128 a, __m128 b) __m128 _mm_div_ss(__m128 a, __m128 b) __m128 _mm_div_ps(__m128 a, __m128 b) __m128 _mm_sqrt_ss(__m128 a, __m128 b) ルート __m128 _mm_sqrt_ps(__m128 a, __m128 b) ルート __m128 _mm_rcp_ss(__m128 a, __m128 b) 逆数 __m128 _mm_rcp_ps(__m128 a, __m128 b) 逆数 __m128 _mm_rsqrt_ss(__m128 a, __m128 b) ルートを取ってから逆数 __m128 _mm_rsqrt_ps(__m128 a, __m128 b) ルートを取ってから逆数 __m128 _mm_min_ss(__m128 a, __m128 b) __m128 _mm_min_ps(__m128 a, __m128 b) __m128 _mm_max_ss(__m128 a, __m128 b) __m128 _mm_max_ps(__m128 a, __m128 b) __m128 _mm_shuffle_ps(__m128 a, __m128 b, int i) _MM_SHUFFLE(z, y, x, w) (z<<6) | (y<<4) | (x<<2) | w 例 a[0] = 0; a[1] = 1; a[2] = 2; a[3] = 3; b[0] = 4; b[1] = 5; b[2] = 6; b[3] = 7; __m128 m1 = _mm_loadu_ps(a); __m128 m2 = _mm_loadu_ps(b); __m128 m3 = _mm_shuffle_ps(m1, m2, _MM_SHUFFLE(3, 2, 1, 0)); _mm_storeu_ps(c, m3); printf("c[0] %f c[1] %f c[2] %f c[3] %f", c[0], c[1], c[2], c[3]); c[0] 0.0 c[1] 1.0 c[2] 6.0 c[3] 7.0 _MM_SHUFFLEの第3,4引数がm1から選択するもの、第1,2引数がm2から選択するもの __m128 m3 = _mm_shuffle_ps(m1, m2, _MM_SHUFFLE(1, 1, 1, 1)); c[0] 1.0 c[1] 1.0 c[2] 5.0 c[3] 5.0 __m128 _mm_unpackhi_ps(__m128 a, __m128 b) r0 := a2 r1 := b2 r2 := a3 r3 := b3 __m128 _mm_unpacklo_ps(__m128 a, __m128 b) r0 := a0 r1 := b0 r2 := a1 r3 := b1 __m128 _mm_movehl_ps(__m128 a, __m128 b) r3 := a3 r2 := a2 r1 := b3 r0 := b2 __m128 _mm_movelh_ps(__m128 a, __m128 b) r3 := b1 r2 := b0 r1 := a1 r0 := a0 int _mm_movemask_ps(__m128 a) r := sign(a3)<<3 | sign(a2)<<2 | sign(a1)<<1 | sign(a0) __m128 _mm_and_ps(__m128 a , __m128 b ) r0 := a0 & b0 r1 := a1 & b1 r2 := a2 & b2 r3 := a3 & b3 __m128 _mm_andnot_ps(__m128 a , __m128 b ) r0 := ~a0 & b0 r1 := ~a1 & b1 r2 := ~a2 & b2 r3 := ~a3 & b3 __m128 _mm_or_ps(__m128 a , __m128 b ) r0 := a0 | b0 r1 := a1 | b1 r2 := a2 | b2 r3 := a3 | b3 __m128 _mm_xor_ps(__m128 a , __m128 b ) r0 := a0 ^ b0 r1 := a1 ^ b1 r2 := a2 ^ b2 r3 := a3 ^ b3 __m128 _mm_blend_ps( __m128 a, __m128 b, const int mask ) r0 := (mask0 == 0) ? a0 : b0 r1 := (mask1 == 0) ? a1 : b1 r2 := (mask2 == 0) ? a2 : b2 r3 := (mask3 == 0) ? a3 : b3 __m128 _mm_blendv_ps( __m128 a, __m128 b, __m128 mask ) r0 := (mask0 & 0x80000000) ? b0 : a0 r1 := (mask1 & 0x80000000) ? b1 : a1 r2 := (mask2 & 0x80000000) ? b2 : a2 r3 := (mask3 & 0x80000000) ? b3 : a3 int _mm_extract_ps( __m128 a, const int ndx ) r := (ndx == 0) ? a0 : ((ndx == 1) ? a1 : ((ndx == 2) ? a2 : a3)) __m128 _mm_hadd_ps( __m128 a, __m128 b ) The result of the operation on operand a (A3, A2, A1, A0) and operand b (B3, B2, B1, B0) is (B3 + B2, B1 + B0, A3 + A2, A1 + A0). __m128 _mm_hsub_ps( __m128 a, __m128 b ) void _mm_prefetch(char * p , int i ) Loads one cache line of data from address p to a location closer to the processor. The value i specifies the type of prefetch operation: the constants _MM_HINT_T0, _MM_HINT_T1, _MM_HINT_T2, and _MM_HINT_NTA, corresponding to the type of prefetch instruction, should be used void _mm_stream_ps(float * p , __m128 a ) Stores the data in a to the address p without polluting the caches. The address must be 16-byte aligned. void _mm_sfence(void) Guarantees that every preceding store is globally visible before any subsequent store. SSE2 : SSE の doubleバージョン __m128 : float(32bit) * 4 __m128d : double(64bit) * 2 __m128i : 整数 shortなら * 8 AVX : 2011 : Sandy Bridge 2012 : Ivy Bridge 2013 : Haswell 2014 : Broadwell 2015 : Skylake 2016 : Kaby Lake 2017 : Cannonlake FMA(Fused Multiply Add) Haswell : 2013 AVX-512 : 2013 SSE Pentium 3 以降 SSE2 Pentium 4 以降 SSE4.1 Core 2 のPenyryn 以降 SSE4.2 Nehalem 以降 Intel Core 2 processor family for Laptop Merom dual (65 nm) July 2006 Penryn dual (45 nm) January 2008