1 #include <m4ri/m4ri_config.h>
12 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(
word *m,
word const *t[N],
wi_t wide) {
13 assert(1 <= N && N <= 8);
32 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
33 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
34 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
35 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
36 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
37 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
38 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
39 case 1: *m++ ^= *t[0]++;
break;
44 __m128i *m__ = (__m128i*)m;
48 case 8: t__[N-8] = (__m128i*)t[N-8];
49 case 7: t__[N-7] = (__m128i*)t[N-7];
50 case 6: t__[N-6] = (__m128i*)t[N-6];
51 case 5: t__[N-5] = (__m128i*)t[N-5];
52 case 4: t__[N-4] = (__m128i*)t[N-4];
53 case 3: t__[N-3] = (__m128i*)t[N-3];
54 case 2: t__[N-2] = (__m128i*)t[N-2];
55 case 1: t__[N-1] = (__m128i*)t[N-1];
58 __m128i xmm0, xmm1, xmm2, xmm3;
61 for(; i+4<= (wide>>1); i+=4) {
62 xmm0 = m__[0]; xmm1 = m__[1]; xmm2 = m__[2]; xmm3 = m__[3];
64 case 8: xmm0 = _mm_xor_si128(xmm0, t__[7][0]); xmm1 = _mm_xor_si128(xmm1, t__[7][1]); xmm2 = _mm_xor_si128(xmm2, t__[7][2]); xmm3 = _mm_xor_si128(xmm3, t__[7][3]); t__[7]+=4;
65 case 7: xmm0 = _mm_xor_si128(xmm0, t__[6][0]); xmm1 = _mm_xor_si128(xmm1, t__[6][1]); xmm2 = _mm_xor_si128(xmm2, t__[6][2]); xmm3 = _mm_xor_si128(xmm3, t__[6][3]); t__[6]+=4;
66 case 6: xmm0 = _mm_xor_si128(xmm0, t__[5][0]); xmm1 = _mm_xor_si128(xmm1, t__[5][1]); xmm2 = _mm_xor_si128(xmm2, t__[5][2]); xmm3 = _mm_xor_si128(xmm3, t__[5][3]); t__[5]+=4;
67 case 5: xmm0 = _mm_xor_si128(xmm0, t__[4][0]); xmm1 = _mm_xor_si128(xmm1, t__[4][1]); xmm2 = _mm_xor_si128(xmm2, t__[4][2]); xmm3 = _mm_xor_si128(xmm3, t__[4][3]); t__[4]+=4;
68 case 4: xmm0 = _mm_xor_si128(xmm0, t__[3][0]); xmm1 = _mm_xor_si128(xmm1, t__[3][1]); xmm2 = _mm_xor_si128(xmm2, t__[3][2]); xmm3 = _mm_xor_si128(xmm3, t__[3][3]); t__[3]+=4;
69 case 3: xmm0 = _mm_xor_si128(xmm0, t__[2][0]); xmm1 = _mm_xor_si128(xmm1, t__[2][1]); xmm2 = _mm_xor_si128(xmm2, t__[2][2]); xmm3 = _mm_xor_si128(xmm3, t__[2][3]); t__[2]+=4;
70 case 2: xmm0 = _mm_xor_si128(xmm0, t__[1][0]); xmm1 = _mm_xor_si128(xmm1, t__[1][1]); xmm2 = _mm_xor_si128(xmm2, t__[1][2]); xmm3 = _mm_xor_si128(xmm3, t__[1][3]); t__[1]+=4;
71 case 1: xmm0 = _mm_xor_si128(xmm0, t__[0][0]); xmm1 = _mm_xor_si128(xmm1, t__[0][1]); xmm2 = _mm_xor_si128(xmm2, t__[0][2]); xmm3 = _mm_xor_si128(xmm3, t__[0][3]); t__[0]+=4;
73 m__[0] = xmm0; m__[1] = xmm1; m__[2] = xmm2; m__[3] = xmm3;
77 for(; i< (wide>>1); i++) {
80 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
81 xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++); xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
82 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm2 = _mm_xor_si128(xmm2, xmm3);
83 xmm0 = _mm_xor_si128(xmm0, xmm2); xmm0 = _mm_xor_si128(*m__, xmm0);
86 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
87 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
88 xmm0 = _mm_xor_si128(xmm0, *t__[6]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
89 xmm0 = _mm_xor_si128(*m__, xmm0);
92 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
93 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
94 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
97 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
98 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
99 xmm0 = _mm_xor_si128(*m__, xmm0);
102 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
103 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
106 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*m__, *t__[2]++);
107 xmm0 = _mm_xor_si128(xmm0, xmm1);
110 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm0 = _mm_xor_si128(*m__, xmm0);
113 xmm0 = _mm_xor_si128(*m__, *t__[0]++);
122 case 8: t[N-8] = (
word*)t__[N-8];
123 case 7: t[N-7] = (
word*)t__[N-7];
124 case 6: t[N-6] = (
word*)t__[N-6];
125 case 5: t[N-5] = (
word*)t__[N-5];
126 case 4: t[N-4] = (
word*)t__[N-4];
127 case 3: t[N-3] = (
word*)t__[N-3];
128 case 2: t[N-2] = (
word*)t__[N-2];
129 case 1: t[N-1] = (
word*)t__[N-1];
133 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
134 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
135 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
136 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
137 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
138 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
139 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
140 case 1: *m++ ^= *t[0]++;
break;
146 for(
wi_t i=0; i< wide; i++) {
148 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
149 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
150 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
151 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
152 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
153 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
154 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
155 case 1: *m++ ^= *t[0]++;
break;
160 #endif // __M4RI_HAVE_SSE2
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w.r.t. 16.
Definition: misc.h:421
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition: misc.h:449
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition: misc.h:87
int wi_t
Type of word indexes.
Definition: misc.h:80