M4RI  20140914
1 #include <m4ri/m4ri_config.h>
2 #include <m4ri/misc.h>
12 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(word *m, word const *t[N], wi_t wide) {
13  assert(1 <= N && N <= 8);
15 #if __M4RI_HAVE_SSE2
17  assert( (__M4RI_ALIGNMENT(m,16) == 8) | (__M4RI_ALIGNMENT(m,16) == 0) );
19  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
20  case 8: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[7],16));
21  case 7: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[6],16));
22  case 6: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[5],16));
23  case 5: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[4],16));
24  case 4: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[3],16));
25  case 3: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[2],16));
26  case 2: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[1],16));
27  case 1: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[0],16));
28  };
30  if (__M4RI_UNLIKELY(__M4RI_ALIGNMENT(m,16) == 8)) {
31  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
32  case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;
33  case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
34  case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
35  case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
36  case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
37  case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
38  case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
39  case 1: *m++ ^= *t[0]++; break;
40  };
41  wide--;
42  }
44  __m128i *m__ = (__m128i*)m;
45  __m128i *t__[N];
47  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
48  case 8: t__[N-8] = (__m128i*)t[N-8];
49  case 7: t__[N-7] = (__m128i*)t[N-7];
50  case 6: t__[N-6] = (__m128i*)t[N-6];
51  case 5: t__[N-5] = (__m128i*)t[N-5];
52  case 4: t__[N-4] = (__m128i*)t[N-4];
53  case 3: t__[N-3] = (__m128i*)t[N-3];
54  case 2: t__[N-2] = (__m128i*)t[N-2];
55  case 1: t__[N-1] = (__m128i*)t[N-1];
56  };
58  __m128i xmm0, xmm1, xmm2, xmm3;
60  wi_t i=0;
61  for(; i+4<= (wide>>1); i+=4) {
62  xmm0 = m__[0]; xmm1 = m__[1]; xmm2 = m__[2]; xmm3 = m__[3];
63  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
64  case 8: xmm0 = _mm_xor_si128(xmm0, t__[7][0]); xmm1 = _mm_xor_si128(xmm1, t__[7][1]); xmm2 = _mm_xor_si128(xmm2, t__[7][2]); xmm3 = _mm_xor_si128(xmm3, t__[7][3]); t__[7]+=4;
65  case 7: xmm0 = _mm_xor_si128(xmm0, t__[6][0]); xmm1 = _mm_xor_si128(xmm1, t__[6][1]); xmm2 = _mm_xor_si128(xmm2, t__[6][2]); xmm3 = _mm_xor_si128(xmm3, t__[6][3]); t__[6]+=4;
66  case 6: xmm0 = _mm_xor_si128(xmm0, t__[5][0]); xmm1 = _mm_xor_si128(xmm1, t__[5][1]); xmm2 = _mm_xor_si128(xmm2, t__[5][2]); xmm3 = _mm_xor_si128(xmm3, t__[5][3]); t__[5]+=4;
67  case 5: xmm0 = _mm_xor_si128(xmm0, t__[4][0]); xmm1 = _mm_xor_si128(xmm1, t__[4][1]); xmm2 = _mm_xor_si128(xmm2, t__[4][2]); xmm3 = _mm_xor_si128(xmm3, t__[4][3]); t__[4]+=4;
68  case 4: xmm0 = _mm_xor_si128(xmm0, t__[3][0]); xmm1 = _mm_xor_si128(xmm1, t__[3][1]); xmm2 = _mm_xor_si128(xmm2, t__[3][2]); xmm3 = _mm_xor_si128(xmm3, t__[3][3]); t__[3]+=4;
69  case 3: xmm0 = _mm_xor_si128(xmm0, t__[2][0]); xmm1 = _mm_xor_si128(xmm1, t__[2][1]); xmm2 = _mm_xor_si128(xmm2, t__[2][2]); xmm3 = _mm_xor_si128(xmm3, t__[2][3]); t__[2]+=4;
70  case 2: xmm0 = _mm_xor_si128(xmm0, t__[1][0]); xmm1 = _mm_xor_si128(xmm1, t__[1][1]); xmm2 = _mm_xor_si128(xmm2, t__[1][2]); xmm3 = _mm_xor_si128(xmm3, t__[1][3]); t__[1]+=4;
71  case 1: xmm0 = _mm_xor_si128(xmm0, t__[0][0]); xmm1 = _mm_xor_si128(xmm1, t__[0][1]); xmm2 = _mm_xor_si128(xmm2, t__[0][2]); xmm3 = _mm_xor_si128(xmm3, t__[0][3]); t__[0]+=4;
72  }
73  m__[0] = xmm0; m__[1] = xmm1; m__[2] = xmm2; m__[3] = xmm3;
74  m__ += 4;
75  }
77  for(; i< (wide>>1); i++) {
78  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
79  case 8:
80  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
81  xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++); xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
82  xmm0 = _mm_xor_si128(xmm0, xmm1); xmm2 = _mm_xor_si128(xmm2, xmm3);
83  xmm0 = _mm_xor_si128(xmm0, xmm2); xmm0 = _mm_xor_si128(*m__, xmm0);
84  break;
85  case 7:
86  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
87  xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
88  xmm0 = _mm_xor_si128(xmm0, *t__[6]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
89  xmm0 = _mm_xor_si128(*m__, xmm0);
90  break;
91  case 6:
92  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
93  xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
94  xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
95  break;
96  case 5:
97  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
98  xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
99  xmm0 = _mm_xor_si128(*m__, xmm0);
100  break;
101  case 4:
102  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
103  xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
104  break;
105  case 3:
106  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*m__, *t__[2]++);
107  xmm0 = _mm_xor_si128(xmm0, xmm1);
108  break;
109  case 2:
110  xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm0 = _mm_xor_si128(*m__, xmm0);
111  break;
112  case 1:
113  xmm0 = _mm_xor_si128(*m__, *t__[0]++);
114  break;
115  };
116  *m__++ = xmm0;
117  }
119  if(wide & 0x1) {
120  m = (word*)m__;
121  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
122  case 8: t[N-8] = (word*)t__[N-8];
123  case 7: t[N-7] = (word*)t__[N-7];
124  case 6: t[N-6] = (word*)t__[N-6];
125  case 5: t[N-5] = (word*)t__[N-5];
126  case 4: t[N-4] = (word*)t__[N-4];
127  case 3: t[N-3] = (word*)t__[N-3];
128  case 2: t[N-2] = (word*)t__[N-2];
129  case 1: t[N-1] = (word*)t__[N-1];
130  }
132  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
133  case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;
134  case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
135  case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
136  case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
137  case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
138  case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
139  case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
140  case 1: *m++ ^= *t[0]++; break;
141  }
142  }
143  return;
144 #else
146  for(wi_t i=0; i< wide; i++) {
147  switch(N) { /* we rely on the compiler to optimise this switch away, it reads nicer than #if */
148  case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;
149  case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;
150  case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;
151  case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;
152  case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;
153  case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;
154  case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;
155  case 1: *m++ ^= *t[0]++; break;
156  }
157  }
159  return;
160 #endif // __M4RI_HAVE_SSE2
161 }
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w.r.t. 16.
Definition: misc.h:421
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition: misc.h:449
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition: misc.h:87
int wi_t
Type of word indexes.
Definition: misc.h:80