1 #include <m4ri/m4ri_config.h> 
   12 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(
word *m, 
word const *t[N], 
wi_t wide) {
 
   13   assert(1 <= N && N <= 8);
 
   32     case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; 
break;
 
   33     case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; 
break;
 
   34     case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; 
break;
 
   35     case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; 
break;
 
   36     case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; 
break;
 
   37     case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; 
break;
 
   38     case 2: *m++ ^= *t[0]++ ^ *t[1]++; 
break;
 
   39     case 1: *m++ ^= *t[0]++; 
break;
 
   44   __m128i *m__ = (__m128i*)m;
 
   48   case 8: t__[N-8] = (__m128i*)t[N-8];
 
   49   case 7: t__[N-7] = (__m128i*)t[N-7];
 
   50   case 6: t__[N-6] = (__m128i*)t[N-6];
 
   51   case 5: t__[N-5] = (__m128i*)t[N-5];
 
   52   case 4: t__[N-4] = (__m128i*)t[N-4];
 
   53   case 3: t__[N-3] = (__m128i*)t[N-3];
 
   54   case 2: t__[N-2] = (__m128i*)t[N-2];
 
   55   case 1: t__[N-1] = (__m128i*)t[N-1];
 
   58   __m128i xmm0, xmm1, xmm2, xmm3;
 
   61   for(; i+4<= (wide>>1); i+=4) {
 
   62     xmm0 = m__[0];  xmm1 = m__[1];  xmm2 = m__[2];  xmm3 = m__[3];
 
   64     case 8:  xmm0 = _mm_xor_si128(xmm0, t__[7][0]);  xmm1 = _mm_xor_si128(xmm1, t__[7][1]);  xmm2 = _mm_xor_si128(xmm2, t__[7][2]);  xmm3 = _mm_xor_si128(xmm3, t__[7][3]); t__[7]+=4;
 
   65     case 7:  xmm0 = _mm_xor_si128(xmm0, t__[6][0]);  xmm1 = _mm_xor_si128(xmm1, t__[6][1]);  xmm2 = _mm_xor_si128(xmm2, t__[6][2]);  xmm3 = _mm_xor_si128(xmm3, t__[6][3]); t__[6]+=4;
 
   66     case 6:  xmm0 = _mm_xor_si128(xmm0, t__[5][0]);  xmm1 = _mm_xor_si128(xmm1, t__[5][1]);  xmm2 = _mm_xor_si128(xmm2, t__[5][2]);  xmm3 = _mm_xor_si128(xmm3, t__[5][3]); t__[5]+=4;
 
   67     case 5:  xmm0 = _mm_xor_si128(xmm0, t__[4][0]);  xmm1 = _mm_xor_si128(xmm1, t__[4][1]);  xmm2 = _mm_xor_si128(xmm2, t__[4][2]);  xmm3 = _mm_xor_si128(xmm3, t__[4][3]); t__[4]+=4;
 
   68     case 4:  xmm0 = _mm_xor_si128(xmm0, t__[3][0]);  xmm1 = _mm_xor_si128(xmm1, t__[3][1]);  xmm2 = _mm_xor_si128(xmm2, t__[3][2]);  xmm3 = _mm_xor_si128(xmm3, t__[3][3]); t__[3]+=4;
 
   69     case 3:  xmm0 = _mm_xor_si128(xmm0, t__[2][0]);  xmm1 = _mm_xor_si128(xmm1, t__[2][1]);  xmm2 = _mm_xor_si128(xmm2, t__[2][2]);  xmm3 = _mm_xor_si128(xmm3, t__[2][3]); t__[2]+=4;
 
   70     case 2:  xmm0 = _mm_xor_si128(xmm0, t__[1][0]);  xmm1 = _mm_xor_si128(xmm1, t__[1][1]);  xmm2 = _mm_xor_si128(xmm2, t__[1][2]);  xmm3 = _mm_xor_si128(xmm3, t__[1][3]); t__[1]+=4;
 
   71     case 1:  xmm0 = _mm_xor_si128(xmm0, t__[0][0]);  xmm1 = _mm_xor_si128(xmm1, t__[0][1]);  xmm2 = _mm_xor_si128(xmm2, t__[0][2]);  xmm3 = _mm_xor_si128(xmm3, t__[0][3]); t__[0]+=4;
 
   73     m__[0] = xmm0;  m__[1] = xmm1; m__[2] = xmm2;  m__[3] = xmm3;
 
   77   for(; i< (wide>>1); i++) {
 
   80       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
 
   81       xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++); xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
 
   82       xmm0 = _mm_xor_si128(xmm0, xmm1);  xmm2 = _mm_xor_si128(xmm2, xmm3);
 
   83       xmm0 = _mm_xor_si128(xmm0, xmm2);  xmm0 = _mm_xor_si128(*m__, xmm0);
 
   86       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
 
   87       xmm0 = _mm_xor_si128(xmm0, *t__[4]++);      xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
 
   88       xmm0 = _mm_xor_si128(xmm0, *t__[6]++);      xmm0 = _mm_xor_si128(xmm0, xmm1);
 
   89       xmm0 = _mm_xor_si128(*m__, xmm0);
 
   92       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
 
   93       xmm0 = _mm_xor_si128(xmm0, *t__[4]++);      xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
 
   94       xmm0 = _mm_xor_si128(xmm0, xmm1);           xmm0 = _mm_xor_si128(*m__, xmm0);
 
   97       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
 
   98       xmm0 = _mm_xor_si128(xmm0, *t__[4]++);      xmm0 = _mm_xor_si128(xmm0, xmm1);
 
   99       xmm0 = _mm_xor_si128(*m__, xmm0);
 
  102       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
 
  103       xmm0 = _mm_xor_si128(xmm0, xmm1);           xmm0 = _mm_xor_si128(*m__, xmm0);
 
  106       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*m__, *t__[2]++);
 
  107       xmm0 = _mm_xor_si128(xmm0, xmm1);
 
  110       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm0 = _mm_xor_si128(*m__, xmm0);
 
  113       xmm0 = _mm_xor_si128(*m__, *t__[0]++); 
 
  122     case 8: t[N-8] = (
word*)t__[N-8];
 
  123     case 7: t[N-7] = (
word*)t__[N-7];
 
  124     case 6: t[N-6] = (
word*)t__[N-6];
 
  125     case 5: t[N-5] = (
word*)t__[N-5];
 
  126     case 4: t[N-4] = (
word*)t__[N-4];
 
  127     case 3: t[N-3] = (
word*)t__[N-3];
 
  128     case 2: t[N-2] = (
word*)t__[N-2];
 
  129     case 1: t[N-1] = (
word*)t__[N-1];
 
  133     case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; 
break;
 
  134     case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; 
break;
 
  135     case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; 
break;
 
  136     case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; 
break;
 
  137     case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; 
break;
 
  138     case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; 
break;
 
  139     case 2: *m++ ^= *t[0]++ ^ *t[1]++; 
break;
 
  140     case 1: *m++ ^= *t[0]++; 
break;
 
  146   for(
wi_t i=0; i< wide; i++) {
 
  148     case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; 
break;
 
  149     case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; 
break;
 
  150     case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; 
break;
 
  151     case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; 
break;
 
  152     case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; 
break;
 
  153     case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; 
break;
 
  154     case 2: *m++ ^= *t[0]++ ^ *t[1]++; 
break;
 
  155     case 1: *m++ ^= *t[0]++; 
break;
 
  160 #endif // __M4RI_HAVE_SSE2 
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w.r.t. 16. 
Definition: misc.h:421
 
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction. 
Definition: misc.h:449
 
uint64_t word
A word is the typical packed data structure to represent packed bits. 
Definition: misc.h:87
 
int wi_t
Type of word indexes. 
Definition: misc.h:80