m4ri/xor__template_8h_source.html

 #include <m4ri/m4ri_config.h>

 #include <m4ri/misc.h>


 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(word *m, word const *t[N], wi_t wide) {

   assert(1 <= N && N <= 8);


 #if __M4RI_HAVE_SSE2


   assert( (__M4RI_ALIGNMENT(m,16) == 8) | (__M4RI_ALIGNMENT(m,16) == 0) );


   switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

   case 8: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[7],16));

   case 7: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[6],16));

   case 6: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[5],16));

   case 5: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[4],16));

   case 4: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[3],16));

   case 3: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[2],16));

   case 2: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[1],16));

   case 1: assert(__M4RI_ALIGNMENT(m,16) == __M4RI_ALIGNMENT(t[0],16));

   };


   if (__M4RI_UNLIKELY(__M4RI_ALIGNMENT(m,16) == 8)) {

     switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

     case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;

     case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;

     case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;

     case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;

     case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;

     case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;

     case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;

     case 1: *m++ ^= *t[0]++; break;

     };

     wide--;

   }


   __m128i *m__ = (__m128i*)m;

   __m128i *t__[N];


   switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

   case 8: t__[N-8] = (__m128i*)t[N-8];

   case 7: t__[N-7] = (__m128i*)t[N-7];

   case 6: t__[N-6] = (__m128i*)t[N-6];

   case 5: t__[N-5] = (__m128i*)t[N-5];

   case 4: t__[N-4] = (__m128i*)t[N-4];

   case 3: t__[N-3] = (__m128i*)t[N-3];

   case 2: t__[N-2] = (__m128i*)t[N-2];

   case 1: t__[N-1] = (__m128i*)t[N-1];

   };


   __m128i xmm0, xmm1, xmm2, xmm3;


   wi_t i=0;

   for(; i+4<= (wide>>1); i+=4) {

     xmm0 = m__[0];  xmm1 = m__[1];  xmm2 = m__[2];  xmm3 = m__[3];

     switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

     case 8:  xmm0 = _mm_xor_si128(xmm0, t__[7][0]);  xmm1 = _mm_xor_si128(xmm1, t__[7][1]);  xmm2 = _mm_xor_si128(xmm2, t__[7][2]);  xmm3 = _mm_xor_si128(xmm3, t__[7][3]); t__[7]+=4;

     case 7:  xmm0 = _mm_xor_si128(xmm0, t__[6][0]);  xmm1 = _mm_xor_si128(xmm1, t__[6][1]);  xmm2 = _mm_xor_si128(xmm2, t__[6][2]);  xmm3 = _mm_xor_si128(xmm3, t__[6][3]); t__[6]+=4;

     case 6:  xmm0 = _mm_xor_si128(xmm0, t__[5][0]);  xmm1 = _mm_xor_si128(xmm1, t__[5][1]);  xmm2 = _mm_xor_si128(xmm2, t__[5][2]);  xmm3 = _mm_xor_si128(xmm3, t__[5][3]); t__[5]+=4;

     case 5:  xmm0 = _mm_xor_si128(xmm0, t__[4][0]);  xmm1 = _mm_xor_si128(xmm1, t__[4][1]);  xmm2 = _mm_xor_si128(xmm2, t__[4][2]);  xmm3 = _mm_xor_si128(xmm3, t__[4][3]); t__[4]+=4;

     case 4:  xmm0 = _mm_xor_si128(xmm0, t__[3][0]);  xmm1 = _mm_xor_si128(xmm1, t__[3][1]);  xmm2 = _mm_xor_si128(xmm2, t__[3][2]);  xmm3 = _mm_xor_si128(xmm3, t__[3][3]); t__[3]+=4;

     case 3:  xmm0 = _mm_xor_si128(xmm0, t__[2][0]);  xmm1 = _mm_xor_si128(xmm1, t__[2][1]);  xmm2 = _mm_xor_si128(xmm2, t__[2][2]);  xmm3 = _mm_xor_si128(xmm3, t__[2][3]); t__[2]+=4;

     case 2:  xmm0 = _mm_xor_si128(xmm0, t__[1][0]);  xmm1 = _mm_xor_si128(xmm1, t__[1][1]);  xmm2 = _mm_xor_si128(xmm2, t__[1][2]);  xmm3 = _mm_xor_si128(xmm3, t__[1][3]); t__[1]+=4;

     case 1:  xmm0 = _mm_xor_si128(xmm0, t__[0][0]);  xmm1 = _mm_xor_si128(xmm1, t__[0][1]);  xmm2 = _mm_xor_si128(xmm2, t__[0][2]);  xmm3 = _mm_xor_si128(xmm3, t__[0][3]); t__[0]+=4;

     }

     m__[0] = xmm0;  m__[1] = xmm1; m__[2] = xmm2;  m__[3] = xmm3;

     m__ += 4;

   }


   for(; i< (wide>>1); i++) {

     switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

     case 8:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);

       xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++); xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);

       xmm0 = _mm_xor_si128(xmm0, xmm1);  xmm2 = _mm_xor_si128(xmm2, xmm3);

       xmm0 = _mm_xor_si128(xmm0, xmm2);  xmm0 = _mm_xor_si128(*m__, xmm0);

       break;

     case 7:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);

       xmm0 = _mm_xor_si128(xmm0, *t__[4]++);      xmm1 = _mm_xor_si128(xmm1, *t__[5]++);

       xmm0 = _mm_xor_si128(xmm0, *t__[6]++);      xmm0 = _mm_xor_si128(xmm0, xmm1);

       xmm0 = _mm_xor_si128(*m__, xmm0);

       break;

     case 6:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);

       xmm0 = _mm_xor_si128(xmm0, *t__[4]++);      xmm1 = _mm_xor_si128(xmm1, *t__[5]++);

       xmm0 = _mm_xor_si128(xmm0, xmm1);           xmm0 = _mm_xor_si128(*m__, xmm0);

       break;

     case 5:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);

       xmm0 = _mm_xor_si128(xmm0, *t__[4]++);      xmm0 = _mm_xor_si128(xmm0, xmm1);

       xmm0 = _mm_xor_si128(*m__, xmm0);

       break;

     case 4:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);

       xmm0 = _mm_xor_si128(xmm0, xmm1);           xmm0 = _mm_xor_si128(*m__, xmm0);

       break;

     case 3:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*m__, *t__[2]++);

       xmm0 = _mm_xor_si128(xmm0, xmm1);

       break;

     case 2:

       xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm0 = _mm_xor_si128(*m__, xmm0);

       break;

     case 1:

       xmm0 = _mm_xor_si128(*m__, *t__[0]++);

       break;

     };

     *m__++ = xmm0;

   }


   if(wide & 0x1) {

     m = (word*)m__;

     switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

     case 8: t[N-8] = (word*)t__[N-8];

     case 7: t[N-7] = (word*)t__[N-7];

     case 6: t[N-6] = (word*)t__[N-6];

     case 5: t[N-5] = (word*)t__[N-5];

     case 4: t[N-4] = (word*)t__[N-4];

     case 3: t[N-3] = (word*)t__[N-3];

     case 2: t[N-2] = (word*)t__[N-2];

     case 1: t[N-1] = (word*)t__[N-1];

     }


     switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

     case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;

     case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;

     case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;

     case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;

     case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;

     case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;

     case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;

     case 1: *m++ ^= *t[0]++; break;

     }

   }

   return;

 #else


   for(wi_t i=0; i< wide; i++) {

     switch(N) {  /* we rely on the compiler to optimise this switch away, it reads nicer than #if */

     case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++; break;

     case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++; break;

     case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++; break;

     case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++; break;

     case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++; break;

     case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++; break;

     case 2: *m++ ^= *t[0]++ ^ *t[1]++; break;

     case 1: *m++ ^= *t[0]++; break;

     }

   }


   return;

 #endif // __M4RI_HAVE_SSE2

 }


__M4RI_ALIGNMENT
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w.r.t. 16.
Definition: misc.h:421

__M4RI_UNLIKELY
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition: misc.h:449

word
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition: misc.h:87

wi_t
int wi_t
Type of word indexes.
Definition: misc.h:80