#include <inttypes.h>
#include <immintrin.h>
#include "m.h"

// have an array x[0]...x[n-1] with n >= 4
// want to load x[m]...x[m+3], but only what fits within x[0]...x[n-1]
// setting remaining words to c
void int64x4_loadtail(int64_t *y,const int64_t *x,long long n,int64_t c)
{
  int64_t mask[4];
  long long i;
  for (i = 0;i < 4;++i) mask[i] = 0;
  for (i = 0;i < 4 && m+i < n;++i) mask[i] = -1;
  __m256i maskvec = _mm256_loadu_si256((__m256i *) mask);
  __m256i xpart = _mm256_maskload_epi64((void *) (x+m),maskvec);
  __m256i cvec = _mm256_set1_epi64x(c);
  __m256i xc = _mm256_blendv_epi8(cvec,xpart,maskvec);
  _mm256_storeu_si256((__m256i *) y,xc);
}
