#include <inttypes.h>
#include <immintrin.h>
#include "crypto_int32.h"
#include "crypto_int64.h"
#include "m.h"

// have an array x[0]...x[n-1] with n >= 8
// want to load x[m]...x[m+7], but only what fits within x[0]...x[n-1]
// setting remaining words to c
void int32x8_loadtail(int32_t *y,const int32_t *x,long long n,int32_t c)
{
  n = crypto_int64_max(8,n); // no-op when caller properly guarantees n >= 8
  long long pos = crypto_int64_min(m+8,n);
  __m256i xpart = _mm256_loadu_si256((__m256i *) (x+pos-8));
  __m256i posvec = _mm256_set1_epi32(pos);
  __m256i mplus = _mm256_setr_epi32(m,m+1,m+2,m+3,m+4,m+5,m+6,m+7);
  __m256i diff = _mm256_sub_epi32(mplus,posvec);
  __m256i xrotate = _mm256_permutevar8x32_epi32(xpart,diff);
  __m256i cvec = _mm256_set1_epi32(c);
  __m256i xc = _mm256_blendv_epi8(cvec,xrotate,diff);
  _mm256_storeu_si256((__m256i *) y,xc);
}
