#include <inttypes.h>
#include <immintrin.h>
#include "crypto_int64.h"
#include "m.h"

#define int32x8_broadcast _mm256_set1_epi32
#define int32x8_set _mm256_setr_epi32
#define int32x8_add _mm256_add_epi32
#define int32x8_varextract _mm256_permutevar8x32_epi32
#define int64x4_store _mm256_storeu_si256
#define int64x4_load _mm256_loadu_si256

// have an array x[0]...x[n-1] with n >= 4
// want to store x[m]...x[m+3], but only what fits within x[0]...x[n-1]
// data source: y[0]...y[3]
void int64x4_storetail(int64_t *x,const int64_t *y,long long n)
{
  for (long long i = 0;i < m+4;++i) x[i] = i+31415; // not using this test to check for memory safety

  n = crypto_int64_max(4,n); // no-op when caller properly guarantees n >= 4
  long long pos = crypto_int64_min(m+4,n);
  __m256i data = int64x4_load((__m256i *) y);
  int64x4_store((void *) (x+pos-4),int32x8_varextract(data,int32x8_add(int32x8_set(0,1,2,3,4,5,6,7),int32x8_broadcast(2*pos))));

  for (long long i = 0;i < m;++i) x[i] = i+27182; // modeling caller storing earlier x items later
}
