-rw-r--r-- 164966 nttcompiler-20220411/ops-512/avx/ntt.c
// auto-generated; do not edit
#include "ntt_ops_512.h"
#include <assert.h>
#include <immintrin.h>
#define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20)
#define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31)
#define int16x16 __m256i
typedef int16_t int16;
typedef int32_t int32;
#include "ntt_ops.h"
#define nummul_count(n) ntt_ops_mul += (n)
#define numadd_count(n) ntt_ops_add += (n)
#define nummul_x16_count(n) ntt_ops_mul_x16 += (n)
#define numadd_x16_count(n) ntt_ops_add_x16 += (n)
#define nummulmod_count(n) ntt_ops_mulmod += (n)
#define numreduce_count(n) ntt_ops_reduce += (n)
static const int16 __attribute((aligned(32))) qdata_7681[] = {
#define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0)
-3593,-3593,-3593,-3593,-3625,-3625,-3625,-3625,-3593,-3593,-3593,-3593,-3625,-3625,-3625,-3625,
#define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16)
-3777,-3777,-3777,-3777,3182,3182,3182,3182,-3777,-3777,-3777,-3777,3182,3182,3182,3182,
#define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32)
-3593,-3593,-3593,-3593,-3182,-3182,-3182,-3182,-3593,-3593,-3593,-3593,-3182,-3182,-3182,-3182,
#define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48)
3777,3777,3777,3777,3625,3625,3625,3625,3777,3777,3777,3777,3625,3625,3625,3625,
#define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64)
-3593,-3593,-3593,-3593,2194,2194,2194,2194,-3593,-3593,-3593,-3593,2194,2194,2194,2194,
#define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80)
-3625,-3625,-3625,-3625,-1100,-1100,-1100,-1100,-3625,-3625,-3625,-3625,-1100,-1100,-1100,-1100,
#define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96)
-3593,-3593,-3593,-3593,3696,3696,3696,3696,-3593,-3593,-3593,-3593,3696,3696,3696,3696,
#define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112)
-3182,-3182,-3182,-3182,-2456,-2456,-2456,-2456,-3182,-3182,-3182,-3182,-2456,-2456,-2456,-2456,
#define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128)
-3593,1701,2194,834,-3625,2319,-1100,121,-3593,1701,2194,834,-3625,2319,-1100,121,
#define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144)
-3777,1414,2456,2495,3182,2876,-3696,2250,-3777,1414,2456,2495,3182,2876,-3696,2250,
#define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160)
-3593,-2250,3696,-2876,-3182,-2495,-2456,-1414,-3593,-2250,3696,-2876,-3182,-2495,-2456,-1414,
#define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176)
3777,-121,1100,-2319,3625,-834,-2194,-1701,3777,-121,1100,-2319,3625,-834,-2194,-1701,
#define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192)
-3593,3364,1701,-1599,2194,2557,834,-2816,-3593,3364,1701,-1599,2194,2557,834,-2816,
#define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208)
-3625,617,2319,2006,-1100,-1296,121,1986,-3625,617,2319,2006,-1100,-1296,121,1986,
#define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224)
-3593,2237,-2250,-1483,3696,3706,-2876,1921,-3593,2237,-2250,-1483,3696,3706,-2876,1921,
#define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240)
-3182,2088,-2495,-1525,-2456,1993,-1414,2830,-3182,2088,-2495,-1525,-2456,1993,-1414,2830,
#define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256)
-3593,514,3364,438,1701,2555,-1599,-1738,2194,103,2557,1881,834,-549,-2816,638,
#define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272)
-3625,-1399,617,-1760,2319,2535,2006,3266,-1100,-1431,-1296,3174,121,3153,1986,-810,
#define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288)
-3777,2956,-2830,-679,1414,2440,-1993,-3689,2456,2804,1525,3555,2495,1535,-2088,-7,
#define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304)
3182,-1321,-1921,-1305,2876,-3772,-3706,3600,-3696,-2043,1483,-396,2250,-2310,-2237,1887,
#define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320)
-3593,-1887,2237,2310,-2250,396,-1483,2043,3696,-3600,3706,3772,-2876,1305,1921,1321,
#define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336)
-3182,7,2088,-1535,-2495,-3555,-1525,-2804,-2456,3689,1993,-2440,-1414,679,2830,-2956,
#define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352)
3777,810,-1986,-3153,-121,-3174,1296,1431,1100,-3266,-2006,-2535,-2319,1760,-617,1399,
#define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368)
3625,-638,2816,549,-834,-1881,-2557,-103,-2194,1738,1599,-2555,-1701,-438,-3364,-514,
#define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384)
-3593,-1532,514,-373,3364,-3816,438,-3456,1701,783,2555,2883,-1599,727,-1738,-2385,
#define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400)
2194,-2160,103,-2391,2557,2762,1881,-2426,834,3310,-549,-1350,-2816,1386,638,-194,
#define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416)
-3625,404,-1399,-3692,617,-2764,-1760,-1054,2319,1799,2535,-3588,2006,1533,3266,2113,
#define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432)
-1100,-2579,-1431,-1756,-1296,1598,3174,-2,121,-3480,3153,-2572,1986,2743,-810,2919,
#define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448)
-3593,2789,-1887,-921,2237,-1497,2310,-2133,-2250,-915,396,1390,-1483,3135,2043,-859,
#define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464)
3696,2732,-3600,-1464,3706,2224,3772,-2665,-2876,1698,1305,2835,1921,730,1321,486,
#define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480)
-3182,3417,7,-3428,2088,-3145,-1535,1168,-2495,-3831,-3555,-3750,-1525,660,-2804,2649,
#define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496)
-2456,3405,3689,-1521,1993,1681,-2440,1056,-1414,1166,679,-2233,2830,2175,-2956,-1919,
#define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512)
-3593,-1404,-1532,451,514,-402,-373,1278,3364,-509,-3816,-3770,438,-2345,-3456,-226,
#define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528)
1701,-1689,783,-1509,2555,2963,2883,1242,-1599,1669,727,2719,-1738,642,-2385,-436,
#define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544)
2194,3335,-2160,1779,103,3745,-2391,17,2557,2812,2762,-1144,1881,83,-2426,-1181,
#define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560)
834,-1519,3310,3568,-549,-796,-1350,2072,-2816,-2460,1386,2891,638,-2083,-194,-715,
#define precomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576)
-3593,451,-373,-509,438,-226,783,2963,-1599,2719,-2385,3335,103,17,2762,83,
#define precomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592)
834,3568,-1350,-2460,638,-715,404,1931,617,1295,-1054,-2262,2535,2059,1533,-791,
#define precomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608)
-1100,-1151,-1756,-2005,3174,151,-3480,-3781,1986,-3550,2919,-2874,2956,-929,2233,1338,
#define precomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624)
1414,2918,-1681,692,-3689,-236,-2649,3366,1525,1072,3831,-188,1535,-3177,3428,-3312,
#define precomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640)
-3593,370,-921,1649,2310,893,-915,2589,-1483,3214,-859,1121,-3600,-3287,2224,-2339,
#define precomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656)
-2876,-2515,2835,-1348,1321,2130,3417,-2340,2088,-3163,1168,1203,-3555,3763,660,3547,
#define precomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672)
-2456,1837,-1521,-179,-2440,-777,1166,3450,2830,429,-1919,1476,810,-3677,2572,-1586,
#define precomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688)
-121,1526,-1598,-2001,1431,1441,-2113,-3314,-2006,3208,-1799,-2767,1760,-3343,3692,-222,
#define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704)
-3593,658,2789,370,-1887,-3434,-921,-3752,2237,1649,-1497,2258,2310,3581,-2133,893,
#define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720)
-2250,3794,-915,826,396,2589,1390,592,-1483,-2422,3135,3214,2043,-434,-859,-2532,
#define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736)
3696,1121,2732,2965,-3600,2998,-1464,-3287,3706,1070,2224,-589,3772,-2339,-2665,2070,
#define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752)
-2876,2378,1698,-2515,1305,-2815,2835,-2937,1921,-1348,730,-3723,1321,1712,486,2130,
#define q_x16 *(const int16x16 *)(qdata+768)
7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,7681,
#define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784)
-9,-9,-9,-9,-16425,-16425,-16425,-16425,-9,-9,-9,-9,-16425,-16425,-16425,-16425,
#define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800)
-28865,-28865,-28865,-28865,10350,10350,10350,10350,-28865,-28865,-28865,-28865,10350,10350,10350,10350,
#define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816)
-9,-9,-9,-9,-10350,-10350,-10350,-10350,-9,-9,-9,-9,-10350,-10350,-10350,-10350,
#define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832)
28865,28865,28865,28865,16425,16425,16425,16425,28865,28865,28865,28865,16425,16425,16425,16425,
#define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848)
-9,-9,-9,-9,-4974,-4974,-4974,-4974,-9,-9,-9,-9,-4974,-4974,-4974,-4974,
#define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864)
-16425,-16425,-16425,-16425,-7244,-7244,-7244,-7244,-16425,-16425,-16425,-16425,-7244,-7244,-7244,-7244,
#define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880)
-9,-9,-9,-9,-4496,-4496,-4496,-4496,-9,-9,-9,-9,-4496,-4496,-4496,-4496,
#define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896)
-10350,-10350,-10350,-10350,-14744,-14744,-14744,-14744,-10350,-10350,-10350,-10350,-14744,-14744,-14744,-14744,
#define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912)
-9,-20315,-4974,18242,-16425,18191,-7244,-11655,-9,-20315,-4974,18242,-16425,18191,-7244,-11655,
#define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928)
-28865,20870,14744,-22593,10350,828,4496,23754,-28865,20870,14744,-22593,10350,828,4496,23754,
#define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944)
-9,-23754,-4496,-828,-10350,22593,-14744,-20870,-9,-23754,-4496,-828,-10350,22593,-14744,-20870,
#define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960)
28865,11655,7244,-18191,16425,-18242,4974,20315,28865,11655,7244,-18191,16425,-18242,4974,20315,
#define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976)
-9,-10972,-20315,23489,-4974,25597,18242,-2816,-9,-10972,-20315,23489,-4974,25597,18242,-2816,
#define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992)
-16425,-19351,18191,-3114,-7244,-9488,-11655,19394,-16425,-19351,18191,-3114,-7244,-9488,-11655,19394,
#define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008)
-9,-7491,-23754,-15307,-4496,-15750,-828,-5759,-9,-7491,-23754,-15307,-4496,-15750,-828,-5759,
#define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024)
-10350,22568,22593,-20469,-14744,31177,-20870,26382,-10350,22568,22593,-20469,-14744,31177,-20870,26382,
#define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040)
-9,-14846,-10972,-21066,-20315,-24581,23489,-23242,-4974,-4505,25597,-26279,18242,21467,-2816,15998,
#define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056)
-16425,-4983,-19351,14624,18191,-2073,-3114,20674,-7244,-21399,-9488,6246,-11655,-29103,19394,-5930,
#define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072)
-28865,-23668,-26382,-28839,20870,6536,-31177,16279,14744,29428,20469,29667,-22593,9215,-22568,-11783,
#define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088)
10350,-14121,5759,-5913,828,-1724,15750,11792,4496,25093,15307,26228,23754,-21766,7491,-6817,
#define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104)
-9,6817,-7491,21766,-23754,-26228,-15307,-25093,-4496,-11792,-15750,1724,-828,5913,-5759,14121,
#define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120)
-10350,11783,22568,-9215,22593,-29667,-20469,-29428,-14744,-16279,31177,-6536,-20870,28839,26382,23668,
#define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136)
28865,5930,-19394,29103,11655,-6246,9488,21399,7244,-20674,3114,2073,-18191,-14624,19351,4983,
#define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152)
16425,-15998,2816,-21467,-18242,26279,-25597,4505,4974,23242,-23489,24581,20315,21066,10972,14846,
#define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168)
-9,-32252,-14846,-19317,-10972,8472,-21066,-3456,-20315,16655,-24581,12611,23489,-12073,-23242,29871,
#define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184)
-4974,6032,-4505,10409,25597,24266,-26279,17030,18242,10478,21467,11962,-2816,-26262,15998,-17602,
#define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200)
-16425,-22124,-4983,-26220,-19351,-8908,14624,32738,18191,13575,-2073,27132,-3114,24573,20674,27201,
#define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216)
-7244,12269,-21399,-16092,-9488,-15810,6246,15358,-11655,-15768,-29103,24052,19394,-26441,-5930,-1689,
#define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232)
-9,13541,6817,-5529,-7491,26663,21766,-4693,-23754,13933,-26228,8558,-15307,-21953,-25093,-22875,
#define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248)
-4496,-7508,-11792,-30136,-15750,26800,1724,17303,-828,2722,5913,-12013,-5759,30426,14121,3558,
#define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264)
-10350,-24743,11783,-21860,22568,-32329,-9215,9360,22593,-7415,-29667,25946,-20469,-21868,-29428,-25511,
#define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280)
-14744,1869,-16279,14351,31177,2193,-6536,17440,-20870,24718,28839,-23225,26382,9855,23668,-9599,
#define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296)
-9,-32124,-32252,10179,-14846,6766,-19317,16638,-10972,-23549,8472,-17082,-21066,-15145,-3456,31518,
#define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312)
-20315,-6297,16655,-12261,-24581,-11885,12611,30938,23489,28805,-12073,26783,-23242,-14718,29871,5708,
#define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328)
-4974,15111,6032,-29453,-4505,12449,10409,529,25597,-32004,24266,2952,-26279,18003,17030,24931,
#define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344)
18242,-1007,10478,-4624,21467,17636,11962,14360,-2816,15972,-26262,16715,15998,4573,-17602,-14539,
#define qinvprecomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360)
-9,10179,-19317,-23549,-21066,31518,16655,-11885,23489,26783,29871,15111,-4505,529,24266,18003,
#define qinvprecomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376)
18242,-4624,11962,15972,15998,-14539,-22124,-17013,-19351,17167,32738,2858,-2073,-16885,24573,-20759,
#define qinvprecomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392)
-7244,-8831,-16092,-4565,6246,20119,-15768,1851,19394,-2526,-1689,-16186,-23668,-9633,23225,14650,
#define qinvprecomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408)
20870,5990,-2193,-5452,16279,-22764,25511,-26330,20469,25648,7415,1860,9215,16791,21860,4880,
#define qinvprecomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424)
-9,-23182,-5529,-14223,21766,23933,13933,-23523,-15307,26766,-22875,-22943,-11792,9513,26800,4317,
#define qinvprecomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440)
-828,-20435,-12013,-3396,14121,27730,-24743,11996,22568,-25179,9360,2739,-29667,5299,-21868,25563,
#define qinvprecomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456)
-14744,-16083,14351,-1715,-6536,2807,24718,-16006,26382,-17491,-9599,3524,5930,-10333,-24052,-10802,
#define qinvprecomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472)
11655,12790,15810,30255,21399,10145,-27201,20238,3114,7304,-13575,14129,-14624,-19215,26220,802,
#define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488)
-9,-6510,13541,-23182,6817,24214,-5529,-24232,-7491,-14223,26663,27858,21766,26621,-4693,23933,
#define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504)
-23754,29394,13933,14138,-26228,-23523,8558,-23984,-15307,-13686,-21953,26766,-25093,-9650,-22875,-20964,
#define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520)
-4496,-22943,-7508,-27243,-11792,-18506,-30136,9513,-15750,-24530,26800,947,1724,4317,17303,29718,
#define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536)
-828,23882,2722,-20435,5913,-10495,-12013,8839,-5759,-3396,30426,15221,14121,26288,3558,27730,
#define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552)
-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,-28865,
#define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568)
28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,28865,
#define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584)
-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,-16425,
#define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600)
-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,-10350,
#define qround32_x16 *(const int16x16 *)(qdata+1616)
4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
#define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632)
-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,-3777,
#define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648)
3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,3777,
#define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664)
-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,-3625,
#define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680)
-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,-3182,
#define q qdata[1696]
7681,
} ;
static const int16 __attribute((aligned(32))) qdata_10753[] = {
// precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
1018,1018,1018,1018,3688,3688,3688,3688,1018,1018,1018,1018,3688,3688,3688,3688,
// precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
-223,-223,-223,-223,-4188,-4188,-4188,-4188,-223,-223,-223,-223,-4188,-4188,-4188,-4188,
// precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
1018,1018,1018,1018,4188,4188,4188,4188,1018,1018,1018,1018,4188,4188,4188,4188,
// precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
223,223,223,223,-3688,-3688,-3688,-3688,223,223,223,223,-3688,-3688,-3688,-3688,
// precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
1018,1018,1018,1018,-376,-376,-376,-376,1018,1018,1018,1018,-376,-376,-376,-376,
// precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
3688,3688,3688,3688,-3686,-3686,-3686,-3686,3688,3688,3688,3688,-3686,-3686,-3686,-3686,
// precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
1018,1018,1018,1018,-2413,-2413,-2413,-2413,1018,1018,1018,1018,-2413,-2413,-2413,-2413,
// precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
4188,4188,4188,4188,-357,-357,-357,-357,4188,4188,4188,4188,-357,-357,-357,-357,
// precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
1018,-3364,-376,4855,3688,425,-3686,2695,1018,-3364,-376,4855,3688,425,-3686,2695,
// precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
-223,-3784,357,-2236,-4188,4544,2413,730,-223,-3784,357,-2236,-4188,4544,2413,730,
// precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
1018,-730,-2413,-4544,4188,2236,-357,3784,1018,-730,-2413,-4544,4188,2236,-357,3784,
// precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
223,-2695,3686,-425,-3688,-4855,376,3364,223,-2695,3686,-425,-3688,-4855,376,3364,
// precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
1018,-5175,-3364,2503,-376,1341,4855,-4875,1018,-5175,-3364,2503,-376,1341,4855,-4875,
// precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
3688,-2629,425,-4347,-3686,3823,2695,-4035,3688,-2629,425,-4347,-3686,3823,2695,-4035,
// precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
1018,5063,-730,341,-2413,-3012,-4544,-5213,1018,5063,-730,341,-2413,-3012,-4544,-5213,
// precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
4188,1520,2236,1931,-357,918,3784,4095,4188,1520,2236,1931,-357,918,3784,4095,
// precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,3085,-5175,2982,-3364,-4744,2503,-4129,-376,-2576,1341,-193,4855,3062,-4875,4,
// precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
3688,2388,-2629,-4513,425,4742,-4347,2935,-3686,-544,3823,-2178,2695,847,-4035,268,
// precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-223,-1299,-4095,-1287,-3784,-4876,-918,3091,357,-4189,-1931,4616,-2236,2984,-1520,-3550,
// precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-4188,-1009,5213,-205,4544,-4102,3012,2790,2413,-1085,-341,-2565,730,-4379,-5063,-1284,
// precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,1284,5063,4379,-730,2565,341,1085,-2413,-2790,-3012,4102,-4544,205,-5213,1009,
// precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
4188,3550,1520,-2984,2236,-4616,1931,4189,-357,-3091,918,4876,3784,1287,4095,1299,
// precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
223,-268,4035,-847,-2695,2178,-3823,544,3686,-2935,4347,-4742,-425,4513,2629,-2388,
// precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-3688,-4,4875,-3062,-4855,193,-1341,2576,376,4129,-2503,4744,3364,-2982,5175,-3085,
// precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,5116,3085,-3615,-5175,400,2982,3198,-3364,2234,-4744,-4828,2503,326,-4129,-512,
// precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-376,1068,-2576,-4580,1341,3169,-193,-2998,4855,-635,3062,-4808,-4875,-2740,4,675,
// precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
3688,-1324,2388,5114,-2629,5294,-4513,-794,425,-864,4742,-886,-4347,336,2935,-2045,
// precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-3686,-3715,-544,4977,3823,-2737,-2178,3441,2695,467,847,454,-4035,-779,268,2213,
// precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,1615,1284,2206,5063,5064,4379,472,-730,-5341,2565,-4286,341,2981,1085,-1268,
// precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-2413,-3057,-2790,-2884,-3012,-1356,4102,-3337,-4544,5023,205,-636,-5213,909,1009,-2973,
// precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
4188,2271,3550,-1572,1520,1841,-2984,970,2236,-4734,-4616,578,1931,-116,4189,1586,
// precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-357,-2774,-3091,-1006,918,-5156,4876,4123,3784,-567,1287,151,4095,1458,1299,2684,
// precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,-3260,5116,-1722,3085,5120,-3615,3760,-5175,73,400,4254,2982,2788,3198,-2657,
// precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-3364,569,2234,1930,-4744,-2279,-4828,5215,2503,-4403,326,1639,-4129,5068,-512,-5015,
// precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-376,-4859,1068,-40,-2576,4003,-4580,-4621,1341,2487,3169,-2374,-193,2625,-2998,4784,
// precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
4855,825,-635,2118,3062,-2813,-4808,-4250,-4875,-2113,-2740,-4408,4,-1893,675,458,
// precomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,-1722,-3615,73,2982,-2657,2234,-2279,2503,1639,-512,-4859,-2576,-4621,3169,2625,
// precomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
4855,2118,-4808,-2113,4,458,-1324,-1056,-2629,-5313,-794,-4889,4742,5309,336,-4540,
// precomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-3686,-2680,4977,5334,-2178,-2062,467,5083,-4035,-5005,2213,693,-1299,-3570,-151,-1160,
// precomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-3784,-3148,5156,-1053,3091,4447,-1586,1204,-1931,-663,4734,4393,2984,-2428,1572,-2807,
// precomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,5268,2206,1381,4379,-4000,-5341,1409,341,5356,-1268,3135,-2790,-4720,-1356,-4144,
// precomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-4544,2449,-636,2624,1009,-3419,2271,1992,1520,3535,970,-1635,-4616,-2529,-116,-3241,
// precomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-357,3096,-1006,-854,4876,4973,-567,1122,4095,-4519,2684,1573,-268,1782,-454,-2117,
// precomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-2695,-3827,2737,-2230,544,2963,2045,-2283,4347,2151,864,-4782,4513,-4891,-5114,-2909,
// precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
1018,-3524,1615,5268,1284,4428,2206,-834,5063,1381,5064,279,4379,2439,472,-4000,
// precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-730,-2015,-5341,3891,2565,1409,-4286,2605,341,573,2981,5356,1085,-2087,-1268,-554,
// precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-2413,3135,-3057,3125,-2790,-778,-2884,-4720,-3012,-3453,-1356,-355,4102,-4144,-3337,-152,
// precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-4544,-3410,5023,2449,205,-97,-636,1927,-5213,2624,909,-1689,1009,-4359,-2973,-3419,
// q_x16
10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,10753,
// qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
-6,-6,-6,-6,-408,-408,-408,-408,-6,-6,-6,-6,-408,-408,-408,-408,
// qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
-27359,-27359,-27359,-27359,1956,1956,1956,1956,-27359,-27359,-27359,-27359,1956,1956,1956,1956,
// qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
-6,-6,-6,-6,-1956,-1956,-1956,-1956,-6,-6,-6,-6,-1956,-1956,-1956,-1956,
// qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
27359,27359,27359,27359,408,408,408,408,27359,27359,27359,27359,408,408,408,408,
// qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
-6,-6,-6,-6,-20856,-20856,-20856,-20856,-6,-6,-6,-6,-20856,-20856,-20856,-20856,
// qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
-408,-408,-408,-408,-21094,-21094,-21094,-21094,-408,-408,-408,-408,-21094,-21094,-21094,-21094,
// qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
-6,-6,-6,-6,-10093,-10093,-10093,-10093,-6,-6,-6,-6,-10093,-10093,-10093,-10093,
// qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
-1956,-1956,-1956,-1956,-28517,-28517,-28517,-28517,-1956,-1956,-1956,-1956,-28517,-28517,-28517,-28517,
// qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
-6,-9508,-20856,-29449,-408,18345,-21094,-7033,-6,-9508,-20856,-29449,-408,18345,-21094,-7033,
// qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
-27359,-16072,28517,-12476,1956,-28224,10093,16090,-27359,-16072,28517,-12476,1956,-28224,10093,16090,
// qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
-6,-16090,-10093,28224,-1956,12476,-28517,16072,-6,-16090,-10093,28224,-1956,12476,-28517,16072,
// qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
27359,7033,21094,-18345,408,29449,20856,9508,27359,7033,21094,-18345,408,29449,20856,9508,
// qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
-6,-3639,-9508,25543,-20856,829,-29449,-17675,-6,-3639,-9508,25543,-20856,829,-29449,-17675,
// qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
-408,18363,18345,7429,-21094,-10001,-7033,-4547,-408,18363,18345,7429,-21094,-10001,-7033,-4547,
// qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
-6,28103,-16090,3925,-10093,7228,28224,11683,-6,28103,-16090,3925,-10093,7228,28224,11683,
// qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
-1956,-23056,12476,14731,-28517,26518,16072,14847,-1956,-23056,12476,14731,-28517,26518,16072,14847,
// qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,-5619,-3639,-12378,-9508,15736,25543,23007,-20856,-27152,829,-22209,-29449,-20490,-17675,22532,
// qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-408,16724,18363,22623,18345,5766,7429,-31369,-21094,15840,-10001,19326,-7033,3407,-4547,2316,
// qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-27359,6381,-14847,8441,-16072,-6924,-26518,-4589,28517,12707,-14731,-15864,-12476,31656,23056,24098,
// qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
1956,-31217,-11683,-24269,-28224,-5126,-7228,20198,10093,-573,-3925,-14341,16090,23781,-28103,-23812,
// qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,23812,28103,-23781,-16090,14341,3925,573,-10093,-20198,7228,5126,28224,24269,11683,31217,
// qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-1956,-24098,-23056,-31656,12476,15864,14731,-12707,-28517,4589,26518,6924,16072,-8441,14847,-6381,
// qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
27359,-2316,4547,-3407,7033,-19326,10001,-15840,21094,31369,-7429,-5766,-18345,-22623,-18363,-16724,
// qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
408,-22532,17675,20490,29449,22209,-829,27152,20856,-23007,-25543,-15736,9508,12378,3639,5619,
// qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,-17412,-5619,2017,-3639,24976,-12378,24702,-9508,-31558,15736,1316,25543,-31418,23007,-512,
// qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-20856,-13268,-27152,22044,829,8801,-22209,-12214,-29449,11141,-20490,-17096,-17675,32076,22532,17571,
// qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-408,13012,16724,4090,18363,-30546,22623,16614,18345,-17248,5766,22666,7429,-7856,-31369,31235,
// qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-21094,28541,15840,-30351,-10001,-177,19326,-31887,-7033,25555,3407,-31290,-4547,-13579,2316,-2395,
// qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,4175,23812,7326,28103,17352,-23781,-28200,-16090,11555,14341,6978,3925,-1627,573,780,
// qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-10093,32271,-20198,7356,7228,29364,5126,27895,28224,-609,24269,21892,11683,-7795,31217,-18845,
// qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-1956,29407,-24098,-7716,-23056,-719,-31656,-8246,12476,-26238,15864,11842,14731,1932,-12707,-11726,
// qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-28517,4394,4589,2066,26518,-11300,6924,-24037,16072,969,-8441,14999,14847,-11854,-6381,-19844,
// qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,-13500,-17412,32070,-5619,5120,2017,11952,-3639,1609,24976,9374,-12378,-23836,24702,-8289,
// qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-9508,-22471,-31558,25482,15736,-8935,1316,32351,25543,19661,-31418,8295,23007,-25652,-512,-19863,
// qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-20856,6917,-13268,-28712,-27152,20899,22044,4083,829,951,8801,29370,-22209,24641,-12214,12976,
// qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-29449,-22215,11141,-29626,-20490,30467,-17096,13158,-17675,-24129,32076,7880,22532,-30053,17571,-8758,
// qinvprecomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,32070,2017,1609,-12378,-8289,-31558,-8935,25543,8295,-512,6917,-27152,4083,8801,24641,
// qinvprecomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-29449,-29626,-17096,-24129,22532,-8758,13012,15328,18363,-27329,16614,1767,5766,4797,-7856,-14780,
// qinvprecomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-21094,-23160,-30351,-1834,19326,17394,25555,9691,-4547,3699,-2395,20661,6381,-23026,-14999,19320,
// qinvprecomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
-16072,27572,11300,-16925,-4589,31583,11726,31924,-14731,-15511,26238,22313,31656,20100,7716,31497,
// qinvprecomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,-13164,7326,29541,-23781,12384,11555,-9343,3925,23788,780,-18881,-20198,19856,29364,-12336,
// qinvprecomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
28224,16273,21892,-30144,31217,-8027,29407,14280,-23056,6095,-8246,14237,15864,-8161,1932,-21161,
// qinvprecomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-28517,7192,2066,6314,6924,12653,969,-3998,14847,21593,-19844,-3035,-2316,-21770,31290,18875,
// qinvprecomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
7033,-12531,177,-11446,-15840,-4717,-31235,-31467,-7429,8807,17248,31058,-22623,23269,-4090,13987,
// qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
-6,6716,4175,-13164,23812,-26292,7326,-12098,28103,29541,17352,15127,-23781,-7289,-28200,12384,
// qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
-16090,-29151,11555,-20173,14341,-9343,6978,-22483,3925,61,-1627,23788,573,24025,780,-7722,
// qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
-10093,-18881,32271,23093,-20198,-24330,7356,19856,7228,29827,29364,15517,5126,-12336,27895,-4248,
// qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
28224,26286,-609,16273,24269,-5729,21892,-7801,11683,-30144,-7795,4967,31217,5369,-18845,-8027,
// qinvscaledzeta_x16_4_1
-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,-27359,
// qinvscaledzeta_x16_4_3
27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,27359,
// qinvscaledzeta_x16_8_1
-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,-408,
// qinvscaledzeta_x16_8_7
-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,-1956,
// qround32_x16
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
// scaledzeta_x16_4_1
-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,-223,
// scaledzeta_x16_4_3
223,223,223,223,223,223,223,223,223,223,223,223,223,223,223,223,
// scaledzeta_x16_8_1
3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,3688,
// scaledzeta_x16_8_7
4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,4188,
// q
10753,
} ;
static inline int16x16 add_x16(int16x16 a,int16x16 b)
{
numadd_count(16);
numadd_x16_count(1);
return _mm256_add_epi16(a,b);
}
static inline int16x16 sub_x16(int16x16 a,int16x16 b)
{
numadd_count(16);
numadd_x16_count(1);
return _mm256_sub_epi16(a,b);
}
static inline int16x16 mulmod_scaled_x16(int16x16 x,int16x16 y,int16x16 yqinv,const int16 *qdata)
{
nummulmod_count(16);
nummul_count(48);
nummul_x16_count(3);
int16x16 b = _mm256_mulhi_epi16(x,y);
int16x16 d = _mm256_mullo_epi16(x,yqinv);
int16x16 e = _mm256_mulhi_epi16(d,q_x16);
return sub_x16(b,e);
}
static inline int16x16 reduce_x16(int16x16 x,const int16 *qdata)
{
numreduce_count(16);
nummul_count(32);
nummul_x16_count(2);
int16x16 y = _mm256_mulhrs_epi16(x,qround32_x16);
y = _mm256_mullo_epi16(y,q_x16);
return sub_x16(x,y);
}
// ----- codegen pass 1
//
// startntt 512
// startbatch 512
// // ----- PRECONDITIONS
// physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// // transform size 512
// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
// // transforms per batch 1
// // batch indexing []
// // total batch size 512
//
// // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800)
// assertranges ...
//
// // ----- LAYER 1
//
// // butterflies()
// butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // ----- POSTCONDITIONS AFTER LAYER 1
// // transform size 512
// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
// // transforms per batch 1
// // batch indexing []
// // total batch size 512
//
// // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600)
// assertranges ...
//
// // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600)
// assertranges ...
//
// // ----- LAYER 2
//
// // reduce_ifreverse(0,64,1)
// reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // reduce_ifreverse(256,320,1)
// reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterflies()
// butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // ----- POSTCONDITIONS AFTER LAYER 2
// // transform size 512
// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
// // transforms per batch 1
// // batch indexing []
// // total batch size 512
//
// // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200)
// assertranges ...
//
// // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200)
// assertranges ...
//
// // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016)
// assertranges ...
//
// // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016)
// assertranges ...
//
// // ----- LAYER 3
//
// // reduce_ifforward(64,128,1)
// reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterflies()
// butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twists()
// reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 320 384 1 512 509 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// twist 448 512 1 512 3 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // physical_permute(3,6)
// physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
//
// // fold(256)
// physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
// physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,)
//
// // fold(128)
// physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,)
// physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8)
//
// // fold(64)
// physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8)
// physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8)
//
// // nextbatch()
// stopbatch 512
// startbatch 512
//
// // halfbatch()
// physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8)
// stopbatch 512
// doublereps
// startbatch 256
// physical_map (0, 1, 2, 6, 4, 5) (3, 7)
//
// // halfbatch()
// physical_unmap (0, 1, 2, 6, 4, 5) (3, 7)
// stopbatch 256
// doublereps
// startbatch 128
// physical_map (0, 1, 2, 6, 4, 5) (3,)
//
// // ----- POSTCONDITIONS AFTER LAYER 3
// // transform size 64
// // transform indexing [0, 1, 2, 6, 4, 5]
// // transforms per batch 2
// // batch indexing [3]
// // total batch size 128
//
// // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ...
// assertranges ...
//
// // ----- LAYER 4
//
// // butterflies()
// butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,)
//
// // ----- POSTCONDITIONS AFTER LAYER 4
// // transform size 64
// // transform indexing [0, 1, 2, 6, 4, 5]
// // transforms per batch 2
// // batch indexing [3]
// // total batch size 128
//
// // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14753) 1*(11258,15282) 1*(11258,14641) 1*(11258,14402) ...
// assertranges ...
//
// // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14753) 1*(11258,15282) 1*(11258,14641) 1*(11258,14402) ...
// assertranges ...
//
// // ----- LAYER 5
//
// // butterflies()
// butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,)
// butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,)
//
// // twists()
// reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,)
// twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,)
// twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,)
// twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,)
//
// // physical_permute(0,1,2,5)
// physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,)
//
// // fold(32)
// physical_unmap (1, 2, 5, 6, 4, 0) (3,)
// physical_map (1, 2, 5, 6, 4) (0, 3)
//
// // fold(16)
// physical_unmap (1, 2, 5, 6, 4) (0, 3)
// physical_map (1, 2, 5, 6) (0, 3, 4)
//
// // ----- POSTCONDITIONS AFTER LAYER 5
// // transform size 16
// // transform indexing [1, 2, 5, 6]
// // transforms per batch 8
// // batch indexing [0, 3, 4]
// // total batch size 128
//
// // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5802) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7020) 1*(5629,6328) 1*(5629,7033) 1*(5629,6954) ...
// assertranges ...
//
// // ----- LAYER 6
//
// // butterflies()
// butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4)
//
// // physical_permute(1,2,4)
// physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1)
//
// // nextbatch()
// stopbatch 128
// startbatch 128
//
// // ----- POSTCONDITIONS AFTER LAYER 6
// // transform size 16
// // transform indexing [2, 4, 5, 6]
// // transforms per batch 8
// // batch indexing [0, 3, 1]
// // total batch size 128
//
// // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12424) 1*(11258,14021) 1*(11258,12488) 1*(11258,14310) 1*(11258,14290) 1*(11258,13681) 1*(11258,13574) 1*(11258,13540)
// assertranges ...
//
// // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12424) 1*(11258,14021) 1*(11258,12488) 1*(11258,14310) 1*(11258,14290) 1*(11258,13681) 1*(11258,13574) 1*(11258,13540)
// assertranges ...
//
// // ----- LAYER 7
//
// // butterflies()
// butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1)
// butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1)
//
// // twists()
// reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1)
// twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1)
// twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1)
// twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1)
//
// // physical_permute(2,6)
// physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1)
//
// // fold(8)
// physical_unmap (6, 4, 5, 2) (0, 3, 1)
// physical_map (6, 4, 5) (0, 1, 2, 3)
//
// // fold(4)
// physical_unmap (6, 4, 5) (0, 1, 2, 3)
// physical_map (6, 4) (0, 1, 2, 3, 5)
//
// // ----- POSTCONDITIONS AFTER LAYER 7
// // transform size 4
// // transform indexing [6, 4]
// // transforms per batch 32
// // batch indexing [0, 1, 2, 3, 5]
// // total batch size 128
//
// // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6935) 1*(5629,6521) 1*(5629,7156)
// assertranges ...
//
// // ----- LAYER 8
//
// // butterflies()
// butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5)
//
// // ----- POSTCONDITIONS AFTER LAYER 8
// // transform size 4
// // transform indexing [6, 4]
// // transforms per batch 32
// // batch indexing [0, 1, 2, 3, 5]
// // total batch size 128
//
// // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14091)
// assertranges ...
//
// // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14091)
// assertranges ...
//
// // ----- LAYER 9
//
// // butterflies()
// butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5)
// butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5)
//
// // ----- POSTCONDITIONS AFTER LAYER 9
// // transform size 4
// // transform indexing [6, 4]
// // transforms per batch 32
// // batch indexing [0, 1, 2, 3, 5]
// // total batch size 128
//
// // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26412)
// assertranges ...
//
// // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26412)
// assertranges ...
//
// // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745)
// assertranges ...
//
// // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745)
// assertranges ...
// stopbatch 128
// physical_unmap (6, 4) (0, 1, 2, 3, 5)
// stopntt 512
// ----- codegen pass 2
//
// startntt 512
// startbatch 512
// physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// assertranges ...
// vector_butterfly 0 256 1 0
// vector_butterfly 128 384 1 0
// vector_butterfly 64 320 1 0
// vector_butterfly 192 448 1 0
// vector_butterfly 32 288 1 0
// vector_butterfly 160 416 1 0
// vector_butterfly 96 352 1 0
// vector_butterfly 224 480 1 0
// vector_butterfly 16 272 1 0
// vector_butterfly 144 400 1 0
// vector_butterfly 80 336 1 0
// vector_butterfly 208 464 1 0
// vector_butterfly 48 304 1 0
// vector_butterfly 176 432 1 0
// vector_butterfly 112 368 1 0
// vector_butterfly 240 496 1 0
// assertranges ...
// assertranges ...
// vector_reduce_ifreverse 0
// vector_reduce_ifreverse 32
// vector_reduce_ifreverse 16
// vector_reduce_ifreverse 48
// vector_reduce_ifreverse 256
// vector_reduce_ifreverse 288
// vector_reduce_ifreverse 272
// vector_reduce_ifreverse 304
// vector_butterfly 0 128 1 0
// vector_butterfly 64 192 1 0
// vector_butterfly 32 160 1 0
// vector_butterfly 96 224 1 0
// vector_butterfly 16 144 1 0
// vector_butterfly 80 208 1 0
// vector_butterfly 48 176 1 0
// vector_butterfly 112 240 1 0
// vector_butterfly 256 384 4 1
// vector_butterfly 320 448 4 1
// vector_butterfly 288 416 4 1
// vector_butterfly 352 480 4 1
// vector_butterfly 272 400 4 1
// vector_butterfly 336 464 4 1
// vector_butterfly 304 432 4 1
// vector_butterfly 368 496 4 1
// assertranges ...
// assertranges ...
// assertranges ...
// assertranges ...
// vector_reduce_ifforward 64
// vector_butterfly 0 64 1 0
// vector_butterfly 128 192 4 1
// vector_butterfly 256 320 8 1
// vector_butterfly 384 448 8 7
// vector_reduce 0
// vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 320 512 509 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 448 512 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// startbatch 512
// vector_reduce_ifforward 80
// vector_butterfly 16 80 1 0
// vector_butterfly 144 208 4 1
// vector_butterfly 272 336 8 1
// vector_butterfly 400 464 8 7
// vector_reduce 16
// vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 336 512 509 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 464 512 3 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// startbatch 512
// vector_reduce_ifforward 96
// vector_butterfly 32 96 1 0
// vector_butterfly 160 224 4 1
// vector_butterfly 288 352 8 1
// vector_butterfly 416 480 8 7
// vector_reduce 32
// vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 352 512 509 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 480 512 3 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// startbatch 512
// vector_reduce_ifforward 112
// vector_butterfly 48 112 1 0
// vector_butterfly 176 240 4 1
// vector_butterfly 304 368 8 1
// vector_butterfly 432 496 8 7
// vector_reduce 48
// vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 368 512 509 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 496 512 3 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// doublereps
// doublereps
// startbatch 128
// physical_unmap (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// physical_map (0, 1, 2, 6, 4, 5) (3,)
// assertranges ...
// vector_butterfly 0 32 1 0
// vector_butterfly 64 96 1 0
// stopbatch 128
// startbatch 128
// vector_butterfly 16 48 1 0
// vector_butterfly 80 112 1 0
// stopbatch 128
// startbatch 128
// assertranges ...
// assertranges ...
// vector_butterfly 0 16 1 0
// vector_butterfly 64 80 1 0
// vector_butterfly 32 48 4 1
// vector_butterfly 96 112 4 1
// vector_reduce 0
// vector_reduce 64
// vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
// vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
// vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
// vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// stopbatch 128
// startbatch 128
// physical_unmap (0, 1, 2, 6, 4, 5) (3,)
// physical_map (1, 2, 5, 6) (0, 3, 4)
// assertranges ...
// vector_butterfly 0 64 1 0
// vector_butterfly 32 96 1 0
// vector_butterfly 16 80 1 0
// vector_butterfly 48 112 1 0
// vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// stopbatch 128
// startbatch 128
// physical_unmap (1, 2, 5, 6) (0, 3, 4)
// physical_map (2, 4, 5, 6) (0, 3, 1)
// assertranges ...
// assertranges ...
// vector_butterfly 0 32 1 0
// vector_butterfly 64 96 4 1
// vector_reduce 0
// vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
// vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
// vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
// vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// stopbatch 128
// startbatch 128
// vector_butterfly 16 48 1 0
// vector_butterfly 80 112 4 1
// vector_reduce 16
// vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
// vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
// vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
// vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// stopbatch 128
// startbatch 128
// physical_unmap (2, 4, 5, 6) (0, 3, 1)
// physical_map (6, 4) (0, 1, 2, 3, 5)
// assertranges ...
// vector_butterfly 0 16 1 0
// vector_butterfly 64 80 1 0
// vector_butterfly 32 48 1 0
// vector_butterfly 96 112 1 0
// stopbatch 128
// startbatch 128
// assertranges ...
// assertranges ...
// vector_butterfly 0 64 1 0
// stopbatch 128
// startbatch 128
// vector_butterfly 16 80 4 1
// stopbatch 128
// startbatch 128
// vector_butterfly 32 96 1 0
// stopbatch 128
// startbatch 128
// vector_butterfly 48 112 4 1
// stopbatch 128
// startbatch 128
// assertranges ...
// assertranges ...
// assertranges ...
// assertranges ...
// stopbatch 128
// physical_unmap (6, 4) (0, 1, 2, 3, 5)
// stopntt 512
// startntt 512
static void ntt512(int16 *f,long long reps,const int16 *qdata)
{
// startbatch 512
for (long long r = 0;r < reps;++r) {
// physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
#define F(t,v) f[((((v)>>0)&1)<<0)+((((v)>>1)&1)<<1)+((((v)>>2)&1)<<2)+((((v)>>3)&1)<<3)+((((v)>>4)&1)<<4)+((((v)>>5)&1)<<5)+((((v)>>6)&1)<<6)+((((v)>>7)&1)<<7)+((((v)>>8)&1)<<8)]
// assertranges ...
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 0;j != 512;j += 1)
assert(F(t,j) >= -5629 && F(t,j) <= 5629);
}
if (q == 10753) {
for (long long j = 0;j != 512;j += 1)
assert(F(t,j) >= -5800 && F(t,j) <= 5800);
}
}
// vector_butterfly 0 256 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f+256));
int16x16 b0 = add_x16(a0,a16);
int16x16 b16 = sub_x16(a0,a16);
// vector_butterfly 128 384 1 0
int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f+128));
int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f+384));
int16x16 b8 = add_x16(a8,a24);
int16x16 b24 = sub_x16(a8,a24);
// vector_butterfly 64 320 1 0
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f+320));
int16x16 b4 = add_x16(a4,a20);
int16x16 b20 = sub_x16(a4,a20);
// vector_butterfly 192 448 1 0
int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f+192));
int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f+448));
int16x16 b12 = add_x16(a12,a28);
int16x16 b28 = sub_x16(a12,a28);
// vector_butterfly 32 288 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f+288));
int16x16 b2 = add_x16(a2,a18);
int16x16 b18 = sub_x16(a2,a18);
// vector_butterfly 160 416 1 0
int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f+160));
int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f+416));
int16x16 b10 = add_x16(a10,a26);
int16x16 b26 = sub_x16(a10,a26);
// vector_butterfly 96 352 1 0
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f+352));
int16x16 b6 = add_x16(a6,a22);
int16x16 b22 = sub_x16(a6,a22);
// vector_butterfly 224 480 1 0
int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f+224));
int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f+480));
int16x16 b14 = add_x16(a14,a30);
int16x16 b30 = sub_x16(a14,a30);
// vector_butterfly 16 272 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f+272));
int16x16 b1 = add_x16(a1,a17);
int16x16 b17 = sub_x16(a1,a17);
// vector_butterfly 144 400 1 0
int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f+144));
int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f+400));
int16x16 b9 = add_x16(a9,a25);
int16x16 b25 = sub_x16(a9,a25);
// vector_butterfly 80 336 1 0
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f+336));
int16x16 b5 = add_x16(a5,a21);
int16x16 b21 = sub_x16(a5,a21);
// vector_butterfly 208 464 1 0
int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f+208));
int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f+464));
int16x16 b13 = add_x16(a13,a29);
int16x16 b29 = sub_x16(a13,a29);
// vector_butterfly 48 304 1 0
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f+304));
int16x16 b3 = add_x16(a3,a19);
int16x16 b19 = sub_x16(a3,a19);
// vector_butterfly 176 432 1 0
int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f+176));
int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f+432));
int16x16 b11 = add_x16(a11,a27);
int16x16 b27 = sub_x16(a11,a27);
// vector_butterfly 112 368 1 0
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f+368));
int16x16 b7 = add_x16(a7,a23);
int16x16 b23 = sub_x16(a7,a23);
// vector_butterfly 240 496 1 0
int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f+240));
int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f+496));
int16x16 b15 = add_x16(a15,a31);
int16x16 b31 = sub_x16(a15,a31);
// assertranges ...
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
_mm256_storeu_si256((int16x16 *) (f+128),b8);
_mm256_storeu_si256((int16x16 *) (f+144),b9);
_mm256_storeu_si256((int16x16 *) (f+160),b10);
_mm256_storeu_si256((int16x16 *) (f+176),b11);
_mm256_storeu_si256((int16x16 *) (f+192),b12);
_mm256_storeu_si256((int16x16 *) (f+208),b13);
_mm256_storeu_si256((int16x16 *) (f+224),b14);
_mm256_storeu_si256((int16x16 *) (f+240),b15);
_mm256_storeu_si256((int16x16 *) (f+256),b16);
_mm256_storeu_si256((int16x16 *) (f+272),b17);
_mm256_storeu_si256((int16x16 *) (f+288),b18);
_mm256_storeu_si256((int16x16 *) (f+304),b19);
_mm256_storeu_si256((int16x16 *) (f+320),b20);
_mm256_storeu_si256((int16x16 *) (f+336),b21);
_mm256_storeu_si256((int16x16 *) (f+352),b22);
_mm256_storeu_si256((int16x16 *) (f+368),b23);
_mm256_storeu_si256((int16x16 *) (f+384),b24);
_mm256_storeu_si256((int16x16 *) (f+400),b25);
_mm256_storeu_si256((int16x16 *) (f+416),b26);
_mm256_storeu_si256((int16x16 *) (f+432),b27);
_mm256_storeu_si256((int16x16 *) (f+448),b28);
_mm256_storeu_si256((int16x16 *) (f+464),b29);
_mm256_storeu_si256((int16x16 *) (f+480),b30);
_mm256_storeu_si256((int16x16 *) (f+496),b31);
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 0;j != 256;j += 1)
assert(F(t,j) >= -11258 && F(t,j) <= 11258);
}
if (q == 10753) {
for (long long j = 0;j != 256;j += 1)
assert(F(t,j) >= -11600 && F(t,j) <= 11600);
}
}
// assertranges ...
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 256;j != 512;j += 1)
assert(F(t,j) >= -11258 && F(t,j) <= 11258);
}
if (q == 10753) {
for (long long j = 256;j != 512;j += 1)
assert(F(t,j) >= -11600 && F(t,j) <= 11600);
}
}
// vector_reduce_ifreverse 0
// vector_reduce_ifreverse 32
// vector_reduce_ifreverse 16
// vector_reduce_ifreverse 48
// vector_reduce_ifreverse 256
// vector_reduce_ifreverse 288
// vector_reduce_ifreverse 272
// vector_reduce_ifreverse 304
// vector_butterfly 0 128 1 0
int16x16 c0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 c8 = _mm256_loadu_si256((int16x16 *) (f+128));
int16x16 d0 = add_x16(c0,c8);
int16x16 d8 = sub_x16(c0,c8);
// vector_butterfly 64 192 1 0
int16x16 c4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 c12 = _mm256_loadu_si256((int16x16 *) (f+192));
int16x16 d4 = add_x16(c4,c12);
int16x16 d12 = sub_x16(c4,c12);
// vector_butterfly 32 160 1 0
int16x16 c2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 c10 = _mm256_loadu_si256((int16x16 *) (f+160));
int16x16 d2 = add_x16(c2,c10);
int16x16 d10 = sub_x16(c2,c10);
// vector_butterfly 96 224 1 0
int16x16 c6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 c14 = _mm256_loadu_si256((int16x16 *) (f+224));
int16x16 d6 = add_x16(c6,c14);
int16x16 d14 = sub_x16(c6,c14);
// vector_butterfly 16 144 1 0
int16x16 c1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 c9 = _mm256_loadu_si256((int16x16 *) (f+144));
int16x16 d1 = add_x16(c1,c9);
int16x16 d9 = sub_x16(c1,c9);
// vector_butterfly 80 208 1 0
int16x16 c5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 c13 = _mm256_loadu_si256((int16x16 *) (f+208));
int16x16 d5 = add_x16(c5,c13);
int16x16 d13 = sub_x16(c5,c13);
// vector_butterfly 48 176 1 0
int16x16 c3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 c11 = _mm256_loadu_si256((int16x16 *) (f+176));
int16x16 d3 = add_x16(c3,c11);
int16x16 d11 = sub_x16(c3,c11);
// vector_butterfly 112 240 1 0
int16x16 c7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 c15 = _mm256_loadu_si256((int16x16 *) (f+240));
int16x16 d7 = add_x16(c7,c15);
int16x16 d15 = sub_x16(c7,c15);
// vector_butterfly 256 384 4 1
int16x16 c16 = _mm256_loadu_si256((int16x16 *) (f+256));
int16x16 c24 = _mm256_loadu_si256((int16x16 *) (f+384));
c24 = mulmod_scaled_x16(c24,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d16 = add_x16(c16,c24);
int16x16 d24 = sub_x16(c16,c24);
// vector_butterfly 320 448 4 1
int16x16 c20 = _mm256_loadu_si256((int16x16 *) (f+320));
int16x16 c28 = _mm256_loadu_si256((int16x16 *) (f+448));
c28 = mulmod_scaled_x16(c28,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d20 = add_x16(c20,c28);
int16x16 d28 = sub_x16(c20,c28);
// vector_butterfly 288 416 4 1
int16x16 c18 = _mm256_loadu_si256((int16x16 *) (f+288));
int16x16 c26 = _mm256_loadu_si256((int16x16 *) (f+416));
c26 = mulmod_scaled_x16(c26,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d18 = add_x16(c18,c26);
int16x16 d26 = sub_x16(c18,c26);
// vector_butterfly 352 480 4 1
int16x16 c22 = _mm256_loadu_si256((int16x16 *) (f+352));
int16x16 c30 = _mm256_loadu_si256((int16x16 *) (f+480));
c30 = mulmod_scaled_x16(c30,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d22 = add_x16(c22,c30);
int16x16 d30 = sub_x16(c22,c30);
// vector_butterfly 272 400 4 1
int16x16 c17 = _mm256_loadu_si256((int16x16 *) (f+272));
int16x16 c25 = _mm256_loadu_si256((int16x16 *) (f+400));
c25 = mulmod_scaled_x16(c25,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d17 = add_x16(c17,c25);
int16x16 d25 = sub_x16(c17,c25);
// vector_butterfly 336 464 4 1
int16x16 c21 = _mm256_loadu_si256((int16x16 *) (f+336));
int16x16 c29 = _mm256_loadu_si256((int16x16 *) (f+464));
c29 = mulmod_scaled_x16(c29,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d21 = add_x16(c21,c29);
int16x16 d29 = sub_x16(c21,c29);
// vector_butterfly 304 432 4 1
int16x16 c19 = _mm256_loadu_si256((int16x16 *) (f+304));
int16x16 c27 = _mm256_loadu_si256((int16x16 *) (f+432));
c27 = mulmod_scaled_x16(c27,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d19 = add_x16(c19,c27);
int16x16 d27 = sub_x16(c19,c27);
// vector_butterfly 368 496 4 1
int16x16 c23 = _mm256_loadu_si256((int16x16 *) (f+368));
int16x16 c31 = _mm256_loadu_si256((int16x16 *) (f+496));
c31 = mulmod_scaled_x16(c31,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 d23 = add_x16(c23,c31);
int16x16 d31 = sub_x16(c23,c31);
// assertranges ...
_mm256_storeu_si256((int16x16 *) (f+0),d0);
_mm256_storeu_si256((int16x16 *) (f+16),d1);
_mm256_storeu_si256((int16x16 *) (f+32),d2);
_mm256_storeu_si256((int16x16 *) (f+48),d3);
_mm256_storeu_si256((int16x16 *) (f+64),d4);
_mm256_storeu_si256((int16x16 *) (f+80),d5);
_mm256_storeu_si256((int16x16 *) (f+96),d6);
_mm256_storeu_si256((int16x16 *) (f+112),d7);
_mm256_storeu_si256((int16x16 *) (f+128),d8);
_mm256_storeu_si256((int16x16 *) (f+144),d9);
_mm256_storeu_si256((int16x16 *) (f+160),d10);
_mm256_storeu_si256((int16x16 *) (f+176),d11);
_mm256_storeu_si256((int16x16 *) (f+192),d12);
_mm256_storeu_si256((int16x16 *) (f+208),d13);
_mm256_storeu_si256((int16x16 *) (f+224),d14);
_mm256_storeu_si256((int16x16 *) (f+240),d15);
_mm256_storeu_si256((int16x16 *) (f+256),d16);
_mm256_storeu_si256((int16x16 *) (f+272),d17);
_mm256_storeu_si256((int16x16 *) (f+288),d18);
_mm256_storeu_si256((int16x16 *) (f+304),d19);
_mm256_storeu_si256((int16x16 *) (f+320),d20);
_mm256_storeu_si256((int16x16 *) (f+336),d21);
_mm256_storeu_si256((int16x16 *) (f+352),d22);
_mm256_storeu_si256((int16x16 *) (f+368),d23);
_mm256_storeu_si256((int16x16 *) (f+384),d24);
_mm256_storeu_si256((int16x16 *) (f+400),d25);
_mm256_storeu_si256((int16x16 *) (f+416),d26);
_mm256_storeu_si256((int16x16 *) (f+432),d27);
_mm256_storeu_si256((int16x16 *) (f+448),d28);
_mm256_storeu_si256((int16x16 *) (f+464),d29);
_mm256_storeu_si256((int16x16 *) (f+480),d30);
_mm256_storeu_si256((int16x16 *) (f+496),d31);
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 0;j != 128;j += 1)
assert(F(t,j) >= -22516 && F(t,j) <= 22516);
}
if (q == 10753) {
for (long long j = 0;j != 128;j += 1)
assert(F(t,j) >= -23200 && F(t,j) <= 23200);
}
}
// assertranges ...
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 128;j != 256;j += 1)
assert(F(t,j) >= -22516 && F(t,j) <= 22516);
}
if (q == 10753) {
for (long long j = 128;j != 256;j += 1)
assert(F(t,j) >= -23200 && F(t,j) <= 23200);
}
}
// assertranges ...
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 256;j != 384;j += 1)
assert(F(t,j) >= -15747 && F(t,j) <= 15747);
}
if (q == 10753) {
for (long long j = 256;j != 384;j += 1)
assert(F(t,j) >= -17016 && F(t,j) <= 17016);
}
}
// assertranges ...
for (long long t = 0;t < 1;++t) {
if (q == 7681) {
for (long long j = 384;j != 512;j += 1)
assert(F(t,j) >= -15747 && F(t,j) <= 15747);
}
if (q == 10753) {
for (long long j = 384;j != 512;j += 1)
assert(F(t,j) >= -17016 && F(t,j) <= 17016);
}
}
// vector_reduce_ifforward 64
int16x16 e4 = _mm256_loadu_si256((int16x16 *) (f+64));
e4 = reduce_x16(e4,qdata);
// vector_butterfly 0 64 1 0
int16x16 e0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 f0 = add_x16(e0,e4);
int16x16 f4 = sub_x16(e0,e4);
// vector_butterfly 128 192 4 1
int16x16 e8 = _mm256_loadu_si256((int16x16 *) (f+128));
int16x16 e12 = _mm256_loadu_si256((int16x16 *) (f+192));
e12 = mulmod_scaled_x16(e12,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 f8 = add_x16(e8,e12);
int16x16 f12 = sub_x16(e8,e12);
// vector_butterfly 256 320 8 1
int16x16 e16 = _mm256_loadu_si256((int16x16 *) (f+256));
int16x16 e20 = _mm256_loadu_si256((int16x16 *) (f+320));
e20 = mulmod_scaled_x16(e20,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
int16x16 f16 = add_x16(e16,e20);
int16x16 f20 = sub_x16(e16,e20);
// vector_butterfly 384 448 8 7
int16x16 e24 = _mm256_loadu_si256((int16x16 *) (f+384));
int16x16 e28 = _mm256_loadu_si256((int16x16 *) (f+448));
e28 = mulmod_scaled_x16(e28,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
int16x16 f24 = add_x16(e24,e28);
int16x16 f28 = sub_x16(e24,e28);
// vector_reduce 0
f0 = reduce_x16(f0,qdata);
// vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f4 = mulmod_scaled_x16(f4,precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f8 = mulmod_scaled_x16(f8,precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f12 = mulmod_scaled_x16(f12,precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f16 = mulmod_scaled_x16(f16,precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_twist 320 512 509 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f20 = mulmod_scaled_x16(f20,precomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f24 = mulmod_scaled_x16(f24,precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_twist 448 512 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
f28 = mulmod_scaled_x16(f28,precomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 g0 = _mm256_permute2x128_si256_lo(f0,f4);
int16x16 g4 = _mm256_permute2x128_si256_hi(f0,f4);
// vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 g8 = _mm256_permute2x128_si256_lo(f8,f12);
int16x16 g12 = _mm256_permute2x128_si256_hi(f8,f12);
// vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 g16 = _mm256_permute2x128_si256_lo(f16,f20);
int16x16 g20 = _mm256_permute2x128_si256_hi(f16,f20);
// vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 g24 = _mm256_permute2x128_si256_lo(f24,f28);
int16x16 g28 = _mm256_permute2x128_si256_hi(f24,f28);
// stopbatch 512
_mm256_storeu_si256((int16x16 *) (f+0),g0);
_mm256_storeu_si256((int16x16 *) (f+64),g4);
_mm256_storeu_si256((int16x16 *) (f+128),g8);
_mm256_storeu_si256((int16x16 *) (f+192),g12);
_mm256_storeu_si256((int16x16 *) (f+256),g16);
_mm256_storeu_si256((int16x16 *) (f+320),g20);
_mm256_storeu_si256((int16x16 *) (f+384),g24);
_mm256_storeu_si256((int16x16 *) (f+448),g28);
f += 512;
}
f -= 512*reps;
// startbatch 512
for (long long r = 0;r < reps;++r) {
// vector_reduce_ifforward 80
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
a5 = reduce_x16(a5,qdata);
// vector_butterfly 16 80 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 b1 = add_x16(a1,a5);
int16x16 b5 = sub_x16(a1,a5);
// vector_butterfly 144 208 4 1
int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f+144));
int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f+208));
a13 = mulmod_scaled_x16(a13,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b9 = add_x16(a9,a13);
int16x16 b13 = sub_x16(a9,a13);
// vector_butterfly 272 336 8 1
int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f+272));
int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f+336));
a21 = mulmod_scaled_x16(a21,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
int16x16 b17 = add_x16(a17,a21);
int16x16 b21 = sub_x16(a17,a21);
// vector_butterfly 400 464 8 7
int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f+400));
int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f+464));
a29 = mulmod_scaled_x16(a29,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
int16x16 b25 = add_x16(a25,a29);
int16x16 b29 = sub_x16(a25,a29);
// vector_reduce 16
b1 = reduce_x16(b1,qdata);
// vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b5 = mulmod_scaled_x16(b5,precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b9 = mulmod_scaled_x16(b9,precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b13 = mulmod_scaled_x16(b13,precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b17 = mulmod_scaled_x16(b17,precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_twist 336 512 509 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b21 = mulmod_scaled_x16(b21,precomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b25 = mulmod_scaled_x16(b25,precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_twist 464 512 3 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b29 = mulmod_scaled_x16(b29,precomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c1 = _mm256_permute2x128_si256_lo(b1,b5);
int16x16 c5 = _mm256_permute2x128_si256_hi(b1,b5);
// vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c9 = _mm256_permute2x128_si256_lo(b9,b13);
int16x16 c13 = _mm256_permute2x128_si256_hi(b9,b13);
// vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c17 = _mm256_permute2x128_si256_lo(b17,b21);
int16x16 c21 = _mm256_permute2x128_si256_hi(b17,b21);
// vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c25 = _mm256_permute2x128_si256_lo(b25,b29);
int16x16 c29 = _mm256_permute2x128_si256_hi(b25,b29);
// stopbatch 512
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+144),c9);
_mm256_storeu_si256((int16x16 *) (f+208),c13);
_mm256_storeu_si256((int16x16 *) (f+272),c17);
_mm256_storeu_si256((int16x16 *) (f+336),c21);
_mm256_storeu_si256((int16x16 *) (f+400),c25);
_mm256_storeu_si256((int16x16 *) (f+464),c29);
f += 512;
}
f -= 512*reps;
// startbatch 512
for (long long r = 0;r < reps;++r) {
// vector_reduce_ifforward 96
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
a6 = reduce_x16(a6,qdata);
// vector_butterfly 32 96 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 b2 = add_x16(a2,a6);
int16x16 b6 = sub_x16(a2,a6);
// vector_butterfly 160 224 4 1
int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f+160));
int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f+224));
a14 = mulmod_scaled_x16(a14,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b10 = add_x16(a10,a14);
int16x16 b14 = sub_x16(a10,a14);
// vector_butterfly 288 352 8 1
int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f+288));
int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f+352));
a22 = mulmod_scaled_x16(a22,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
int16x16 b18 = add_x16(a18,a22);
int16x16 b22 = sub_x16(a18,a22);
// vector_butterfly 416 480 8 7
int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f+416));
int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f+480));
a30 = mulmod_scaled_x16(a30,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
int16x16 b26 = add_x16(a26,a30);
int16x16 b30 = sub_x16(a26,a30);
// vector_reduce 32
b2 = reduce_x16(b2,qdata);
// vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b6 = mulmod_scaled_x16(b6,precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b10 = mulmod_scaled_x16(b10,precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b14 = mulmod_scaled_x16(b14,precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b18 = mulmod_scaled_x16(b18,precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_twist 352 512 509 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b22 = mulmod_scaled_x16(b22,precomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b26 = mulmod_scaled_x16(b26,precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_twist 480 512 3 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b30 = mulmod_scaled_x16(b30,precomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c2 = _mm256_permute2x128_si256_lo(b2,b6);
int16x16 c6 = _mm256_permute2x128_si256_hi(b2,b6);
// vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c10 = _mm256_permute2x128_si256_lo(b10,b14);
int16x16 c14 = _mm256_permute2x128_si256_hi(b10,b14);
// vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c18 = _mm256_permute2x128_si256_lo(b18,b22);
int16x16 c22 = _mm256_permute2x128_si256_hi(b18,b22);
// vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c26 = _mm256_permute2x128_si256_lo(b26,b30);
int16x16 c30 = _mm256_permute2x128_si256_hi(b26,b30);
// stopbatch 512
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
_mm256_storeu_si256((int16x16 *) (f+160),c10);
_mm256_storeu_si256((int16x16 *) (f+224),c14);
_mm256_storeu_si256((int16x16 *) (f+288),c18);
_mm256_storeu_si256((int16x16 *) (f+352),c22);
_mm256_storeu_si256((int16x16 *) (f+416),c26);
_mm256_storeu_si256((int16x16 *) (f+480),c30);
f += 512;
}
f -= 512*reps;
// startbatch 512
for (long long r = 0;r < reps;++r) {
// vector_reduce_ifforward 112
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
a7 = reduce_x16(a7,qdata);
// vector_butterfly 48 112 1 0
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b3 = add_x16(a3,a7);
int16x16 b7 = sub_x16(a3,a7);
// vector_butterfly 176 240 4 1
int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f+176));
int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f+240));
a15 = mulmod_scaled_x16(a15,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b11 = add_x16(a11,a15);
int16x16 b15 = sub_x16(a11,a15);
// vector_butterfly 304 368 8 1
int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f+304));
int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f+368));
a23 = mulmod_scaled_x16(a23,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
int16x16 b19 = add_x16(a19,a23);
int16x16 b23 = sub_x16(a19,a23);
// vector_butterfly 432 496 8 7
int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f+432));
int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f+496));
a31 = mulmod_scaled_x16(a31,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
int16x16 b27 = add_x16(a27,a31);
int16x16 b31 = sub_x16(a27,a31);
// vector_reduce 48
b3 = reduce_x16(b3,qdata);
// vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b7 = mulmod_scaled_x16(b7,precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b11 = mulmod_scaled_x16(b11,precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b15 = mulmod_scaled_x16(b15,precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b19 = mulmod_scaled_x16(b19,precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_twist 368 512 509 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b23 = mulmod_scaled_x16(b23,precomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b27 = mulmod_scaled_x16(b27,precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_twist 496 512 3 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b31 = mulmod_scaled_x16(b31,precomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c3 = _mm256_permute2x128_si256_lo(b3,b7);
int16x16 c7 = _mm256_permute2x128_si256_hi(b3,b7);
// vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c11 = _mm256_permute2x128_si256_lo(b11,b15);
int16x16 c15 = _mm256_permute2x128_si256_hi(b11,b15);
// vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c19 = _mm256_permute2x128_si256_lo(b19,b23);
int16x16 c23 = _mm256_permute2x128_si256_hi(b19,b23);
// vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 c27 = _mm256_permute2x128_si256_lo(b27,b31);
int16x16 c31 = _mm256_permute2x128_si256_hi(b27,b31);
// stopbatch 512
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
_mm256_storeu_si256((int16x16 *) (f+176),c11);
_mm256_storeu_si256((int16x16 *) (f+240),c15);
_mm256_storeu_si256((int16x16 *) (f+304),c19);
_mm256_storeu_si256((int16x16 *) (f+368),c23);
_mm256_storeu_si256((int16x16 *) (f+432),c27);
_mm256_storeu_si256((int16x16 *) (f+496),c31);
f += 512;
}
f -= 512*reps;
// doublereps
reps *= 2;
// doublereps
reps *= 2;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// physical_unmap (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
#undef F
// physical_map (0, 1, 2, 6, 4, 5) (3,)
#define F(t,v) f[((((t)>>0)&1)<<3)+((((v)>>0)&1)<<0)+((((v)>>1)&1)<<1)+((((v)>>2)&1)<<2)+((((v)>>3)&1)<<6)+((((v)>>4)&1)<<4)+((((v)>>5)&1)<<5)]
// assertranges ...
for (long long t = 0;t < 2;++t) {
if (q == 7681) {
assert(F(t,0) >= -5629 && F(t,0) <= 5629);
assert(F(t,1) >= -5629 && F(t,1) <= 5629);
assert(F(t,2) >= -5629 && F(t,2) <= 5629);
assert(F(t,3) >= -5629 && F(t,3) <= 5629);
assert(F(t,4) >= -5629 && F(t,4) <= 5629);
assert(F(t,5) >= -5629 && F(t,5) <= 5629);
assert(F(t,6) >= -5629 && F(t,6) <= 5629);
assert(F(t,7) >= -5629 && F(t,7) <= 5629);
assert(F(t,8) >= -5629 && F(t,8) <= 5629);
assert(F(t,9) >= -5629 && F(t,9) <= 5629);
assert(F(t,10) >= -5629 && F(t,10) <= 5629);
assert(F(t,11) >= -5629 && F(t,11) <= 5629);
assert(F(t,12) >= -5629 && F(t,12) <= 5629);
assert(F(t,13) >= -5629 && F(t,13) <= 5629);
assert(F(t,14) >= -5629 && F(t,14) <= 5629);
assert(F(t,15) >= -5629 && F(t,15) <= 5629);
assert(F(t,16) >= -5629 && F(t,16) <= 5629);
assert(F(t,17) >= -5629 && F(t,17) <= 5629);
assert(F(t,18) >= -5629 && F(t,18) <= 5629);
assert(F(t,19) >= -5629 && F(t,19) <= 5629);
assert(F(t,20) >= -5629 && F(t,20) <= 5629);
assert(F(t,21) >= -5629 && F(t,21) <= 5629);
assert(F(t,22) >= -5629 && F(t,22) <= 5629);
assert(F(t,23) >= -5629 && F(t,23) <= 5629);
assert(F(t,24) >= -5629 && F(t,24) <= 5629);
assert(F(t,25) >= -5629 && F(t,25) <= 5629);
assert(F(t,26) >= -5629 && F(t,26) <= 5629);
assert(F(t,27) >= -5629 && F(t,27) <= 5629);
assert(F(t,28) >= -5629 && F(t,28) <= 5629);
assert(F(t,29) >= -5629 && F(t,29) <= 5629);
assert(F(t,30) >= -5629 && F(t,30) <= 5629);
assert(F(t,31) >= -5629 && F(t,31) <= 5629);
assert(F(t,32) >= -5629 && F(t,32) <= 5629);
assert(F(t,33) >= -5629 && F(t,33) <= 5629);
assert(F(t,34) >= -5629 && F(t,34) <= 5629);
assert(F(t,35) >= -5629 && F(t,35) <= 5629);
assert(F(t,36) >= -5629 && F(t,36) <= 5629);
assert(F(t,37) >= -5629 && F(t,37) <= 5629);
assert(F(t,38) >= -5629 && F(t,38) <= 5629);
assert(F(t,39) >= -5629 && F(t,39) <= 5629);
assert(F(t,40) >= -5629 && F(t,40) <= 5629);
assert(F(t,41) >= -5629 && F(t,41) <= 5629);
assert(F(t,42) >= -5629 && F(t,42) <= 5629);
assert(F(t,43) >= -5629 && F(t,43) <= 5629);
assert(F(t,44) >= -5629 && F(t,44) <= 5629);
assert(F(t,45) >= -5629 && F(t,45) <= 5629);
assert(F(t,46) >= -5629 && F(t,46) <= 5629);
assert(F(t,47) >= -5629 && F(t,47) <= 5629);
assert(F(t,48) >= -5629 && F(t,48) <= 5629);
assert(F(t,49) >= -5629 && F(t,49) <= 5629);
assert(F(t,50) >= -5629 && F(t,50) <= 5629);
assert(F(t,51) >= -5629 && F(t,51) <= 5629);
assert(F(t,52) >= -5629 && F(t,52) <= 5629);
assert(F(t,53) >= -5629 && F(t,53) <= 5629);
assert(F(t,54) >= -5629 && F(t,54) <= 5629);
assert(F(t,55) >= -5629 && F(t,55) <= 5629);
assert(F(t,56) >= -5629 && F(t,56) <= 5629);
assert(F(t,57) >= -5629 && F(t,57) <= 5629);
assert(F(t,58) >= -5629 && F(t,58) <= 5629);
assert(F(t,59) >= -5629 && F(t,59) <= 5629);
assert(F(t,60) >= -5629 && F(t,60) <= 5629);
assert(F(t,61) >= -5629 && F(t,61) <= 5629);
assert(F(t,62) >= -5629 && F(t,62) <= 5629);
assert(F(t,63) >= -5629 && F(t,63) <= 5629);
}
if (q == 10753) {
assert(F(t,0) >= -5827 && F(t,0) <= 5827);
assert(F(t,1) >= -7613 && F(t,1) <= 7613);
assert(F(t,2) >= -7666 && F(t,2) <= 7666);
assert(F(t,3) >= -7264 && F(t,3) <= 7264);
assert(F(t,4) >= -7639 && F(t,4) <= 7639);
assert(F(t,5) >= -7591 && F(t,5) <= 7591);
assert(F(t,6) >= -7291 && F(t,6) <= 7291);
assert(F(t,7) >= -7204 && F(t,7) <= 7204);
assert(F(t,8) >= -7220 && F(t,8) <= 7220);
assert(F(t,9) >= -7712 && F(t,9) <= 7712);
assert(F(t,10) >= -7451 && F(t,10) <= 7451);
assert(F(t,11) >= -7487 && F(t,11) <= 7487);
assert(F(t,12) >= -7525 && F(t,12) <= 7525);
assert(F(t,13) >= -7058 && F(t,13) <= 7058);
assert(F(t,14) >= -7534 && F(t,14) <= 7534);
assert(F(t,15) >= -6853 && F(t,15) <= 6853);
assert(F(t,16) >= -7116 && F(t,16) <= 7116);
assert(F(t,17) >= -6713 && F(t,17) <= 6713);
assert(F(t,18) >= -7290 && F(t,18) <= 7290);
assert(F(t,19) >= -7379 && F(t,19) <= 7379);
assert(F(t,20) >= -7067 && F(t,20) <= 7067);
assert(F(t,21) >= -7475 && F(t,21) <= 7475);
assert(F(t,22) >= -7300 && F(t,22) <= 7300);
assert(F(t,23) >= -7235 && F(t,23) <= 7235);
assert(F(t,24) >= -7499 && F(t,24) <= 7499);
assert(F(t,25) >= -7573 && F(t,25) <= 7573);
assert(F(t,26) >= -7068 && F(t,26) <= 7068);
assert(F(t,27) >= -7479 && F(t,27) <= 7479);
assert(F(t,28) >= -7656 && F(t,28) <= 7656);
assert(F(t,29) >= -7279 && F(t,29) <= 7279);
assert(F(t,30) >= -7162 && F(t,30) <= 7162);
assert(F(t,31) >= -7163 && F(t,31) <= 7163);
assert(F(t,32) >= -7208 && F(t,32) <= 7208);
assert(F(t,33) >= -7108 && F(t,33) <= 7108);
assert(F(t,34) >= -7189 && F(t,34) <= 7189);
assert(F(t,35) >= -7613 && F(t,35) <= 7613);
assert(F(t,36) >= -7114 && F(t,36) <= 7114);
assert(F(t,37) >= -7691 && F(t,37) <= 7691);
assert(F(t,38) >= -7350 && F(t,38) <= 7350);
assert(F(t,39) >= -7198 && F(t,39) <= 7198);
assert(F(t,40) >= -6836 && F(t,40) <= 6836);
assert(F(t,41) >= -7446 && F(t,41) <= 7446);
assert(F(t,42) >= -7450 && F(t,42) <= 7450);
assert(F(t,43) >= -7419 && F(t,43) <= 7419);
assert(F(t,44) >= -7277 && F(t,44) <= 7277);
assert(F(t,45) >= -6861 && F(t,45) <= 6861);
assert(F(t,46) >= -7208 && F(t,46) <= 7208);
assert(F(t,47) >= -7081 && F(t,47) <= 7081);
assert(F(t,48) >= -7230 && F(t,48) <= 7230);
assert(F(t,49) >= -7001 && F(t,49) <= 7001);
assert(F(t,50) >= -7683 && F(t,50) <= 7683);
assert(F(t,51) >= -7553 && F(t,51) <= 7553);
assert(F(t,52) >= -7387 && F(t,52) <= 7387);
assert(F(t,53) >= -7631 && F(t,53) <= 7631);
assert(F(t,54) >= -7508 && F(t,54) <= 7508);
assert(F(t,55) >= -7179 && F(t,55) <= 7179);
assert(F(t,56) >= -7244 && F(t,56) <= 7244);
assert(F(t,57) >= -6317 && F(t,57) <= 6317);
assert(F(t,58) >= -7073 && F(t,58) <= 7073);
assert(F(t,59) >= -7080 && F(t,59) <= 7080);
assert(F(t,60) >= -7167 && F(t,60) <= 7167);
assert(F(t,61) >= -7314 && F(t,61) <= 7314);
assert(F(t,62) >= -7617 && F(t,62) <= 7617);
assert(F(t,63) >= -6601 && F(t,63) <= 6601);
}
}
// vector_butterfly 0 32 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 b0 = add_x16(a0,a2);
int16x16 b2 = sub_x16(a0,a2);
// vector_butterfly 64 96 1 0
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b4 = add_x16(a4,a6);
int16x16 b6 = sub_x16(a4,a6);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// vector_butterfly 16 48 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b1 = add_x16(a1,a3);
int16x16 b3 = sub_x16(a1,a3);
// vector_butterfly 80 112 1 0
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b5 = add_x16(a5,a7);
int16x16 b7 = sub_x16(a5,a7);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// assertranges ...
for (long long t = 0;t < 2;++t) {
if (q == 7681) {
assert(F(t,0) >= -11258 && F(t,0) <= 11258);
assert(F(t,1) >= -11258 && F(t,1) <= 11258);
assert(F(t,2) >= -11258 && F(t,2) <= 11258);
assert(F(t,3) >= -11258 && F(t,3) <= 11258);
assert(F(t,4) >= -11258 && F(t,4) <= 11258);
assert(F(t,5) >= -11258 && F(t,5) <= 11258);
assert(F(t,6) >= -11258 && F(t,6) <= 11258);
assert(F(t,7) >= -11258 && F(t,7) <= 11258);
assert(F(t,8) >= -11258 && F(t,8) <= 11258);
assert(F(t,9) >= -11258 && F(t,9) <= 11258);
assert(F(t,10) >= -11258 && F(t,10) <= 11258);
assert(F(t,11) >= -11258 && F(t,11) <= 11258);
assert(F(t,12) >= -11258 && F(t,12) <= 11258);
assert(F(t,13) >= -11258 && F(t,13) <= 11258);
assert(F(t,14) >= -11258 && F(t,14) <= 11258);
assert(F(t,15) >= -11258 && F(t,15) <= 11258);
assert(F(t,16) >= -11258 && F(t,16) <= 11258);
assert(F(t,17) >= -11258 && F(t,17) <= 11258);
assert(F(t,18) >= -11258 && F(t,18) <= 11258);
assert(F(t,19) >= -11258 && F(t,19) <= 11258);
assert(F(t,20) >= -11258 && F(t,20) <= 11258);
assert(F(t,21) >= -11258 && F(t,21) <= 11258);
assert(F(t,22) >= -11258 && F(t,22) <= 11258);
assert(F(t,23) >= -11258 && F(t,23) <= 11258);
assert(F(t,24) >= -11258 && F(t,24) <= 11258);
assert(F(t,25) >= -11258 && F(t,25) <= 11258);
assert(F(t,26) >= -11258 && F(t,26) <= 11258);
assert(F(t,27) >= -11258 && F(t,27) <= 11258);
assert(F(t,28) >= -11258 && F(t,28) <= 11258);
assert(F(t,29) >= -11258 && F(t,29) <= 11258);
assert(F(t,30) >= -11258 && F(t,30) <= 11258);
assert(F(t,31) >= -11258 && F(t,31) <= 11258);
}
if (q == 10753) {
assert(F(t,0) >= -13035 && F(t,0) <= 13035);
assert(F(t,1) >= -14721 && F(t,1) <= 14721);
assert(F(t,2) >= -14855 && F(t,2) <= 14855);
assert(F(t,3) >= -14877 && F(t,3) <= 14877);
assert(F(t,4) >= -14753 && F(t,4) <= 14753);
assert(F(t,5) >= -15282 && F(t,5) <= 15282);
assert(F(t,6) >= -14641 && F(t,6) <= 14641);
assert(F(t,7) >= -14402 && F(t,7) <= 14402);
assert(F(t,8) >= -14056 && F(t,8) <= 14056);
assert(F(t,9) >= -15158 && F(t,9) <= 15158);
assert(F(t,10) >= -14901 && F(t,10) <= 14901);
assert(F(t,11) >= -14906 && F(t,11) <= 14906);
assert(F(t,12) >= -14802 && F(t,12) <= 14802);
assert(F(t,13) >= -13919 && F(t,13) <= 13919);
assert(F(t,14) >= -14742 && F(t,14) <= 14742);
assert(F(t,15) >= -13934 && F(t,15) <= 13934);
assert(F(t,16) >= -14346 && F(t,16) <= 14346);
assert(F(t,17) >= -13714 && F(t,17) <= 13714);
assert(F(t,18) >= -14973 && F(t,18) <= 14973);
assert(F(t,19) >= -14932 && F(t,19) <= 14932);
assert(F(t,20) >= -14454 && F(t,20) <= 14454);
assert(F(t,21) >= -15106 && F(t,21) <= 15106);
assert(F(t,22) >= -14808 && F(t,22) <= 14808);
assert(F(t,23) >= -14414 && F(t,23) <= 14414);
assert(F(t,24) >= -14743 && F(t,24) <= 14743);
assert(F(t,25) >= -13890 && F(t,25) <= 13890);
assert(F(t,26) >= -14141 && F(t,26) <= 14141);
assert(F(t,27) >= -14559 && F(t,27) <= 14559);
assert(F(t,28) >= -14823 && F(t,28) <= 14823);
assert(F(t,29) >= -14593 && F(t,29) <= 14593);
assert(F(t,30) >= -14779 && F(t,30) <= 14779);
assert(F(t,31) >= -13764 && F(t,31) <= 13764);
}
}
// assertranges ...
for (long long t = 0;t < 2;++t) {
if (q == 7681) {
assert(F(t,32) >= -11258 && F(t,32) <= 11258);
assert(F(t,33) >= -11258 && F(t,33) <= 11258);
assert(F(t,34) >= -11258 && F(t,34) <= 11258);
assert(F(t,35) >= -11258 && F(t,35) <= 11258);
assert(F(t,36) >= -11258 && F(t,36) <= 11258);
assert(F(t,37) >= -11258 && F(t,37) <= 11258);
assert(F(t,38) >= -11258 && F(t,38) <= 11258);
assert(F(t,39) >= -11258 && F(t,39) <= 11258);
assert(F(t,40) >= -11258 && F(t,40) <= 11258);
assert(F(t,41) >= -11258 && F(t,41) <= 11258);
assert(F(t,42) >= -11258 && F(t,42) <= 11258);
assert(F(t,43) >= -11258 && F(t,43) <= 11258);
assert(F(t,44) >= -11258 && F(t,44) <= 11258);
assert(F(t,45) >= -11258 && F(t,45) <= 11258);
assert(F(t,46) >= -11258 && F(t,46) <= 11258);
assert(F(t,47) >= -11258 && F(t,47) <= 11258);
assert(F(t,48) >= -11258 && F(t,48) <= 11258);
assert(F(t,49) >= -11258 && F(t,49) <= 11258);
assert(F(t,50) >= -11258 && F(t,50) <= 11258);
assert(F(t,51) >= -11258 && F(t,51) <= 11258);
assert(F(t,52) >= -11258 && F(t,52) <= 11258);
assert(F(t,53) >= -11258 && F(t,53) <= 11258);
assert(F(t,54) >= -11258 && F(t,54) <= 11258);
assert(F(t,55) >= -11258 && F(t,55) <= 11258);
assert(F(t,56) >= -11258 && F(t,56) <= 11258);
assert(F(t,57) >= -11258 && F(t,57) <= 11258);
assert(F(t,58) >= -11258 && F(t,58) <= 11258);
assert(F(t,59) >= -11258 && F(t,59) <= 11258);
assert(F(t,60) >= -11258 && F(t,60) <= 11258);
assert(F(t,61) >= -11258 && F(t,61) <= 11258);
assert(F(t,62) >= -11258 && F(t,62) <= 11258);
assert(F(t,63) >= -11258 && F(t,63) <= 11258);
}
if (q == 10753) {
assert(F(t,32) >= -13035 && F(t,32) <= 13035);
assert(F(t,33) >= -14721 && F(t,33) <= 14721);
assert(F(t,34) >= -14855 && F(t,34) <= 14855);
assert(F(t,35) >= -14877 && F(t,35) <= 14877);
assert(F(t,36) >= -14753 && F(t,36) <= 14753);
assert(F(t,37) >= -15282 && F(t,37) <= 15282);
assert(F(t,38) >= -14641 && F(t,38) <= 14641);
assert(F(t,39) >= -14402 && F(t,39) <= 14402);
assert(F(t,40) >= -14056 && F(t,40) <= 14056);
assert(F(t,41) >= -15158 && F(t,41) <= 15158);
assert(F(t,42) >= -14901 && F(t,42) <= 14901);
assert(F(t,43) >= -14906 && F(t,43) <= 14906);
assert(F(t,44) >= -14802 && F(t,44) <= 14802);
assert(F(t,45) >= -13919 && F(t,45) <= 13919);
assert(F(t,46) >= -14742 && F(t,46) <= 14742);
assert(F(t,47) >= -13934 && F(t,47) <= 13934);
assert(F(t,48) >= -14346 && F(t,48) <= 14346);
assert(F(t,49) >= -13714 && F(t,49) <= 13714);
assert(F(t,50) >= -14973 && F(t,50) <= 14973);
assert(F(t,51) >= -14932 && F(t,51) <= 14932);
assert(F(t,52) >= -14454 && F(t,52) <= 14454);
assert(F(t,53) >= -15106 && F(t,53) <= 15106);
assert(F(t,54) >= -14808 && F(t,54) <= 14808);
assert(F(t,55) >= -14414 && F(t,55) <= 14414);
assert(F(t,56) >= -14743 && F(t,56) <= 14743);
assert(F(t,57) >= -13890 && F(t,57) <= 13890);
assert(F(t,58) >= -14141 && F(t,58) <= 14141);
assert(F(t,59) >= -14559 && F(t,59) <= 14559);
assert(F(t,60) >= -14823 && F(t,60) <= 14823);
assert(F(t,61) >= -14593 && F(t,61) <= 14593);
assert(F(t,62) >= -14779 && F(t,62) <= 14779);
assert(F(t,63) >= -13764 && F(t,63) <= 13764);
}
}
// vector_butterfly 0 16 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 b0 = add_x16(a0,a1);
int16x16 b1 = sub_x16(a0,a1);
// vector_butterfly 64 80 1 0
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b4 = add_x16(a4,a5);
int16x16 b5 = sub_x16(a4,a5);
// vector_butterfly 32 48 4 1
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
a3 = mulmod_scaled_x16(a3,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b2 = add_x16(a2,a3);
int16x16 b3 = sub_x16(a2,a3);
// vector_butterfly 96 112 4 1
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
a7 = mulmod_scaled_x16(a7,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b6 = add_x16(a6,a7);
int16x16 b7 = sub_x16(a6,a7);
// vector_reduce 0
b0 = reduce_x16(b0,qdata);
// vector_reduce 64
b4 = reduce_x16(b4,qdata);
// vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
b1 = mulmod_scaled_x16(b1,precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata);
// vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
b5 = mulmod_scaled_x16(b5,precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata);
// vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
b2 = mulmod_scaled_x16(b2,precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata);
// vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
b6 = mulmod_scaled_x16(b6,precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata);
// vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
b3 = mulmod_scaled_x16(b3,precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata);
// vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
b7 = mulmod_scaled_x16(b7,precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata);
// vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 c0 = _mm256_unpacklo_epi16(b0,b2);
int16x16 c2 = _mm256_unpackhi_epi16(b0,b2);
// vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 c1 = _mm256_unpacklo_epi16(b1,b3);
int16x16 c3 = _mm256_unpackhi_epi16(b1,b3);
// vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 c4 = _mm256_unpacklo_epi16(b4,b6);
int16x16 c6 = _mm256_unpackhi_epi16(b4,b6);
// vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 c5 = _mm256_unpacklo_epi16(b5,b7);
int16x16 c7 = _mm256_unpackhi_epi16(b5,b7);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),c0);
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+64),c4);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// physical_unmap (0, 1, 2, 6, 4, 5) (3,)
#undef F
// physical_map (1, 2, 5, 6) (0, 3, 4)
#define F(t,v) f[((((t)>>0)&1)<<0)+((((t)>>1)&1)<<3)+((((t)>>2)&1)<<4)+((((v)>>0)&1)<<1)+((((v)>>1)&1)<<2)+((((v)>>2)&1)<<5)+((((v)>>3)&1)<<6)]
// assertranges ...
for (long long t = 0;t < 8;++t) {
if (q == 7681) {
assert(F(t,0) >= -5629 && F(t,0) <= 5629);
assert(F(t,1) >= -5629 && F(t,1) <= 5629);
assert(F(t,2) >= -5629 && F(t,2) <= 5629);
assert(F(t,3) >= -5629 && F(t,3) <= 5629);
assert(F(t,4) >= -5629 && F(t,4) <= 5629);
assert(F(t,5) >= -5629 && F(t,5) <= 5629);
assert(F(t,6) >= -5629 && F(t,6) <= 5629);
assert(F(t,7) >= -5629 && F(t,7) <= 5629);
assert(F(t,8) >= -5629 && F(t,8) <= 5629);
assert(F(t,9) >= -5629 && F(t,9) <= 5629);
assert(F(t,10) >= -5629 && F(t,10) <= 5629);
assert(F(t,11) >= -5629 && F(t,11) <= 5629);
assert(F(t,12) >= -5629 && F(t,12) <= 5629);
assert(F(t,13) >= -5629 && F(t,13) <= 5629);
assert(F(t,14) >= -5629 && F(t,14) <= 5629);
assert(F(t,15) >= -5629 && F(t,15) <= 5629);
}
if (q == 10753) {
assert(F(t,0) >= -5802 && F(t,0) <= 5802);
assert(F(t,1) >= -6967 && F(t,1) <= 6967);
assert(F(t,2) >= -6418 && F(t,2) <= 6418);
assert(F(t,3) >= -7585 && F(t,3) <= 7585);
assert(F(t,4) >= -7020 && F(t,4) <= 7020);
assert(F(t,5) >= -6328 && F(t,5) <= 6328);
assert(F(t,6) >= -7033 && F(t,6) <= 7033);
assert(F(t,7) >= -6954 && F(t,7) <= 6954);
assert(F(t,8) >= -6622 && F(t,8) <= 6622);
assert(F(t,9) >= -7054 && F(t,9) <= 7054);
assert(F(t,10) >= -6070 && F(t,10) <= 6070);
assert(F(t,11) >= -6725 && F(t,11) <= 6725);
assert(F(t,12) >= -7270 && F(t,12) <= 7270);
assert(F(t,13) >= -7353 && F(t,13) <= 7353);
assert(F(t,14) >= -6541 && F(t,14) <= 6541);
assert(F(t,15) >= -6586 && F(t,15) <= 6586);
}
}
// vector_butterfly 0 64 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 b0 = add_x16(a0,a4);
int16x16 b4 = sub_x16(a0,a4);
// vector_butterfly 32 96 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b2 = add_x16(a2,a6);
int16x16 b6 = sub_x16(a2,a6);
// vector_butterfly 16 80 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b1 = add_x16(a1,a5);
int16x16 b5 = sub_x16(a1,a5);
// vector_butterfly 48 112 1 0
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b3 = add_x16(a3,a7);
int16x16 b7 = sub_x16(a3,a7);
// vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 c0 = _mm256_unpacklo_epi32(b0,b1);
int16x16 c1 = _mm256_unpackhi_epi32(b0,b1);
// vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 c2 = _mm256_unpacklo_epi32(b2,b3);
int16x16 c3 = _mm256_unpackhi_epi32(b2,b3);
// vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 c4 = _mm256_unpacklo_epi32(b4,b5);
int16x16 c5 = _mm256_unpackhi_epi32(b4,b5);
// vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 c6 = _mm256_unpacklo_epi32(b6,b7);
int16x16 c7 = _mm256_unpackhi_epi32(b6,b7);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),c0);
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+64),c4);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// physical_unmap (1, 2, 5, 6) (0, 3, 4)
#undef F
// physical_map (2, 4, 5, 6) (0, 3, 1)
#define F(t,v) f[((((t)>>0)&1)<<0)+((((t)>>1)&1)<<3)+((((t)>>2)&1)<<1)+((((v)>>0)&1)<<2)+((((v)>>1)&1)<<4)+((((v)>>2)&1)<<5)+((((v)>>3)&1)<<6)]
// assertranges ...
for (long long t = 0;t < 8;++t) {
if (q == 7681) {
assert(F(t,0) >= -11258 && F(t,0) <= 11258);
assert(F(t,1) >= -11258 && F(t,1) <= 11258);
assert(F(t,2) >= -11258 && F(t,2) <= 11258);
assert(F(t,3) >= -11258 && F(t,3) <= 11258);
assert(F(t,4) >= -11258 && F(t,4) <= 11258);
assert(F(t,5) >= -11258 && F(t,5) <= 11258);
assert(F(t,6) >= -11258 && F(t,6) <= 11258);
assert(F(t,7) >= -11258 && F(t,7) <= 11258);
}
if (q == 10753) {
assert(F(t,0) >= -12424 && F(t,0) <= 12424);
assert(F(t,1) >= -14021 && F(t,1) <= 14021);
assert(F(t,2) >= -12488 && F(t,2) <= 12488);
assert(F(t,3) >= -14310 && F(t,3) <= 14310);
assert(F(t,4) >= -14290 && F(t,4) <= 14290);
assert(F(t,5) >= -13681 && F(t,5) <= 13681);
assert(F(t,6) >= -13574 && F(t,6) <= 13574);
assert(F(t,7) >= -13540 && F(t,7) <= 13540);
}
}
// assertranges ...
for (long long t = 0;t < 8;++t) {
if (q == 7681) {
assert(F(t,8) >= -11258 && F(t,8) <= 11258);
assert(F(t,9) >= -11258 && F(t,9) <= 11258);
assert(F(t,10) >= -11258 && F(t,10) <= 11258);
assert(F(t,11) >= -11258 && F(t,11) <= 11258);
assert(F(t,12) >= -11258 && F(t,12) <= 11258);
assert(F(t,13) >= -11258 && F(t,13) <= 11258);
assert(F(t,14) >= -11258 && F(t,14) <= 11258);
assert(F(t,15) >= -11258 && F(t,15) <= 11258);
}
if (q == 10753) {
assert(F(t,8) >= -12424 && F(t,8) <= 12424);
assert(F(t,9) >= -14021 && F(t,9) <= 14021);
assert(F(t,10) >= -12488 && F(t,10) <= 12488);
assert(F(t,11) >= -14310 && F(t,11) <= 14310);
assert(F(t,12) >= -14290 && F(t,12) <= 14290);
assert(F(t,13) >= -13681 && F(t,13) <= 13681);
assert(F(t,14) >= -13574 && F(t,14) <= 13574);
assert(F(t,15) >= -13540 && F(t,15) <= 13540);
}
}
// vector_butterfly 0 32 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 b0 = add_x16(a0,a2);
int16x16 b2 = sub_x16(a0,a2);
// vector_butterfly 64 96 4 1
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
a6 = mulmod_scaled_x16(a6,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b4 = add_x16(a4,a6);
int16x16 b6 = sub_x16(a4,a6);
// vector_reduce 0
b0 = reduce_x16(b0,qdata);
// vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
b2 = mulmod_scaled_x16(b2,precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata);
// vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
b4 = mulmod_scaled_x16(b4,precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata);
// vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
b6 = mulmod_scaled_x16(b6,precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata);
// vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 c0 = _mm256_unpacklo_epi64(b0,b4);
int16x16 c4 = _mm256_unpackhi_epi64(b0,b4);
// vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 c2 = _mm256_unpacklo_epi64(b2,b6);
int16x16 c6 = _mm256_unpackhi_epi64(b2,b6);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),c0);
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+64),c4);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// vector_butterfly 16 48 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b1 = add_x16(a1,a3);
int16x16 b3 = sub_x16(a1,a3);
// vector_butterfly 80 112 4 1
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
a7 = mulmod_scaled_x16(a7,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b5 = add_x16(a5,a7);
int16x16 b7 = sub_x16(a5,a7);
// vector_reduce 16
b1 = reduce_x16(b1,qdata);
// vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
b3 = mulmod_scaled_x16(b3,precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata);
// vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
b5 = mulmod_scaled_x16(b5,precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata);
// vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
b7 = mulmod_scaled_x16(b7,precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata);
// vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 c1 = _mm256_unpacklo_epi64(b1,b5);
int16x16 c5 = _mm256_unpackhi_epi64(b1,b5);
// vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 c3 = _mm256_unpacklo_epi64(b3,b7);
int16x16 c7 = _mm256_unpackhi_epi64(b3,b7);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// physical_unmap (2, 4, 5, 6) (0, 3, 1)
#undef F
// physical_map (6, 4) (0, 1, 2, 3, 5)
#define F(t,v) f[((((t)>>0)&1)<<0)+((((t)>>1)&1)<<1)+((((t)>>2)&1)<<2)+((((t)>>3)&1)<<3)+((((t)>>4)&1)<<5)+((((v)>>0)&1)<<6)+((((v)>>1)&1)<<4)]
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,0) >= -5629 && F(t,0) <= 5629);
assert(F(t,1) >= -5629 && F(t,1) <= 5629);
assert(F(t,2) >= -5629 && F(t,2) <= 5629);
assert(F(t,3) >= -5629 && F(t,3) <= 5629);
}
if (q == 10753) {
assert(F(t,0) >= -5800 && F(t,0) <= 5800);
assert(F(t,1) >= -6935 && F(t,1) <= 6935);
assert(F(t,2) >= -6521 && F(t,2) <= 6521);
assert(F(t,3) >= -7156 && F(t,3) <= 7156);
}
}
// vector_butterfly 0 16 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 b0 = add_x16(a0,a1);
int16x16 b1 = sub_x16(a0,a1);
// vector_butterfly 64 80 1 0
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b4 = add_x16(a4,a5);
int16x16 b5 = sub_x16(a4,a5);
// vector_butterfly 32 48 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b2 = add_x16(a2,a3);
int16x16 b3 = sub_x16(a2,a3);
// vector_butterfly 96 112 1 0
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b6 = add_x16(a6,a7);
int16x16 b7 = sub_x16(a6,a7);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,0) >= -11258 && F(t,0) <= 11258);
assert(F(t,1) >= -11258 && F(t,1) <= 11258);
}
if (q == 10753) {
assert(F(t,0) >= -12321 && F(t,0) <= 12321);
assert(F(t,1) >= -14091 && F(t,1) <= 14091);
}
}
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,2) >= -11258 && F(t,2) <= 11258);
assert(F(t,3) >= -11258 && F(t,3) <= 11258);
}
if (q == 10753) {
assert(F(t,2) >= -12321 && F(t,2) <= 12321);
assert(F(t,3) >= -14091 && F(t,3) <= 14091);
}
}
// vector_butterfly 0 64 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 b0 = add_x16(a0,a4);
int16x16 b4 = sub_x16(a0,a4);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// vector_butterfly 16 80 4 1
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
a5 = mulmod_scaled_x16(a5,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b1 = add_x16(a1,a5);
int16x16 b5 = sub_x16(a1,a5);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// vector_butterfly 32 96 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b2 = add_x16(a2,a6);
int16x16 b6 = sub_x16(a2,a6);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// vector_butterfly 48 112 4 1
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
a7 = mulmod_scaled_x16(a7,scaledzeta_x16_4_1,qinvscaledzeta_x16_4_1,qdata);
int16x16 b3 = add_x16(a3,a7);
int16x16 b7 = sub_x16(a3,a7);
// stopbatch 128
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
f += 128;
}
f -= 128*reps;
// startbatch 128
for (long long r = 0;r < reps;++r) {
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,0) >= -22516 && F(t,0) <= 22516);
}
if (q == 10753) {
assert(F(t,0) >= -26412 && F(t,0) <= 26412);
}
}
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,1) >= -22516 && F(t,1) <= 22516);
}
if (q == 10753) {
assert(F(t,1) >= -26412 && F(t,1) <= 26412);
}
}
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,2) >= -15747 && F(t,2) <= 15747);
}
if (q == 10753) {
assert(F(t,2) >= -17745 && F(t,2) <= 17745);
}
}
// assertranges ...
for (long long t = 0;t < 32;++t) {
if (q == 7681) {
assert(F(t,3) >= -15747 && F(t,3) <= 15747);
}
if (q == 10753) {
assert(F(t,3) >= -17745 && F(t,3) <= 17745);
}
}
// stopbatch 128
f += 128;
}
f -= 128*reps;
// physical_unmap (6, 4) (0, 1, 2, 3, 5)
#undef F
// stopntt 512
}
void ntt_ops_512_7681(int16 *f,long long reps)
{
ntt512(f,reps,qdata_7681);
}
void ntt_ops_512_10753(int16 *f,long long reps)
{
ntt512(f,reps,qdata_10753);
}
// inv stopntt 512
static void invntt512(int16 *f,long long reps,const int16 *qdata)
{
reps *= 4;
// inv physical_unmap (6, 4) (0, 1, 2, 3, 5)
#define F(t,v) f[((((t)>>0)&1)<<0)+((((t)>>1)&1)<<1)+((((t)>>2)&1)<<2)+((((t)>>3)&1)<<3)+((((t)>>4)&1)<<5)+((((v)>>0)&1)<<6)+((((v)>>1)&1)<<4)]
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// assertranges ...
// assertranges ...
// assertranges ...
// assertranges ...
// inv startbatch 128
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 48 112 4 1
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b3 = add_x16(a3,a7);
int16x16 b7 = sub_x16(a3,a7);
b7 = mulmod_scaled_x16(b7,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 32 96 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b2 = add_x16(a2,a6);
int16x16 b6 = sub_x16(a2,a6);
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 16 80 4 1
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b1 = add_x16(a1,a5);
int16x16 b5 = sub_x16(a1,a5);
b5 = mulmod_scaled_x16(b5,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 0 64 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 b0 = add_x16(a0,a4);
int16x16 b4 = sub_x16(a0,a4);
// assertranges ...
// assertranges ...
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 96 112 1 0
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b6 = add_x16(a6,a7);
int16x16 b7 = sub_x16(a6,a7);
// inv vector_butterfly 32 48 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b2 = add_x16(a2,a3);
int16x16 b3 = sub_x16(a2,a3);
// inv vector_butterfly 64 80 1 0
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b4 = add_x16(a4,a5);
int16x16 b5 = sub_x16(a4,a5);
// inv vector_butterfly 0 16 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 b0 = add_x16(a0,a1);
int16x16 b1 = sub_x16(a0,a1);
// assertranges ...
// inv physical_map (6, 4) (0, 1, 2, 3, 5)
#undef F
// inv physical_unmap (2, 4, 5, 6) (0, 3, 1)
#define F(t,v) f[((((t)>>0)&1)<<0)+((((t)>>1)&1)<<3)+((((t)>>2)&1)<<1)+((((v)>>0)&1)<<2)+((((v)>>1)&1)<<4)+((((v)>>2)&1)<<5)+((((v)>>3)&1)<<6)]
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b3 = _mm256_unpacklo_epi64(a3,a7);
int16x16 b7 = _mm256_unpackhi_epi64(a3,a7);
// inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b1 = _mm256_unpacklo_epi64(a1,a5);
int16x16 b5 = _mm256_unpackhi_epi64(a1,a5);
// inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
b7 = mulmod_scaled_x16(b7,precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata);
// inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
b5 = mulmod_scaled_x16(b5,precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata);
// inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
b3 = mulmod_scaled_x16(b3,precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3,qdata);
// inv vector_reduce 16
b1 = reduce_x16(b1,qdata);
// inv vector_butterfly 80 112 4 1
int16x16 c5 = add_x16(b5,b7);
int16x16 c7 = sub_x16(b5,b7);
c7 = mulmod_scaled_x16(c7,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 16 48 1 0
int16x16 c1 = add_x16(b1,b3);
int16x16 c3 = sub_x16(b1,b3);
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b2 = _mm256_unpacklo_epi64(a2,a6);
int16x16 b6 = _mm256_unpackhi_epi64(a2,a6);
// inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 b0 = _mm256_unpacklo_epi64(a0,a4);
int16x16 b4 = _mm256_unpackhi_epi64(a0,a4);
// inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
b6 = mulmod_scaled_x16(b6,precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata);
// inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
b4 = mulmod_scaled_x16(b4,precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata);
// inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
b2 = mulmod_scaled_x16(b2,precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1,qdata);
// inv vector_reduce 0
b0 = reduce_x16(b0,qdata);
// inv vector_butterfly 64 96 4 1
int16x16 c4 = add_x16(b4,b6);
int16x16 c6 = sub_x16(b4,b6);
c6 = mulmod_scaled_x16(c6,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 0 32 1 0
int16x16 c0 = add_x16(b0,b2);
int16x16 c2 = sub_x16(b0,b2);
// assertranges ...
// assertranges ...
// inv physical_map (2, 4, 5, 6) (0, 3, 1)
#undef F
// inv physical_unmap (1, 2, 5, 6) (0, 3, 4)
#define F(t,v) f[((((t)>>0)&1)<<0)+((((t)>>1)&1)<<3)+((((t)>>2)&1)<<4)+((((v)>>0)&1)<<1)+((((v)>>1)&1)<<2)+((((v)>>2)&1)<<5)+((((v)>>3)&1)<<6)]
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),c0);
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+64),c4);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b6 = _mm256_unpacklo_epi32(a6,a7);
int16x16 b7 = _mm256_unpackhi_epi32(a6,a7);
int16x16 c6 = _mm256_unpacklo_epi32(b6,b7);
int16x16 c7 = _mm256_unpackhi_epi32(b6,b7);
// inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b4 = _mm256_unpacklo_epi32(a4,a5);
int16x16 b5 = _mm256_unpackhi_epi32(a4,a5);
int16x16 c4 = _mm256_unpacklo_epi32(b4,b5);
int16x16 c5 = _mm256_unpackhi_epi32(b4,b5);
// inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b2 = _mm256_unpacklo_epi32(a2,a3);
int16x16 b3 = _mm256_unpackhi_epi32(a2,a3);
int16x16 c2 = _mm256_unpacklo_epi32(b2,b3);
int16x16 c3 = _mm256_unpackhi_epi32(b2,b3);
// inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 b0 = _mm256_unpacklo_epi32(a0,a1);
int16x16 b1 = _mm256_unpackhi_epi32(a0,a1);
int16x16 c0 = _mm256_unpacklo_epi32(b0,b1);
int16x16 c1 = _mm256_unpackhi_epi32(b0,b1);
// inv vector_butterfly 48 112 1 0
int16x16 d3 = add_x16(c3,c7);
int16x16 d7 = sub_x16(c3,c7);
// inv vector_butterfly 16 80 1 0
int16x16 d1 = add_x16(c1,c5);
int16x16 d5 = sub_x16(c1,c5);
// inv vector_butterfly 32 96 1 0
int16x16 d2 = add_x16(c2,c6);
int16x16 d6 = sub_x16(c2,c6);
// inv vector_butterfly 0 64 1 0
int16x16 d0 = add_x16(c0,c4);
int16x16 d4 = sub_x16(c0,c4);
// assertranges ...
// inv physical_map (1, 2, 5, 6) (0, 3, 4)
#undef F
// inv physical_unmap (0, 1, 2, 6, 4, 5) (3,)
#define F(t,v) f[((((t)>>0)&1)<<3)+((((v)>>0)&1)<<0)+((((v)>>1)&1)<<1)+((((v)>>2)&1)<<2)+((((v)>>3)&1)<<6)+((((v)>>4)&1)<<4)+((((v)>>5)&1)<<5)]
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),d0);
_mm256_storeu_si256((int16x16 *) (f+16),d1);
_mm256_storeu_si256((int16x16 *) (f+32),d2);
_mm256_storeu_si256((int16x16 *) (f+48),d3);
_mm256_storeu_si256((int16x16 *) (f+64),d4);
_mm256_storeu_si256((int16x16 *) (f+80),d5);
_mm256_storeu_si256((int16x16 *) (f+96),d6);
_mm256_storeu_si256((int16x16 *) (f+112),d7);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b5 = _mm256_unpacklo_epi16(a5,a7);
int16x16 b7 = _mm256_unpackhi_epi16(a5,a7);
int16x16 c5 = _mm256_unpacklo_epi16(b5,b7);
int16x16 c7 = _mm256_unpackhi_epi16(b5,b7);
int16x16 d5 = _mm256_unpacklo_epi16(c5,c7);
int16x16 d7 = _mm256_unpackhi_epi16(c5,c7);
// inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b4 = _mm256_unpacklo_epi16(a4,a6);
int16x16 b6 = _mm256_unpackhi_epi16(a4,a6);
int16x16 c4 = _mm256_unpacklo_epi16(b4,b6);
int16x16 c6 = _mm256_unpackhi_epi16(b4,b6);
int16x16 d4 = _mm256_unpacklo_epi16(c4,c6);
int16x16 d6 = _mm256_unpackhi_epi16(c4,c6);
// inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b1 = _mm256_unpacklo_epi16(a1,a3);
int16x16 b3 = _mm256_unpackhi_epi16(a1,a3);
int16x16 c1 = _mm256_unpacklo_epi16(b1,b3);
int16x16 c3 = _mm256_unpackhi_epi16(b1,b3);
int16x16 d1 = _mm256_unpacklo_epi16(c1,c3);
int16x16 d3 = _mm256_unpackhi_epi16(c1,c3);
// inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 b0 = _mm256_unpacklo_epi16(a0,a2);
int16x16 b2 = _mm256_unpackhi_epi16(a0,a2);
int16x16 c0 = _mm256_unpacklo_epi16(b0,b2);
int16x16 c2 = _mm256_unpackhi_epi16(b0,b2);
int16x16 d0 = _mm256_unpacklo_epi16(c0,c2);
int16x16 d2 = _mm256_unpackhi_epi16(c0,c2);
// inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
d7 = mulmod_scaled_x16(d7,precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
d3 = mulmod_scaled_x16(d3,precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata);
// inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
d6 = mulmod_scaled_x16(d6,precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
d2 = mulmod_scaled_x16(d2,precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata);
// inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
d5 = mulmod_scaled_x16(d5,precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
d1 = mulmod_scaled_x16(d1,precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7,qdata);
// inv vector_reduce 64
d4 = reduce_x16(d4,qdata);
// inv vector_reduce 0
d0 = reduce_x16(d0,qdata);
// inv vector_butterfly 96 112 4 1
int16x16 e6 = add_x16(d6,d7);
int16x16 e7 = sub_x16(d6,d7);
e7 = mulmod_scaled_x16(e7,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 32 48 4 1
int16x16 e2 = add_x16(d2,d3);
int16x16 e3 = sub_x16(d2,d3);
e3 = mulmod_scaled_x16(e3,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 64 80 1 0
int16x16 e4 = add_x16(d4,d5);
int16x16 e5 = sub_x16(d4,d5);
// inv vector_butterfly 0 16 1 0
int16x16 e0 = add_x16(d0,d1);
int16x16 e1 = sub_x16(d0,d1);
// assertranges ...
// assertranges ...
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),e0);
_mm256_storeu_si256((int16x16 *) (f+16),e1);
_mm256_storeu_si256((int16x16 *) (f+32),e2);
_mm256_storeu_si256((int16x16 *) (f+48),e3);
_mm256_storeu_si256((int16x16 *) (f+64),e4);
_mm256_storeu_si256((int16x16 *) (f+80),e5);
_mm256_storeu_si256((int16x16 *) (f+96),e6);
_mm256_storeu_si256((int16x16 *) (f+112),e7);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 80 112 1 0
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b5 = add_x16(a5,a7);
int16x16 b7 = sub_x16(a5,a7);
// inv vector_butterfly 16 48 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 b1 = add_x16(a1,a3);
int16x16 b3 = sub_x16(a1,a3);
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+16),b1);
_mm256_storeu_si256((int16x16 *) (f+48),b3);
_mm256_storeu_si256((int16x16 *) (f+80),b5);
_mm256_storeu_si256((int16x16 *) (f+112),b7);
f += 128;
}
f -= 128*reps;
// inv stopbatch 128
for (long long r = 0;r < reps;++r) {
// inv vector_butterfly 64 96 1 0
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b4 = add_x16(a4,a6);
int16x16 b6 = sub_x16(a4,a6);
// inv vector_butterfly 0 32 1 0
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 b0 = add_x16(a0,a2);
int16x16 b2 = sub_x16(a0,a2);
// assertranges ...
// inv physical_map (0, 1, 2, 6, 4, 5) (3,)
#undef F
// inv physical_unmap (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
#define F(t,v) f[((((v)>>0)&1)<<0)+((((v)>>1)&1)<<1)+((((v)>>2)&1)<<2)+((((v)>>3)&1)<<3)+((((v)>>4)&1)<<4)+((((v)>>5)&1)<<5)+((((v)>>6)&1)<<6)+((((v)>>7)&1)<<7)+((((v)>>8)&1)<<8)]
// inv startbatch 128
_mm256_storeu_si256((int16x16 *) (f+0),b0);
_mm256_storeu_si256((int16x16 *) (f+32),b2);
_mm256_storeu_si256((int16x16 *) (f+64),b4);
_mm256_storeu_si256((int16x16 *) (f+96),b6);
f += 128;
}
f -= 128*reps;
// inv doublereps
reps /= 2;
// inv doublereps
reps /= 2;
// inv stopbatch 512
for (long long r = 0;r < reps;++r) {
// inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f+432));
int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f+496));
int16x16 b27 = _mm256_permute2x128_si256_lo(a27,a31);
int16x16 b31 = _mm256_permute2x128_si256_hi(a27,a31);
// inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f+304));
int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f+368));
int16x16 b19 = _mm256_permute2x128_si256_lo(a19,a23);
int16x16 b23 = _mm256_permute2x128_si256_hi(a19,a23);
// inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f+176));
int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f+240));
int16x16 b11 = _mm256_permute2x128_si256_lo(a11,a15);
int16x16 b15 = _mm256_permute2x128_si256_hi(a11,a15);
// inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 b3 = _mm256_permute2x128_si256_lo(a3,a7);
int16x16 b7 = _mm256_permute2x128_si256_hi(a3,a7);
// inv vector_twist 496 512 3 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b31 = mulmod_scaled_x16(b31,precomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_509_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b27 = mulmod_scaled_x16(b27,precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_twist 368 512 509 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b23 = mulmod_scaled_x16(b23,precomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_3_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b19 = mulmod_scaled_x16(b19,precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b15 = mulmod_scaled_x16(b15,precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b11 = mulmod_scaled_x16(b11,precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
b7 = mulmod_scaled_x16(b7,precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63,qdata);
// inv vector_reduce 48
b3 = reduce_x16(b3,qdata);
// inv vector_butterfly 432 496 8 7
int16x16 c27 = add_x16(b27,b31);
int16x16 c31 = sub_x16(b27,b31);
c31 = mulmod_scaled_x16(c31,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
// inv vector_butterfly 304 368 8 1
int16x16 c19 = add_x16(b19,b23);
int16x16 c23 = sub_x16(b19,b23);
c23 = mulmod_scaled_x16(c23,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
// inv vector_butterfly 176 240 4 1
int16x16 c11 = add_x16(b11,b15);
int16x16 c15 = sub_x16(b11,b15);
c15 = mulmod_scaled_x16(c15,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 48 112 1 0
int16x16 c3 = add_x16(b3,b7);
int16x16 c7 = sub_x16(b3,b7);
// inv vector_reduce_ifforward 112
// inv startbatch 512
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
_mm256_storeu_si256((int16x16 *) (f+176),c11);
_mm256_storeu_si256((int16x16 *) (f+240),c15);
_mm256_storeu_si256((int16x16 *) (f+304),c19);
_mm256_storeu_si256((int16x16 *) (f+368),c23);
_mm256_storeu_si256((int16x16 *) (f+432),c27);
_mm256_storeu_si256((int16x16 *) (f+496),c31);
f += 512;
}
f -= 512*reps;
// inv stopbatch 512
for (long long r = 0;r < reps;++r) {
// inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f+416));
int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f+480));
int16x16 b26 = _mm256_permute2x128_si256_lo(a26,a30);
int16x16 b30 = _mm256_permute2x128_si256_hi(a26,a30);
// inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f+288));
int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f+352));
int16x16 b18 = _mm256_permute2x128_si256_lo(a18,a22);
int16x16 b22 = _mm256_permute2x128_si256_hi(a18,a22);
// inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f+160));
int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f+224));
int16x16 b10 = _mm256_permute2x128_si256_lo(a10,a14);
int16x16 b14 = _mm256_permute2x128_si256_hi(a10,a14);
// inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 b2 = _mm256_permute2x128_si256_lo(a2,a6);
int16x16 b6 = _mm256_permute2x128_si256_hi(a2,a6);
// inv vector_twist 480 512 3 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b30 = mulmod_scaled_x16(b30,precomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_509_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b26 = mulmod_scaled_x16(b26,precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_twist 352 512 509 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b22 = mulmod_scaled_x16(b22,precomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_3_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b18 = mulmod_scaled_x16(b18,precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b14 = mulmod_scaled_x16(b14,precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b10 = mulmod_scaled_x16(b10,precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
b6 = mulmod_scaled_x16(b6,precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47,qdata);
// inv vector_reduce 32
b2 = reduce_x16(b2,qdata);
// inv vector_butterfly 416 480 8 7
int16x16 c26 = add_x16(b26,b30);
int16x16 c30 = sub_x16(b26,b30);
c30 = mulmod_scaled_x16(c30,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
// inv vector_butterfly 288 352 8 1
int16x16 c18 = add_x16(b18,b22);
int16x16 c22 = sub_x16(b18,b22);
c22 = mulmod_scaled_x16(c22,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
// inv vector_butterfly 160 224 4 1
int16x16 c10 = add_x16(b10,b14);
int16x16 c14 = sub_x16(b10,b14);
c14 = mulmod_scaled_x16(c14,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 32 96 1 0
int16x16 c2 = add_x16(b2,b6);
int16x16 c6 = sub_x16(b2,b6);
// inv vector_reduce_ifforward 96
// inv startbatch 512
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
_mm256_storeu_si256((int16x16 *) (f+160),c10);
_mm256_storeu_si256((int16x16 *) (f+224),c14);
_mm256_storeu_si256((int16x16 *) (f+288),c18);
_mm256_storeu_si256((int16x16 *) (f+352),c22);
_mm256_storeu_si256((int16x16 *) (f+416),c26);
_mm256_storeu_si256((int16x16 *) (f+480),c30);
f += 512;
}
f -= 512*reps;
// inv stopbatch 512
for (long long r = 0;r < reps;++r) {
// inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f+400));
int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f+464));
int16x16 b25 = _mm256_permute2x128_si256_lo(a25,a29);
int16x16 b29 = _mm256_permute2x128_si256_hi(a25,a29);
// inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f+272));
int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f+336));
int16x16 b17 = _mm256_permute2x128_si256_lo(a17,a21);
int16x16 b21 = _mm256_permute2x128_si256_hi(a17,a21);
// inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f+144));
int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f+208));
int16x16 b9 = _mm256_permute2x128_si256_lo(a9,a13);
int16x16 b13 = _mm256_permute2x128_si256_hi(a9,a13);
// inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 b1 = _mm256_permute2x128_si256_lo(a1,a5);
int16x16 b5 = _mm256_permute2x128_si256_hi(a1,a5);
// inv vector_twist 464 512 3 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b29 = mulmod_scaled_x16(b29,precomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_509_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b25 = mulmod_scaled_x16(b25,precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_twist 336 512 509 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b21 = mulmod_scaled_x16(b21,precomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_3_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b17 = mulmod_scaled_x16(b17,precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b13 = mulmod_scaled_x16(b13,precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b9 = mulmod_scaled_x16(b9,precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
b5 = mulmod_scaled_x16(b5,precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31,qdata);
// inv vector_reduce 16
b1 = reduce_x16(b1,qdata);
// inv vector_butterfly 400 464 8 7
int16x16 c25 = add_x16(b25,b29);
int16x16 c29 = sub_x16(b25,b29);
c29 = mulmod_scaled_x16(c29,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
// inv vector_butterfly 272 336 8 1
int16x16 c17 = add_x16(b17,b21);
int16x16 c21 = sub_x16(b17,b21);
c21 = mulmod_scaled_x16(c21,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
// inv vector_butterfly 144 208 4 1
int16x16 c9 = add_x16(b9,b13);
int16x16 c13 = sub_x16(b9,b13);
c13 = mulmod_scaled_x16(c13,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 16 80 1 0
int16x16 c1 = add_x16(b1,b5);
int16x16 c5 = sub_x16(b1,b5);
// inv vector_reduce_ifforward 80
// inv startbatch 512
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+144),c9);
_mm256_storeu_si256((int16x16 *) (f+208),c13);
_mm256_storeu_si256((int16x16 *) (f+272),c17);
_mm256_storeu_si256((int16x16 *) (f+336),c21);
_mm256_storeu_si256((int16x16 *) (f+400),c25);
_mm256_storeu_si256((int16x16 *) (f+464),c29);
f += 512;
}
f -= 512*reps;
// inv stopbatch 512
for (long long r = 0;r < reps;++r) {
// inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f+384));
int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f+448));
int16x16 b24 = _mm256_permute2x128_si256_lo(a24,a28);
int16x16 b28 = _mm256_permute2x128_si256_hi(a24,a28);
// inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f+256));
int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f+320));
int16x16 b16 = _mm256_permute2x128_si256_lo(a16,a20);
int16x16 b20 = _mm256_permute2x128_si256_hi(a16,a20);
// inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f+128));
int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f+192));
int16x16 b8 = _mm256_permute2x128_si256_lo(a8,a12);
int16x16 b12 = _mm256_permute2x128_si256_hi(a8,a12);
// inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f+0));
int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f+64));
int16x16 b0 = _mm256_permute2x128_si256_lo(a0,a4);
int16x16 b4 = _mm256_permute2x128_si256_hi(a0,a4);
// inv vector_twist 448 512 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b28 = mulmod_scaled_x16(b28,precomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_509_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b24 = mulmod_scaled_x16(b24,precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 320 512 509 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b20 = mulmod_scaled_x16(b20,precomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_3_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b16 = mulmod_scaled_x16(b16,precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b12 = mulmod_scaled_x16(b12,precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b8 = mulmod_scaled_x16(b8,precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
b4 = mulmod_scaled_x16(b4,precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15,qdata);
// inv vector_reduce 0
b0 = reduce_x16(b0,qdata);
// inv vector_butterfly 384 448 8 7
int16x16 c24 = add_x16(b24,b28);
int16x16 c28 = sub_x16(b24,b28);
c28 = mulmod_scaled_x16(c28,scaledzeta_x16_8_1,qinvscaledzeta_x16_8_1,qdata);
// inv vector_butterfly 256 320 8 1
int16x16 c16 = add_x16(b16,b20);
int16x16 c20 = sub_x16(b16,b20);
c20 = mulmod_scaled_x16(c20,scaledzeta_x16_8_7,qinvscaledzeta_x16_8_7,qdata);
// inv vector_butterfly 128 192 4 1
int16x16 c8 = add_x16(b8,b12);
int16x16 c12 = sub_x16(b8,b12);
c12 = mulmod_scaled_x16(c12,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 0 64 1 0
int16x16 c0 = add_x16(b0,b4);
int16x16 c4 = sub_x16(b0,b4);
// inv vector_reduce_ifforward 64
// assertranges ...
// assertranges ...
// assertranges ...
// assertranges ...
// inv vector_butterfly 368 496 4 1
int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f+368));
int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f+496));
int16x16 b23 = add_x16(a23,a31);
int16x16 b31 = sub_x16(a23,a31);
b31 = mulmod_scaled_x16(b31,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 304 432 4 1
int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f+304));
int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f+432));
int16x16 b19 = add_x16(a19,a27);
int16x16 b27 = sub_x16(a19,a27);
b27 = mulmod_scaled_x16(b27,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 336 464 4 1
int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f+336));
int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f+464));
int16x16 b21 = add_x16(a21,a29);
int16x16 b29 = sub_x16(a21,a29);
b29 = mulmod_scaled_x16(b29,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 272 400 4 1
int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f+272));
int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f+400));
int16x16 b17 = add_x16(a17,a25);
int16x16 b25 = sub_x16(a17,a25);
b25 = mulmod_scaled_x16(b25,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 352 480 4 1
int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f+352));
int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f+480));
int16x16 b22 = add_x16(a22,a30);
int16x16 b30 = sub_x16(a22,a30);
b30 = mulmod_scaled_x16(b30,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 288 416 4 1
int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f+288));
int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f+416));
int16x16 b18 = add_x16(a18,a26);
int16x16 b26 = sub_x16(a18,a26);
b26 = mulmod_scaled_x16(b26,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 320 448 4 1
int16x16 d20 = add_x16(c20,c28);
int16x16 d28 = sub_x16(c20,c28);
d28 = mulmod_scaled_x16(d28,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 256 384 4 1
int16x16 d16 = add_x16(c16,c24);
int16x16 d24 = sub_x16(c16,c24);
d24 = mulmod_scaled_x16(d24,scaledzeta_x16_4_3,qinvscaledzeta_x16_4_3,qdata);
// inv vector_butterfly 112 240 1 0
int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f+112));
int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f+240));
int16x16 b7 = add_x16(a7,a15);
int16x16 b15 = sub_x16(a7,a15);
// inv vector_butterfly 48 176 1 0
int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f+48));
int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f+176));
int16x16 b3 = add_x16(a3,a11);
int16x16 b11 = sub_x16(a3,a11);
// inv vector_butterfly 80 208 1 0
int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f+80));
int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f+208));
int16x16 b5 = add_x16(a5,a13);
int16x16 b13 = sub_x16(a5,a13);
// inv vector_butterfly 16 144 1 0
int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f+16));
int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f+144));
int16x16 b1 = add_x16(a1,a9);
int16x16 b9 = sub_x16(a1,a9);
// inv vector_butterfly 96 224 1 0
int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f+96));
int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f+224));
int16x16 b6 = add_x16(a6,a14);
int16x16 b14 = sub_x16(a6,a14);
// inv vector_butterfly 32 160 1 0
int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f+32));
int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f+160));
int16x16 b2 = add_x16(a2,a10);
int16x16 b10 = sub_x16(a2,a10);
// inv vector_butterfly 64 192 1 0
int16x16 d4 = add_x16(c4,c12);
int16x16 d12 = sub_x16(c4,c12);
// inv vector_butterfly 0 128 1 0
int16x16 d0 = add_x16(c0,c8);
int16x16 d8 = sub_x16(c0,c8);
// inv vector_reduce_ifreverse 304
b19 = reduce_x16(b19,qdata);
// inv vector_reduce_ifreverse 272
b17 = reduce_x16(b17,qdata);
// inv vector_reduce_ifreverse 288
b18 = reduce_x16(b18,qdata);
// inv vector_reduce_ifreverse 256
d16 = reduce_x16(d16,qdata);
// inv vector_reduce_ifreverse 48
b3 = reduce_x16(b3,qdata);
// inv vector_reduce_ifreverse 16
b1 = reduce_x16(b1,qdata);
// inv vector_reduce_ifreverse 32
b2 = reduce_x16(b2,qdata);
// inv vector_reduce_ifreverse 0
d0 = reduce_x16(d0,qdata);
// assertranges ...
// assertranges ...
// inv vector_butterfly 240 496 1 0
int16x16 c15 = add_x16(b15,b31);
int16x16 c31 = sub_x16(b15,b31);
// inv vector_butterfly 112 368 1 0
int16x16 c7 = add_x16(b7,b23);
int16x16 c23 = sub_x16(b7,b23);
// inv vector_butterfly 176 432 1 0
int16x16 c11 = add_x16(b11,b27);
int16x16 c27 = sub_x16(b11,b27);
// inv vector_butterfly 48 304 1 0
int16x16 c3 = add_x16(b3,b19);
int16x16 c19 = sub_x16(b3,b19);
// inv vector_butterfly 208 464 1 0
int16x16 c13 = add_x16(b13,b29);
int16x16 c29 = sub_x16(b13,b29);
// inv vector_butterfly 80 336 1 0
int16x16 c5 = add_x16(b5,b21);
int16x16 c21 = sub_x16(b5,b21);
// inv vector_butterfly 144 400 1 0
int16x16 c9 = add_x16(b9,b25);
int16x16 c25 = sub_x16(b9,b25);
// inv vector_butterfly 16 272 1 0
int16x16 c1 = add_x16(b1,b17);
int16x16 c17 = sub_x16(b1,b17);
// inv vector_butterfly 224 480 1 0
int16x16 c14 = add_x16(b14,b30);
int16x16 c30 = sub_x16(b14,b30);
// inv vector_butterfly 96 352 1 0
int16x16 c6 = add_x16(b6,b22);
int16x16 c22 = sub_x16(b6,b22);
// inv vector_butterfly 160 416 1 0
int16x16 c10 = add_x16(b10,b26);
int16x16 c26 = sub_x16(b10,b26);
// inv vector_butterfly 32 288 1 0
int16x16 c2 = add_x16(b2,b18);
int16x16 c18 = sub_x16(b2,b18);
// inv vector_butterfly 192 448 1 0
int16x16 e12 = add_x16(d12,d28);
int16x16 e28 = sub_x16(d12,d28);
// inv vector_butterfly 64 320 1 0
int16x16 e4 = add_x16(d4,d20);
int16x16 e20 = sub_x16(d4,d20);
// inv vector_butterfly 128 384 1 0
int16x16 e8 = add_x16(d8,d24);
int16x16 e24 = sub_x16(d8,d24);
// inv vector_butterfly 0 256 1 0
int16x16 e0 = add_x16(d0,d16);
int16x16 e16 = sub_x16(d0,d16);
// assertranges ...
// inv physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
#undef F
// inv startbatch 512
_mm256_storeu_si256((int16x16 *) (f+0),e0);
_mm256_storeu_si256((int16x16 *) (f+16),c1);
_mm256_storeu_si256((int16x16 *) (f+32),c2);
_mm256_storeu_si256((int16x16 *) (f+48),c3);
_mm256_storeu_si256((int16x16 *) (f+64),e4);
_mm256_storeu_si256((int16x16 *) (f+80),c5);
_mm256_storeu_si256((int16x16 *) (f+96),c6);
_mm256_storeu_si256((int16x16 *) (f+112),c7);
_mm256_storeu_si256((int16x16 *) (f+128),e8);
_mm256_storeu_si256((int16x16 *) (f+144),c9);
_mm256_storeu_si256((int16x16 *) (f+160),c10);
_mm256_storeu_si256((int16x16 *) (f+176),c11);
_mm256_storeu_si256((int16x16 *) (f+192),e12);
_mm256_storeu_si256((int16x16 *) (f+208),c13);
_mm256_storeu_si256((int16x16 *) (f+224),c14);
_mm256_storeu_si256((int16x16 *) (f+240),c15);
_mm256_storeu_si256((int16x16 *) (f+256),e16);
_mm256_storeu_si256((int16x16 *) (f+272),c17);
_mm256_storeu_si256((int16x16 *) (f+288),c18);
_mm256_storeu_si256((int16x16 *) (f+304),c19);
_mm256_storeu_si256((int16x16 *) (f+320),e20);
_mm256_storeu_si256((int16x16 *) (f+336),c21);
_mm256_storeu_si256((int16x16 *) (f+352),c22);
_mm256_storeu_si256((int16x16 *) (f+368),c23);
_mm256_storeu_si256((int16x16 *) (f+384),e24);
_mm256_storeu_si256((int16x16 *) (f+400),c25);
_mm256_storeu_si256((int16x16 *) (f+416),c26);
_mm256_storeu_si256((int16x16 *) (f+432),c27);
_mm256_storeu_si256((int16x16 *) (f+448),e28);
_mm256_storeu_si256((int16x16 *) (f+464),c29);
_mm256_storeu_si256((int16x16 *) (f+480),c30);
_mm256_storeu_si256((int16x16 *) (f+496),c31);
f += 512;
}
f -= 512*reps;
// inv startntt 512
}
void ntt_ops_512_7681_inv(int16 *f,long long reps)
{
invntt512(f,reps,qdata_7681);
}
void ntt_ops_512_10753_inv(int16 *f,long long reps)
{
invntt512(f,reps,qdata_10753);
}