QOJ.ac

QOJ

IDProblemSubmitterResultTimeMemoryLanguageFile sizeSubmit timeJudge time
#110169#4946. 青蕈领主xiaoyaowudi100 ✓323ms7268kbC++1746.6kb2023-05-31 22:43:332023-05-31 22:43:35

Judging History

你现在查看的是最新测评结果

  • [2023-08-10 23:21:45]
  • System Update: QOJ starts to keep a history of the judgings of all the submissions.
  • [2023-05-31 22:43:35]
  • 评测
  • 测评结果:100
  • 用时:323ms
  • 内存:7268kb
  • [2023-05-31 22:43:33]
  • 提交

answer

#define __AVX__ 1
#define __AVX2__ 1
#define __AVX512F__ 1
#define __AVX512DQ__ 1
#pragma GCC target("avx,avx2,avx512f,avx512dq")
// #pragma GCC target("avx,avx2")
#include <iostream>
#include <immintrin.h>
#include <cstdint>
#include <memory>
#include <new>
#include <vector>
#include <array>
#include <exception>
#include <cstring>
#include <algorithm>
namespace library
{
	typedef std::uint32_t ui;
	typedef std::int32_t i32;
	typedef std::int64_t ll;
	typedef std::uint64_t ull;
	typedef __uint128_t u128;
	typedef __int128 i128;
	constexpr ui default_mod=998244353;
	struct fast_mod_32 {
		ull b, m;
		fast_mod_32(ull b) : b(b), m(ull((u128(1) << 64) / b)) {}
		fast_mod_32(){}
		fast_mod_32(const fast_mod_32 &d):b(d.b),m(d.m){}
		ull reduce(ull a) {
			ull q = (ull)((u128(m) * a) >> 64);
			ull r = a - q * b;
			return r >= b ? r - b : r;
		}
	};
	fast_mod_32 global_fast_mod_32(default_mod);
	struct montgomery_mi_lib{
		constexpr static ui calc_k(ui MOD,ui len){ui ans=1;for(ui i=1;i<len;++i) ans=(ans*(MOD+1)+1);return ans;}
		ui P,P2,NP,Pk;static constexpr ui ui_len = sizeof(ui)*8;
		montgomery_mi_lib(ui P0):P(P0),P2(P0*2),NP((-ull(P0))%P0),Pk(calc_k(P0,ui_len)){}
		montgomery_mi_lib(){}
		#define INLINE_OP __attribute__((__always_inline__))
		INLINE_OP ui redd(ui k) const {return k>=P2?k-P2:k;}INLINE_OP ui reds(ui k) const {return k>=P?k-P:k;}INLINE_OP ui redu(ull k) const {return (k+ull(ui(k)*Pk)*P)>>ui_len;}
		INLINE_OP ui add(ui a,ui b) const {return redd(a+b);}INLINE_OP ui sub(ui a,ui b) const {return redd(a-b+P2);}INLINE_OP ui mul(ui a,ui b) const {return redu(ull(a)*b);}
		INLINE_OP ui neg(ui a) const {return redd(P2-a);}INLINE_OP ui v(ui a) const {return redu(ull(a)*NP);}INLINE_OP ui rv(ui a) const {return reds(redu(a));}
		#undef INLINE_OP
	};
	struct montgomery_mi{
		static montgomery_mi_lib mlib;
		ui val;
		void init(ui a){val=mlib.redu(ull(a)*mlib.NP);}
		montgomery_mi(){val=0;} montgomery_mi(const montgomery_mi &a):val(a.val){} montgomery_mi(ui v):val(mlib.redu(ull(v)*mlib.NP)){}
		montgomery_mi& operator=(const montgomery_mi &b) {val=b.val;return *this;}
		montgomery_mi& operator+=(const montgomery_mi &b) {val=mlib.redd(val+b.val);return *this;}
		montgomery_mi& operator-=(const montgomery_mi &b) {val=mlib.redd(val-b.val+mlib.P2);return *this;}
		montgomery_mi operator+(const montgomery_mi &b) const {return montgomery_mi(*this)+=b;}
		montgomery_mi operator-(const montgomery_mi &b) const {return montgomery_mi(*this)-=b;}
		montgomery_mi operator*=(const montgomery_mi &b) {val=mlib.redu(ull(val)*b.val);return *this;}
		montgomery_mi operator*(const montgomery_mi &b) const {return montgomery_mi(*this)*=b;}
		montgomery_mi operator-() const {montgomery_mi b;return b-=(*this);}
		ui real_val() const {return mlib.reds(mlib.redu(val));}
		ui get_val() const {return val;}
		friend std::istream& operator>>(std::istream &in, montgomery_mi &m_int){ui inp;in>>inp;m_int.init(inp);return in;}
		friend std::ostream& operator<<(std::ostream &out, const montgomery_mi &m_int){out<<m_int.real_val();return out;}
	};
	static __attribute__((__always_inline__)) inline montgomery_mi ui2mi(ui k){
		union ui2mi_impl
		{
			ui uval;
			montgomery_mi mval;
			ui2mi_impl(){}
		}T;
		T.uval=k;
		return T.mval;
	}
	typedef montgomery_mi mi;
	typedef montgomery_mi_lib lmi;
	#if defined(__AVX__) && defined(__AVX2__)
	struct montgomery_mm256_lib{
		alignas(32) __m256i P,P2,NP,Pk;static constexpr ui ui_len=sizeof(ui)*8;
		montgomery_mm256_lib(ui P0){P=_mm256_set1_epi32(P0),P2=_mm256_set1_epi32(P0*2),
			NP=_mm256_set1_epi32(ui((-ull(P0))%P0)),Pk=_mm256_set1_epi32(montgomery_mi_lib::calc_k(P0,ui_len));}
		montgomery_mm256_lib(){}
		#define INLINE_OP __attribute__((__always_inline__))
		INLINE_OP __m256i redd(__m256i k){__m256i a=_mm256_sub_epi32(k,P2);__m256i b=_mm256_and_si256(_mm256_cmpgt_epi32(_mm256_setzero_si256(),a),P2);return _mm256_add_epi32(a,b);}
		INLINE_OP __m256i reds(__m256i k){__m256i a=_mm256_sub_epi32(k,P); __m256i b=_mm256_and_si256(_mm256_cmpgt_epi32(_mm256_setzero_si256(),a),P); return _mm256_add_epi32(a,b);}
		INLINE_OP __m256i redu(__m256i k){return _mm256_srli_epi64(_mm256_add_epi64(_mm256_mul_epu32(_mm256_mul_epu32(k,Pk),P),k),32);}
		INLINE_OP __m256i mul(__m256i k1,__m256i k2){
			return _mm256_or_si256(redu(_mm256_mul_epu32(k1,k2)),_mm256_slli_epi64(redu(_mm256_mul_epu32(_mm256_srli_epi64(k1,32),_mm256_srli_epi64(k2,32))),32));}
		INLINE_OP __m256i add(__m256i k1,__m256i k2){return redd(_mm256_add_epi32(k1,k2));}
		INLINE_OP __m256i sub(__m256i k1,__m256i k2){return redd(_mm256_add_epi32(P2,_mm256_sub_epi32(k1,k2)));}
		#undef INLINE_OP
	};
	struct montgomery_mm256_int{
		static montgomery_mm256_lib mlib;
		__m256i val;
		void init(__m256i a){val=mlib.mul(a,mlib.NP);}
		montgomery_mm256_int(){val=_mm256_setzero_si256();} montgomery_mm256_int(const montgomery_mm256_int &a):val(a.val){} montgomery_mm256_int(__m256i v):val(mlib.mul(v,mlib.NP)){}
		montgomery_mm256_int(ui v){init(_mm256_set1_epi32(v));}
		montgomery_mm256_int& operator=(const montgomery_mm256_int &b) {val=b.val;return *this;}
		montgomery_mm256_int& operator+=(const montgomery_mm256_int &b) {val=mlib.add(val,b.val);return *this;}
		montgomery_mm256_int& operator-=(const montgomery_mm256_int &b) {val=mlib.sub(val,b.val);return *this;}
		montgomery_mm256_int operator+(const montgomery_mm256_int &b) const {return montgomery_mm256_int(*this)+=b;}
		montgomery_mm256_int operator-(const montgomery_mm256_int &b) const {return montgomery_mm256_int(*this)-=b;}
		montgomery_mm256_int operator*=(const montgomery_mm256_int &b) {val=mlib.mul(val,b.val);return *this;}
		montgomery_mm256_int operator*(const montgomery_mm256_int &b) const {return montgomery_mm256_int(*this)*=b;}
		montgomery_mm256_int operator-() const {montgomery_mm256_int b;return b-=(*this);}
		__m256i real_val() const {return mlib.reds(_mm256_or_si256(_mm256_slli_epi64(mlib.redu(_mm256_srli_epi64(val,32)),32),
			mlib.redu(_mm256_srli_epi64(_mm256_slli_epi64(val,32),32))));}
		__m256i get_val() const {return val;}
	};
	typedef montgomery_mm256_int mai;
	typedef montgomery_mm256_lib lma;
	#endif
	#if defined(__AVX512F__) && defined(__AVX512DQ__)
	struct montgomery_mm512_lib{
		alignas(64) __m512i P,P2,NP,Pk;static constexpr ui ui_len=sizeof(ui)*8;
		montgomery_mm512_lib(ui P0){P=_mm512_set1_epi32(P0),P2=_mm512_set1_epi32(P0*2),
			NP=_mm512_set1_epi32(ui((-ull(P0))%P0)),Pk=_mm512_set1_epi32(montgomery_mi_lib::calc_k(P0,ui_len));}
		montgomery_mm512_lib(){}
		#define INLINE_OP __attribute__((__always_inline__))
		INLINE_OP __m512i redd(__m512i k){__m512i a=_mm512_sub_epi32(k,P2);return _mm512_mask_add_epi32(a,_mm512_movepi32_mask(a),a,P2);}
		INLINE_OP __m512i reds(__m512i k){__m512i a=_mm512_sub_epi32(k,P);return _mm512_mask_add_epi32(a,_mm512_movepi32_mask(a),a,P);}
		INLINE_OP __m512i redu(__m512i k){return _mm512_srli_epi64(_mm512_add_epi64(_mm512_mul_epu32(_mm512_mul_epu32(k,Pk),P),k),32);}
		INLINE_OP __m512i mul(__m512i k1,__m512i k2){
			return _mm512_or_si512(redu(_mm512_mul_epu32(k1,k2)),_mm512_slli_epi64(redu(_mm512_mul_epu32(_mm512_srli_epi64(k1,32),_mm512_srli_epi64(k2,32))),32));}
		INLINE_OP __m512i add(__m512i k1,__m512i k2){return redd(_mm512_add_epi32(k1,k2));}
		INLINE_OP __m512i sub(__m512i k1,__m512i k2){return redd(_mm512_add_epi32(P2,_mm512_sub_epi32(k1,k2)));}
		#undef INLINE_OP
	};
	struct montgomery_mm512_int{
		static montgomery_mm512_lib mlib;
		__m512i val;
		void init(__m512i a){val=mlib.mul(a,mlib.NP);}
		montgomery_mm512_int(){val=_mm512_setzero_si512();} montgomery_mm512_int(const montgomery_mm512_int &a):val(a.val){} montgomery_mm512_int(__m512i v):val(mlib.mul(v,mlib.NP)){}
		montgomery_mm512_int(ui v){init(_mm512_set1_epi32(v));}
		montgomery_mm512_int& operator=(const montgomery_mm512_int &b) {val=b.val;return *this;}
		montgomery_mm512_int& operator+=(const montgomery_mm512_int &b) {val=mlib.add(val,b.val);return *this;}
		montgomery_mm512_int& operator-=(const montgomery_mm512_int &b) {val=mlib.sub(val,b.val);return *this;}
		montgomery_mm512_int operator+(const montgomery_mm512_int &b) const {return montgomery_mm512_int(*this)+=b;}
		montgomery_mm512_int operator-(const montgomery_mm512_int &b) const {return montgomery_mm512_int(*this)-=b;}
		montgomery_mm512_int operator*=(const montgomery_mm512_int &b) {val=mlib.mul(val,b.val);return *this;}
		montgomery_mm512_int operator*(const montgomery_mm512_int &b) const {return montgomery_mm512_int(*this)*=b;}
		montgomery_mm512_int operator-() const {montgomery_mm512_int b;return b-=(*this);}
		__m512i real_val() const {return mlib.reds(_mm512_or_si512(_mm512_slli_epi64(mlib.redu(_mm512_srli_epi64(val,32)),32),
			mlib.redu(_mm512_srli_epi64(_mm512_slli_epi64(val,32),32))));}
		__m512i get_val() const {return val;}
	};
	typedef montgomery_mm512_int m5i;
	typedef montgomery_mm512_lib lm5;
	#endif
	ui global_mod_mi=default_mod;
	void set_mod_mi(ui p);
	#if defined(__AVX__) && defined(__AVX2__)
	ui global_mod_mai=default_mod;
	void set_mod_mai(ui p);
	#endif
	#if defined(__AVX512F__) && defined(__AVX512DQ__)
	ui global_mod_m5i=default_mod;
	void set_mod_m5i(ui p);
	#endif
	montgomery_mi_lib mi::mlib(default_mod);
	void set_mod_mi(ui p){
		mi::mlib=montgomery_mi_lib(p);
		global_fast_mod_32=fast_mod_32(p);
		global_mod_mi=p;
	}
	#if defined(__AVX__) && defined(__AVX2__)
	montgomery_mm256_lib mai::mlib(default_mod);
	void set_mod_mai(ui p){
		mai::mlib=montgomery_mm256_lib(p);
		global_mod_mai=p;
	}
	#endif
	#if defined(__AVX512F__) && defined(__AVX512DQ__)
	montgomery_mm512_lib m5i::mlib(default_mod);
	void set_mod_m5i(ui p){
		m5i::mlib=montgomery_mm512_lib(p);
		global_mod_m5i=p;
	}
	#endif
	template<typename T,size_t align_val>
	struct aligned_delete {
		void operator()(T* ptr) const {
			operator delete[](ptr,std::align_val_t(align_val));
		}
	};
	template<typename T,size_t align_val>
	using aligned_array=std::unique_ptr<T[],aligned_delete<T,align_val>>;
	template<typename T,size_t align_val>
	aligned_array<T,align_val> create_aligned_array(size_t size){
		return aligned_array<T,align_val>(new(std::align_val_t(align_val)) T[size]);
	}
	#define restrict __restrict
	#define NTT_partition_size 10
	typedef std::vector<mi> poly;
	class polynomial_kernel_mtt;
	class polynomial_kernel_ntt
	{
		friend class polynomial_kernel_mtt;
	private:
		static constexpr ui tmp_size=9;
		aligned_array<ui,64> ws0,ws1,_inv,tt[tmp_size],num;ui P,G;
		ui fn,fb,mx;
		void release(){
			ws0.reset();ws1.reset();
			_inv.reset();num.reset();
			fn=fb=mx=0;
			for(ui i=0;i<tmp_size;++i) tt[i].reset();
		}
		ui _fastpow(ui a,ui b){ui ans=li.v(1),off=a;while(b){if(b&1) ans=li.mul(ans,off);off=li.mul(off,off);b>>=1;}return ans;}
		void dif(ui* restrict p,ui n){
			ui len=(1<<n);
			ui* restrict ws=ws0.get();
			if(len<16){
				ui t1,t2;
				for(ui l=len;l>=2;l>>=1) for(ui j=0,mid=(l>>1);j<len;j+=l){
					ui restrict *p1=p+j,*p2=p+j+mid,*ww=ws+mid;
					for(ui i=0;i<mid;++i,++p1,++p2,++ww) t1=*p1,t2=*p2,*p1=li.add(t1,t2),*p2=li.mul(li.sub(t1,t2),(*ww));
				}
			}else if(len<=(1<<NTT_partition_size)){
				__m512i* pp=(__m512i*)p,*p1,*p2,*ww;
				__m512i msk,val;__mmask16 smsk;
				for(ui l=len;l>16;l>>=1){
					ui mid=(l>>1);
					for(ui j=0;j<len;j+=l){
						p1=(__m512i*)(p+j),p2=(__m512i*)(p+j+mid),ww=(__m512i*)(ws+mid);
						for(ui i=0;i<mid;i+=16,++p1,++p2,++ww){
							__m512i x=*p1,y=*p2;
							*p1=l5.add(x,y);
							*p2=l5.mul(l5.sub(x,y),*ww);
						}
					}
				}
				val=_mm512_setr_epi32(ws[8],ws[8],ws[8],ws[8],
									  ws[8],ws[8],ws[8],ws[8],
									  ws[8],ws[9],ws[10],ws[11],
									  ws[12],ws[13],ws[14],ws[15]);
				msk=_mm512_setr_epi32(0,0,0,0,0,0,0,0,P*2,P*2,P*2,P*2,P*2,P*2,P*2,P*2);
				smsk=0xff00;
				pp=(__m512i*)p;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_i64x2(*pp,*pp,_MM_PERM_BADC);
					__m512i y=_mm512_mask_sub_epi32(*pp,smsk,msk,*pp);
					*pp=l5.mul(l5.add(x,y),val);
				}
				val=_mm512_setr_epi32(ws[4],ws[4],ws[4],ws[4],
									  ws[4],ws[5],ws[6],ws[7],
									  ws[4],ws[4],ws[4],ws[4],
									  ws[4],ws[5],ws[6],ws[7]);
				smsk=0xf0f0;
				msk=_mm512_setr_epi32(0,0,0,0,P*2,P*2,P*2,P*2,0,0,0,0,P*2,P*2,P*2,P*2);
				pp=(__m512i*)p;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_i64x2(*pp,*pp,_MM_PERM_CDAB);
					__m512i y=_mm512_mask_sub_epi32(*pp,smsk,msk,*pp);
					*pp=l5.mul(l5.add(x,y),val);
				}
				val=_mm512_setr_epi32(ws[2],ws[2],ws[2],ws[3],
									  ws[2],ws[2],ws[2],ws[3],
									  ws[2],ws[2],ws[2],ws[3],
									  ws[2],ws[2],ws[2],ws[3]);
				msk=_mm512_setr_epi32(0,0,P*2,P*2,0,0,P*2,P*2,0,0,P*2,P*2,0,0,P*2,P*2);
				pp=(__m512i*)p;
				smsk=0xcccc;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_epi32(*pp,_MM_PERM_BADC);
					__m512i y=_mm512_mask_sub_epi32(*pp,smsk,msk,*pp);
					*pp=l5.mul(l5.add(x,y),val);
				}
				msk=_mm512_setr_epi32(0,P*2,0,P*2,0,P*2,0,P*2,0,P*2,0,P*2,0,P*2,0,P*2);
				pp=(__m512i*)p;
				smsk=0xaaaa;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_epi32(*pp,_MM_PERM_CDAB);
					__m512i y=_mm512_mask_sub_epi32(*pp,smsk,msk,*pp);
					*pp=l5.add(x,y);
				}
			}
			else{
				__m512i *p1=(__m512i*)(p),*p2=(__m512i*)(p+(len>>2)),*p3=(__m512i*)(p+(len>>1)),*p4=(__m512i*)(p+(len>>2)*3),*w1=(__m512i*)(ws0.get()+(len>>1)),
				*w2=(__m512i*)(ws0.get()+(len>>1)+(len>>2)),*w3=(__m512i*)(ws0.get()+(len>>2));
				for(ui i=0;i<(len>>2);i+=16,++p1,++p2,++p3,++p4,++w2,++w3,++w1){
					__m512i x=(*(p1)),y=(*(p2)),z=(*(p3)),w=(*(p4));
					__m512i r=l5.add(x,z),s=l5.mul(l5.sub(x,z),*w1);
					__m512i t=l5.add(y,w),q=l5.mul(l5.sub(y,w),*w2);
					(*(p1))=l5.add(r,t);(*(p2))=l5.mul(l5.sub(r,t),*w3);
					(*(p3))=l5.add(s,q);(*(p4))=l5.mul(l5.sub(s,q),*w3);
				}
				dif(p,n-2);dif(p+(1<<(n-2)),n-2);dif(p+(1<<(n-1)),n-2);dif(p+(1<<(n-2))*3,n-2);
			}
		}
		void dit(ui* restrict p,ui n,bool inverse_coef=true){
			ui len=(1<<n);
			ui* restrict ws=ws1.get();
			if(len<16){
				ui t1,t2;
				for(ui l=2;l<=len;l<<=1) for(ui j=0,mid=(l>>1);j<len;j+=l){
					ui restrict *p1=p+j,*p2=p+j+mid,*ww=ws+mid;
					for(ui i=0;i<mid;++i,++p1,++p2,++ww) t1=*p1,t2=li.mul((*p2),(*ww)),*p1=li.add(t1,t2),*p2=li.sub(t1,t2);
				}
				ui co=_inv[len-1];ui* restrict p1=p;
				for(ui i=0;i<len;++i,++p1) (*p1)=li.mul(co,(*p1));
			}else if(len<=(1<<NTT_partition_size)){
				__m512i* pp=(__m512i*)p,*p1,*p2,*ww;
				__m512i msk,val;__mmask16 smsk;
				msk=_mm512_setr_epi32(0,P*2,0,P*2,0,P*2,0,P*2,0,P*2,0,P*2,0,P*2,0,P*2);
				smsk=0xaaaa;
				pp=(__m512i*)p;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_epi32(*pp,_MM_PERM_CDAB);
					__m512i y=_mm512_mask_sub_epi32(*pp,smsk,msk,*pp);
					*pp=l5.add(x,y);
				}
				val=_mm512_setr_epi32(ws[2],ws[3],li.neg(ws[2]),li.neg(ws[3]),
									  ws[2],ws[3],li.neg(ws[2]),li.neg(ws[3]),
									  ws[2],ws[3],li.neg(ws[2]),li.neg(ws[3]),
									  ws[2],ws[3],li.neg(ws[2]),li.neg(ws[3]));
				pp=(__m512i*)p;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_epi32(*pp,_MM_PERM_BABA);
					__m512i y=_mm512_shuffle_epi32(*pp,_MM_PERM_DCDC);
					*pp=l5.add(x,l5.mul(y,val));
				}
				val=_mm512_setr_epi32(  ws[4],   ws[5],   ws[6],   ws[7],
									  li.neg(ws[4]),li.neg(ws[5]),li.neg(ws[6]),li.neg(ws[7]),
									    ws[4],   ws[5],   ws[6],   ws[7],
									  li.neg(ws[4]),li.neg(ws[5]),li.neg(ws[6]),li.neg(ws[7]));
				pp=(__m512i*)p;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_i64x2(*pp,*pp,_MM_PERM_CCAA);
					__m512i y=_mm512_shuffle_i64x2(*pp,*pp,_MM_PERM_DDBB);
					*pp=l5.add(x,l5.mul(y,val));
				}
				val=_mm512_setr_epi32(  ws[8],    ws[9],    ws[10],   ws[11],
									    ws[12],   ws[13],   ws[14],   ws[15],
									  li.neg(ws[8]), li.neg(ws[9]), li.neg(ws[10]),li.neg(ws[11]),
									  li.neg(ws[12]),li.neg(ws[13]),li.neg(ws[14]),li.neg(ws[15]));
				pp=(__m512i*)p;
				for(ui j=0;j<len;j+=16,++pp){
					__m512i x=_mm512_shuffle_i64x2(*pp,*pp,_MM_PERM_BABA);
					__m512i y=_mm512_shuffle_i64x2(*pp,*pp,_MM_PERM_DCDC);
					*pp=l5.add(x,l5.mul(y,val));
				}
				for(ui l=32;l<=len;l<<=1){
					ui mid=(l>>1);
					for(ui j=0;j<len;j+=l){
						p1=(__m512i*)(p+j),p2=(__m512i*)(p+j+mid),ww=(__m512i*)(ws+mid);
						for(ui i=0;i<mid;i+=16,++p1,++p2,++ww){
							__m512i x=*p1,y=l5.mul(*p2,*ww);
							*p1=l5.add(x,y);
							*p2=l5.sub(x,y);
						}
					}
				}
				if(inverse_coef){
					__m512i co=_mm512_set1_epi32(_inv[len-1]);
					pp=(__m512i*)p;
					for(ui i=0;i<len;i+=16,++pp) (*pp)=l5.mul(*pp,co);
				}
			}
			else{
				dit(p,n-2,false);dit(p+(1<<(n-2)),n-2,false);dit(p+(1<<(n-1)),n-2,false);dit(p+(1<<(n-2))*3,n-2,false);
				__m512i *p1=(__m512i*)(p),*p2=(__m512i*)(p+(len>>2)),*p3=(__m512i*)(p+(len>>1)),*p4=(__m512i*)(p+(len>>2)*3),*w1=(__m512i*)(ws+(len>>1)),
				*w2=(__m512i*)(ws+(len>>1)+(len>>2)),*w3=(__m512i*)(ws+(len>>2));
				for(ui i=0;i<(len>>2);i+=16,++p1,++p2,++p3,++p4,++w2,++w3,++w1){
					__m512i x=(*(p1)),y=(*(p2)),z=(*(p3)),w=(*(p4));
					__m512i h=l5.mul(y,*w3),
							k=l5.mul(w,*w3);
					__m512i	t=l5.mul(l5.add(z,k),*w1),q=l5.mul(l5.sub(z,k),*w2);
					__m512i r=l5.add(x,h),s=l5.sub(x,h);
					(*(p1))=l5.add(r,t);(*(p2))=l5.add(s,q);
					(*(p3))=l5.sub(r,t);(*(p4))=l5.sub(s,q);
				}
				if(inverse_coef){
					__m512i co=_mm512_set1_epi32(_inv[len-1]);
					p1=(__m512i*)p;
					for(ui i=0;i<len;i+=16,++p1) (*p1)=l5.mul(*p1,co);
				}
			}
		}
		void dif_xni(ui* restrict arr,ui n){
			ui* restrict ws=ws0.get();
			ui len=(1<<n);
			if(len<=4){
				for(ui i=0;i<len;++i) arr[i]=li.mul(arr[i],ws[(len<<1)+i]);
			}else{
				__m256i restrict *p1=(__m256i*)arr,*p2=(__m256i*)(ws+(len<<1));
				for(ui i=0;i<len;i+=8,++p1,++p2) *p1=la.mul(*p1,*p2);
			}
			dif(arr,n);
		}
		void dit_xni(ui* restrict arr,ui n){
			dit(arr,n);
			ui* restrict ws=ws1.get();
			ui len=(1<<n);
			if(len<=4){
				for(ui i=0;i<len;++i) arr[i]=li.mul(arr[i],ws[(len<<1)+i]);
			}else{
				__m256i restrict *p1=(__m256i*)arr,*p2=(__m256i*)(ws+(len<<1));
				for(ui i=0;i<len;i+=8,++p1,++p2) *p1=la.mul(*p1,*p2);
			}
		}
		void internal_mul(ui* restrict src1,ui* restrict src2,ui* restrict dst,ui m){
			dif(src1,m);
			dif(src2,m);
			if((1<<m)<8){
				for(ui i=0;i<(1<<m);++i) dst[i]=li.mul(src1[i],src2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)src1, *p2=(__m256i*)src2, *p3=(__m256i*)dst;
				for(ui i=0;i<(1<<m);i+=8,++p1,++p2,++p3) *p3=la.mul(*p1,*p2);
			}
			dit(dst,m);
		}
		void internal_transpose_mul(ui* restrict src1,ui* restrict src2,ui* restrict dst,ui m){
			std::reverse(src1,src1+(1<<m));
			internal_mul(src1,src2,dst,m);
			std::reverse(dst,dst+(1<<m));
		}
		void internal_inv(ui* restrict src,ui* restrict dst,ui* restrict tmp,ui* restrict tmp2,ui len){//10E(n) x^n->x^{2n}
			if(len==1){dst[0]=_fastpow(src[0],P-2);return;}
			internal_inv(src,dst,tmp,tmp2,len>>1);
			std::memcpy(tmp,src,sizeof(ui)*len);std::memcpy(tmp2,dst,sizeof(ui)*(len>>1));std::memset(tmp2+(len>>1),0,sizeof(ui)*(len>>1));
			std::memset(dst+(len>>1),0,sizeof(ui)*(len>>1));
			dif(tmp,__builtin_ctz(len));dif(tmp2,__builtin_ctz(len));
			if(len<=4){
				for(ui i=0;i<len;++i) tmp[i]=li.mul(tmp[i],tmp2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp2,*p2=(__m256i*)tmp;
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p2)=la.mul((*p1),(*p2));
			}
			dit(tmp,__builtin_ctz(len));std::memset(tmp,0,sizeof(ui)*(len>>1));dif(tmp,__builtin_ctz(len));
			if(len<=4){
				for(ui i=0;i<len;++i) tmp[i]=li.mul(tmp[i],tmp2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp2,*p2=(__m256i*)tmp;
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p2)=la.mul((*p1),(*p2));
			}
			dit(tmp,__builtin_ctz(len));
			if(len<=8){
				for(ui i=(len>>1);i<len;++i) dst[i]=li.neg(tmp[i]);
			}else{
				__m256i restrict *p1=(__m256i*)(tmp+(len>>1)),*p2=(__m256i*)(dst+(len>>1));
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p2)=la.sub(_mm256_setzero_si256(),(*p1));
			}
		}
		void internal_inv_faster(ui* restrict src,ui* restrict dst,ui* restrict tmp,ui* restrict tmp2,ui* restrict tmp3,ui len){//9E(n) x^n->x^{2n}
			if(len==1){dst[0]=_fastpow(src[0],P-2);return;}
			internal_inv_faster(src,dst,tmp,tmp2,tmp3,len>>1);
			std::memcpy(tmp,src,sizeof(ui)*(len>>1));std::memcpy(tmp2,dst,sizeof(ui)*(len>>1));
			if(len<=8){
				ui mip=ws0[3];
				for(ui i=0;i<(len>>1);++i) tmp[i]=li.add(tmp[i],li.mul(mip,src[i+(len>>1)]));
			}
			else{
				__m256i mip=_mm256_set1_epi32(ws0[3]);
				__m256i restrict *p1=(__m256i*)(src+(len>>1)),*p2=(__m256i*)tmp;
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p2)=la.add((*p2),la.mul((*p1),mip));
			}
			dif_xni(tmp,__builtin_ctz(len>>1));
			dif_xni(tmp2,__builtin_ctz(len>>1));
			if(len<=8){
				for(ui i=0;i<(len>>1);++i) tmp[i]=li.mul(li.mul(tmp2[i],tmp2[i]),tmp[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp2,*p2=(__m256i*)tmp;
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p2)=la.mul((*p2),la.mul((*p1),(*p1)));
			}
			dit_xni(tmp,__builtin_ctz(len>>1));
			std::memcpy(tmp2,src,sizeof(ui)*len);std::memcpy(tmp3,dst,sizeof(ui)*(len>>1));std::memset(tmp3+(len>>1),0,sizeof(ui)*(len>>1));
			dif(tmp2,__builtin_ctz(len));dif(tmp3,__builtin_ctz(len));
			if(len<=8){
				for(ui i=0;i<len;++i) tmp2[i]=li.mul(li.mul(tmp3[i],tmp3[i]),tmp2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp3,*p2=(__m256i*)tmp2;
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p2)=la.mul((*p2),la.mul((*p1),(*p1)));
			}
			dit(tmp2,__builtin_ctz(len));
			if(len<=8){
				ui mip=ws0[3],iv2=_inv[1];
				for(ui i=0;i<(len>>1);++i) dst[i+(len>>1)]=li.mul(li.sub(li.mul(li.sub(li.add(tmp[i],tmp2[i]),li.add(dst[i],dst[i])),mip),tmp2[i+(len>>1)]),iv2);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp,*p2=(__m256i*)(tmp2+(len>>1)),*p3=(__m256i*)(tmp2),*p4=(__m256i*)(dst+(len>>1)),*p5=(__m256i*)(dst);
				__m256i mip=_mm256_set1_epi32(ws0[3]),iv2=_mm256_set1_epi32(_inv[1]);
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2,++p3,++p4,++p5) (*p4)=la.mul(la.sub(la.mul(la.sub(la.add((*p1),(*p3)),la.add((*p5),(*p5))),mip),(*p2)),iv2);
			}
		}
		void internal_ln(ui* restrict src,ui* restrict dst,ui* restrict tmp1,ui* restrict tmp2,ui* restrict tmp3,ui len){
			#if defined(__AVX__) && defined(__AVX2__)
			ui pos=1;__m256i restrict *pp=(__m256i*)tmp1,*iv=(__m256i*)num.get();ui restrict *p1=src+1;
			for(;pos+8<=len;pos+=8,p1+=8,++pp,++iv) *pp=la.mul(_mm256_loadu_si256((__m256i*)p1),*iv);
			for(;pos<len;++pos) tmp1[pos-1]=li.mul(src[pos],num[pos-1]);tmp1[len-1]=li.v(0);
			#else
			ui restrict *p1=src+1,*p2=tmp1,*p3=num.get();
			for(ui i=1;i<len;++i,++p1,++p2,++p3) *p2=li.mul((*p1),(*p3));
			tmp1[len-1]=li.v(0);
			#endif
			internal_inv(src,dst,tmp2,tmp3,len);
			std::memset(dst+len,0,sizeof(ui)*len);std::memset(tmp1+len,0,sizeof(ui)*len);
			internal_mul(tmp1,dst,tmp2,__builtin_ctz(len<<1));
			#if defined(__AVX__) && defined(__AVX2__)
			ui ps=1;__m256i restrict *pp0=(__m256i*)tmp2,*iv0=(__m256i*)_inv.get();ui restrict *p10=dst+1;
			for(;ps+8<=len;ps+=8,p10+=8,++pp0,++iv0) _mm256_storeu_si256((__m256i*)p10,la.mul(*pp0,*iv0));
			dst[0]=li.v(0);
			for(;ps<len;++ps) dst[ps]=li.mul(_inv[ps-1],tmp2[ps-1]);
			#else
			dst[0]=li.v(0);
			ui restrict *p10=dst+1,*p20=tmp2,*p30=_inv.get();
			for(ui i=1;i<len;++i,++p10,++p20,++p30) *p10=li.mul((*p20),(*p30));
			#endif
		}
		void internal_ln_faster(ui* restrict src,ui* restrict dst,ui* restrict tmp1,ui* restrict tmp2,ui* restrict tmp3,ui* restrict tmp4,ui len){
			if(len<4){internal_ln(src,dst,tmp1,tmp2,tmp3,len);return;}
			#if defined(__AVX__) && defined(__AVX2__)
			ui pos=1;__m256i restrict *pp=(__m256i*)tmp1,*iv=(__m256i*)num.get();ui restrict *p1=src+1;
			for(;pos+8<=len;pos+=8,p1+=8,++pp,++iv) *pp=la.mul(_mm256_loadu_si256((__m256i*)p1),*iv);
			for(;pos<len;++pos) tmp1[pos-1]=li.mul(src[pos],num[pos-1]);tmp1[len-1]=li.v(0);
			#else
			ui restrict *p1=src+1,*p2=tmp1,*p3=num.get();
			for(ui i=1;i<len;++i,++p1,++p2,++p3) *p2=li.mul((*p1),(*p3));
			tmp1[len-1]=li.v(0);
			#endif
			internal_inv_faster(src,dst,tmp2,tmp3,tmp4,(len>>1));//tmp4=F_{n/2}(g0 mod x^{n/4})
			std::memcpy(dst,dst+(len>>2),sizeof(ui)*(len>>2));std::memset(dst+(len>>2),0,sizeof(ui)*(len>>2));
			dif(dst,__builtin_ctz(len>>1));
			std::memcpy(tmp2,tmp1+(len>>2),sizeof(ui)*(len>>2));std::memset(tmp2+(len>>2),0,sizeof(ui)*(len>>2));
			dif(tmp2,__builtin_ctz(len>>1));
			std::memset(tmp1+(len>>2),0,sizeof(ui)*(len>>2));
			dif(tmp1,__builtin_ctz(len>>1));
			if(len<=8){
				for(ui i=0;i<(len>>1);++i) tmp2[i]=li.add(li.mul(tmp2[i],tmp4[i]),li.mul(dst[i],tmp1[i])),tmp1[i]=li.mul(tmp1[i],tmp4[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp2,*p2=(__m256i*)tmp4,*p3=(__m256i*)dst,*p4=(__m256i*)tmp1;
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2,++p3,++p4) (*p1)=la.add(la.mul((*p1),(*p2)),la.mul((*p3),(*p4))),(*p4)=la.mul((*p2),(*p4));
			}
			dit(tmp1,__builtin_ctz(len>>1));dit(tmp2,__builtin_ctz(len>>1));
			if(len<=16){
				for(ui i=0;i<(len>>2);++i) tmp1[i+(len>>2)]=li.add(tmp1[i+(len>>2)],tmp2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)(tmp1+(len>>2)),*p2=(__m256i*)tmp2;
				for(ui i=0;i<(len>>2);i+=8,++p1,++p2) (*p1)=la.add((*p1),(*p2));
			}
			std::memcpy(tmp2,tmp1,sizeof(ui)*(len>>1));
			std::memset(tmp2+(len>>1),0,sizeof(ui)*(len>>1));
			dif(tmp2,__builtin_ctz(len));std::memcpy(tmp3,src,sizeof(ui)*len);
			dif(tmp3,__builtin_ctz(len));
			if(len<=4){
				for(ui i=0;i<len;++i) tmp3[i]=li.mul(tmp3[i],tmp2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)(tmp3),*p2=(__m256i*)tmp2;
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p1)=la.mul((*p1),(*p2));
			}
			dit(tmp3,__builtin_ctz(len));
			if(len<=8){
				for(ui i=0;i<(len>>1);++i) tmp3[i+(len>>1)]=li.sub(tmp1[i+(len>>1)],tmp3[i+(len>>1)]);
			}
			else{
				__m256i restrict *p1=(__m256i*)(tmp3+(len>>1)),*p2=(__m256i*)(tmp1+(len>>1));
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p1)=la.sub((*p2),(*p1));
			}
			std::memcpy(tmp3,tmp3+(len>>2)*3,sizeof(ui)*(len>>2));std::memset(tmp3+(len>>2),0,sizeof(ui)*(len>>2));
			std::memcpy(tmp2,tmp3+(len>>1),sizeof(ui)*(len>>2));std::memset(tmp2+(len>>2),0,sizeof(ui)*(len>>2));
			dif(tmp3,__builtin_ctz(len>>1));dif(tmp2,__builtin_ctz(len>>1));
			if(len<=8){
				for(ui i=0;i<(len>>1);++i) tmp3[i]=li.add(li.mul(tmp3[i],tmp4[i]),li.mul(dst[i],tmp2[i])),tmp2[i]=li.mul(tmp2[i],tmp4[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp3,*p2=(__m256i*)tmp4,*p3=(__m256i*)dst,*p4=(__m256i*)tmp2;
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2,++p3,++p4) (*p1)=la.add(la.mul((*p1),(*p2)),la.mul((*p3),(*p4))),(*p4)=la.mul((*p2),(*p4));
			}
			dit(tmp3,__builtin_ctz(len>>1));dit(tmp2,__builtin_ctz(len>>1));
			std::memcpy(tmp1+(len>>1),tmp2,sizeof(mi)*(len>>2));
			if(len<=16){
				for(ui i=0;i<(len>>2);++i) tmp1[i+(len>>2)*3]=li.add(tmp2[i+(len>>2)],tmp3[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)(tmp1+(len>>2)*3),*p2=(__m256i*)(tmp2+(len>>2)),*p3=(__m256i*)(tmp3);
				for(ui i=0;i<(len>>2);i+=8,++p1,++p2,++p3) (*p1)=la.add((*p2),(*p3));
			}
			ui ps=1;__m256i restrict *pp0=(__m256i*)tmp1,*iv0=(__m256i*)_inv.get();ui restrict *p10=dst+1;
			for(;ps+8<=len;ps+=8,p10+=8,++pp0,++iv0) _mm256_storeu_si256((__m256i*)p10,la.mul(*pp0,*iv0));
			dst[0]=li.v(0);
			for(;ps<len;++ps) dst[ps]=li.mul(_inv[ps-1],tmp1[ps-1]);
		}
		void internal_exp(ui* restrict src,ui* restrict dst,ui* restrict gn,ui* restrict gxni,
						  ui* restrict h,ui* restrict tmp1,ui* restrict tmp2,ui* restrict tmp3,ui len,bool calc_h=false){
			if(len==1){dst[0]=li.v(1);return;}
			else if(len==2){dst[0]=li.v(1);dst[1]=src[1];gn[0]=li.add(dst[0],dst[1]),gn[1]=li.sub(dst[0],dst[1]);gxni[0]=li.add(li.mul(dst[1],ws0[3]),dst[0]);h[0]=li.v(1);h[1]=li.neg(dst[1]);return;}
			internal_exp(src,dst,gn,gxni,h,tmp1,tmp2,tmp3,(len>>1),true);
			{
				ui pos=1;__m256i restrict *pp=(__m256i*)tmp1,*iv=(__m256i*)num.get();ui restrict *p1=src+1;
				for(;pos+8<=(len>>1);pos+=8,p1+=8,++pp,++iv) *pp=la.mul(_mm256_loadu_si256((__m256i*)p1),*iv);
				for(;pos<(len>>1);++pos) tmp1[pos-1]=li.mul(src[pos],num[pos-1]);tmp1[(len>>1)-1]=li.v(0);
			}
			dif(tmp1,__builtin_ctz(len>>1));
			if(len<8){
				for(ui i=0;i<(len>>1);++i) tmp1[i]=li.mul(tmp1[i],gn[i]);
			}else{
				__m256i restrict *p1=(__m256i*)(tmp1),*p2=(__m256i*)(gn);
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p1)=la.mul((*p1),(*p2));
			}
			dit(tmp1,__builtin_ctz(len>>1));
			{
				ui pos=1;__m256i restrict *pp=(__m256i*)tmp1,*iv=(__m256i*)num.get();ui restrict *p1=dst+1;
				for(;pos+8<=(len>>1);pos+=8,p1+=8,++pp,++iv) *pp=la.sub(la.mul(_mm256_loadu_si256((__m256i*)p1),*iv),(*pp));
				for(;pos<(len>>1);++pos) tmp1[pos-1]=li.sub(li.mul(dst[pos],num[pos-1]),tmp1[pos-1]);tmp1[(len>>1)-1]=li.neg(tmp1[(len>>1)-1]);
			}
			std::memmove(tmp1+1,tmp1,sizeof(ui)*(len>>1));tmp1[0]=tmp1[(len>>1)];
			std::memset(tmp1+(len>>1),0,sizeof(ui)*(len>>1));
			dif(tmp1,__builtin_ctz(len));
			std::memcpy(tmp3,h,sizeof(ui)*(len>>1));std::memset(tmp3+(len>>1),0,sizeof(ui)*(len>>1));
			dif(tmp3,__builtin_ctz(len));
			if(len<8){
				for(ui i=0;i<len;++i) tmp1[i]=li.mul(tmp3[i],tmp1[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp1,*p2=(__m256i*)tmp3;
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p1)=la.mul((*p1),(*p2));
			}
			dit(tmp1,__builtin_ctz(len));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=8){
				for(ui i=0;i<(len>>1);++i) tmp2[i]=li.sub(src[i+(len>>1)],li.mul(_inv[i+(len>>1)-1],tmp1[i]));
			}else{
				__m256i restrict *p1=(__m256i*)tmp1,*p3=(__m256i*)(tmp2),*p4=(__m256i*)(src+(len>>1));ui* restrict p2=_inv.get()+(len>>1)-1;
				for(ui i=0;i<(len>>1);i+=8,++p1,p2+=8,++p3,++p4) (*p3)=la.sub((*p4),la.mul((*p1),_mm256_loadu_si256((__m256i*)p2)));
			}
			#else
			{
				for(ui i=0;i<(len>>1);++i) tmp2[i]=li.sub(src[i+(len>>1)],li.mul(_inv[i+(len>>1)-1],tmp1[i]));
			}
			#endif
			std::memset(tmp2+(len>>1),0,sizeof(ui)*(len>>1));
			dif(tmp2,__builtin_ctz(len));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=16){
				ui mip=ws1[3];
				for(ui i=0;i<(len>>2);++i) tmp1[i]=li.mul(li.mul(li.add(dst[i],li.mul(dst[i+(len>>2)],mip)),ws0[(len>>1)+i]),ws0[(len>>2)+i]);
			}else{
				__m256i restrict *p1=(__m256i*)dst,*p2=(__m256i*)(dst+(len>>2)),*p3=(__m256i*)(tmp1),*p4=(__m256i*)(ws0.get()+(len>>1)),*p5=(__m256i*)(ws0.get()+(len>>2));
				__m256i mip=_mm256_set1_epi32(ws1[3]);
				for(ui i=0;i<(len>>2);i+=8,++p1,++p2,++p3,++p4,++p5) (*p3)=la.mul(la.add((*p1),la.mul((*p2),mip)),la.mul((*p4),(*p5)));
			}
			#else
			{
				ui mip=ws1[3];
				for(ui i=0;i<(len>>2);++i) tmp1[i]=li.mul(li.mul(li.add(dst[i],li.mul(dst[i+(len>>2)],mip)),ws0[(len>>1)+i]),ws0[(len>>2)+i]);
			}
			#endif
			dif(tmp1,__builtin_ctz(len>>2));
			std::memcpy(tmp1+(len>>2)*3,tmp1,sizeof(ui)*(len>>2));
			std::memcpy(tmp1,gn,sizeof(ui)*(len>>1));
			std::memcpy(tmp1+(len>>1),gxni,sizeof(ui)*(len>>2));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=4){
				for(ui i=0;i<len;++i) tmp1[i]=li.mul(tmp2[i],tmp1[i]);
			}else{
				__m256i restrict *p1=(__m256i*)tmp1,*p2=(__m256i*)(tmp2);
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p1)=la.mul((*p1),(*p2));
			}
			#else
			for(ui i=0;i<len;++i) tmp1[i]=li.mul(tmp2[i],tmp1[i]);
			#endif
			dit(tmp1,__builtin_ctz(len));
			std::memcpy(dst+(len>>1),tmp1,sizeof(ui)*(len>>1));
			//inv iteration start
			if(!calc_h) return;
			std::memcpy(gxni,dst,sizeof(ui)*(len>>1));std::memcpy(tmp2,h,sizeof(ui)*(len>>1));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=8){
				ui mip=ws0[3];
				for(ui i=0;i<(len>>1);++i) gxni[i]=li.add(gxni[i],li.mul(mip,dst[i+(len>>1)]));
			}
			else{
				__m256i mip=_mm256_set1_epi32(ws0[3]);
				__m256i restrict *p1=(__m256i*)(dst+(len>>1)),*p2=(__m256i*)gxni;
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p2)=la.add((*p2),la.mul((*p1),mip));
			}
			#else
			{
				ui mip=ws0[3];
				for(ui i=0;i<(len>>1);++i) gxni[i]=li.add(gxni[i],li.mul(mip,dst[i+(len>>1)]));
			}
			#endif
			dif_xni(gxni,__builtin_ctz(len>>1));
			dif_xni(tmp2,__builtin_ctz(len>>1));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=8){
				for(ui i=0;i<(len>>1);++i) tmp2[i]=li.mul(li.mul(tmp2[i],gxni[i]),tmp2[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp2,*p2=(__m256i*)gxni;
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2) (*p1)=la.mul((*p2),la.mul((*p1),(*p1)));
			}
			#else
			for(ui i=0;i<(len>>1);++i) tmp2[i]=li.mul(li.mul(tmp2[i],gxni[i]),tmp2[i]);
			#endif
			dit_xni(tmp2,__builtin_ctz(len>>1));
			std::memcpy(gn,dst,sizeof(ui)*len);
			dif(gn,__builtin_ctz(len));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=8){
				for(ui i=0;i<len;++i) tmp3[i]=li.mul(li.mul(tmp3[i],gn[i]),tmp3[i]);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp3,*p2=(__m256i*)gn;
				for(ui i=0;i<len;i+=8,++p1,++p2) (*p1)=la.mul((*p2),la.mul((*p1),(*p1)));
			}
			#else
			for(ui i=0;i<len;++i) tmp3[i]=li.mul(li.mul(tmp3[i],gn[i]),tmp3[i]);
			#endif
			dit(tmp3,__builtin_ctz(len));
			#if defined(__AVX__) && defined(__AVX2__)
			if(len<=8){
				ui mip=ws0[3],iv2=_inv[1];
				for(ui i=0;i<(len>>1);++i) h[i+(len>>1)]=li.mul(li.sub(li.mul(li.sub(li.add(tmp2[i],tmp3[i]),li.add(h[i],h[i])),mip),tmp3[i+(len>>1)]),iv2);
			}
			else{
				__m256i restrict *p1=(__m256i*)tmp2,*p2=(__m256i*)(tmp3+(len>>1)),*p3=(__m256i*)(tmp3),*p4=(__m256i*)(h+(len>>1)),*p5=(__m256i*)(h);
				__m256i mip=_mm256_set1_epi32(ws0[3]),iv2=_mm256_set1_epi32(_inv[1]);
				for(ui i=0;i<(len>>1);i+=8,++p1,++p2,++p3,++p4,++p5) (*p4)=la.mul(la.sub(la.mul(la.sub(la.add((*p1),(*p3)),la.add((*p5),(*p5))),mip),(*p2)),iv2);
			}
			#else
			{
				ui mip=ws0[3],iv2=_inv[1];
				for(ui i=0;i<(len>>1);++i) h[i+(len>>1)]=li.mul(li.sub(li.mul(li.sub(li.add(tmp2[i],tmp3[i]),li.add(h[i],h[i])),mip),tmp3[i+(len>>1)]),iv2);
			}
			#endif
		}
		void internal_multipoint_eval_interpolation_calc_Q(std::vector<poly> &Q_storage,const poly &input_coef,ui l,ui r,ui id);
		void internal_multipoint_eval_interpolation_calc_P(const std::vector<poly> &Q_storage,std::vector<poly> &P_stack,poly &result_coef,ui l,ui r,ui id,ui dep);
		void internal_lagrange_interpolation_dvc_mul(ui l,ui r,const poly &a,ui id,std::vector<std::pair<poly,poly>> &R_storage);
		void internal_lagrange_interpolation_calc_P(const std::vector<std::pair<poly,poly>> &R_storage,std::vector<poly> &P_stack,poly &result_coef,ui l,ui r,ui id,ui dep);
		poly internal_lagrange_interpolation_dvc_mul_ans(ui l,ui r,const poly &a,ui id,const std::vector<std::pair<poly,poly>> &R_storage);
		lmi li;
		#if defined(__AVX__) && defined(__AVX2__)
		lma la;
		#endif
		#if defined(__AVX512F__) && defined(__AVX512DQ__)
		lm5 l5;
		#endif
	public:
		polynomial_kernel_ntt(ui max_conv_size,ui P0,ui G0){init(max_conv_size,P0,G0);}
		void init(ui max_conv_size,ui P0,ui G0){
			if(max_conv_size>=(1u<<30)) throw std::runtime_error("invalid range!");
			max_conv_size=std::max(max_conv_size,16u);
			li=lmi(P0);
			#if defined(__AVX__) && defined(__AVX2__)
			la=lma(P0);
			#endif
			#if defined(__AVX512F__) && defined(__AVX512DQ__)
			l5=lm5(P0);
			#endif
			release();P=P0,G=G0;mx=max_conv_size;
			fn=1;fb=0;while(fn<(max_conv_size<<1)) fn<<=1,++fb;
			if((P0-1)%fn) throw std::runtime_error("invalid range!");
			_inv=create_aligned_array<ui,64>(fn+32);ws0 =create_aligned_array<ui,64>(fn+32);
			ws1 =create_aligned_array<ui,64>(fn+32);num =create_aligned_array<ui,64>(fn+32);
			for(ui i=0;i<tmp_size;++i)	tt[i] =create_aligned_array<ui,64>(fn+32);
			_inv[0]=li.v(1);for(ui i=2;i<=fn+32;++i) _inv[i-1]=li.mul(li.v(P-P/i),_inv[(P%i)-1]);
			for(ui i=1;i<=fn+32;++i) num[i-1]=li.v(i);
			ui j0=_fastpow(li.v(G),(P-1)/fn),j1=_fastpow(_fastpow(li.v(G),(P-2)),(P-1)/fn);
			for(ui mid=(fn>>1);mid>=1;mid>>=1,j0=li.mul(j0,j0),j1=li.mul(j1,j1)){
				ui w0=li.v(1),w1=li.v(1);
				for(ui i=0;i<mid;++i,w0=li.mul(w0,j0),w1=li.mul(w1,j1)) ws0[i+mid]=w0,ws1[i+mid]=w1;
			}
		}
		polynomial_kernel_ntt(const polynomial_kernel_ntt &d);
		polynomial_kernel_ntt(){fn=fb=mx=0;}
		~polynomial_kernel_ntt(){release();}
		poly rev(const poly &a){
			poly ret(a);
			std::reverse(ret.begin(),ret.end());
			return ret;
		}
		poly mul(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size();if((!la) && (!lb)) return poly();
			if(la>mx || lb>mx) throw std::runtime_error("Convolution size out of range!");
			ui m=0;if(la+lb>2) m=32-__builtin_clz(la+lb-2);
			std::memcpy(tt[0].get(),&a[0],sizeof(ui)*la);std::memset(tt[0].get()+la,0,sizeof(ui)*((1<<m)-la));
			std::memcpy(tt[1].get(),&b[0],sizeof(ui)*lb);std::memset(tt[1].get()+lb,0,sizeof(ui)*((1<<m)-lb));
			internal_mul(tt[0].get(),tt[1].get(),tt[2].get(),m);
			poly ret(la+lb-1);
			std::memcpy(&ret[0],tt[2].get(),sizeof(ui)*(la+lb-1));
			return ret;
		}
		poly transpose_mul(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size();if((!la) && (!lb)) return poly();
			if(la>mx || lb>mx) throw std::runtime_error("Convolution size out of range!");
			ui m=0;if(la+lb>2) m=32-__builtin_clz(la+lb-2);
			std::memcpy(tt[0].get(),&a[0],sizeof(ui)*la);std::memset(tt[0].get()+la,0,sizeof(ui)*((1<<m)-la));
			std::memcpy(tt[1].get(),&b[0],sizeof(ui)*lb);std::memset(tt[1].get()+lb,0,sizeof(ui)*((1<<m)-lb));
			internal_transpose_mul(tt[0].get(),tt[1].get(),tt[2].get(),m);
			poly ret(la);
			std::memcpy(&ret[0],tt[2].get(),sizeof(ui)*(la));
			return ret;
		}
		poly multipoint_eval_interpolation(const poly &a,const poly &b);
		poly lagrange_interpolation(const std::vector<std::pair<mi,mi>> &a);
		poly inv(const poly &src){
			ui la=src.size();if(!la) throw std::runtime_error("Inversion calculation of empty polynomial!");
			if((la*4)>fn) throw std::runtime_error("Inversion calculation size out of range!");
			if(!li.rv(src[0].get_val())){
				throw std::runtime_error("Inversion calculation of polynomial which has constant not equal to 1!");
			}
			ui m=0;if(la>1) m=32- __builtin_clz(la-1);
			std::memcpy(tt[0].get(),&src[0],sizeof(ui)*la);std::memset(tt[0].get()+la,0,sizeof(ui)*((1<<m)-la));
			// internal_inv(tt[0].get(),tt[1].get(),tt[2].get(),tt[3].get(),(1<<m));
			internal_inv_faster(tt[0].get(),tt[1].get(),tt[2].get(),tt[3].get(),tt[4].get(),(1<<m));
			poly ret(la);
			std::memcpy(&ret[0],tt[1].get(),sizeof(ui)*la);
			return ret;
		}
		poly ln(const poly &src){
			ui la=src.size();if(!la) throw std::runtime_error("Ln calculation of empty polynomial!");
			if((la*2)>fn) throw std::runtime_error("Ln calculation size out of range!");
			if(li.rv(src[0].get_val())!=1){
				throw std::runtime_error("Ln calculation of polynomial which has constant not equal to 1!");
			}
			ui m=0;if(la>1) m=32- __builtin_clz(la-1);
			std::memcpy(tt[0].get(),&src[0],sizeof(ui)*la);std::memset(tt[0].get()+la,0,sizeof(ui)*((1<<m)-la));
			// internal_ln(tt[0].get(),tt[1].get(),tt[2].get(),tt[3].get(),tt[4].get(),(1<<m));
			internal_ln_faster(tt[0].get(),tt[1].get(),tt[2].get(),tt[3].get(),tt[4].get(),tt[5].get(),(1<<m));
			poly ret(la);
			std::memcpy(&ret[0],tt[1].get(),sizeof(ui)*la);
			return ret;
		}
		poly exp(const poly &src){
			ui la=src.size();if(!la) throw std::runtime_error("Exp calculation of empty polynomial!");
			if((la*2)>fn) throw std::runtime_error("Exp calculation size out of range!");
			if(li.rv(src[0].get_val())!=0){
				throw std::runtime_error("Exp calculation of polynomial which has constant not equal to 0!");
			}
			ui m=0;if(la>1) m=32- __builtin_clz(la-1);
			std::memcpy(tt[0].get(),&src[0],sizeof(ui)*la);std::memset(tt[0].get()+la,0,sizeof(ui)*((1<<m)-la));
			internal_exp(tt[0].get(),tt[1].get(),tt[2].get(),tt[3].get(),tt[4].get(),tt[5].get(),tt[6].get(),tt[7].get(),(1<<m));
			poly ret(la);
			std::memcpy(&ret[0],tt[1].get(),sizeof(ui)*la);
			return ret;
		}
		poly derivative(const poly &a){
			if(!a.size()) throw std::runtime_error("Derivative calculation of empty polynomial!");
			if(a.size()>fn) throw std::runtime_error("Derivative calculation size out of range!");
			ui len=a.size();poly ret(len-1);
			for(ui i=0;i<len-1;++i) ret[i]=ui2mi(li.mul(li.v(i+1),a[i+1].get_val()));
			return ret;
		}
		poly integrate(const poly &a){
			if(!a.size()) throw std::runtime_error("Integrate calculation of empty polynomial!");
			if(a.size()>=fn) throw std::runtime_error("Integrate calculation size out of range!");
			ui len=a.size();poly ret(len+1);ret[0]=ui2mi(li.v(0));
			for(ui i=1;i<=len;++i) ret[i]=ui2mi(li.mul(_inv[i-1],a[i-1].get_val()));
			return ret;
		}
		poly add(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size(),len=std::max(la,lb);poly ret(len);
			std::memcpy(&ret[0],&b[0],sizeof(mi)*(lb));
			for(ui i=0;i<la;++i) ret[i]=ui2mi(li.add(ret[i].get_val(),a[i].get_val()));
			return ret;
		}
		poly sub(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size(),len=std::max(la,lb);poly ret(len);
			std::memcpy(&ret[0],&a[0],sizeof(mi)*(la));
			for(ui i=0;i<lb;++i) ret[i]=ui2mi(li.sub(ret[i].get_val(),b[i].get_val()));
			return ret;
		}
	};
	class polynomial_kernel_mtt
	{
	private:
		static constexpr ui P1=167772161,G1=3,P2=469762049,G2=3,P3=754974721,G3=11,I1=104391568,I2=190329765;
		polynomial_kernel_ntt k1,k2,k3;ui P,fn;lmi li,li1,li2,li3;
		fast_mod_32 F,F1,F2,F3;
		aligned_array<ui,64> _inv;
		void release(){
			k1.release();k2.release();k3.release();P=fn=0;
			_inv.reset();
		}
		ui _fastpow(ui a,ui b){ui ans=li.v(1),off=a;while(b){if(b&1) ans=li.mul(ans,off);off=li.mul(off,off);b>>=1;}return ans;}
	public:
		void init(ui max_conv_size,ui P0){
			if(P0>=(1u<<30) || !P0) throw std::runtime_error("invalid prime!");
			// if(!factorization::miller_rabin_u32(P0)) throw std::runtime_error("invalid prime!");
			try{
				release();
				k1.init(max_conv_size,P1,G1);
				k2.init(max_conv_size,P2,G2);
				k3.init(max_conv_size,P3,G3);
				P=P0;fn=k1.fn;li=lmi(P);
				F=fast_mod_32(P);
				if(P<=fn+32) throw std::runtime_error("invalid prime!");
				_inv=create_aligned_array<ui,64>(fn+32);
				_inv[0]=li.v(1);for(ui i=2;i<=fn+32;++i) _inv[i-1]=li.mul(li.v(P-P/i),_inv[(P%i)-1]);
			}catch(std::exception &e){
				P=0;
				throw std::runtime_error(e.what());
			}
		}
		polynomial_kernel_mtt(ui max_conv_size,ui P0):F1(P1),F2(P2),F3(P3),li1(P1),li2(P2),li3(P3){
			init(max_conv_size,P0);
		}
		polynomial_kernel_mtt(const polynomial_kernel_mtt &d):
		P(d.P),fn(d.fn),k1(d.k1),k2(d.k2),k3(d.k3),F1(P1),F2(P2),F3(P3),F(d.F),li(d.li),li1(P1),li2(P2),li3(P3){}
		polynomial_kernel_mtt():F1(P1),F2(P2),F3(P3),li1(P1),li2(P2),li3(P3){
			P=fn=0;
		}
		~polynomial_kernel_mtt(){release();}
		poly add(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size(),len=std::max(la,lb);poly ret(len);
			std::memcpy(&ret[0],&b[0],sizeof(mi)*(lb));
			for(ui i=0;i<la;++i) ret[i]=ui2mi(li.add(ret[i].get_val(),a[i].get_val()));
			return ret;
		}
		poly sub(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size(),len=std::max(la,lb);poly ret(len);
			std::memcpy(&ret[0],&a[0],sizeof(mi)*(la));
			for(ui i=0;i<lb;++i) ret[i]=ui2mi(li.sub(ret[i].get_val(),b[i].get_val()));
			return ret;
		}
		poly mul(const poly &a,const poly &b){
			ui la=a.size(),lb=b.size();if((!la) && (!lb)) return poly();
			if((la+lb-1)>fn) throw std::runtime_error("Convolution size out of range!");
			poly a1(a.size()),a2(a.size()),a3(a.size()),b1(b.size()),b2(b.size()),b3(b.size());
			for(ui i=0;i<a.size();++i){
				ui ra=li.rv(a[i].get_val());
				a1[i]=ui2mi(li1.v(ra));
				a2[i]=ui2mi(li2.v(ra));
				a3[i]=ui2mi(li3.v(ra));
			}
			for(ui i=0;i<b.size();++i){
				ui rb=li.rv(b[i].get_val());
				b1[i]=ui2mi(li1.v(rb));
				b2[i]=ui2mi(li2.v(rb));
				b3[i]=ui2mi(li3.v(rb));
			}
			poly r1=k1.mul(a1,b1),r2=k2.mul(a2,b2),r3=k3.mul(a3,b3);
			poly ret(la+lb-1);
			ui I3=F.reduce(1ull*P1*P2);
			for(int i=0;i<la+lb-1;++i){
				ui x1=li1.rv(r1[i].get_val());
				ui x2=li2.rv(r2[i].get_val());
				ui x3=li3.rv(r3[i].get_val());
				ui k1=F2.reduce(1ull*(x2-x1+P2)*I1);
				ull x4=1ull*k1*P1+x1;
				ui k2=F3.reduce((x3-F3.reduce(x4)+P3)*I2);
				ui x=F.reduce(x4+1ull*k2*I3);
				ret[i]=ui2mi(li.v(x));
			}
			return ret;
		}
		poly inv(const poly &a)
		{
			ui l(a.size());if(!l) throw std::runtime_error("Inversion calculation of empty polynomial!");
			if(l==1) return {ui2mi(_fastpow(a[0].get_val(),P-2))};
			poly b(a);ui k((l+1)/2);b.resize(k);
			b=inv(b);poly r(mul(b,a));r.resize(l);
			for(mi &v:r) v=ui2mi(li.neg(v.get_val()));
			r[0]=ui2mi(li.add(li.v(2),r[0].get_val()));
			r=mul(b,r);r.resize(l);return r;
		}
		poly ln(const poly &a)
		{
			ui l(a.size());poly b(l-1);
			for(ui i(1);i<l;++i) b[i-1]=ui2mi(li.mul(a[i].get_val(),li.v(i)));
			b=mul(b,inv(a));
			poly r(l);r[0]=ui2mi(li.v(0));
			for(ui i(1);i<l;++i) r[i]=ui2mi(li.mul(_inv[i-1],b[i-1].get_val()));
			return r;
		}
		poly exp(const poly &a)
		{
			ui l(a.size());
			if(l==1) return {ui2mi(li.v(1))};
			poly b(a);ui k((l+1)/2);b.resize(k);
			b=exp(b);poly c(b);c.resize(l);
			c=sub(a,ln(c));c[0]=ui2mi(li.add(li.v(1),c[0].get_val()));
			c=mul(b,c);c.resize(l);return c;
		}
		mi linear_recurrence(poly P,poly Q,ll k)
		{
			while(k)
			{
				poly nQ(Q);
				{
					ui l(nQ.size());
					for(ui i(1);i<l;i+=2) nQ[i]=ui2mi(li.neg(nQ[i].get_val()));
				}
				poly _lQ(mul(nQ,Q));
				{
					ui l(Q.size());
					for(ui i(0);i<l;i++) Q[i]=_lQ[i*2];
				}
				poly _lP(mul(P,nQ));
				{
					ui l(_lP.size());
					ui t((l+1-(k&1))/2);
					P.resize(t);
					for(ui i(0),j(k&1);i<t;++i,j+=2) P[i]=_lP[j];
				}
				k>>=1;
			}
			return P[0];
		}
	};
	#undef restrict
	#undef NTT_partition_size
}
using library::polynomial_kernel_ntt;
using library::poly;
using library::mi;
constexpr int N(5e4+10),p(998244353);
mi f[N];
void solve(int l,int r,polynomial_kernel_ntt &ker)
{
	if(l==r)
	{
		f[l]+=mi(3-l+p)*f[l-1];
		return;
	}
	int mid((l+r)>>1);
	solve(l,mid,ker);
	poly p1(mid-l+1);
	std::copy(f+l,f+mid+1,p1.begin());
	int len(std::min(mid,r-l));
	poly p2(len);for(int i(1);i<=len;++i) p2[i-1]=mi(i-1)*f[i];
	poly R=ker.mul(p1,p2);R.resize(r-l);
	for(int i(mid+1);i<=r;++i) f[i]+=R[i-l-1];
	if(l>1)
	{
		for(int i(l);i<=mid;++i) p1[i-l]=p1[i-l]*mi(i-1);
		len=std::min(l-1,r-l);
		p2.resize(len);std::copy(f+1,f+len+1,p2.begin());
		R=ker.mul(p1,p2);R.resize(r-l);
		for(int i(mid+1);i<=r;++i) f[i]+=R[i-l-1];
	}
	solve(mid+1,r,ker);
}
int main()
{
	std::ios::sync_with_stdio(false);
	std::cin.tie(nullptr);
	int T,n;std::cin>>T>>n;f[0]=1;
	polynomial_kernel_ntt ker(n+1,p,3);
	solve(1,n,ker);
	while(T--)
	{
		static int a[N];for(int i(1);i<=n;++i) std::cin>>a[i];
		static int stk[N];int top(0);
		bool ok(true);mi ans(1);
		for(int i(1);i<=n && ok;++i)
		{
			int cnt(0);
			while(top && stk[top]-a[stk[top]]+1>=i-a[i]+1) --top,++cnt;
			if(top && stk[top]>=i-a[i]+1){ok=false;break;}
			else stk[++top]=i,ans*=f[cnt];
		}
		if(!ok || top!=1) std::cout<<"0\n";
		else std::cout<<ans<<"\n";
	}
	// for(int i(1);i<=n;++i) std::cout<<f[i]<<" \n"[i==n];
	return 0;
}

Details

Tip: Click on the bar to expand more detailed information

Test #1:

score: 5
Accepted
time: 2ms
memory: 3704kb

input:

1 10
1 1 3 1 2 1 4 1 2 10

output:

64

result:

ok 1 number(s): "64"

Test #2:

score: 5
Accepted
time: 2ms
memory: 3704kb

input:

1 10
1 1 3 1 2 1 2 7 9 10

output:

0

result:

ok 1 number(s): "0"

Test #3:

score: 5
Accepted
time: 0ms
memory: 3600kb

input:

100 10
1 2 1 2 1 1 1 1 5 10
1 1 1 1 1 1 1 1 9 10
1 2 1 1 1 1 1 8 1 10
1 1 1 1 1 1 2 1 1 10
1 1 2 4 1 1 1 1 5 10
1 1 3 1 1 1 1 1 9 10
1 1 3 1 2 1 2 1 4 10
1 1 1 1 1 3 1 8 1 10
1 1 1 1 1 1 1 1 3 10
1 1 3 1 1 1 2 3 1 10
1 1 1 1 1 1 6 1 2 10
1 1 1 3 1 1 1 1 5 10
1 1 3 1 5 1 1 1 9 10
1 1 1 4 1 1 1 1 5 10...

output:

256
87360
2400
87360
128
2400
64
352
9600
704
704
128
128
128
2400
128
2400
2400
2400
64
128
1408
128
9600
443296
128
128
9600
9600
443296
0
443296
256
2400
0
64
87360
443296
443296
128
0
443296
0
256
443296
443296
1408
9600
128
443296
704
443296
0
443296
2400
704
443296
87360
2400
704
443296
87360
...

result:

ok 100 numbers

Test #4:

score: 5
Accepted
time: 0ms
memory: 3620kb

input:

100 10
1 1 1 1 1 1 7 1 2 10
1 1 1 1 1 6 1 1 3 10
1 1 1 1 1 1 1 3 1 10
1 2 1 2 1 1 3 1 1 10
1 1 1 1 5 1 7 1 1 10
1 1 1 1 1 2 1 1 1 10
1 2 1 1 1 4 1 1 3 10
1 1 1 1 1 2 1 8 1 10
1 1 1 1 5 1 1 1 2 10
1 1 1 2 1 1 1 1 8 10
1 1 1 1 5 1 1 1 6 10
1 1 1 1 1 5 1 8 1 10
1 2 1 2 1 2 1 1 3 10
1 1 1 1 1 1 1 1 1 10...

output:

2400
352
9600
704
128
87360
64
2400
512
2400
0
128
256
443296
9600
2400
87360
0
443296
87360
87360
1408
64
2400
64
443296
256
64
128
352
443296
0
443296
128
0
0
704
443296
704
443296
9600
128
2400
9600
87360
352
128
443296
128
704
0
1408
128
9600
443296
704
2400
2400
443296
0
128
2400
443296
9600
35...

result:

ok 100 numbers

Test #5:

score: 5
Accepted
time: 2ms
memory: 3760kb

input:

1 300
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 10...

output:

731696071

result:

ok 1 number(s): "731696071"

Test #6:

score: 5
Accepted
time: 0ms
memory: 3752kb

input:

1 300
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...

output:

654818711

result:

ok 1 number(s): "654818711"

Test #7:

score: 5
Accepted
time: 4ms
memory: 3716kb

input:

100 300
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 14 1 1 1 1 1 25 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 8 1 1 17 1...

output:

629582282
936488602
14854846
0
972850406
335643474
401801367
518359524
0
42431632
964390278
0
735641393
402072933
0
315498454
714104802
235533741
71997442
584545233
143385672
0
243017113
986234659
955040013
187506887
47264449
0
134183782
140053056
645326653
706116063
793388530
755635337
660249932
35...

result:

ok 100 numbers

Test #8:

score: 5
Accepted
time: 1ms
memory: 3760kb

input:

100 300
1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 16 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 1 9 1 1 1 1 46 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 1 13 1 1 1 1 21 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 12 1 40 1 1 1 1 1 1 1 1 1 1 ...

output:

213721346
242599023
719011592
0
422638207
728716801
710719728
177328520
0
443663955
995539661
0
0
0
582405109
301065834
482480764
189294107
882417753
263472408
583100647
887062292
558090353
374255923
176488287
958825157
132492749
27789769
297287037
0
0
2733249
643517198
0
20415575
953067857
74118762...

result:

ok 100 numbers

Test #9:

score: 5
Accepted
time: 3ms
memory: 3808kb

input:

1 997
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...

output:

495842312

result:

ok 1 number(s): "495842312"

Test #10:

score: 5
Accepted
time: 0ms
memory: 3816kb

input:

100 1000
1 1 1 2 4 1 6 1 2 3 11 1 1 2 1 4 1 1 1 2 5 1 11 1 1 2 3 1 18 1 2 1 1 1 6 1 1 2 4 5 1 1 3 9 1 1 2 1 5 1 7 1 2 1 1 2 1 4 24 1 1 1 1 3 1 1 7 1 2 1 2 61 1 2 3 1 2 1 4 8 9 82 1 2 3 1 1 6 1 2 9 1 1 3 1 5 1 1 8 1 1 3 1 1 23 1 2 1 4 1 1 7 1 2 3 4 1 2 1 8 1 2 1 12 1 1 2 1 1 2 1 1 3 4 8 1 2 3 15 1 1 ...

output:

639441690
325951010
772826730
707904063
750757689
685150437
3586144
711874830
882896695
312252618
0
0
625591512
0
0
953293767
310960913
665863278
61999458
536746177
269479775
755348118
0
816896717
0
0
0
890648230
0
435292445
0
528885058
277580975
916252312
0
958726742
640472413
682980893
661293405
0...

result:

ok 100 numbers

Test #11:

score: 5
Accepted
time: 9ms
memory: 3804kb

input:

100 1000
1 1 1 2 3 1 1 3 1 2 6 1 1 12 1 1 3 4 1 2 1 1 1 3 4 1 9 1 1 29 1 2 1 4 1 2 1 1 1 1 3 5 7 1 1 1 3 1 2 16 1 2 1 4 1 1 2 3 1 2 6 8 9 1 1 16 1 1 3 40 1 2 3 74 1 1 1 2 1 2 1 4 1 1 11 1 2 3 1 2 6 1 2 3 4 1 1 3 1 9 10 1 2 3 4 1 1 1 2 3 5 1 2 1 1 11 1 2 1 2 5 1 7 8 34 1 2 37 1 2 1 1 1 49 1 1 1 2 4 6...

output:

0
0
706651534
439786013
0
70411622
0
0
0
835773719
0
0
0
107157784
27212808
568144383
407628078
308471256
890881779
0
0
0
0
389644399
0
452029310
55646018
0
183690719
0
0
0
0
609100480
611850278
81354283
0
89798335
375537513
0
902011601
957022140
0
903873199
0
276820482
0
247728618
866180585
4914605...

result:

ok 100 numbers

Test #12:

score: 5
Accepted
time: 9ms
memory: 3804kb

input:

100 1000
1 1 1 2 3 1 1 3 1 2 6 1 1 12 1 1 3 4 1 2 1 1 1 3 4 1 9 1 1 29 1 2 1 4 1 2 1 1 1 1 3 5 7 1 1 1 3 1 2 16 1 2 1 4 1 1 2 3 1 2 6 8 9 1 1 16 1 1 3 40 1 2 3 74 1 1 1 2 1 2 1 4 1 1 11 1 2 3 1 2 6 1 2 3 4 1 1 3 1 9 10 1 2 3 4 1 1 1 2 3 5 1 2 1 1 11 1 2 1 2 5 1 7 8 34 1 2 37 1 2 1 1 1 49 1 1 1 2 4 6...

output:

0
0
706651534
439786013
0
70411622
0
0
0
835773719
0
0
0
107157784
27212808
568144383
407628078
308471256
890881779
0
0
0
0
389644399
0
452029310
55646018
0
183690719
0
0
0
0
609100480
611850278
81354283
0
89798335
375537513
0
902011601
957022140
0
903873199
0
276820482
0
247728618
866180585
4914605...

result:

ok 100 numbers

Test #13:

score: 5
Accepted
time: 27ms
memory: 4180kb

input:

100 5000
1 1 1 2 4 1 1 7 1 1 3 1 2 1 2 1 9 1 2 1 2 1 6 1 17 1 19 1 1 1 2 1 1 1 1 7 1 1 1 39 1 1 1 2 1 2 1 6 48 1 1 51 1 1 2 1 4 5 1 2 1 1 1 6 1 13 1 1 2 1 4 1 1 3 1 10 1 1 3 1 15 1 2 1 2 3 1 1 36 1 1 1 2 3 1 1 1 7 1 1 2 1 4 1 1 1 1 10 1 20 1 2 1 1 3 1 1 66 1 1 1 2 3 1 1 1 2 4 1 1 1 1 5 1 17 1 1 1 4 ...

output:

0
292588696
344051249
351335346
0
985560804
0
0
744141033
0
318360253
509056631
897631526
210708530
0
282499786
0
0
298272534
0
866815275
708474811
0
0
533348880
0
0
0
496288915
0
288732736
613115755
275961405
582960568
0
777792174
304810404
835564816
0
809847872
0
0
890711747
27774048
0
838137435
8...

result:

ok 100 numbers

Test #14:

score: 5
Accepted
time: 31ms
memory: 4164kb

input:

100 5000
1 1 2 3 1 5 1 1 1 1 3 1 2 1 9 1 1 1 2 3 1 21 1 1 3 1 5 6 1 2 3 4 1 1 2 4 36 38 1 1 1 1 2 4 1 1 3 1 9 1 11 1 1 1 1 1 2 1 20 1 2 1 24 1 1 1 1 2 3 31 1 33 1 2 1 4 1 2 7 1 1 2 4 12 1 2 3 4 1 1 7 8 1 10 1 1 13 1 2 1 4 1 19 1 2 1 4 5 1 7 1 2 3 1 2 1 2 1 6 1 2 3 20 1 1 2 3 1 2 1 2 1 1 3 135 1 1 1 ...

output:

153322044
150492767
0
994490869
0
0
0
855074869
0
788845897
849135324
558695774
329454393
0
0
0
739924836
452723945
0
786225052
0
0
0
185397693
73624630
988094148
0
836735505
57947855
705987810
106069205
0
0
885756183
494461680
182794483
0
482961814
31686100
0
0
249571642
543812012
0
722599260
64625...

result:

ok 100 numbers

Test #15:

score: 5
Accepted
time: 34ms
memory: 4156kb

input:

100 5000
1 1 1 2 1 2 6 1 1 2 10 1 1 1 2 1 6 1 8 1 1 3 1 2 1 2 1 1 1 1 12 1 2 3 1 1 3 1 1 2 1 11 1 2 3 4 5 6 1 19 1 1 3 1 1 2 3 1 2 3 4 8 1 1 11 1 1 1 68 1 2 3 4 5 1 75 1 2 1 2 1 2 1 8 1 2 3 1 1 2 15 1 2 1 4 96 1 1 3 1 2 1 7 1 9 106 1 2 1 1 1 1 7 1 2 1 1 2 1 1 6 1 8 1 12 1 1 2 1 5 1 2 3 1 2 1 2 1 2 1...

output:

946701850
983965823
543510180
0
234696913
532101369
36100829
817231065
772588292
709229991
29442326
651065274
0
837031094
0
0
316959187
97361210
0
989103922
345758410
347379164
0
897533810
488460689
0
0
497682855
33909352
194898623
0
0
118515226
0
284902892
251441025
850822408
357498752
892873025
0
...

result:

ok 100 numbers

Test #16:

score: 5
Accepted
time: 23ms
memory: 4236kb

input:

100 5000
1 1 1 3 4 1 1 2 3 1 1 6 1 8 14 1 1 2 3 1 2 3 1 1 1 4 8 1 2 3 1 17 1 19 1 2 3 1 2 1 2 1 6 1 2 1 46 1 1 1 1 1 4 1 2 1 4 9 1 1 2 15 1 1 3 4 5 1 2 1 4 71 1 1 3 1 5 78 1 2 3 1 2 6 1 1 2 1 1 1 4 1 8 9 1 1 2 1 2 3 1 1 9 1 1 22 1 1 2 1 5 1 29 1 2 1 4 5 41 1 2 1 1 5 1 1 1 2 1 1 1 1 5 8 1 1 3 4 1 1 6...

output:

0
0
0
387445433
53260884
0
253952014
537163051
36559794
0
511384985
0
832504854
740181150
330078867
405525929
569684673
0
0
827053910
613985912
647990716
75829151
918618470
295766219
845315492
0
0
129206830
666746707
586695222
81186019
0
35517922
0
392622181
0
722722877
685721668
0
788434307
7688927...

result:

ok 100 numbers

Test #17:

score: 5
Accepted
time: 320ms
memory: 7204kb

input:

100 50000
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 10 1 1 1 1 1 1 5 1 22 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...

output:

938322083
0
627807333
0
553973202
0
0
14048401
0
950421047
0
622576077
0
744029553
793127481
645301924
533609841
852154523
625396972
62789084
413008358
292040298
656049825
581436181
922659811
0
0
0
91815535
506836479
156002945
205212058
316354153
344645416
521003850
532634702
926445048
641763410
519...

result:

ok 100 numbers

Test #18:

score: 5
Accepted
time: 323ms
memory: 7252kb

input:

100 50000
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 6 1 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 9 1 32 1 1 1 1 2 1 1 1 1 1 1 1 1 1 3 1 1 65 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...

output:

624885832
833309143
968068549
16720417
467661900
0
983405586
43591434
547668905
950286049
530347421
0
0
909087345
658815045
0
794306626
14976636
357941460
150818230
715847749
638450467
0
0
155568509
0
879137806
428155583
300267776
779360110
39013493
751776921
685994614
809532425
0
0
28163073
6789027...

result:

ok 100 numbers

Test #19:

score: 5
Accepted
time: 302ms
memory: 7168kb

input:

100 50000
1 1 2 3 1 5 1 1 1 3 5 1 2 14 15 1 2 1 1 2 1 1 1 3 4 1 10 11 14 15 1 2 3 1 20 1 2 3 1 1 2 4 1 2 10 1 1 3 4 1 2 1 4 5 1 7 1 2 10 1 1 1 1 2 1 2 6 1 1 3 1 5 1 1 8 15 1 2 34 1 1 1 2 1 1 2 3 1 2 7 1 1 2 4 1 1 97 1 99 1 1 1 1 2 6 7 1 1 1 1 3 5 6 1 1 2 11 1 2 1 15 1 1 124 1 1 1 2 5 1 2 8 9 1 1 1 3...

output:

475040820
821423600
161407909
880735181
500071559
337741049
134440032
894840892
976405119
528519244
473904119
936942481
875766540
93375618
980295619
621222155
0
393457466
0
0
281655372
418588257
269174342
31667135
0
0
462648500
0
528191112
831811998
548436806
0
0
917468062
373787292
425247061
235798...

result:

ok 100 numbers

Test #20:

score: 5
Accepted
time: 311ms
memory: 7268kb

input:

100 50000
1 1 1 1 1 3 1 1 6 1 1 1 10 1 12 1 1 1 4 1 2 1 4 1 2 3 1 1 2 3 5 1 18 1 2 3 1 1 1 1 1 2 1 2 1 10 1 2 1 1 1 16 1 2 3 23 1 2 3 1 5 1 2 1 1 5 6 1 2 3 1 11 1 1 1 4 1 2 1 2 3 1 5 1 9 1 11 1 1 1 3 1 2 79 1 1 2 3 4 1 1 1 2 5 1 2 3 1 5 15 17 1 1 1 1 1 1 2 5 1 27 1 29 1 2 1 2 3 1 7 8 1 2 3 1 133 1 2...

output:

0
0
682944657
750632782
263950446
693440161
405059290
525305126
44971279
123548737
0
741272096
576480062
0
677196555
0
145863649
0
0
0
987541820
0
714088381
470820324
0
893509858
0
0
258145392
0
374442624
393753011
495298252
455139328
0
454995497
462625493
552202545
35811919
51255305
465265116
94156...

result:

ok 100 numbers

Extra Test:

score: 0
Extra Test Passed