QOJ.ac

QOJ

ID题目提交者结果用时内存语言文件大小提交时间测评时间
#330039#618. 多项式乘法NOI_AK_ME100 ✓38ms39100kbC++2325.8kb2024-02-17 11:15:572024-02-17 11:15:59

Judging History

你现在查看的是最新测评结果

  • [2024-02-17 11:15:59]
  • 评测
  • 测评结果:100
  • 用时:38ms
  • 内存:39100kb
  • [2024-02-17 11:15:57]
  • 提交

answer

#pragma GCC target("avx512f")
#include <immintrin.h>
#include <array>
#include <cstring>
#include <cstdint>
#include <cstdio>
#include <algorithm>
#include <sys/mman.h>
#include <sys/stat.h>
using u32=uint32_t;
using u64=uint64_t;
using idt=std::size_t;
using I128=__m128i;
using I256=__m256i;
using I512=__m512i;
struct LMont_32{
    u32 M,M2,niv,R,R2;
    LMont_32()=default;
    LMont_32(u32 m):M{m},M2{m*2},niv{2+m},R{-m%m},R2((-u64(m))%m){
        for(u32 i=0;i<4;++i){niv*=2+m*niv;}
    }
    template<bool cond=true>u32 shr(u32 x)const{
        if constexpr(cond){
            return std::min(x,x-M);
        }
        return x;
    }
    u32 shr2(u32 x)const{
        return std::min(x,x-M2);
    }
    u32 dil(u32 x)const{
        return std::min(x,x+M);
    }
    u32 dil2(u32 x)const{
        return std::min(x,x+M2);
    }
    u32 reduce(u64 x)const{
        return (x+u64(u32(x)*niv)*M)>>32;
    }
    template<bool shrk=false>u32 mul(u32 x,u32 y)const{
        return shr<shrk>(reduce(u64(x)*y));
    }
    template<bool shrk=false>u32 qpw(u32 a,u32 b,u32 r)const{
        for(;b;b>>=1,a=mul(a,a)){
            b&1?r=mul(r,a):r;
        }
        return shr<shrk>(r);
    }
    template<bool shrk=false>u32 qpw(u32 a,u32 b)const{
        return qpw<shrk>(a,b,R);
    }
    template<bool shrk=false>u32 inv(u32 a)const{
        return qpw<shrk>(a,M-2);
    }
    template<bool shrk=false>u32 in(u32 x)const{
        return mul<shrk>(x,R2);
    }
    u32 add(u32 x,u32 y)const{
        return shr2(x+y);
    }
    u32 sub(u32 x,u32 y)const{
        return dil2(x-y);
    }
    u32 Ladd(u32 x,u32 y)const{
        return x+y;
    }
    u32 Lsub(u32 x,u32 y)const{
        return x+M2-y;
    }
    u32 neg(u32 x)const{
        return M2-x;
    }
};
struct LMont_32_I512{
    I512 M,M2,niv;
    LMont_32_I512()=default;
    LMont_32_I512(LMont_32 mt):M{_mm512_set1_epi32(mt.M)},M2{_mm512_set1_epi32(mt.M2)},niv{_mm512_set1_epi32(mt.niv)}{}
    template<bool cond=true>I512 shr(I512 x)const{
        if constexpr(cond){
            return _mm512_min_epu32(x,_mm512_sub_epi32(x,M));
        }
        return x;
    }
    I512 shr2(I512 x)const{
        return _mm512_min_epu32(x,_mm512_sub_epi32(x,M2));
    }
    I512 dil(I512 x)const{
        return _mm512_min_epu32(x,_mm512_add_epi32(x,M));
    }
    I512 dil2(I512 x)const{
        return _mm512_min_epu32(x,_mm512_add_epi32(x,M2));
    }
    I512 _mul_hi(I512 x,I512 y)const{
        I512 a=_mm512_mul_epu32(x,y);
        return _mm512_add_epi64(a,_mm512_mul_epu32(_mm512_mul_epu32(a,niv),M));
    }
    template<bool shrk=false>I512 mul(I512 x,I512 y)const{
        I512 a=_mm512_mul_epu32(x,y),b=_mm512_mul_epu32(_mm512_srli_epi64(x,32),_mm512_srli_epi64(y,32));
        I512 c=_mm512_mul_epu32(a,niv),d=_mm512_mul_epu32(b,niv);
        c=_mm512_mul_epu32(c,M),d=_mm512_mul_epu32(d,M);
        return shr<shrk>(_mm512_mask_blend_epi32(0xaaaa,_mm512_srli_epi64(_mm512_add_epi64(a,c),32),_mm512_add_epi64(b,d)));
    }
    template<bool shrk=false>I512 mul_sm(I512 x,I512 y)const{
        I512 a=_mm512_mul_epu32(x,y),b=_mm512_mul_epu32(_mm512_srli_epi64(x,32),y);
        I512 c=_mm512_mul_epu32(a,niv),d=_mm512_mul_epu32(b,niv);
        c=_mm512_mul_epu32(c,M),d=_mm512_mul_epu32(d,M);
        return shr<shrk>(_mm512_mask_blend_epi32(0xaaaa,_mm512_srli_epi64(_mm512_add_epi64(a,c),32),_mm512_add_epi64(b,d)));
    }
    template<bool shrk=false>I512 mul_hi(I512 x,I512 y)const{
        return shr<shrk>(_mm512_srli_epi64(_mul_hi(x,y),32));
    }
    I512 add(I512 x,I512 y)const{
        return shr2(_mm512_add_epi32(x,y));
    }
    I512 sub(I512 x,I512 y)const{
        return dil2(_mm512_sub_epi32(x,y));
    }
    I512 Ladd(I512 x,I512 y)const{
        return _mm512_add_epi32(x,y);
    }
    I512 Lsub(I512 x,I512 y)const{
        return _mm512_add_epi32(_mm512_sub_epi32(x,y),M2);
    }
    I512 neg(I512 x)const{
        return _mm512_sub_epi32(M2,x);
    }
    template<u32 z>I512 neg_m(I512 x)const{
        return _mm512_mask_sub_epi32(x,z,M2,x);
    }
};
template<class T>concept trivialT=std::is_trivial_v<T>;
inline u32*alc(idt n){return new(std::align_val_t(64))u32[n];}
inline void fre(u32*p){::operator delete[](p,std::align_val_t(64));}
template<trivialT T>inline T*cpy(T*f,const T*g,idt n){return (T*)memcpy(f,g,n*sizeof(T));}
template<trivialT T>inline T*clr(T*f,idt n){return (T*)memset(f,0,n*sizeof(T));}
constexpr u32 mxlg=26,_lg_iter_thre=6;
struct fntt_info_32{
    LMont_32 mt;
    u32 RT1[mxlg];
    LMont_32_I512 ms;
    alignas(32) std::array<u64,4> rt3[mxlg-6],rt3i[mxlg-6],st[(mxlg-_lg_iter_thre)>>1],st2[_lg_iter_thre>>1],bwb;
    alignas(64) std::array<u32,16> rt4[mxlg-4],rt4i[mxlg-4];
    I512 bwd,pr2,pr4,pr8,imgx16,_R4;
    alignas(64) static constexpr u32
    idx[]={0,2,0,4,0,2,0,4,0,2,0,4,0,2,0,4},
    id1x[]={7,3,7,3,7,3,7,3,7,11,7,11,7,11,7,11},
    id2x[]={8,0,9,1,10,2,11,3,12,4,13,5,14,6,15,7},
    id2i[]={0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
    id3x[]={8,0,10,2,12,4,14,6,9,1,11,3,13,5,15,7},
    id4x[]={4,0,6,2,5,1,7,3,12,8,14,10,13,9,15,11};
    fntt_info_32()=default;
    fntt_info_32(u32 M,u32 k,u32 _g):mt{M},ms{mt}{
        u32 rt1[mxlg-1],rt1i[mxlg-1];
        rt1[k-2]=_g,rt1i[k-2]=mt.inv(_g);
        for(u32 i=k-2;i>0;--i){
            rt1[i-1]=mt.mul(rt1[i],rt1[i]);
            rt1i[i-1]=mt.mul(rt1i[i],rt1i[i]);
        }
        for(u32 i=1;i<=k;++i){
			RT1[i-1]=mt.qpw<true>(_g,3<<(k-i));
        }
        u32 pr=mt.R,pri=mt.R;
        bwb={mt.R,mt.R,mt.M-mt.R};
        for(u32 i=0;i<k-6;pr=mt.mul(pr,rt1i[i+1]),pri=mt.mul(pri,rt1[i+1]),++i){
            u32 r=mt.mul<true>(pr,rt1[i+1]),ri=mt.mul<true>(pri,rt1i[i+1]);
            u32 r2=mt.mul<true>(r,r),r2i=mt.mul<true>(ri,ri);
            u32 r3=mt.mul<true>(r,r2),r3i=mt.mul<true>(ri,r2i);
            rt3[i]={r,r2,r3},rt3i[i]={ri,r2i,r3i};
        }
        pr=mt.R,pri=mt.R;
        for(u32 i=0;i<k-4;pr=mt.mul(pr,rt1i[i+3]),pri=mt.mul(pri,rt1[i+3]),++i){
            u32 r=mt.mul<true>(pr,rt1[i+3]),ri=mt.mul<true>(pri,rt1i[i+3]);
            u32 r2=mt.mul<true>(r,r),r4=mt.mul<true>(r2,r2),r8=mt.mul<true>(r4,r4);
            rt4[i]={r,r2,r,r4,r,r2,r,r8,r,r2,r,r4,r,r2,r},rt4i[i][0]=mt.R;
            for(u32 j=1;j<16;++j){
                rt4i[i][j]=mt.mul<true>(rt4i[i][j-1],ri);
            }
        }
        u32 w[8],wi[8];
        w[0]=mt.R,wi[0]=mt.R;
        for(u32 i=0;i<3;++i){
            pr=rt1[i],pri=rt1i[i];
            for(u32 j=1<<i,k=0;k<j;++k){
                w[j+k]=mt.mul<true>(w[k],pr);
                wi[j+k]=mt.mul<true>(wi[k],pri);
            }
        }
        bwd=_mm512_setr_epi32(w[0],w[0],w[1],w[0],w[2],w[1],w[3],w[0],w[4],w[2],w[5],w[1],w[6],w[3],w[7],0);
        pr2=_mm512_setr_epi64(mt.R,mt.R,mt.R,mt.R,mt.R,mt.R,wi[1],wi[1]);
        pr4=_mm512_setr_epi64(wi[0],wi[0],wi[1],wi[1],wi[2],wi[2],wi[3],wi[3]);
        pr8=_mm512_setr_epi64(mt.M-wi[0],mt.M-wi[1],mt.M-wi[2],mt.M-wi[3],mt.M-wi[4],mt.M-wi[5],mt.M-wi[6],mt.M-wi[7]);
        imgx16=_mm512_set1_epi32(w[1]);
    }
    template<bool first>static void __butterfly4(I512*p0,I512*p1,I512*p2,I512*p3,I512 img,I512 r1,I512 r2,I512 nr3,const LMont_32_I512&ms){
        if constexpr(first){
            //in : [0,2mod)
            auto f1=_mm512_load_si512(p1);
            auto f3=_mm512_load_si512(p3);
            auto g1=ms.add(f1,f3);
            auto g3=ms.mul_sm(ms.Lsub(f1,f3),img);
            auto f0=_mm512_load_si512(p0);
            auto f2=_mm512_load_si512(p2);
            auto g0=ms.add(f0,f2);
            auto g2=ms.sub(f0,f2);
            _mm512_store_si512(p0,ms.add(g0,g1));
            _mm512_store_si512(p1,ms.Lsub(g0,g1));
            _mm512_store_si512(p2,ms.Ladd(g2,g3));
            _mm512_store_si512(p3,ms.Lsub(g2,g3));
            return;
        }
        //in : [0,4mod)
        auto f1=ms.mul_sm(_mm512_load_si512(p1),r1);
        auto nf3=ms.mul_sm(_mm512_load_si512(p3),nr3);
        auto g1=ms.sub(f1,nf3);
        auto f0=ms.shr2(_mm512_load_si512(p0));
        auto f2=ms.mul_sm(_mm512_load_si512(p2),r2);
        auto g3=ms.mul_sm(ms.Ladd(f1,nf3),img);
        auto g0=ms.add(f0,f2);
        auto g2=ms.sub(f0,f2);
        _mm512_store_si512(p0,ms.Ladd(g0,g1));
        _mm512_store_si512(p1,ms.Lsub(g0,g1));
        _mm512_store_si512(p2,ms.Ladd(g2,g3));
        _mm512_store_si512(p3,ms.Lsub(g2,g3));
    }
    template<bool first,bool complete>void __dif4(I512*f,idt lm,idt ix){
        const auto ms=this->ms;
        if(lm<=(idt(1)<<_lg_iter_thre)){
            u32 p=0;
            auto id=_mm512_load_si512(idx),img=imgx16;
            idt l=lm,L=lm>>2,yk=ix;
            for(;L;l=L,L>>=2,++p,yk<<=2){
                if constexpr(first){st2[p]=bwb;}
                auto rt=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(st2+p)));
                for(idt i=0,k=yk;i<lm;i+=l,++k){
                    auto r1=_mm512_permutexvar_epi32(id,rt);
                    auto r2=_mm512_shuffle_epi32(r1,_MM_PERM_BBBB);
                    auto nr3=_mm512_shuffle_epi32(r1,_MM_PERM_DDDD);
                    I512 tr=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(rt3+__builtin_ctzll(~k))));
                    rt=ms.mul_hi<true>(rt,tr);
                    for(idt j=0;j<L;++j){
                        __butterfly4<false>(f+i+j+L*0,f+i+j+L*1,f+i+j+L*2,f+i+j+L*3,img,r1,r2,nr3,ms); 
                    }
                }
                _mm256_store_si256((I256*)(st2+p),_mm512_castsi512_si256(rt)); 
            }
            if constexpr(complete){
                if constexpr(first){
                    _R4=bwd;
                }
                auto rt=_R4,id1=_mm512_load_si512(id1x),id2=_mm512_load_si512(id2x),id3=_mm512_load_si512(id3x),id4=_mm512_load_si512(id4x);
                for(idt i=0;i<lm;++i,++yk){
                    auto fi=_mm512_load_si512(f+i);
                    auto tr=_mm512_load_si512(rt4+__builtin_ctzll(~yk));
                    auto r8=_mm512_permutexvar_epi32(id1,rt);
                    auto r4=_mm512_shuffle_epi32(r8,_MM_PERM_DDBB);
                    auto r2=_mm512_shuffle_epi32(rt,_MM_PERM_BBBB);
                    auto r1=rt;
                    fi=_mm512_permutexvar_epi32(id2,fi);
                    I512 b=_mm512_mul_epu32(rt,tr),a=_mm512_mul_epu32(fi,r8);
                    I512 d=_mm512_mul_epu32(b,ms.niv),c=_mm512_mul_epu32(a,ms.niv);
                    rt=_mm512_srli_epi64(rt,32),tr=_mm512_srli_epi64(tr,32);
                    d=_mm512_mul_epu32(d,ms.M),c=_mm512_mul_epu32(c,ms.M);
                    auto pr1=_mm512_shuffle_epi32(ms.shr2(fi),_MM_PERM_DDBB);
                    d=_mm512_add_epi64(b,d),c=_mm512_add_epi64(a,c);
                    fi=ms.Ladd(pr1,ms.neg_m<0xaaaa>(_mm512_shuffle_epi32(c,_MM_PERM_DDBB)));
                    fi=_mm512_permutexvar_epi32(id3,fi);
                    b=_mm512_mul_epu32(rt,tr),a=_mm512_mul_epu32(fi,r4);
                    I512 kil=_mm512_srli_epi64(d,32);
                    d=_mm512_mul_epu32(b,ms.niv),c=_mm512_mul_epu32(a,ms.niv);
                    d=_mm512_mul_epu32(d,ms.M),c=_mm512_mul_epu32(c,ms.M);
                    pr1=_mm512_shuffle_epi32(ms.shr2(fi),_MM_PERM_DDBB);
                    d=_mm512_add_epi64(b,d),c=_mm512_add_epi64(a,c);
                    fi=ms.Ladd(pr1,ms.neg_m<0xaaaa>(_mm512_shuffle_epi32(c,_MM_PERM_DDBB)));
                    fi=_mm512_permutexvar_epi32(id4,fi);
                    rt=ms.shr(_mm512_mask_blend_epi32(0xaaaa,kil,d));
                    fi=ms.Ladd(_mm512_shuffle_epi32(ms.shr2(fi),_MM_PERM_DDBB),ms.neg_m<0xaaaa>(_mm512_shuffle_epi32(ms._mul_hi(fi,r2),_MM_PERM_DDBB)));
                    fi=_mm512_shuffle_epi32(fi,_MM_PERM_BDAC);
                    fi=ms.add(_mm512_shuffle_epi32(ms.shr2(fi),_MM_PERM_DDBB),ms.neg_m<0xaaaa>(_mm512_shuffle_epi32(ms._mul_hi(fi,r1),_MM_PERM_DDBB)));
                    _mm512_store_si512(f+i,fi);
                }
                _R4=rt;
            }
            return;
        }
        idt qlm=lm>>2;
        const u32 p=(__builtin_ctzll(lm)-_lg_iter_thre-1)>>1;
        if constexpr(first){
            st[p]=bwb;
        }
        auto rt=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(st+p)));
        auto r1=_mm512_permutexvar_epi32(_mm512_load_si512(idx),rt),img=imgx16;
        auto r2=_mm512_shuffle_epi32(r1,_MM_PERM_BBBB),nr3=_mm512_shuffle_epi32(r1,_MM_PERM_DDDD);
        I512 tr=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(rt3+__builtin_ctzll(~ix))));
        _mm256_store_si256((I256*)(st+p),_mm512_castsi512_si256(ms.mul_hi<true>(rt,tr)));
        for(idt j=0;j<qlm;++j){
            __butterfly4<first>(f+j+qlm*0,f+j+qlm*1,f+j+qlm*2,f+j+qlm*3,img,r1,r2,nr3,ms); 
        }
        __dif4<first,complete>(f+qlm*0,qlm,ix<<2|0);
        __dif4<false,complete>(f+qlm*1,qlm,ix<<2|1);
        __dif4<false,complete>(f+qlm*2,qlm,ix<<2|2);
        __dif4<false,complete>(f+qlm*3,qlm,ix<<2|3);
    }
    template<bool complete>void __dif(I512*f,idt n){
        if(__builtin_ctzll(n)&1){
            const auto ms=this->ms;
            n>>=1;
            for(idt i=0;i<n;++i){
                auto x=_mm512_load_si512(f+i),y=_mm512_load_si512(f+n+i);
                _mm512_store_si512(f+i,ms.add(x,y)),_mm512_store_si512(f+n+i,ms.Lsub(x,y));
            }
            __dif4<true,complete>(f,n,0),__dif4<false,complete>(f+n,n,1);
        }
        else{
            __dif4<true,complete>(f,n,0);
        }
    }
    template<bool first,bool shrk=false>static void __ylfrettub4(I512*p0,I512*p1,I512*p2,I512*p3,I512 img,I512 r1,I512 r2,I512 nr3,const LMont_32_I512&ms){
        if constexpr(first){
            auto f2=_mm512_load_si512(p2);
            auto f3=_mm512_load_si512(p3);
            auto g3=ms.mul_sm(ms.Lsub(f3,f2),img);
            auto g2=ms.add(f2,f3);
            auto f0=_mm512_load_si512(p0);
            auto f1=_mm512_load_si512(p1);
            auto g0=ms.add(f0,f1);
            auto g1=ms.sub(f0,f1);
            _mm512_store_si512(p0,ms.shr<shrk>(ms.add(g0,g2)));
            _mm512_store_si512(p1,ms.shr<shrk>(ms.add(g1,g3)));
            _mm512_store_si512(p2,ms.shr<shrk>(ms.sub(g0,g2)));
            _mm512_store_si512(p3,ms.shr<shrk>(ms.sub(g1,g3)));
            return;
        }
        auto f0=_mm512_load_si512(p0);
        auto f1=_mm512_load_si512(p1);
        auto nf2=ms.neg(_mm512_load_si512(p2));
        auto f3=_mm512_load_si512(p3);
        auto g0=ms.add(f0,f1);
        auto ng2=ms.sub(nf2,f3);
        auto g3=ms.mul_sm(ms.Ladd(nf2,f3),img);
        _mm512_store_si512(p2,ms.mul_sm<shrk>(ms.Ladd(g0,ng2),r2));
        _mm512_store_si512(p0,ms.shr<shrk>(ms.sub(g0,ng2)));
        auto g1=ms.sub(f0,f1);
        _mm512_store_si512(p1,ms.mul_sm<shrk>(ms.Ladd(g1,g3),r1));
        _mm512_store_si512(p3,ms.mul_sm<shrk>(ms.Lsub(g3,g1),nr3));
    }
    template<bool first,bool complete,bool shrk=false>void __dit4(I512*f,idt lm,idt ix){
        const auto ms=this->ms;
        if(lm<=(idt(1)<<_lg_iter_thre)){
            idt yk=ix<<__builtin_ctzll(lm);
            if constexpr(complete){
                auto rt=_R4;
                auto r2=pr2,r4=pr4,r8=pr8;
                idt k=yk;
                for(idt i=0;i<lm;++i,++k){
                    auto fi=_mm512_load_si512(f+i);
                    auto tr=_mm512_load_si512(rt4i+__builtin_ctzll(~k));
                    auto r1=rt;
                    fi=ms.Ladd(ms.neg_m<0x5555>(fi),_mm512_shuffle_epi32(fi,_MM_PERM_CDAB));
                    I512 b=_mm512_mul_epu32(rt,tr),a=_mm512_mul_epu32(fi,r8);
                    I512 d=_mm512_mul_epu32(b,ms.niv),c=_mm512_mul_epu32(a,ms.niv);
                    rt=_mm512_srli_epi64(rt,32),tr=_mm512_srli_epi64(tr,32);
                    d=_mm512_mul_epu32(d,ms.M),c=_mm512_mul_epu32(c,ms.M);
                    auto pr1=_mm512_srli_epi64(ms.shr2(fi),32);
                    d=_mm512_add_epi64(b,d),c=_mm512_add_epi64(a,c);
                    fi=_mm512_mask_blend_epi32(0xaaaa,pr1,c);
                    I512 jok=_mm512_srli_epi64(d,32);
                    fi=ms.Ladd(ms.neg_m<0xcccc>(fi),_mm512_shuffle_epi32(fi,_MM_PERM_BADC));
                    b=_mm512_mul_epu32(rt,tr),a=_mm512_mul_epu32(_mm512_shuffle_epi32(fi,_MM_PERM_BDAC),r4);
                    d=_mm512_mul_epu32(b,ms.niv),c=_mm512_mul_epu32(a,ms.niv);
                    pr1=ms.shr2(fi);
                    d=_mm512_mul_epu32(d,ms.M),c=_mm512_mul_epu32(c,ms.M);
                    d=_mm512_add_epi64(b,d),c=_mm512_add_epi64(a,c);
                    rt=ms.shr(_mm512_mask_blend_epi32(0xaaaa,jok,d));
                    fi=_mm512_mask_shuffle_epi32(pr1,0xcccc,c,_MM_PERM_DBCA);
                    fi=ms.mul_sm(ms.Ladd(ms.neg_m<0xf0f0>(fi),_mm512_shuffle_i64x2(fi,fi,0xb1)),r2);
                    fi=ms.mul(ms.Ladd(ms.neg_m<0xff00>(fi),_mm512_shuffle_i64x2(fi,fi,0x4e)),r1);
                    _mm512_store_si512(f+i,fi);
                }
                _R4=rt;
            }
            auto id=_mm512_load_si512(idx),img=imgx16;
            u32 p=0;
            idt l=4,L=1;
            for(;yk>>=2,L<lm;L=l,l<<=2,++p){
                if constexpr(first){st2[p]=bwb;}
                auto rt=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(st2+p)));
                for(idt i=0,k=yk;i<lm;i+=l,++k){
                    auto r1=_mm512_permutexvar_epi32(id,rt);
                    auto r2=_mm512_shuffle_epi32(r1,_MM_PERM_BBBB);
                    auto nr3=_mm512_shuffle_epi32(r1,_MM_PERM_DDDD);
                    I512 tr=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(rt3i+__builtin_ctzll(~k))));
                    rt=ms.mul_hi<true>(rt,tr);
                    for(idt j=0;j<L;++j){
                        __ylfrettub4<false>(f+i+j+L*0,f+i+j+L*1,f+i+j+L*2,f+i+j+L*3,img,r1,r2,nr3,ms); 
                    }
                }
                _mm256_store_si256((I256*)(st2+p),_mm512_castsi512_si256(rt)); 
            }
            if constexpr(shrk){
                for(idt i=0;i<lm;++i){
                    _mm512_store_si512(f+i,ms.shr(_mm512_load_si512(f+i)));
                }
            }
            return;
        }
        idt qlm=lm>>2;
        __dit4<first,complete>(f+qlm*0,qlm,ix<<2|0);
        __dit4<false,complete>(f+qlm*1,qlm,ix<<2|1);
        __dit4<false,complete>(f+qlm*2,qlm,ix<<2|2);
        __dit4<false,complete>(f+qlm*3,qlm,ix<<2|3);
        const u32 p=(__builtin_ctzll(lm)-_lg_iter_thre-1)>>1;
        if constexpr(first){st[p]=bwb;}
        auto rt=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(st+p)));
        auto r1=_mm512_permutexvar_epi32(_mm512_load_si512(idx),rt),img=imgx16;
        auto r2=_mm512_shuffle_epi32(r1,_MM_PERM_BBBB),nr3=_mm512_shuffle_epi32(r1,_MM_PERM_DDDD);
        I512 tr=_mm512_zextsi256_si512(_mm256_load_si256((const I256*)(rt3i+__builtin_ctzll(~ix))));
        _mm256_store_si256((I256*)(st+p),_mm512_castsi512_si256(ms.mul_hi<true>(rt,tr)));
        for(idt j=0;j<qlm;++j){
            __ylfrettub4<first,shrk>(f+j+qlm*0,f+j+qlm*1,f+j+qlm*2,f+j+qlm*3,img,r1,r2,nr3,ms); 
        }
    }
    template<bool complete,bool shrk=false>void __dit(I512*f,idt n){
        _R4=_mm512_set1_epi32(mt.in<true>(mt.in(mt.M-((mt.M-1)>>(__builtin_ctzll(n)+4)))));
		//_R4=_mm512_set1_epi32(mt.in<true>(mt.M-((mt.M-1)>>(__builtin_ctzll(n)+4))));
        if(__builtin_ctzll(n)&1){
            n>>=1;
            __dit4<true,complete>(f,n,0),__dit4<false,complete>(f+n,n,1);
            const auto ms=this->ms;
            for(idt i=0;i<n;++i){
                auto x=_mm512_load_si512(f+i),y=_mm512_load_si512(f+n+i);
                _mm512_store_si512(f+i,ms.shr<shrk>(ms.add(x,y)));
                _mm512_store_si512(f+n+i,ms.shr<shrk>(ms.sub(x,y)));
            }
        }
        else{
            __dit4<true,complete,shrk>(f,n,0);
        }
    }
    //mod x^16-w
    static void __conv16(I512*a,I512*b,I512 w,I512 fx,const LMont_32_I512&ms){
        auto aa=ms.shr(ms.shr2(_mm512_load_si512(a))),bb=ms.mul_sm<true>(_mm512_load_si512(b),fx);
        auto ix=_mm512_set1_epi64(u64(8)<<32),al1=_mm512_set1_epi32(1);
        auto a1=_mm512_permutexvar_epi32(_mm512_load_si512(id2x),aa),a0=_mm512_shuffle_epi32(a1,_MM_PERM_CDAB);
        auto a1w=ms.mul_sm<true>(a1,w),a0w=_mm512_shuffle_epi32(a1w,_MM_PERM_CDAB);
        //a0w a1w a0 a1
        auto res0=_mm512_setzero_si512(),res1=_mm512_setzero_si512();
        auto res2=_mm512_setzero_si512(),res3=_mm512_setzero_si512();
        #pragma GCC unroll 8 //Can't remove because _mm512_alignr_epi64 need immediate data.
        for(u32 i=0;i<8;++i){
            auto b0=_mm512_permutexvar_epi32(ix,bb),b1=_mm512_shuffle_epi32(b0,_MM_PERM_CDAB);
            //i,i+8 i+8,i
            auto pr1=(i==0)?a1:_mm512_alignr_epi64(a1,a0,8-i);
            auto pr2=(i==0)?a0:_mm512_alignr_epi64(a0,a1w,8-i);
            auto pr3=(i==0)?a1w:_mm512_alignr_epi64(a1w,a0w,8-i);
            res0=_mm512_add_epi64(res0,_mm512_mul_epu32(b0,pr2));
            res1=_mm512_add_epi64(res1,_mm512_mul_epu32(b0,pr1));
            res2=_mm512_add_epi64(res2,_mm512_mul_epu32(b1,pr3));
            res3=_mm512_add_epi64(res3,_mm512_mul_epu32(b1,pr2));
            ix=_mm512_add_epi32(ix,al1);
        }
        res0=_mm512_add_epi64(res0,res2),res1=_mm512_add_epi64(res1,res3);
        res2=_mm512_sub_epi64(res0,ms.M2),res3=_mm512_sub_epi64(res1,ms.M2);
        res0=_mm512_min_epu64(res0,res2),res1=_mm512_min_epu64(res1,res3);
        res2=_mm512_mul_epu32(res0,ms.niv),res3=_mm512_mul_epu32(res1,ms.niv);
        res2=_mm512_mul_epu32(res2,ms.M),res3=_mm512_mul_epu32(res3,ms.M);
        res0=_mm512_add_epi64(res0,res2),res1=_mm512_add_epi64(res1,res3);
        res0=_mm512_mask_blend_epi32(0xaaaa,_mm512_srli_epi64(res0,32),res1);
        _mm512_store_si512(a,_mm512_permutexvar_epi32(_mm512_load_si512(id2i),ms.shr2(res0)));
    }
	void dot_I512(I512*a,const I512*b,idt lm){
		const auto ms=this->ms;
		for(idt i=0;i<lm;++i){
            _mm512_store_si512(a+i,ms.mul(_mm512_load_si512(a+i),_mm512_load_si512(b+i)));
		}
	}
    void __dot(I512*a,I512*b,idt lm){
        u32 R=mt.R;
        const auto ms=this->ms;
        const auto fx=mt.in<true>(mt.in(mt.M-((mt.M-1)>>(__builtin_ctzll(lm)))));
        for(idt i=0;i<lm;++i){
            __conv16(a+i,b+i,_mm512_set1_epi32(R),_mm512_set1_epi32(fx),ms);
            R=mt.mul(R,RT1[__builtin_ctzll(~i)]);
        }
    }
};
namespace __yzlf{
using u8=unsigned char;
using u16=unsigned short;
using u32=unsigned;
constexpr std::size_t buf_def_size=262144;
constexpr std::size_t buf_flush_threshold=32;
constexpr std::size_t string_copy_threshold=512;
constexpr u64 E16=1e16,E12=1e12,E8=1e8,E4=1e4;
struct _io_t{
    #define u32 u32
    u8 t_i[1<<15];
    u32 t_o[10000];
    constexpr _io_t(){
        std::fill(t_i,t_i+(1<<15),u8(-1));
        for(u32 i=0;i<10;++i){
            for(u32 j=0;j<10;++j){
                t_i[0x3030+256*j+i]=j+10*i;
            }
        }
        for(u32 e0=(48<<0),j=0;e0<(58<<0);e0+=(1<<0)){
			for(u32 e1=(48<<8);e1<(58<<8);e1+=(1<<8)){
				for(u32 e2=(48<<16);e2<(58<<16);e2+=(1<<16)){
					for(u32 e3=(48<<24);e3<(58<<24);e3+=(1<<24)){
						t_o[j++]=e0^e1^e2^e3;
					}
				}
			}
		}
    }
    #undef u32
    void get(char*s,u32 p)const{
        *((u32*)s)=t_o[p];
    }
};
constexpr _io_t _iot={};
struct Qinf{
    explicit Qinf(FILE*fi):f(fi){
		auto fd=fileno(f);
		fstat(fd,&Fl);
		bg=(char*)mmap(0,Fl.st_size+1,PROT_READ,MAP_PRIVATE,fd,0);
		p=bg,ed=bg+Fl.st_size;
	}
	~Qinf(){
		munmap(bg,Fl.st_size+1);
	}
	template<std::unsigned_integral T>Qinf&operator>>(T&x){
		skip_space();
        x=*p++-'0';
        for(;;){
            T y=_iot.t_i[*reinterpret_cast<u16*>(p)];
            if(y>99){break;}
            x=x*100+y,p+=2;
        }
        if(*p>' '){
            x=x*10+(*p++&15);
        }
        return *this;
    }
	private:
	void skip_space(){
		while(*p<=' '){
			++p;
		}	
	}
	FILE*f;
	char*bg,*ed,*p;
	struct stat Fl;
}qin(stdin);
struct Qoutf{
    explicit Qoutf(FILE*fi,std::size_t sz=buf_def_size):f(fi),bg(new char[sz]),ed(bg+sz-buf_flush_threshold),p(bg){}
    ~Qoutf(){
		flush();
		delete[] bg;
	}
	void flush(){
		fwrite_unlocked(bg,1,p-bg,f),p=bg;
	}
	Qoutf&operator<<(u32 x){
		if(x>=E8){
			put2(x/E8),x%=E8,putb(x/E4),putb(x%E4);
		}
		else if(x>=E4) {
			put4(x/E4),putb(x%E4);
		}
		else{
			put4(x);
		}
		chk();
		return *this;
	}
	Qoutf&operator<<(u64 x){
		if(x>=E8){
			u64 q0=x/E8,r0=x%E8;
			if(x>=E16){
				u64 q1=q0/E8,r1=q0%E8;
				put4(q1),putb(r1/E4),putb(r1%E4);
			} 
			else if(x>=E12){
				put4(q0/E4),putb(q0%E4);
			}
			else{
				put4(q0);
			}
			putb(r0/E4),putb(r0%E4);
		}
		else{
			if(x>=E4){
				put4(x/E4),putb(x%E4);
			}
			else{
				put4(x);
			}
		}
		chk();
		return *this;
	}
	Qoutf&operator<<(char ch){
		*p++=ch;
		return *this;
	}
    private:
	void putb(u32 x){
		_iot.get(p,x),p+=4;
	}
	void put4(u32 x){
		if(x>99){
			if(x>999){
				putb(x);
			}
			else{
				_iot.get(p,x*10),p+=3;
			}	
		}
		else{
			put2(x);
		}
	}
	void put2(u32 x){
		if(x>9){
			_iot.get(p,x*100),p+=2;
		}
		else{
			*p++=x+'0';
		}
	}
	void chk(){
		if(p>ed)[[unlikely]]{
			flush();
		}
	}
	FILE *f;
	char *bg,*ed,*p;
}qout(stdout);
}
using __yzlf::qin;
using __yzlf::qout;
constexpr const u32 mod=998244353;
void solve(){
    idt n,m;
    qin>>n>>m;
    ++m,++n;
    idt lm=idt(2)<<std::__lg(std::max<idt>(16,n+m-1)-1);
    auto f=alc(lm),g=alc(lm);
    for(idt i=0;i<n;++i){
        qin>>f[i];
    }
    clr(f+n,lm-n);
    for(idt i=0;i<m;++i){
        qin>>g[i];
    }
    clr(g+m,lm-m);
    fntt_info_32 ntt{mod,23,752838388};
    {
        ntt.__dif<false>((I512*)f,lm>>4);
        ntt.__dif<false>((I512*)g,lm>>4);
        ntt.__dot((I512*)f,(I512*)g,lm>>4);
        ntt.__dit<false,true>((I512*)f,lm>>4); 
    }
    for(idt i=0;i<n+m-1;++i){
        qout<<f[i]<<' ';
    }
}
main(){
    solve();
}

详细

Test #1:

score: 20
Accepted
time: 1ms
memory: 3712kb

input:

96 96
600395131 184265451 942971382 534262851 830366882 542271170 294355449 501371170 797809599 964826049 276651245 375755165 662619442 941920605 328216963 507795473 460271147 874920847 818231910 156789488 590591583 732194508 793983630 93566697 836155866 305319153 432040686 621119061 835023373 57138...

output:

683858396 5532883 499734624 910262414 221004044 924081841 392466229 64190174 260661815 939986106 283456690 260629512 990528995 704246427 991946815 236857583 903415172 900324859 938555797 225258152 874945420 516870315 74759441 769850097 353889928 300397164 63689540 115003940 872945378 407694641 91843...

result:

ok 193 numbers

Test #2:

score: 20
Accepted
time: 0ms
memory: 3880kb

input:

4992 4994
471194390 313639917 705341086 119536186 430124603 244978845 185588341 13731324 707132801 88167972 927324568 846658454 523684029 5133605 767200778 502476309 539772401 778154025 266136872 183516351 260704325 49303370 475056182 928574546 740424153 277920667 708439854 746983628 839491869 53579...

output:

700935456 677302967 772159864 479386810 109686665 263919131 29567167 960045078 636326916 585682137 409426717 14510019 441964472 92801447 551536199 216995135 59736203 790078879 55883568 796138076 265361608 66124731 150347029 93682849 205256362 672081205 86396898 573029352 541084997 293480941 90518071...

result:

ok 9987 numbers

Test #3:

score: 20
Accepted
time: 2ms
memory: 4336kb

input:

29995 29992
417238081 53580806 733071257 224121793 786137422 127072245 129083351 988357079 246853229 150935424 596994106 975796660 838029970 619117898 328485797 948485083 574261409 79312345 596346086 489787404 929520168 515647000 211731260 50868568 811515357 428215135 498099163 644485329 802849075 3...

output:

115270920 49832720 758693293 745763567 322999821 510579248 697424729 850661201 678364508 817667211 668544763 136619207 562899653 692811546 351397117 768369036 573254435 891143982 717302438 707939747 41743610 540709722 240732780 931265491 38731999 642520590 630812534 632188732 342954490 225414102 836...

result:

ok 59988 numbers

Test #4:

score: 20
Accepted
time: 6ms
memory: 7476kb

input:

100000 99993
812398607 947396010 797321381 883949986 56052416 586258761 193247973 611124334 773505112 142179482 565466227 140875825 79890768 893500101 553768089 648879319 480419657 915530184 799329430 494818755 793895824 851865180 459534006 259473419 610037701 472768430 868914058 887444584 588850309...

output:

821875273 646409297 701893040 744951544 891720486 338002304 134405948 686576985 653633849 704180950 763960458 160339533 773107048 630019221 467173934 675237413 824356289 394352126 870024535 473719536 246319541 372709664 656104889 677100818 890131281 374587639 160832628 144239351 450760970 646488586 ...

result:

ok 199994 numbers

Test #5:

score: 20
Accepted
time: 38ms
memory: 39100kb

input:

999993 999994
388529697 811245378 165909114 295553883 667981275 78502012 400874009 139394758 249494489 4636487 997712665 259780805 431039016 716944209 709300152 356513646 823185021 699568300 650937921 859190797 899514799 785648601 933470757 627225124 349752104 471458923 456404256 48134357 315599086 ...

output:

199012842 735467570 660520906 870291510 102406003 509914017 591503608 692425397 149848591 232605296 411728228 285507919 90090498 682749099 507720817 425946949 937188332 619041823 738654334 153862895 272311969 793838225 260785140 350903642 151151058 631242104 304026658 123734332 23714740 438936743 77...

result:

ok 1999988 numbers