QOJ.ac

QOJ

ID题目提交者结果用时内存语言文件大小提交时间测评时间
#312198#8010. Hierarchies of JudgeskkioWA 3637ms112060kbC++1735.5kb2024-01-23 16:23:162024-01-23 16:23:17

Judging History

你现在查看的是最新测评结果

  • [2024-01-23 16:23:17]
  • 评测
  • 测评结果:WA
  • 用时:3637ms
  • 内存:112060kb
  • [2024-01-23 16:23:16]
  • 提交

answer

#include <bits/stdc++.h>
//#define Kachang 1
#ifdef Kachang
#pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,avx2")
#pragma GCC optimize("Ofast","unroll-loops","inline","no-stack-protector")
#else
#pragma GCC optmize("2")
#endif
using namespace std;
namespace Def{
    #define fir first
    #define sec second
    #define lson(i) (tr[i].ls)
    #define rson(i) (tr[i].rs)
    #define FIO(file) freopen(file".in","r",stdin), freopen(file".out","w",stdout)
    #define Untie() ios::sync_with_stdio(0), cin.tie(0),cout.tie(0)
    typedef long long ll;
    typedef double db;
    typedef long double ldb;
    typedef unsigned int uint;
    typedef unsigned long long ull;
    typedef pair<int,int> pii;
    typedef pair<ll,ll> pll;
    typedef __int128_t i128;
    typedef __uint128_t u128;
}
using namespace Def;
namespace FastIO {
    struct IO {
        char ibuf[(1 << 20) + 1], *iS, *iT, obuf[(1 << 20) + 1], *oS;
        IO() : iS(ibuf), iT(ibuf), oS(obuf) {} ~IO() { fwrite(obuf, 1, oS - obuf, stdout); }
        #if ONLINE_JUDGE
        #define gh() (iS == iT ? iT = (iS = ibuf) + fread(ibuf, 1, (1 << 20) + 1, stdin), (iS == iT ? EOF : *iS++) : *iS++)
        #else
        #define gh() getchar()
        #endif
        inline bool eof (const char &ch) { return ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' || ch == EOF; }
        inline long long read() {
            char ch = gh();
            long long x = 0;
            bool t = 0;
            while (ch < '0' || ch > '9') t |= ch == '-', ch = gh();
            while (ch >= '0' && ch <= '9') x = (x << 1) + (x << 3) + (ch ^ 48), ch = gh();
            return t ? ~(x - 1) : x;
        }
        inline void read (char *s) {
            char ch = gh(); int l = 0;
            while (eof(ch)) ch = gh();
            while (!eof(ch)) s[l++] = ch, ch = gh();
            s[l] = 0;
        }
        inline void read (double &x) {
            char ch = gh(); bool t = 0;
            while (ch < '0' || ch > '9') t |= ch == '-', ch = gh();
            while (ch >= '0' && ch <= '9') x = x * 10 + (ch ^ 48), ch = gh();
            if (ch != '.') return t && (x = -x), void(); ch = gh();
            for (double cf = 0.1; '0' <= ch && ch <= '9'; ch = gh(), cf *= 0.1) x += cf * (ch ^ 48);
            t && (x = -x);
        }
        inline void pc (char ch) {
            #ifdef ONLINE_JUDGE
            if (oS == obuf + (1 << 20) + 1) fwrite(obuf, 1, oS - obuf, stdout), oS = obuf; 
            *oS++ = ch;
            #else
            putchar(ch);
            #endif
        }
        inline void write (char *s)
        {
            int len = strlen(s);
            for(int i = 0; i < len; i++)pc(s[i]);
        }
        template<typename _Tp>
        inline void write (_Tp x) {
            static char stk[64], *tp = stk;
            if (x < 0) x = ~(x - 1), pc('-');
            do *tp++ = x % 10, x /= 10;
            while (x);
            while (tp != stk) pc((*--tp) | 48);
        }
        inline void puts(const char *s){
            int len = strlen(s);
            for (int i = 0; i < len; i++)pc(s[i]);
        }
    } io;
    inline long long read () { return io.read(); }
    template<typename Tp>
    inline void read (Tp &x) { io.read(x); }
    template<typename _Tp>
    inline void write (_Tp x) { io.write(x); }
}
using namespace FastIO;
namespace misc{
    constexpr int infi=1e9;
    constexpr int minfi=0x3f3f3f3f;
    constexpr ll infl=1e18;
    constexpr ll minfl=0x3f3f3f3f3f3f3f3f;
    constexpr int MOD=998244353;
    constexpr int inv2=(MOD+1)/2;
    constexpr int inv3=(MOD+1)/3;
    constexpr double eps=1e-6;
    mt19937_64 rnd(0x3408532);
    template<typename T,typename E>
        inline T ksm(T b,E p){T ret=1;while(p){if(p&1)ret=1ll*ret*b%MOD;b=1ll*b*b%MOD;p>>=1;}return ret;}
    template<typename T,typename E,typename R>
        inline T ksm(T b,E p,R mod){T ret=1;while(p){if(p&1)ret=1ll*ret*b%mod;b=1ll*b*b%mod;p>>=1;}return ret;}
    template<typename T> 
        inline T ginv(T v){return ksm(v,MOD-2);}
    template<typename T,typename E>
        inline void cmax(T &a,E b){a<b?(a=b,1):0;}
    template<typename T,typename E>
        inline void cmin(T &a,E b){a>b?(a=b,1):0;}
    template<typename T,typename E>
        inline void cadd(T &a,E b){(a+=b)>=MOD?(a-=MOD):0;}
    template<typename T,typename E>
        inline void csub(T &a,E b){(a-=b)<0?(a+=MOD):0;}
    template<typename T,typename E>
        inline void cmul(T &a,E b){a=(ll)a*b%MOD;}
    template<typename T,typename E>
        inline T madd(T a,E b){return (a+=b)>=MOD?(a-MOD):a;}
    template<typename T,typename E>
        inline T msub(T a,E b){return (a-=b)<0?(a+MOD):a;}
    template<typename T,typename E>
        inline T mmul(T a,E b){return (ll)a*b%MOD;}
    template<typename T>
        struct dseg{T *first,*last;dseg(T* _l,T* _r):first(_l),last(_r){}};
    inline void debug(void){cerr<<'\n';}
    template<typename T,typename... arg>
        inline void debug(dseg<T> A,arg... v){cerr<<"[ ";for(T* i=A.first;i!=A.last;++i)cerr<<*i<<' ';cerr<<"] ";debug(v...);}
    template<typename T,typename... arg>
        inline void debug(T x,arg... r){cerr<<x<<' ';debug(r...);}
    template<typename T>
        inline T randseg(T l,T r){assert(l<=r);return rnd()%(r-l+1)+l;}
    template<typename T>
        inline bool gbit(T v,int bit){return v>>bit&1;}
    template<typename T> 
        inline void FWTXor(T *a,int n){for(int i=2;i<=n;i<<=1)for(int p=i>>1,j=0;j<n;j+=i)for(int k=j;k<j+p;k++){T x=a[k],y=a[k+p];a[k]=madd(x,y),a[k+p]=msub(x,y);}}
    template<typename T> 
        inline void iFWTXor(T *a,int n){for(int i=2;i<=n;i<<=1)for(int p=i>>1,j=0;j<n;j+=i)for(int k=j;k<j+p;k++){T x=a[k],y=a[k+p];a[k]=mmul(madd(x,y),inv2),a[k+p]=mmul(msub(x,y),inv2);}}
    int timest=0;
    inline void record(){timest=clock()*1000/CLOCKS_PER_SEC;}
    inline int timegap(){int nowtime=clock()*1000/CLOCKS_PER_SEC;return nowtime-timest;}
    inline ll gcd(ll a,ll b){if(!b||!a) return a+b;ll az=__builtin_ctz(a),bz=__builtin_ctz(b),z=(az>bz)?bz:az,t;b>>=bz;while(a) a>>=az,t=a-b,az=__builtin_ctz(t),b=a<b?a:b,a=t<0?-t:t;return b<<z;}
    inline ll exgcd(ll a,ll b,ll &x,ll &y){if(!b){x=1,y=0;return a;}ll g=exgcd(b,a%b,y,x);y-=x*(a/b);return g;}
    inline ll Sum1(ll n){return n*(n+1)/2;}
    inline ll Sum2(ll n){return n*(n+1)*(2*n+1)/6;}
    inline ll Sqr(ll n){return n*n;}
    #define binom(n,m) ((n)<0||(m)<0||(n)<(m)?0:1ll*fac[(n)]*ifac[(m)]%mod*ifac[(n)-(m)]%mod)
    #define likely(x) (__builtin_expect(!!(x),1))
    #define unlikely(x) (__builtin_expect(!!(x),0))
}
using namespace misc;
namespace Barret
{
    class reduction
    {
    private:
        __uint128_t brt;
        int mod;
    public:
        reduction(){};
        reduction(int __mod):brt(((__uint128_t)1<<64)/__mod),mod(__mod){}
        inline void setmod(int __mod){brt=((__uint128_t)1<<64)/__mod,mod=__mod;}
        template<typename T> inline void fix(T& val){val-=mod*(brt*val>>64);while(val>=mod)val-=mod;}
        template<typename T> inline int fixv(T val){val-=mod*(brt*val>>64);return val>=mod?val-mod:val;}
    };
}
using namespace Barret;
#include <immintrin.h>
namespace Modint{
    template <uint32_t mod>
    struct LazyMontgomeryModInt {
        using mint = LazyMontgomeryModInt;
        using i32 = int32_t;
        using u32 = uint32_t;
        using u64 = uint64_t;

        static constexpr u32 get_r() {
            u32 ret = mod;
            for (i32 i = 0; i < 4; ++i) ret *= 2 - mod * ret;
            return ret;
        }

        static constexpr u32 r = get_r();
        static constexpr u32 n2 = -u64(mod) % mod;
        static_assert(mod < (1 << 30), "invalid, mod >= 2 ^ 30");
        static_assert((mod & 1) == 1, "invalid, mod % 2 == 0");
        static_assert(r * mod == 1, "this code has bugs.");

        u32 a;

        constexpr LazyMontgomeryModInt() : a(0) {}
        constexpr LazyMontgomeryModInt(const int64_t &b)
            : a(reduce(u64(b % mod + mod) * n2)){};

        static constexpr u32 reduce(const u64 &b) {
            return (b + u64(u32(b) * u32(-r)) * mod) >> 32;
        }

        constexpr mint &operator+=(const mint &b) {
            if (i32(a += b.a - 2 * mod) < 0) a += 2 * mod;
            return *this;
        }

        constexpr mint &operator-=(const mint &b) {
            if (i32(a -= b.a) < 0) a += 2 * mod;
            return *this;
        }

        constexpr mint &operator*=(const mint &b) {
            a = reduce(u64(a) * b.a);
            return *this;
        }

        constexpr mint &operator/=(const mint &b) {
            *this *= b.inverse();
            return *this;
        }

        constexpr mint operator+(const mint &b) const { return mint(*this) += b; }
        constexpr mint operator-(const mint &b) const { return mint(*this) -= b; }
        constexpr mint operator*(const mint &b) const { return mint(*this) *= b; }
        constexpr mint operator/(const mint &b) const { return mint(*this) /= b; }
        constexpr bool operator==(const mint &b) const {
            return (a >= mod ? a - mod : a) == (b.a >= mod ? b.a - mod : b.a);
        }
        constexpr bool operator!=(const mint &b) const {
            return (a >= mod ? a - mod : a) != (b.a >= mod ? b.a - mod : b.a);
        }
        constexpr mint operator-() const { return mint() - mint(*this); }
        constexpr mint operator+() const { return mint(*this); }

        constexpr mint pow(u64 n) const {
            mint ret(1), mul(*this);
            while (n > 0) {
            if (n & 1) ret *= mul;
            mul *= mul;
            n >>= 1;
            }
            return ret;
        }

        constexpr mint inverse() const {
            int x = get(), y = mod, u = 1, v = 0, t = 0, tmp = 0;
            while (y > 0) {
            t = x / y;
            x -= t * y, u -= t * v;
            tmp = x, x = y, y = tmp;
            tmp = u, u = v, v = tmp;
            }
            return mint{u};
        }

        friend ostream &operator<<(ostream &os, const mint &b) {
            return os << b.get();
        }

        friend istream &operator>>(istream &is, mint &b) {
            int64_t t;
            is >> t;
            b = LazyMontgomeryModInt<mod>(t);
            return (is);
        }

        constexpr u32 get() const {
            u32 ret = reduce(a);
            return ret >= mod ? ret - mod : ret;
        }

        static constexpr u32 get_mod() { return mod; }
    };

}
namespace nttsse42{
    __attribute__((target("sse4.2"))) inline __m128i my128_mullo_epu32(
        const __m128i &a, const __m128i &b) {
    return _mm_mullo_epi32(a, b);
    }

    __attribute__((target("sse4.2"))) inline __m128i my128_mulhi_epu32(
        const __m128i &a, const __m128i &b) {
    __m128i a13 = _mm_shuffle_epi32(a, 0xF5);
    __m128i b13 = _mm_shuffle_epi32(b, 0xF5);
    __m128i prod02 = _mm_mul_epu32(a, b);
    __m128i prod13 = _mm_mul_epu32(a13, b13);
    __m128i prod = _mm_unpackhi_epi64(_mm_unpacklo_epi32(prod02, prod13),
                                        _mm_unpackhi_epi32(prod02, prod13));
    return prod;
    }

    __attribute__((target("sse4.2"))) inline __m128i montgomery_mul_128(
        const __m128i &a, const __m128i &b, const __m128i &r, const __m128i &m1) {
    return _mm_sub_epi32(
        _mm_add_epi32(my128_mulhi_epu32(a, b), m1),
        my128_mulhi_epu32(my128_mullo_epu32(my128_mullo_epu32(a, b), r), m1));
    }

    __attribute__((target("sse4.2"))) inline __m128i montgomery_add_128(
        const __m128i &a, const __m128i &b, const __m128i &m2, const __m128i &m0) {
    __m128i ret = _mm_sub_epi32(_mm_add_epi32(a, b), m2);
    return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
    }

    __attribute__((target("sse4.2"))) inline __m128i montgomery_sub_128(
        const __m128i &a, const __m128i &b, const __m128i &m2, const __m128i &m0) {
    __m128i ret = _mm_sub_epi32(a, b);
    return _mm_add_epi32(_mm_and_si128(_mm_cmpgt_epi32(m0, ret), m2), ret);
    }

    __attribute__((target("avx2"))) inline __m256i my256_mullo_epu32(
        const __m256i &a, const __m256i &b) {
    return _mm256_mullo_epi32(a, b);
    }

    __attribute__((target("avx2"))) inline __m256i my256_mulhi_epu32(
        const __m256i &a, const __m256i &b) {
    __m256i a13 = _mm256_shuffle_epi32(a, 0xF5);
    __m256i b13 = _mm256_shuffle_epi32(b, 0xF5);
    __m256i prod02 = _mm256_mul_epu32(a, b);
    __m256i prod13 = _mm256_mul_epu32(a13, b13);
    __m256i prod = _mm256_unpackhi_epi64(_mm256_unpacklo_epi32(prod02, prod13),
                                        _mm256_unpackhi_epi32(prod02, prod13));
    return prod;
    }

    __attribute__((target("avx2"))) inline __m256i montgomery_mul_256(
        const __m256i &a, const __m256i &b, const __m256i &r, const __m256i &m1) {
    return _mm256_sub_epi32(
        _mm256_add_epi32(my256_mulhi_epu32(a, b), m1),
        my256_mulhi_epu32(my256_mullo_epu32(my256_mullo_epu32(a, b), r), m1));
    }

    __attribute__((target("avx2"))) inline __m256i montgomery_add_256(
        const __m256i &a, const __m256i &b, const __m256i &m2, const __m256i &m0) {
    __m256i ret = _mm256_sub_epi32(_mm256_add_epi32(a, b), m2);
    return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
                            ret);
    }

    __attribute__((target("avx2"))) inline __m256i montgomery_sub_256(
        const __m256i &a, const __m256i &b, const __m256i &m2, const __m256i &m0) {
    __m256i ret = _mm256_sub_epi32(a, b);
    return _mm256_add_epi32(_mm256_and_si256(_mm256_cmpgt_epi32(m0, ret), m2),
                            ret);
    }
    constexpr int SZ_FFT_BUF = 1 << 23;
    uint32_t buf1_[SZ_FFT_BUF] __attribute__((aligned(64)));
    uint32_t buf2_[SZ_FFT_BUF] __attribute__((aligned(64)));
    template <typename mint>
    struct NTT {
        
    
        static constexpr uint32_t get_pr() {
            uint32_t mod = mint::get_mod();
            using u64 = uint64_t;
            u64 ds[32] = {};
            int idx = 0;
            u64 m = mod - 1;
            for (u64 i = 2; i * i <= m; ++i) {
            if (m % i == 0) {
                ds[idx++] = i;
                while (m % i == 0) m /= i;
            }
            }
            if (m != 1) ds[idx++] = m;

            uint32_t pr = 2;
            while (1) {
            int flg = 1;
            for (int i = 0; i < idx; ++i) {
                u64 a = pr, b = (mod - 1) / ds[i], r = 1;
                while (b) {
                if (b & 1) r = r * a % mod;
                a = a * a % mod;
                b >>= 1;
                }
                if (r == 1) {
                flg = 0;
                break;
                }
            }
            if (flg == 1) break;
            ++pr;
            }
            return pr;
        };

        static constexpr uint32_t mod = mint::get_mod();
        static constexpr uint32_t pr = get_pr();
        static constexpr int level = __builtin_ctzll(mod - 1);
        mint dw[level], dy[level];
        mint *buf1, *buf2;

        NTT() {
            setwy(level);
            buf1 = reinterpret_cast<mint *>(buf1_);
            buf2 = reinterpret_cast<mint *>(buf2_);
        }

        constexpr void setwy(int k) {
            mint w[level], y[level];
            w[k - 1] = mint(pr).pow((mod - 1) / (1 << k));
            y[k - 1] = w[k - 1].inverse();
            for (int i = k - 2; i > 0; --i)
            w[i] = w[i + 1] * w[i + 1], y[i] = y[i + 1] * y[i + 1];
            dw[1] = w[1], dy[1] = y[1], dw[2] = w[2], dy[2] = y[2];
            for (int i = 3; i < k; ++i) {
            dw[i] = dw[i - 1] * y[i - 2] * w[i];
            dy[i] = dy[i - 1] * w[i - 2] * y[i];
            }
        }

        __attribute__((target("sse4.2"))) void ntt(mint *a, int n) {
            int k = n ? __builtin_ctz(n) : 0;
            if (k == 0) return;
            if (k == 1) {
            mint a1 = a[1];
            a[1] = a[0] - a[1];
            a[0] = a[0] + a1;
            return;
            }
            if (k & 1) {
            int v = 1 << (k - 1);
            for (int j = 0; j < v; ++j) {
                mint ajv = a[j + v];
                a[j + v] = a[j] - ajv;
                a[j] += ajv;
            }
            }
            int u = 1 << (2 + (k & 1));
            int v = 1 << (k - 2 - (k & 1));
            mint one = mint(1);
            mint imag = dw[1];
            const __m128i m0 = _mm_set1_epi32(0);
            const __m128i m1 = _mm_set1_epi32(mod);
            const __m128i m2 = _mm_set1_epi32(mod + mod);
            const __m128i r = _mm_set1_epi32(mint::r);
            const __m128i Imag = _mm_set1_epi32(imag.a);
            while (v) {
            if (v == 1) {
                mint ww = one, xx = one, wx = one;
                for (int jh = 0; jh < u;) {
                ww = xx * xx, wx = ww * xx;
                mint t0 = a[jh + 0], t1 = a[jh + 1] * xx;
                mint t2 = a[jh + 2] * ww, t3 = a[jh + 3] * wx;
                mint t0p2 = t0 + t2, t1p3 = t1 + t3;
                mint t0m2 = t0 - t2, t1m3 = (t1 - t3) * imag;
                a[jh + 0] = t0p2 + t1p3, a[jh + 1] = t0p2 - t1p3;
                a[jh + 2] = t0m2 + t1m3, a[jh + 3] = t0m2 - t1m3;
                xx *= dw[__builtin_ctz((jh += 4))];
                }
            } else {
                mint ww = one, xx = one, wx = one;
                for (int jh = 0; jh < u;) {
                if (jh == 0) {
                    int j0 = 0;
                    int j1 = v;
                    int j2 = j1 + v;
                    int j3 = j2 + v;
                    int je = v;
                    for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
                    __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
                    __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
                    __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
                    __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
                    __m128i T0P2 = montgomery_add_128(T0, T2, m2, m0);
                    __m128i T1P3 = montgomery_add_128(T1, T3, m2, m0);
                    __m128i T0M2 = montgomery_sub_128(T0, T2, m2, m0);
                    __m128i T1M3 =
                        montgomery_mul_128(montgomery_sub_128(T1, T3, m2, m0), Imag, r, m1);
                    _mm_storeu_si128((__m128i *)(a + j0),
                                    montgomery_add_128(T0P2, T1P3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j1),
                                    montgomery_sub_128(T0P2, T1P3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j2),
                                    montgomery_add_128(T0M2, T1M3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j3),
                                    montgomery_sub_128(T0M2, T1M3, m2, m0));
                    }
                } else {
                    ww = xx * xx, wx = ww * xx;
                    __m128i WW = _mm_set1_epi32(ww.a);
                    __m128i WX = _mm_set1_epi32(wx.a);
                    __m128i XX = _mm_set1_epi32(xx.a);
                    int j0 = jh * v;
                    int j1 = j0 + v;
                    int j2 = j1 + v;
                    int j3 = j2 + v;
                    int je = j1;
                    for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
                    __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
                    __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
                    __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
                    __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
                    T1 = montgomery_mul_128(T1, XX, r, m1);
                    T2 = montgomery_mul_128(T2, WW, r, m1);
                    T3 = montgomery_mul_128(T3, WX, r, m1);
                    __m128i T0P2 = montgomery_add_128(T0, T2, m2, m0);
                    __m128i T1P3 = montgomery_add_128(T1, T3, m2, m0);
                    __m128i T0M2 = montgomery_sub_128(T0, T2, m2, m0);
                    __m128i T1M3 =
                        montgomery_mul_128(montgomery_sub_128(T1, T3, m2, m0), Imag, r, m1);
                    _mm_storeu_si128((__m128i *)(a + j0),
                                    montgomery_add_128(T0P2, T1P3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j1),
                                    montgomery_sub_128(T0P2, T1P3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j2),
                                    montgomery_add_128(T0M2, T1M3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j3),
                                    montgomery_sub_128(T0M2, T1M3, m2, m0));
                    }
                }
                xx *= dw[__builtin_ctz((jh += 4))];
                }
            }
            u <<= 2;
            v >>= 2;
            }
        }

        __attribute__((target("sse4.2"))) void intt(mint *a, int n,
                                                    int normalize = true) {
            int k = n ? __builtin_ctz(n) : 0;
            if (k == 0) return;
            if (k == 1) {
            mint a1 = a[1];
            a[1] = a[0] - a[1];
            a[0] = a[0] + a1;
            return;
            }
            int u = 1 << (k - 2);
            int v = 1;
            mint one = mint(1);
            mint imag = dy[1];
            const __m128i m0 = _mm_set1_epi32(0);
            const __m128i m1 = _mm_set1_epi32(mod);
            const __m128i m2 = _mm_set1_epi32(mod + mod);
            const __m128i r = _mm_set1_epi32(mint::r);
            const __m128i Imag = _mm_set1_epi32(imag.a);
            while (u) {
            if (v == 1) {
                mint ww = one, xx = one, yy = one;
                u <<= 2;
                for (int jh = 0; jh < u;) {
                ww = xx * xx, yy = xx * imag;
                mint t0 = a[jh + 0], t1 = a[jh + 1];
                mint t2 = a[jh + 2], t3 = a[jh + 3];
                mint t0p1 = t0 + t1, t2p3 = t2 + t3;
                mint t0m1 = (t0 - t1) * xx, t2m3 = (t2 - t3) * yy;
                a[jh + 0] = t0p1 + t2p3, a[jh + 2] = (t0p1 - t2p3) * ww;
                a[jh + 1] = t0m1 + t2m3, a[jh + 3] = (t0m1 - t2m3) * ww;
                xx *= dy[__builtin_ctz(jh += 4)];
                }
            } else {
                mint ww = one, xx = one, yy = one;
                u <<= 2;
                for (int jh = 0; jh < u;) {
                if (jh == 0) {
                    int j0 = 0;
                    int j1 = v;
                    int j2 = v + v;
                    int j3 = j2 + v;
                    for (; j0 < v; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
                    __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
                    __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
                    __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
                    __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
                    __m128i T0P1 = montgomery_add_128(T0, T1, m2, m0);
                    __m128i T2P3 = montgomery_add_128(T2, T3, m2, m0);
                    __m128i T0M1 = montgomery_sub_128(T0, T1, m2, m0);
                    __m128i T2M3 =
                        montgomery_mul_128(montgomery_sub_128(T2, T3, m2, m0), Imag, r, m1);
                    _mm_storeu_si128((__m128i *)(a + j0),
                                    montgomery_add_128(T0P1, T2P3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j2),
                                    montgomery_sub_128(T0P1, T2P3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j1),
                                    montgomery_add_128(T0M1, T2M3, m2, m0));
                    _mm_storeu_si128((__m128i *)(a + j3),
                                    montgomery_sub_128(T0M1, T2M3, m2, m0));
                    }
                } else {
                    ww = xx * xx, yy = xx * imag;
                    __m128i WW = _mm_set1_epi32(ww.a);
                    __m128i XX = _mm_set1_epi32(xx.a);
                    __m128i YY = _mm_set1_epi32(yy.a);
                    int j0 = jh * v;
                    int j1 = j0 + v;
                    int j2 = j1 + v;
                    int j3 = j2 + v;
                    int je = j1;
                    for (; j0 < je; j0 += 4, j1 += 4, j2 += 4, j3 += 4) {
                    __m128i T0 = _mm_loadu_si128((__m128i *)(a + j0));
                    __m128i T1 = _mm_loadu_si128((__m128i *)(a + j1));
                    __m128i T2 = _mm_loadu_si128((__m128i *)(a + j2));
                    __m128i T3 = _mm_loadu_si128((__m128i *)(a + j3));
                    __m128i T0P1 = montgomery_add_128(T0, T1, m2, m0);
                    __m128i T2P3 = montgomery_add_128(T2, T3, m2, m0);
                    __m128i T0M1 =
                        montgomery_mul_128(montgomery_sub_128(T0, T1, m2, m0), XX, r, m1);
                    __m128i T2M3 =
                        montgomery_mul_128(montgomery_sub_128(T2, T3, m2, m0), YY, r, m1);
                    _mm_storeu_si128((__m128i *)(a + j0),
                                    montgomery_add_128(T0P1, T2P3, m2, m0));
                    _mm_storeu_si128(
                        (__m128i *)(a + j2),
                        montgomery_mul_128(montgomery_sub_128(T0P1, T2P3, m2, m0), WW, r,
                                        m1));
                    _mm_storeu_si128((__m128i *)(a + j1),
                                    montgomery_add_128(T0M1, T2M3, m2, m0));
                    _mm_storeu_si128(
                        (__m128i *)(a + j3),
                        montgomery_mul_128(montgomery_sub_128(T0M1, T2M3, m2, m0), WW, r,
                                        m1));
                    }
                }
                xx *= dy[__builtin_ctz(jh += 4)];
                }
            }
            u >>= 4;
            v <<= 2;
            }
            if (k & 1) {
            u = 1 << (k - 1);
            for (int j = 0; j < u; ++j) {
                mint ajv = a[j] - a[j + u];
                a[j] += a[j + u];
                a[j + u] = ajv;
            }
            }
            if (normalize) {
            mint invn = one / mint(n);
            for (int i = 0; i < n; i++) a[i] *= invn;
            }
        }

        constexpr vector<mint> multiply(const vector<mint> &a,
                                        const vector<mint> &b) {
            int l = a.size() + b.size() - 1;
            if (min<int>(a.size(), b.size()) <= 40) {
            vector<mint> s(l);
            for (int i = 0; i < (int)a.size(); ++i)
                for (int j = 0; j < (int)b.size(); ++j) s[i + j] += a[i] * b[j];
            return s;
            }
            int M = 4;
            while (M < l) M <<= 1;
            for (int i = 0; i < (int)a.size(); ++i) buf1[i].a = a[i].a;
            for (int i = (int)a.size(); i < M; ++i) buf1[i].a = 0;
            for (int i = 0; i < (int)b.size(); ++i) buf2[i].a = b[i].a;
            for (int i = (int)b.size(); i < M; ++i) buf2[i].a = 0;
            ntt(buf1, M);
            ntt(buf2, M);
            for (int i = 0; i < M; ++i)
            buf1[i].a = mint::reduce(uint64_t(buf1[i].a) * buf2[i].a);
            intt(buf1, M, false);
            vector<mint> s(l);
            mint invm = mint(M).inverse();
            for (int i = 0; i < l; ++i) s[i] = buf1[i] * invm;
            return s;
        }
    };
}
namespace ZPoly
{
using mint=Modint::LazyMontgomeryModInt<998244353>;
nttsse42::NTT<mint> Calculator;
using LL=long long;
constexpr int MOD=998244353,G=114514,MAXN=1<<21;
inline mint qpow(mint a,LL b) { mint r=1;for(;b;(b&1)?r=r*a:0,a=a*a,b>>=1);return r; }
inline mint madd(mint x) { return x; }
inline mint mmul(mint x) { return x; }
inline mint msub(mint x,mint y) { return x-y; }
inline mint mdiv(mint x,mint y) { return x*qpow(y,MOD-2); }
template<typename ...Args>inline mint madd(mint x,Args ...y) { return x+=madd(y...); }
template<typename ...Args>inline mint mmul(mint x,Args ...y) { return x*mmul(y...); }
class Polynomial
{
 private:
	static constexpr int NTT_LIM=180;
	static mint g[MAXN+5],c1[MAXN+5],c2[MAXN+5];
	int deg;
	vector<mint> c;
 public:
	static void init()
	{
        mint gn;
		for(int i=2;i<=MAXN;i<<=1)
		{
			g[i>>1]=1,gn=qpow(G,(MOD-1)/i);
			for(int j=(i>>1)+1;j<i;j++) g[j]=mmul(g[j-1],gn);
		}
	}
	static void DIT(mint *a,int len)
	{Calculator.ntt(a,len);}
	static void DIF(mint *a,int len)
	{Calculator.intt(a,len);}
 private:
	static void __polyinv(const mint *a,mint *b,int len)
	{
		if(len==1) return b[0]=qpow(a[0],MOD-2),void();
		__polyinv(a,b,(len+1)>>1);
		int nn=1<<(__lg((len<<1)-1)+1);
		memcpy(c1,a,len<<2);
		memset(b+len,0,(nn-len)<<2);
		memset(c1+len,0,(nn-len)<<2);
		DIT(b,nn),DIT(c1,nn);
		for(int i=0;i<nn;i++) b[i]=mmul(b[i],msub(2,mmul(b[i],c1[i])));
		DIF(b,nn),memset(b+len,0,(nn-len)<<2);
	}
	static void __polyln(const mint *a,mint *b,int len)
	{
		__polyinv(a,b,len);
		for(int i=1;i<len;i++) c1[i-1]=mmul(i,a[i]);
		int nn=1<<(__lg((len<<1)-1)+1);
		memset(b+len,0,(nn-len)<<2);
		memset(c1+len,0,(nn-len)<<2);
		DIT(b,nn),DIT(c1,nn);
		for(int i=0;i<nn;i++) b[i]=mmul(b[i],c1[i]);
		DIF(b,nn),memset(b+len,0,(nn-len)<<2);
		for(int i=len-1;i>0;i--) b[i]=mdiv(b[i-1],i);
		b[0]=0;
	}
	static void __polyexp(const mint *a,mint *b,int l,int r)
	{
		if(l==r-1) return b[l]=(l?mdiv(b[l],l):1),void();
		int len=r-l,mid=(l+r)>>1;
		__polyexp(a,b,l,mid);
		for(int i=0;i<len;i++) c1[i]=a[i];
		memcpy(c2,b+l,(mid-l)<<2);
		memset(c2+mid-l,0,(r-mid)<<2);
		if(len<=NTT_LIM) for(int i=len-1;i>=0;i--)
		{
			c1[i]=mmul(c1[i],c2[0]);
			for(int j=0;j<i;j++) c1[i]=madd(c1[i],mmul(c1[j],c2[i-j]));
		}
		else
		{
			DIT(c1,len),DIT(c2,len);
			for(int i=0;i<len;i++) c1[i]=mmul(c1[i],c2[i]);
			DIF(c1,len);
		}
		for(int i=mid;i<r;i++) b[i]=madd(b[i],c1[i-l]);
		__polyexp(a,b,mid,r);
	}
 public:
	Polynomial(): deg(1),c(1){}
	Polynomial(const Polynomial &p): deg(p.deg),c(p.c){}
	Polynomial(Polynomial &&p): deg(p.deg),c(move(p.c)){}
	explicit Polynomial(int d): deg(d),c(d){}
	explicit Polynomial(const vector<mint> &v): deg(v.size()),c(v){}
	explicit Polynomial(const initializer_list<mint> &l): deg(l.size()),c(l){}
	inline mint &operator [](int i) { return c[i]; }
	inline mint operator [](int i)const { return c[i]; }
	inline int degree()const { return deg; }
	inline void resize(int d) { c.resize(deg=d); }
	inline Polynomial &operator +=(const Polynomial &p)
	{
		if(deg<p.deg) resize(p.deg);
		for(int i=0;i<p.deg;i++) c[i]=madd(c[i],p[i]);
		return *this;
	}
	inline Polynomial &operator -=(const Polynomial &p)
	{
		if(deg<p.deg) resize(p.deg);
		for(int i=0;i<p.deg;i++) c[i]=msub(c[i],p[i]);
		return *this;
	}
	inline Polynomial &operator *=(const Polynomial &p)
	{
		int n=deg,m=p.deg;resize(n+m-1);
		if(n+m<NTT_LIM)
		{
			memcpy(c1,c.data(),n<<2);
			memset(c2,0,(n+m-1)<<2);
			for(int i=0;i<n;i++)
				for(int j=0;j<m;j++)
					c2[i+j]=madd(c2[i+j],mmul(c1[i],p[j]));
			memcpy(c.data(),c2,(n+m-1)<<2);
		}
		else
		{
			int nn=1<<(__lg(n+m-1)+1);
			memcpy(c1,c.data(),n<<2),memcpy(c2,p.c.data(),m<<2);
			memset(c1+n,0,(nn-n)<<2),memset(c2+m,0,(nn-m)<<2);
			DIT(c1,nn),DIT(c2,nn);
			for(int i=0;i<nn;i++) c1[i]=mmul(c1[i],c2[i]);
			DIF(c1,nn),memcpy(c.data(),c1,deg<<2);
		}
		return *this;
	}
	friend inline Polynomial derivative(const Polynomial &p)
	{
		Polynomial q(p.deg-1);
		for(int i=1;i<p.deg;i++) q[i-1]=mmul(p[i],i);
		return q;
	}
	friend inline Polynomial integral(const Polynomial &p)
	{
		Polynomial q(p.deg+1);
		for(int i=1;i<p.deg;i++) q[i+1]=mdiv(p[i],i+1);
		return q;
	}
	inline Polynomial inv()const
	{
		if(c[0]==0) cerr<<"[x^0]f(x)=0, f(x)^-1 doesn't exist.\n",abort();
		int nn=1<<(__lg((deg<<1)-1)+1);
		Polynomial q(nn);
		__polyinv(c.data(),q.c.data(),deg);
		return q.resize(deg),q;
	}
	friend inline Polynomial ln(const Polynomial &p)
	{
		if(p[0]!=1) cerr<<"[x^0]f(x)!=1, ln(f(x)) doesn't exist.\n",abort();
		int nn=1<<(__lg((p.deg<<1)-1)+1);
		Polynomial q(nn);
		__polyln(p.c.data(),q.c.data(),p.deg);
		return q.resize(p.deg),q;
	}
	friend inline Polynomial exp(const Polynomial &p)
	{
		if(p[0]!=0) cerr<<"[x^0]f(x)!=0, exp(f(x)) doesn't exist.\n",abort();
		static mint c[MAXN];
		int nn=1<<(__lg(p.deg-1)+1);
		for(int i=0;i<p.deg;i++) c[i]=mmul(i,p[i]).get();
		Polynomial q(nn);
		__polyexp(c,q.c.data(),0,nn);
		return q.resize(p.deg),q;
	}
	friend inline pair<Polynomial,Polynomial> div(const Polynomial &f,const Polynomial &g)
	{
		if(f.deg<g.deg) return make_pair(Polynomial{0},f);
		int n=f.deg-1,m=g.deg-1;
		Polynomial fr(n+1),gr(m+1);
		for(int i=0;i<=n;i++) fr[i]=f[n-i];
		for(int i=0;i<=m;i++) gr[i]=g[m-i];
		fr.resize(n-m+1),gr.resize(n-m+1),fr*=gr.inv();
		fr.resize(n-m+1),reverse(fr.c.begin(),fr.c.end());
		gr=f-fr*g,gr.resize(m);
		return make_pair(fr,gr);
	}
	inline Polynomial &operator =(const Polynomial &p)
		{ return deg=p.deg,c=p.c,*this; }
	inline Polynomial &operator =(Polynomial &&p)
		{ return deg=p.deg,c=move(p.c),*this; }
	inline Polynomial &operator *=(int k)
		{ for(auto &i: c) i=mmul(i,k);return *this; }
	inline Polynomial &operator /=(const Polynomial &rhs)
		{ return (*this)*=rhs.inv(); }
	inline Polynomial &operator %=(const Polynomial &rhs)
		{ return (*this)=div(*this,rhs).second; }
	inline Polynomial operator +(const Polynomial &rhs)const
		{ return Polynomial(*this)+=rhs; }
	inline Polynomial operator -(const Polynomial &rhs)const
		{ return Polynomial(*this)-=rhs; }
	inline Polynomial operator *(const Polynomial &rhs)const
		{ return Polynomial(*this)*=rhs; }
	inline Polynomial operator /(const Polynomial &rhs)const
		{ return Polynomial(*this)/=rhs; }
	inline Polynomial operator %(const Polynomial &rhs)const
		{ return div(*this,rhs).second; }
	friend inline Polynomial operator *(const Polynomial &p,int k)
		{ return Polynomial(p)*=k; }
	friend inline Polynomial operator *(int k,const Polynomial &p)
		{ return Polynomial(p)*=k; } 
};
mint Polynomial::g[]={},Polynomial::c1[]={},Polynomial::c2[]={};
}


const int mod=998244353;

using Poly=ZPoly::Polynomial;
Poly x1({0,1}),x0({1});
Poly G0(Poly F0,Poly F1,Poly e01,Poly e1)
{return x1*(e01-e1)-F0*F0+F0;}
Poly G1(Poly F0,Poly F1,Poly e01,Poly e1)
{return x1*(F0*F0*e01-e1)-F0*F1+F1;}
Poly dr00(Poly F0,Poly F1,Poly e01,Poly e1)
{return x1*F1*e01-F0*2+x0;}
Poly dr01(Poly F0,Poly F1,Poly e01,Poly e1)
{return x1*(F0*e01-e1);}
Poly dr10(Poly F0,Poly F1,Poly e01,Poly e1)
{return x1*(2*F0*e01+F0*F0*F1*e01)-F1;}
Poly dr11(Poly F0,Poly F1,Poly e01,Poly e1)
{return x1*(F0*F0*F0*e01-e1)-F0+x0;}
pair<Poly,Poly> newton(int n)
{
    if(n==1)return {Poly({0}),Poly({0})};
    int m=(n+1)/2;
    Poly F0,F1;
    tie(F0,F1)=newton(m);F0.resize(n);F1.resize(n);
    x0.resize(n);x1.resize(n);
    Poly e01=exp(F0*F1);e01.resize(n);
    Poly e1=exp(F1);e1.resize(n);
    Poly d00=dr00(F0,F1,e01,e1),d01=dr01(F0,F1,e01,e1),d10=dr10(F0,F1,e01,e1),d11=dr11(F0,F1,e01,e1);
    Poly g0=G0(F0,F1,e01,e1),g1=G1(F0,F1,e01,e1);
    Poly invd=(d00*d11-d01*d10).inv();
    Poly nxtF0=F0-(g0*d11-g1*d01)*invd;
    Poly nxtF1=F1-(g1*d00-g0*d10)*invd;
    nxtF0.resize(n),nxtF1.resize(n);
    return {nxtF0,nxtF1};
}
int main()
{
    Poly::init();
    Poly F0,F1;
    int n;
    cin>>n;
    tie(F0,F1)=newton(n+1);
    int sum=(F0[n]+F1[n]).get();
    for(int i=1;i<=n;i++)cmul(sum,i);
    cout<<sum<<'\n';
}

詳細信息

Test #1:

score: 100
Accepted
time: 10ms
memory: 20068kb

input:

1

output:

1

result:

ok 1 number(s): "1"

Test #2:

score: 0
Accepted
time: 7ms
memory: 20364kb

input:

3

output:

24

result:

ok 1 number(s): "24"

Test #3:

score: 0
Accepted
time: 7ms
memory: 20364kb

input:

5

output:

3190

result:

ok 1 number(s): "3190"

Test #4:

score: 0
Accepted
time: 11ms
memory: 20172kb

input:

100

output:

413875584

result:

ok 1 number(s): "413875584"

Test #5:

score: 0
Accepted
time: 7ms
memory: 20068kb

input:

1

output:

1

result:

ok 1 number(s): "1"

Test #6:

score: 0
Accepted
time: 10ms
memory: 20076kb

input:

2

output:

4

result:

ok 1 number(s): "4"

Test #7:

score: 0
Accepted
time: 7ms
memory: 20152kb

input:

3

output:

24

result:

ok 1 number(s): "24"

Test #8:

score: 0
Accepted
time: 10ms
memory: 20064kb

input:

4

output:

236

result:

ok 1 number(s): "236"

Test #9:

score: 0
Accepted
time: 7ms
memory: 20132kb

input:

5

output:

3190

result:

ok 1 number(s): "3190"

Test #10:

score: 0
Accepted
time: 10ms
memory: 20080kb

input:

6

output:

55182

result:

ok 1 number(s): "55182"

Test #11:

score: 0
Accepted
time: 3ms
memory: 20156kb

input:

7

output:

1165220

result:

ok 1 number(s): "1165220"

Test #12:

score: 0
Accepted
time: 10ms
memory: 20168kb

input:

8

output:

29013896

result:

ok 1 number(s): "29013896"

Test #13:

score: 0
Accepted
time: 7ms
memory: 20364kb

input:

9

output:

832517514

result:

ok 1 number(s): "832517514"

Test #14:

score: 0
Accepted
time: 0ms
memory: 20168kb

input:

10

output:

96547079

result:

ok 1 number(s): "96547079"

Test #15:

score: 0
Accepted
time: 10ms
memory: 20080kb

input:

11

output:

296100513

result:

ok 1 number(s): "296100513"

Test #16:

score: 0
Accepted
time: 7ms
memory: 20144kb

input:

12

output:

672446962

result:

ok 1 number(s): "672446962"

Test #17:

score: 0
Accepted
time: 10ms
memory: 20084kb

input:

13

output:

986909297

result:

ok 1 number(s): "986909297"

Test #18:

score: 0
Accepted
time: 3ms
memory: 20340kb

input:

14

output:

306542229

result:

ok 1 number(s): "306542229"

Test #19:

score: 0
Accepted
time: 10ms
memory: 20092kb

input:

15

output:

8548107

result:

ok 1 number(s): "8548107"

Test #20:

score: 0
Accepted
time: 7ms
memory: 20092kb

input:

16

output:

773960239

result:

ok 1 number(s): "773960239"

Test #21:

score: 0
Accepted
time: 7ms
memory: 20184kb

input:

17

output:

611627547

result:

ok 1 number(s): "611627547"

Test #22:

score: 0
Accepted
time: 10ms
memory: 20388kb

input:

18

output:

91793980

result:

ok 1 number(s): "91793980"

Test #23:

score: 0
Accepted
time: 3ms
memory: 20352kb

input:

19

output:

689202618

result:

ok 1 number(s): "689202618"

Test #24:

score: 0
Accepted
time: 10ms
memory: 20112kb

input:

20

output:

605957782

result:

ok 1 number(s): "605957782"

Test #25:

score: 0
Accepted
time: 208ms
memory: 29948kb

input:

10000

output:

713782215

result:

ok 1 number(s): "713782215"

Test #26:

score: 0
Accepted
time: 421ms
memory: 35596kb

input:

20000

output:

337916836

result:

ok 1 number(s): "337916836"

Test #27:

score: 0
Accepted
time: 484ms
memory: 42888kb

input:

30000

output:

580803285

result:

ok 1 number(s): "580803285"

Test #28:

score: 0
Accepted
time: 907ms
memory: 51864kb

input:

40000

output:

732660392

result:

ok 1 number(s): "732660392"

Test #29:

score: 0
Accepted
time: 937ms
memory: 57276kb

input:

50000

output:

660835595

result:

ok 1 number(s): "660835595"

Test #30:

score: 0
Accepted
time: 990ms
memory: 62816kb

input:

60000

output:

323452070

result:

ok 1 number(s): "323452070"

Test #31:

score: 0
Accepted
time: 1755ms
memory: 72192kb

input:

70000

output:

307315915

result:

ok 1 number(s): "307315915"

Test #32:

score: 0
Accepted
time: 1862ms
memory: 78680kb

input:

80000

output:

951757567

result:

ok 1 number(s): "951757567"

Test #33:

score: 0
Accepted
time: 1963ms
memory: 84028kb

input:

90000

output:

426123208

result:

ok 1 number(s): "426123208"

Test #34:

score: 0
Accepted
time: 1960ms
memory: 90000kb

input:

100000

output:

827418228

result:

ok 1 number(s): "827418228"

Test #35:

score: 0
Accepted
time: 2077ms
memory: 96012kb

input:

110000

output:

541614559

result:

ok 1 number(s): "541614559"

Test #36:

score: 0
Accepted
time: 2054ms
memory: 100752kb

input:

120000

output:

184688986

result:

ok 1 number(s): "184688986"

Test #37:

score: 0
Accepted
time: 2062ms
memory: 107828kb

input:

130000

output:

898089371

result:

ok 1 number(s): "898089371"

Test #38:

score: -100
Wrong Answer
time: 3637ms
memory: 112060kb

input:

140000

output:

914010165

result:

wrong answer 1st numbers differ - expected: '949540221', found: '914010165'