QOJ.ac

QOJ

ID题目提交者结果用时内存语言文件大小提交时间测评时间
#420373#8721. 括号序列alpha1022AC ✓196ms21552kbC++1447.7kb2024-05-24 16:53:112024-05-24 16:53:14

Judging History

你现在查看的是最新测评结果

  • [2024-05-24 16:53:14]
  • 评测
  • 测评结果:AC
  • 用时:196ms
  • 内存:21552kb
  • [2024-05-24 16:53:11]
  • 提交

answer

#include <bits/stdc++.h>

using ll = long long;

using namespace std;

const int mod = 998244353, K = 23, Root = 31;
int norm(int x) { return x >= mod ? x - mod : x; }
int reduce(int x) { return x < 0 ? x + mod : x; }
int neg(int x) { return x ? mod - x : 0; }
int quo2(int x) { return (x + (x & 1 ? mod : 0)) >> 1; }
void add(int &x, int y) { x += y, x = x >= mod ? x - mod : x; }
void sub(int &x, int y) { x -= y, x = x < 0 ? x + mod : x; }
void fam(int &x, int y, int z) { x = (x + (ll)y * z) % mod; }
int mpow(int a, int b) {
  int ret = 1;
  for (; b; b >>= 1) {
    if (b & 1) ret = (ll)ret * a % mod;
    a = (ll)a * a % mod;
  }
  return ret;
}

const int BRUTE_N2_LIMIT = 50;

struct NumberTheory {
  typedef pair<int, int> P2_Field;
  mt19937 rng;
  NumberTheory(): rng(chrono::steady_clock::now().time_since_epoch().count()) {}
  void exGcd(int a, int b, int &x, int &y) {
    if (!b) {
      x = 1, y = 0;
      return;
    }
    exGcd(b, a % b, y, x), y -= a / b * x;
  }
  int inv(int a, int p = mod) {
    int x, y;
    exGcd(a, p, x, y);
    if (x < 0) x += p;
    return x;
  }
  template<class Integer>
  bool quadRes(Integer a, Integer b) {
    if (a <= 1) return 1;
    while (a % 4 == 0) a /= 4;
    if (a % 2 == 0) return (b % 8 == 1 || b % 8 == 7) == quadRes(a / 2, b);
    return ((a - 1) % 4 == 0 || (b - 1) % 4 == 0) == quadRes(b % a, a);
  }
  int sqrt(int x, int p = mod) {
    if (p == 2 || x <= 1) return x;
    int w, v, k = (p + 1) / 2;
    do w = rng() % p;
    while (quadRes(v = ((ll)w * w - x + p) % p, p));
    P2_Field res(1, 0), a(w, 1);
    for (; k; k >>= 1) {
      if (k & 1)
        res = P2_Field(
        ((ll)res.first * a.first + (ll)res.second * a.second % p * v) % p,
        ((ll)res.first * a.second + (ll)res.second * a.first) % p
        );
      a = P2_Field(((ll)a.first * a.first + (ll)a.second * a.second % p * v) % p, 2LL * a.first * a.second % p);
    }
    return min(res.first, p - res.first);
  }
} nt;

namespace Simple {
  int n = 1;
  vector<int> fac({1, 1}), ifac({1, 1}), inv({0, 1});
  void build(int m) {
    n = m;
    fac.resize(n + 1), ifac.resize(n + 1), inv.resize(n + 1);
    inv[1] = 1;
    for (int i = 2; i <= n; ++i) inv[i] = (ll)(mod - mod / i) * inv[mod % i] % mod;
    fac[0] = ifac[0] = 1;
    for (int i = 1; i <= n; ++i) fac[i] = (ll)fac[i - 1] * i % mod, ifac[i] = (ll)ifac[i - 1] * inv[i] % mod;
  }
  void check(int k) {
    int m = n;
    if (k > m) {
      while (k > m) m <<= 1;
      build(m);
    }
  }
  int gfac(int k) {
    check(k);
    return fac[k];
  }
  int gifac(int k) {
    check(k);
    return ifac[k];
  }
  int ginv(int k) {
    check(k);
    return inv[k];
  }
}

struct SimpleSequence {
  function<int(int)> func;
  SimpleSequence(const function<int(int)> &func): func(func) {}
  int operator[](int k) const { return func(k); }
} gfac(Simple::gfac), gifac(Simple::gifac), ginv(Simple::ginv);

int binom(int n, int m) {
  if (m > n || m < 0) return 0;
  return (ll)gfac[n] * gifac[m] % mod * gifac[n - m] % mod;
}

namespace NTT {
  int L = -1;
  vector<int> root;
  void init(int l) {
    L = l;
    root.resize((1 << L) + 1);
    int n = 1 << L, *w = root.data();
    w[0] = 1, w[1 << L] = mpow(Root, 1 << (K - 2 - L));
    for (int i = L; i; --i) w[1 << (i - 1)] = (ll)w[1 << i] * w[1 << i] % mod;
    for (int i = 1; i < n; ++i) w[i] = (ll)w[i & (i - 1)] * w[i & -i] % mod;
  }
  void DIF(int *a, int l) {
    int n = 1 << l;
    for (int len = n >> 1; len; len >>= 1)
      for (int *j = a, *o = root.data(); j != a + n; j += len << 1, ++o)
        for (int *k = j; k != j + len; ++k) {
          int r = (ll)*o * k[len] % mod;
          k[len] = reduce(*k - r), add(*k, r);
        }
  }
  void DIT(int *a, int l) {
    int n = 1 << l;
    for (int len = 1; len < n; len <<= 1)
      for (int *j = a, *o = root.data(); j != a + n; j += len << 1, ++o)
        for (int *k = j; k != j + len; ++k) {
          int r = norm(*k + k[len]);
          k[len] = (ll)*o * (*k - k[len] + mod) % mod, *k = r;
        }
  }
  void fft(int *a, int lgn, int d = 1) {
    if (L < lgn) init(lgn);
    int n = 1 << lgn;
    if (d == 1) DIF(a, lgn);
    else {
      DIT(a, lgn), reverse(a + 1, a + n);
      int nInv = mod - (mod - 1) / n;
      for (int i = 0; i < n; ++i) a[i] = (ll)a[i] * nInv % mod;
    }
  }
}

struct poly {
  vector<int> a;
  poly(ll v = 0): a(1) {
    if ((v %= mod) < 0) v += mod;
    a[0] = v;
  }
  poly(const poly &o): a(o.a) {}
  poly(const vector<int> &o): a(o) {}
  poly(initializer_list<int> o): a(o) {}
  operator vector<int>() const { return a; }
  int operator[](int k) const { return k < a.size() ? a[k] : 0; }
  int &operator[](int k) {
    if (k >= a.size()) a.resize(k + 1);
    return a[k];
  }
  int deg() const { return (int)a.size() - 1; }
  void redeg(int d) { a.resize(d + 1); }
  int size() const { return a.size(); }
  void resize(int s) { a.resize(s); }
  bool empty() const { return a.empty(); }
  int back() const { return a.back(); }
  poly slice(int d) const {
    if (d < a.size()) return vector<int>(a.begin(), a.begin() + d + 1);
    vector<int> ret = a;
    ret.resize(d + 1);
    return ret;
  }
  poly shift(int k) const {
    if (size() + k <= 0) return 0;
    vector<int> ret(size() + k);
    for (int i = max(0, k); i < ret.size(); ++i) ret[i] = a[i - k];
    return ret;
  }
  int eval(int x) const {
    int ret = 0;
    for (int i = deg(); i >= 0; --i)
      ret = ((ll)ret * x + a[i]) % mod;
    return ret;
  }
  bool operator<(const poly &o) const {
    for (int i = 0; i <= min(deg(), o.deg()); ++i)
      if (a[i] != o[i])
        return a[i] < o[i];
    return deg() < o.deg();
  }
  bool operator==(const poly &o) const {
    if (deg() != o.deg()) return 0;
    for (int i = 0; i <= deg(); ++i)
      if (a[i] != o[i])
        return 0;
    return 1;
  }
  int *base() { return a.data(); }
  const int *base() const { return a.data(); }
  poly println(FILE *fp = stdout) const {
    for (int i = 0; i < a.size(); ++i) fprintf(fp, "%d%c", a[i], " \n"[i == a.size() - 1]);
    return *this;
  }

  poly &operator+=(const poly &o) {
    if (o.size() > a.size()) a.resize(o.size());
    for (int i = 0; i < o.size(); ++i) add(a[i], o[i]);
    return *this;
  }
  poly operator+(const poly &o) const {
    poly ret(a);
    ret += o;
    return ret;
  }
  poly operator-() const {
    poly ret = a;
    for (int i = 0; i < a.size(); ++i) ret[i] = neg(ret[i]);
    return ret;
  }
  poly &operator-=(const poly &o) { return operator+=(-o); }
  poly operator-(const poly &o) const { return operator+(-o); }
  poly operator*(const poly &) const;
  poly operator/(const poly &) const;
  poly operator%(const poly &) const;
  poly &operator*=(const poly &o) {
    *this = operator*(o);
    return *this;
  }
  poly &operator/=(const poly &o) {
    *this = operator/(o);
    return *this;
  }
  poly &operator%=(const poly &o) {
    *this = operator%(o);
    return *this;
  }
  poly deriv() const;
  poly integ() const;
  poly inv() const;
  poly sqrt() const;
  poly ln() const;
  poly exp() const;
  poly srcExp() const;
  pair<poly, poly> sqrti() const;
  pair<poly, poly> expi() const;
  poly quo(const poly &) const;
  pair<poly, poly> iquo() const;
  pair<poly, poly> div(const poly &) const;
  poly taylor(int k) const;
  poly pow(int k) const;
  poly exp(int k) const;
  poly comp(const poly &) const;
  poly compInv() const;
  poly compCompInv(const poly &) const;
};

void trim(vector<int> &a) { for (; !a.empty() && !a.back(); a.pop_back()); }
void trim(poly &a) { trim(a.a); }

poly zeroes(int d) { return vector<int>(d + 1); }

namespace NTT {
  void fft(poly &a, int lgn, int d = 1) { fft(a.base(), lgn, d); }
  void fft(vector<int> &a, int lgn, int d = 1) { fft(a.data(), lgn, d); }
}

using NTT::fft;

struct SemiRelaxedConvolution {
  static const int LG = 19;
  static const int B = 16;
  void run(int *f, const int *g, int n, const function<void(int)> &relax) {
    vector<vector<int>> save(LG + 1);
    function<void(int, int)> divideConquer = [&](int l, int r) {
      if (r - l <= BRUTE_N2_LIMIT) {
        for (int i = l + 1; i <= r; ++i) {
          for (int j = l; j < i; ++j) fam(f[i], f[j], g[i - j]);
          relax(i);
        }
        return;
      }
      int d = (r - l) / B + 1, lgd = 0, lg;
      while ((1 << lgd) <= d * 2) ++lgd;
      d = (1 << (lgd - 1)) - 1, lg = (r - l + d - 1) / d;

      vector<int> dft = save[lgd];
      dft.resize(lg << (lgd + 1));
      for (int i = lg << lgd; i < (lg << (lgd + 1)); ++i) dft[i] = 0;
      int ef = lg;
      for (int i = 0; i < lg; ++i) {
        if ((i << lgd) < save[lgd].size()) continue;
        if ((i + 2) * d >= l) --ef;
        for (int j = 0; j <= min(d * 2, n - i * d); ++j) dft[(i << lgd) + j] = g[i * d + j];
        fft(dft.data() + (i << lgd), lgd);
      }
      if (save[lgd].size() < (ef << lgd)) save[lgd] = vector<int>(dft.begin(), dft.begin() + (ef << lgd));
      for (int i = 0; i < lg; ++i) {
        if (i) fft(dft.data() + ((lg + i) << lgd), lgd, -1);
        for (int j = 1; j <= min(d, r - l - i * d); ++j) add(f[l + i * d + j], dft[((lg + i) << lgd) + d + j]);
        divideConquer(l + i * d, min(l + (i + 1) * d, r));
        if (i + 1 < lg) {
          for (int j = 0; j < d; ++j) dft[((lg + i) << lgd) + j] = f[l + i * d + j];
          for (int j = d; j < (1 << lgd); ++j) dft[((lg + i) << lgd) + j] = 0;
          fft(dft.data() + ((lg + i) << lgd), lgd);
        }
        for (int j = i + 1; j < lg; ++j)
          for (int k = 0; k < (1 << lgd); ++k)
            fam(dft[((lg + j) << lgd) + k], dft[((j - i - 1) << lgd) + k], dft[((lg + i) << lgd) + k]);
      }
    };
    relax(0), divideConquer(0, n);
  }
} src;

poly poly::srcExp() const {
  int n = deg();
  poly f, g;
  f.redeg(n), g.redeg(n);
  for (int i = 0; i <= n; ++i) g[i] = (ll)a[i] * i % mod;
  src.run(f.base(), g.base(), n, [&](int i) {
    if (i == 0) f[i] = 1;
    else f[i] = (ll)f[i] * ginv[i] % mod;
  });
  return f;
}

struct Newton {
  void inv(const poly &f, const poly &dftf, poly &g, const poly &dftg, int t) {
    int n = 1 << t;
    poly prod = dftf;
    for (int i = 0; i < (n << 1); ++i) prod[i] = (ll)prod[i] * dftg[i] % mod;
    fft(prod, t + 1, -1);
    memset(prod.base(), 0, sizeof(int) << t);
    fft(prod, t + 1);
    for (int i = 0; i < (n << 1); ++i) prod[i] = (ll)prod[i] * dftg[i] % mod;
    fft(prod, t + 1, -1);
    for (int i = 0; i < n; ++i) prod[i] = 0;
    g -= prod;
  }
  void inv(const poly &f, const poly &dftf, poly &g, int t) {
    poly dftg(g);
    dftg.resize(2 << t);
    fft(dftg, t + 1);
    inv(f, dftf, g, dftg, t);
  }
  void inv(const poly &f, poly &g, int t) {
    poly dftf(f), dftg(g);
    dftf.resize(2 << t), dftg.resize(2 << t);
    fft(dftf, t + 1), fft(dftg, t + 1);
    inv(f, dftf, g, dftg, t);
  }
  void sqrt(const poly &f, poly &g, poly &dftg, poly &h, int t) {
    for (int i = 0; i < (1 << t); ++i) dftg[i] = (ll)dftg[i] * dftg[i] % mod;
    fft(dftg, t, -1);
    dftg -= f;
    for (int i = 0; i < (1 << t); ++i) add(dftg[i | (1 << t)], dftg[i]), dftg[i] = 0;
    fft(dftg, t + 1);
    poly tmp = h;
    tmp.resize(2 << t);
    fft(tmp, t + 1);
    for (int i = 0; i < (2 << t); ++i) tmp[i] = (ll)tmp[i] * dftg[i] % mod;
    fft(tmp, t + 1, -1);
    memset(tmp.base(), 0, sizeof(int) << t);
    g -= tmp * ginv[2];
  }
  void exp(const poly &f, poly &g, poly &dftg, poly &h, int t) {
    poly dfth = h;
    fft(dfth, t);
    poly dg = g.deriv().slice((1 << t) - 1);
    fft(dg, t);
    poly tmp = zeroes((1 << t) - 1);
    for (int i = 0; i < (1 << t); ++i) tmp[i] = (ll)dftg[i] * dfth[i] % mod, dg[i] = (ll)dg[i] * dfth[i] % mod;
    fft(tmp, t, -1), fft(dg, t, -1);
    sub(tmp[0], 1);
    dg.resize(2 << t);
    poly df0 = f.deriv().slice((1 << t) - 1);
    df0[(1 << t) - 1] = 0;
    for (int i = 0; i < (1 << t); ++i) dg[i | (1 << t)] = reduce(dg[i] - df0[i]);
    memcpy(dg.base(), df0.base(), sizeof(int) * ((1 << t) - 1));
    tmp.resize(2 << t), fft(tmp, t + 1);
    df0.resize(2 << t), fft(df0, t + 1);
    for (int i = 0; i < (2 << t); ++i) df0[i] = (ll)df0[i] * tmp[i] % mod;
    fft(df0, t + 1, -1);
    for (int i = 0; i < (1 << t); ++i) df0[i | (1 << t)] = df0[i], df0[i] = 0;
    dg = (dg - df0).integ().slice((2 << t) - 1) - f;
    fft(dg, t + 1);
    for (int i = 0; i < (2 << t); ++i) tmp[i] = (ll)dg[i] * dftg[i] % mod;
    fft(tmp, t + 1, -1);
    g.resize(2 << t);
    for (int i = 1 << t; i < (2 << t); ++i) g[i] = neg(tmp[i]);
  }
} nit;

struct Eval {
  vector<int> ls, rs, pos;
  virtual void _init(int n) {}
  void _build(int n) {
    ls.assign(n * 2 - 1, 0), rs.assign(n * 2 - 1, 0), pos.resize(n), _init(n);
    int cnt = 0;
    function<int(int, int)> dfs = [&](int l, int r) {
      if (l == r) { pos[l] = cnt; return cnt++; }
      int ret = cnt++;
      int mid = (l + r) >> 1;
      ls[ret] = dfs(l, mid), rs[ret] = dfs(mid + 1, r);
      return ret;
    };
    dfs(0, n - 1);
  }
};

struct Transposition : Eval {
  vector<int> _mul(int l, vector<int> res, const poly &b) {
    vector<int> tmp(1 << l);
    memcpy(tmp.data(), b.base(), sizeof(int) * (b.deg() + 1));
    reverse(tmp.begin() + 1, tmp.end());
    fft(tmp.data(), l);
    for (int i = 0; i < (1 << l); ++i) res[i] = (ll)res[i] * tmp[i] % mod;
    fft(res.data(), l, -1);
    return res;
  }
  poly bfMul(const poly &a, const poly &b) {
    int n = a.deg(), m = b.deg();
    poly ret = zeroes(n - m);
    for (int i = 0; i <= n - m; ++i)
      for (int j = 0; j <= m; ++j) ret[i] = (ret[i] + (ll)a[i + j] * b[j]) % mod;
    return ret;
  }
  poly mul(const poly &a, const poly &b) {
    if (a.deg() < b.deg()) return 0;
    if (a.deg() <= BRUTE_N2_LIMIT) return bfMul(a, b);
    int l = 0;
    while ((1 << l) <= a.deg()) ++l;
    vector<int> res(1 << l);
    memcpy(res.data(), a.base(), sizeof(int) * (a.deg() + 1));
    fft(res.data(), l);
    res = _mul(l, res, b);
    res.resize(a.deg() - b.deg() + 1);
    return res;
  }
  pair<poly, poly> mul2(const poly &a, const poly &b1, const poly &b2) {
    if (a.deg() <= BRUTE_N2_LIMIT) return make_pair(bfMul(a, b1), bfMul(a, b2));
    int l = 0;
    while ((1 << l) <= a.deg()) ++l;
    vector<int> fa(1 << l);
    memcpy(fa.data(), a.base(), sizeof(int) * (a.deg() + 1));
    fft(fa.data(), l);
    vector<int> res1 = _mul(l, fa, b1), res2 = _mul(l, fa, b2);
    res1.resize(a.deg() - b1.deg() + 1), res2.resize(a.deg() - b2.deg() + 1);
    return {res1, res2};
  }
  vector<poly> p, q;
  void _init(int n) { p.assign(n * 2 - 1, 0), q.assign(n * 2 - 1, 0); }
  vector<int> _eval(vector<int> f, const vector<int> &x) {
    int n = f.size();
    _build(n);
    for (int i = 0; i < n; ++i) q[pos[i]] = {1, neg(x[i])};
    for (int i = n * 2 - 2; i >= 0; --i)
      if (ls[i]) q[i] = q[ls[i]] * q[rs[i]];
    f.resize(n * 2);
    p[0] = mul(f, q[0].inv());
    for (int i = 0; i < n * 2 - 1; ++i)
      if (ls[i]) tie(p[ls[i]], p[rs[i]]) = mul2(p[i], q[rs[i]], q[ls[i]]);
    vector<int> ret(n);
    for (int i = 0; i < n; ++i) ret[i] = p[pos[i]][0];
    return ret;
  }
  vector<int> eval(const poly &f, const vector<int> &x) {
    int n = f.deg() + 1, m = x.size();
    vector<int> tmpf = f.a, tmpx = x;
    tmpf.resize(max(n, m)), tmpx.resize(max(n, m));
    vector<int> ret = _eval(tmpf, tmpx);
    ret.resize(m);
    return ret;
  }
  poly inter(const vector<int> &x, const vector<int> &y) {
    int n = x.size();
    _build(n);
    for (int i = 0; i < n; ++i) q[pos[i]] = {1, norm(mod - x[i])};
    for (int i = n * 2 - 2; i >= 0; --i)
      if (ls[i]) q[i] = q[ls[i]] * q[rs[i]];
    poly tmp = q[0];
    reverse(tmp.a.begin(), tmp.a.end());
    vector<int> f = tmp.deriv().a;
    f.resize(n * 2);
    p[0] = mul(f, q[0].inv());
    for (int i = 0; i < n * 2 - 1; ++i)
      if (ls[i]) tie(p[ls[i]], p[rs[i]]) = mul2(p[i], q[rs[i]], q[ls[i]]);
    for (int i = 0; i < n; ++i) p[pos[i]] = nt.inv(p[pos[i]][0]) * (ll)y[i] % mod;
    for (int i = 0; i < n * 2 - 1; ++i) reverse(q[i].a.begin(), q[i].a.end());
    for (int i = n * 2 - 2; i >= 0; --i)
      if (ls[i]) p[i] = p[ls[i]] * q[rs[i]] + p[rs[i]] * q[ls[i]];
    return p[0];
  }
  vector<poly> mul(const vector<poly> &a, const vector<poly> &b);
} tp;

struct FactorialPolynomials : Eval {
  pair<poly, poly> mul2(const poly &a, const poly &b1, const poly &b2) {
    int n = a.deg() + max(b1.deg(), b2.deg());
    if (n <= BRUTE_N2_LIMIT) return {a * b1, a * b2};
    int l = 0;
    while ((1 << l) <= n) ++l;
    vector<int> fa(1 << l);
    memcpy(fa.data(), a.base(), sizeof(int) * (a.deg() + 1));
    fft(fa.data(), l);
    vector<int> res1(1 << l), res2(1 << l);
    memcpy(res1.data(), b1.base(), sizeof(int) * (b1.deg() + 1)), memcpy(res2.data(), b2.base(), sizeof(int) * (b2.deg() + 1));
    fft(res1.data(), l), fft(res2.data(), l);
    for (int i = 0; i < (1 << l); ++i) res1[i] = (ll)res1[i] * fa[i] % mod, res2[i] = (ll)res2[i] * fa[i] % mod;
    fft(res1.data(), l, -1), fft(res2.data(), l, -1);
    res1.resize(a.deg() + b1.deg() + 1), res2.resize(a.deg() + b2.deg() + 1);
    return {res1, res2};
  }
  vector<poly> p, q;
  void _init(int n) { p.assign(n * 2 - 1, 0), q.assign(n * 2 - 1, 0); }
  vector<int> toPoly(vector<int> f) {
    int n = f.size();
    _build(n);
    for (int i = 0; i < n; ++i) p[pos[i]] = f[i], q[pos[i]] = {neg(i), 1};
    for (int i = n * 2 - 2; i >= 0; --i)
      if (ls[i]) tie(p[i], q[i]) = mul2(q[ls[i]], p[rs[i]], q[rs[i]]), p[i] += p[ls[i]];
    return p[0];
  }
  vector<int> lagrange(int x, vector<int> y) {
    int m = y.size(), n = m * 2 - 1;
    vector<int> tinv(n);
    if (x == m)
      for (int i = 0; i < n; ++i) tinv[i] = ginv[x - (m - 1 - i)];
    else {
      vector<int> tfac(n), tifac(n);
      tfac[0] = reduce(x - (m - 1));
      for (int i = 1; i < n; ++i) tfac[i] = (ll)tfac[i - 1] * (x - (m - 1 - i) + mod) % mod;
      tifac[n - 1] = nt.inv(tfac[n - 1]);
      for (int i = n - 1; i; --i) tifac[i - 1] = (ll)tifac[i] * (x - (m - 1 - i) + mod) % mod;
      tinv[0] = tifac[0];
      for (int i = 1; i < n; ++i) tinv[i] = (ll)tifac[i] * tfac[i - 1] % mod;
    }
    for (int i = 0; i < m; ++i)
      y[i] = (ll)((m - i) & 1 ? y[i] : mod - y[i]) * gifac[i] % mod * gifac[m - 1 - i] % mod;
    vector<int> ret(m);
    if (n <= BRUTE_N2_LIMIT) {
      for (int i = 0; i < m; ++i)
        for (int j = 0; j <= m - 1 + i && j < m; ++j)
          fam(ret[i], y[j], tinv[m - 1 + i - j]);
    } else {
      int l = 0;
      while ((1 << l) < n) ++l;
      auto tmp = tinv;
      y.resize(1 << l), tmp.resize(1 << l);
      fft(y.data(), l), fft(tmp.data(), l);
      for (int i = 0; i < (1 << l); ++i) tmp[i] = (ll)tmp[i] * y[i] % mod;
      fft(tmp.data(), l, -1);
      memcpy(ret.data(), tmp.data() + m - 1, sizeof(int) * m);
    }
    int fac = 1;
    for (int i = 0; i < m; ++i) fac = (ll)fac * (x - i) % mod;
    for (int i = 0; i < m; ++i)
      ret[i] = (ll)ret[i] * fac % mod,
      fac = (ll)fac * (x + i + 1) % mod * tinv[i] % mod;
    return ret;
  }
  pair<vector<int>, vector<int>> lagrange2(int x, vector<int> y1, vector<int> y2) {
    assert(y1.size() == y2.size());
    int m = y1.size(), n = m * 2 - 1;
    vector<int> tinv(n);
    if (x == m)
      for (int i = 0; i < n; ++i) tinv[i] = ginv[x - (m - 1 - i)];
    else {
      vector<int> tfac(n), tifac(n);
      tfac[0] = reduce(x - (m - 1));
      for (int i = 1; i < n; ++i) tfac[i] = (ll)tfac[i - 1] * (x - (m - 1 - i) + mod) % mod;
      tifac[n - 1] = nt.inv(tfac[n - 1]);
      for (int i = n - 1; i; --i) tifac[i - 1] = (ll)tifac[i] * (x - (m - 1 - i) + mod) % mod;
      tinv[0] = tifac[0];
      for (int i = 1; i < n; ++i) tinv[i] = (ll)tifac[i] * tfac[i - 1] % mod;
    }
    for (int i = 0; i < m; ++i)
      y1[i] = (ll)((m - i) & 1 ? y1[i] : mod - y1[i]) * gifac[i] % mod * gifac[m - 1 - i] % mod,
      y2[i] = (ll)((m - i) & 1 ? y2[i] : mod - y2[i]) * gifac[i] % mod * gifac[m - 1 - i] % mod;
    vector<int> ret1(m), ret2(m);
    if (n <= BRUTE_N2_LIMIT) {
      for (int i = 0; i < m; ++i)
        for (int j = 0; j <= m - 1 + i && j < m; ++j)
          fam(ret1[i], y1[j], tinv[m - 1 + i - j]),
          fam(ret2[i], y2[j], tinv[m - 1 + i - j]);
    } else {
      int l = 0;
      while ((1 << l) < n) ++l;
      auto tmp = tinv;
      y1.resize(1 << l), y2.resize(1 << l), tmp.resize(1 << l);
      fft(y1.data(), l), fft(y2.data(), l), fft(tmp.data(), l);
      for (int i = 0; i < (1 << l); ++i)
        y1[i] = (ll)tmp[i] * y1[i] % mod, y2[i] = (ll)tmp[i] * y2[i] % mod;
      fft(y1.data(), l, -1), fft(y2.data(), l, -1);
      memcpy(ret1.data(), y1.data() + m - 1, sizeof(int) * m),
      memcpy(ret2.data(), y2.data() + m - 1, sizeof(int) * m);
    }
    int fac = 1;
    for (int i = 0; i < m; ++i) fac = (ll)fac * (x - i) % mod;
    for (int i = 0; i < m; ++i)
      ret1[i] = (ll)ret1[i] * fac % mod, ret2[i] = (ll)ret2[i] * fac % mod,
      fac = (ll)fac * (x + i + 1) % mod * tinv[i] % mod;
    return {ret1, ret2};
  }
  vector<int> lagrangeDoubling(vector<int> f) {
    auto tmp = lagrange(f.size(), f);
    f.insert(f.end(), tmp.begin(), tmp.end());
    return f;
  }
  pair<vector<int>, vector<int>> lagrangeDoubling2(vector<int> f1, vector<int> f2) {
    vector<int> tmp1, tmp2;
    tie(tmp1, tmp2) = lagrange2(f1.size(), f1, f2);
    f1.insert(f1.end(), tmp1.begin(), tmp1.end()),
    f2.insert(f2.end(), tmp2.begin(), tmp2.end());
    return {f1, f2};
  }
  vector<int> lagrangeExtending(vector<int> f) {
    int n = f.size();
    int res = 0;
    for (int i = 0; i < n; ++i)
      res = (res + (ll)((n - i) & 1 ? f[i] : mod - f[i]) * gifac[i] % mod * gifac[n - 1 - i] % mod * ginv[n - i]) % mod;
    for (int i = 1; i <= n; ++i) res = (ll)res * i % mod;
    f.push_back(res);
    return f;
  }
  vector<int> evalFacPoly(vector<int> f) {
    int n = f.size();
    poly g = zeroes(n - 1);
    for (int i = 0; i < n; ++i) g[i] = gifac[i];
    g = (g * f).slice(n - 1);
    for (int i = 0; i < n; ++i) g[i] = (ll)g[i] * gfac[i] % mod;
    return g;
  }
  vector<int> evalPoly(vector<int> f) {
    int n = f.size();
    _build(n);
    for (int i = 0; i < n; ++i) p[pos[i]] = f[i];
    for (int i = n * 2 - 2; i >= 0; --i)
      if (ls[i]) {
        vector<int> l = p[ls[i]], r = p[rs[i]];
        if (l.size() > r.size())
          tie(l, r) = lagrangeDoubling2(l, lagrangeExtending(r)),
          l.pop_back(), r.pop_back();
        else tie(l, r) = lagrangeDoubling2(l, r);
        p[i].resize(p[ls[i]].size() + p[rs[i]].size());
        for (int j = 0; j < p[i].size(); ++j)
          p[i][j] = (l[j] + (ll)r[j] * mpow(j, p[ls[i]].size())) % mod;
      }
    return p[0];
  }
  vector<int> interFacPoly(vector<int> y) {
    int n = y.size();
    poly g = zeroes(n - 1);
    for (int i = 0; i < n; ++i)
      g[i] = i & 1 ? mod - gifac[i] : gifac[i], y[i] = (ll)y[i] * gifac[i] % mod;
    return (g * y).slice(n - 1);
  }
  vector<int> interPoly(vector<int> y) { return toPoly(interFacPoly(y)); }
  vector<int> toFacPoly(vector<int> f) { return interFacPoly(evalPoly(f)); }
  vector<int> mul(vector<int> a, vector<int> b) {
    int n = a.size() + b.size() - 1;
    a.resize(n), b.resize(n);
    a = evalFacPoly(a), b = evalFacPoly(b);
    for (int i = 0; i < n; ++i) a[i] = (ll)a[i] * b[i] % mod;
    return interFacPoly(a);
  }
} fp;

poly poly::operator*(const poly &o) const {
  int n = deg(), m = o.deg();
  if (n == -1 || m == -1) return vector<int>();
  if (n <= 10 || m <= 10 || n + m <= BRUTE_N2_LIMIT) {
    poly ret = zeroes(n + m);
    for (int i = 0; i <= n; ++i)
      for (int j = 0; j <= m; ++j) fam(ret[i + j], a[i], o[j]);
    return ret;
  }
  n += m;
  int l = 0;
  while ((1 << l) <= n) ++l;
  poly ret = a, tmp = o;
  ret.resize(1 << l), tmp.resize(1 << l);
  fft(ret, l), fft(tmp, l);
  for (int i = 0; i < (1 << l); ++i) ret[i] = (ll)ret[i] * tmp[i] % mod;
  fft(ret, l, -1);
  return ret.slice(n);
}
poly poly::inv() const {
  poly g = nt.inv(a[0]);
  for (int t = 0; (1 << t) <= deg(); ++t) nit.inv(slice((2 << t) - 1), g, t);
  return g.slice(deg());
}
poly poly::taylor(int k) const {
  int n = deg();
  poly f = zeroes(n), g = zeroes(n);
  for (int i = 0, pw = 1; i <= n; ++i)
    f[n - i] = (ll)a[i] * gfac[i] % mod, g[i] = (ll)pw * gifac[i] % mod, pw = (ll)pw * k % mod;
  f *= g;
  for (int i = 0; i <= n; ++i) g[i] = (ll)f[n - i] * gifac[i] % mod;
  return g;
}
poly poly::pow(int k) const {
  if (k == 0) return 1;
  if (k == 1) return *this;
  int n = deg() * k;
  int l = 0;
  while ((1 << l) <= n) ++l;
  poly ret = a;
  ret.resize(1 << l), fft(ret, l);
  for (int i = 0; i < (1 << l); ++i) ret[i] = mpow(ret[i], k);
  fft(ret, -1);
  return ret.slice(n);
}
poly poly::deriv() const {
  if (deg() == 0) return 0;
  vector<int> ret(deg());
  for (int i = 0; i < deg(); ++i) ret[i] = (ll)a[i + 1] * (i + 1) % mod;
  return ret;
}
poly poly::integ() const {
  vector<int> ret(size() + 1);
  for (int i = 0; i <= deg(); ++i) ret[i + 1] = (ll)a[i] * ginv[i + 1] % mod;
  return ret;
}
poly poly::quo(const poly &o) const {
  if (o.deg() == 0) return (ll)a[0] * nt.inv(o[0]) % mod;
  poly g = nt.inv(o[0]);
  int t = 0, n;
  for (n = 1; (n << 1) <= o.deg(); ++t, n <<= 1) nit.inv(o.slice((n << 1) - 1), g, t);
  poly dftg = g;
  dftg.resize(n << 1), fft(dftg, t + 1);
  poly eps1 = o;
  eps1.resize(n << 1), fft(eps1, t + 1);
  for (int i = 0; i < (n << 1); ++i) eps1[i] = (ll)eps1[i] * dftg[i] % mod;
  fft(eps1, t + 1, -1);
  for (int i = 0; i < n; ++i) eps1[i] = eps1[i + n], eps1[i + n] = 0;
  fft(eps1, t + 1);
  poly h0 = slice(n - 1);
  h0.resize(n << 1), fft(h0, t + 1);
  poly h0g0 = zeroes((n << 1) - 1);
  for (int i = 0; i < (n << 1); ++i) h0g0[i] = (ll)h0[i] * dftg[i] % mod;
  fft(h0g0, t + 1, -1);
  poly h0eps1 = zeroes((n << 1) - 1);
  for (int i = 0; i < (n << 1); ++i) h0eps1[i] = (ll)h0[i] * eps1[i] % mod;
  fft(h0eps1, t + 1, -1);
  for (int i = 0; i < n; ++i) h0eps1[i] = reduce(operator[](i + n) - h0eps1[i]);
  memset(h0eps1.base() + n, 0, sizeof(int) << t);
  fft(h0eps1, t + 1);
  for (int i = 0; i < (n << 1); ++i) h0eps1[i] = (ll)h0eps1[i] * dftg[i] % mod;
  fft(h0eps1, t + 1, -1);
  for (int i = 0; i < n; ++i) h0eps1[i + n] = h0eps1[i], h0eps1[i] = 0;
  return (h0g0 + h0eps1).slice(o.deg());
}
poly poly::ln() const {
  if (deg() == 0) return 0;
  return deriv().quo(slice(deg() - 1)).integ();
}
pair<poly, poly> poly::sqrti() const {
  poly g = nt.sqrt(a[0]), h = nt.inv(a[0]), dftg = g;
  for (int t = 0; (1 << t) <= deg(); ++t) {
    nit.sqrt(slice((2 << t) - 1), g, dftg, h, t);
    dftg = g, fft(dftg, t + 1);
    nit.inv(g, dftg, h, t);
  }
  return make_pair(g.slice(deg()), h.slice(deg()));
}
poly poly::sqrt() const {
  poly g = nt.sqrt(a[0]), h = nt.inv(a[0]), dftg = g;
  for (int t = 0; (1 << t) <= deg(); ++t) {
    nit.sqrt(slice((2 << t) - 1), g, dftg, h, t);
    if ((2 << t) <= deg()) {
      dftg = g, fft(dftg, t + 1);
      nit.inv(g, dftg, h, t);
    }
  }
  return g.slice(deg());
}
pair<poly, poly> poly::expi() const {
  poly g = 1, h = 1, dftg = {1, 1};
  for (int t = 0; (1 << t) <= deg(); ++t) {
    nit.exp(slice((2 << t) - 1), g, dftg, h, t);
    dftg = g, dftg.resize(4 << t);
    fft(dftg, t + 2);
    poly f2n = dftg.slice((2 << t) - 1);
    nit.inv(g, f2n, h, t);
  }
  return make_pair(g.slice(deg()), h.slice(deg()));
}
poly poly::exp() const {
  poly g = 1, h = 1, dftg = {1, 1};
  for (int t = 0; (1 << t) <= deg(); ++t) {
    nit.exp(slice((2 << t) - 1), g, dftg, h, t);
    if ((2 << t) <= deg()) {
      dftg = g, dftg.resize(4 << t);
      fft(dftg, t + 2);
      poly f2n = dftg.slice((2 << t) - 1);
      nit.inv(g, f2n, h, t);
    }
  }
  return g.slice(deg());
}
poly poly::exp(int k) const {
  int lead, lz = 0;
  while (lz < deg() && !a[lz]) ++lz;
  if (lz == deg() && !a[lz]) return *this;
  lead = a[lz];
  if ((ll)lz * k > deg()) return zeroes(deg());
  poly part = poly(vector<int>(a.begin() + lz, a.begin() + deg() - lz * (k - 1) + 1)) * nt.inv(lead);
  part = (part.ln() * k).exp() * mpow(lead, k);
  vector<int> ret(deg() + 1);
  memcpy(ret.data() + lz * k, part.base(), sizeof(int) * (deg() - lz * k + 1));
  return ret;
}
poly poly::operator/(const poly &o) const {
  int n = deg(), m = o.deg();
  if (n < m) return 0;
  poly ta(vector<int>(a.rbegin(), a.rend())), tb(vector<int>(o.a.rbegin(), o.a.rend()));
  ta.redeg(n - m), tb.redeg(n - m);
  poly q = ta.quo(tb);
  return vector<int>(q.a.rbegin(), q.a.rend());
}
pair<poly, poly> poly::div(const poly &o) const {
  if (deg() < o.deg()) return make_pair(0, *this);
  int n = deg(), m = o.deg();
  poly q = operator/(o), r;
  int l = 0;
  while ((1 << l) < o.deg()) ++l;
  int t = (1 << l) - 1;
  poly tmp = zeroes(t);
  r = zeroes(t);
  for (int i = 0; i <= m; ++i) add(r[i & t], o[i]);
  for (int i = 0; i <= n - m; ++i) add(tmp[i & t], q[i]);
  fft(r, l), fft(tmp, l);
  for (int i = 0; i <= t; ++i) tmp[i] = (ll)tmp[i] * r[i] % mod;
  fft(tmp, l, -1);
  memset(r.base(), 0, sizeof(int) << l);
  for (int i = 0; i <= n; ++i) add(r[i & t], a[i]);
  for (int i = 0; i < m; ++i) sub(r[i], tmp[i]);
  return make_pair(q, r.slice(m - 1));
}
poly poly::operator%(const poly &o) const { return div(o).second; }


vector<poly> operator*(vector<poly> a, vector<poly> b) {
  int m0 = a.size() - 1, n0 = a[0].deg(), m1 = b.size() - 1, n1 = b[0].deg();
  poly A = zeroes((n0 + n1 + 1) * m0), B = zeroes((n0 + n1 + 1) * m1);
  for (int i = 0; i <= m0; ++i)
    for (int j = 0; j <= n0; ++j)
      A[i * (n0 + n1 + 1) + j] = a[i][j];
  for (int i = 0; i <= m1; ++i)
    for (int j = 0; j <= n1; ++j)
      B[i * (n0 + n1 + 1) + j] = b[i][j];
  A *= B;
  vector<poly> ret(m0 + m1 + 1, zeroes(n0 + n1));
  for (int i = 0; i <= m0 + m1; ++i)
    for (int j = 0; j <= n0 + n1; ++j)
      ret[i][j] = A[i * (n0 + n1 + 1) + j];
  return ret;
}
vector<poly> Transposition::mul(const vector<poly> &a, const vector<poly> &b) {
  int m0 = a.size() - 1, n0 = a[0].deg(), m1 = b.size() - 1, n1 = b[0].deg();
  poly A = zeroes((n0 + 1) * m0), B = zeroes((n0 + 1) * m1);
  for (int i = 0; i <= m0; ++i)
    for (int j = 0; j <= n0; ++j)
      A[i * (n0 + 1) + j] = a[i][j];
  for (int i = 0; i <= m1; ++i)
    for (int j = 0; j <= n1; ++j)
      B[i * (n0 + 1) + j] = b[i][j];
  A = tp.mul(A, B);
  vector<poly> ret(m0 - m1 + 1, zeroes(n0 - n1));
  for (int i = 0; i <= m0 - m1; ++i)
    for (int j = 0; j <= n0 - n1; ++j)
      ret[i][j] = A[i * (n0 + 1) + j];
  return ret;
}
poly poly::comp(const poly &o) const {
  if (o[0] != 0) return taylor(o[0]).comp(o - o[0]);
  function<vector<poly>(int, int, vector<poly>)> recurse = [&](int n, int k, vector<poly> denom) {
    if (!n) {
      vector<poly> ret(k, 1);
      for (int i = k - 1, j = 0; i >= 0 && j <= deg(); --i, ++j) ret[i] = a[j];
      return ret;
    }
    int l = 0;
    while ((1 << l) <= ((k << 1) + 1) * ((n + 1) << 1) - 2) ++l;
    vector<int> dt(1 << l);
    for (int i = 0; i <= k; ++i)
      for (int j = 0; j <= n; ++j)
        dt[i * ((n + 1) << 1) + j] = denom[i][j];
    fft(dt, l);
    vector<int> sq(1 << (l - 1));
    for (int i = 0; i < 1 << l; i += 2) sq[i >> 1] = (ll)dt[i] * dt[i ^ 1] % mod;
    fft(sq, l - 1, -1);
    vector<poly> sub(k << 1 | 1, zeroes(n >> 1));
    for (int i = 0; i <= k << 1; ++i)
      for (int j = 0; j <= n >> 1; ++j)
        sub[i][j] = sq[i * (n + 1) + j];
    auto res = recurse(n >> 1, k << 1, sub);
    vector<int> ing(1 << (l - 1));
    for (int i = 0; i < k << 1; ++i)
      for (int j = 0; j <= n >> 1; ++j)
        ing[i * (n + 1) + j] = res[i][j];
    sq.resize(1 << l);
    fft(ing, l - 1);
    for (int i = 0; i < 1 << l; ++i) sq[i] = (ll)ing[i >> 1] * dt[i ^ 1] % mod;
    fft(sq, l, -1);
    vector<poly> ret(k, zeroes(n));
    for (int i = 0; i < k; ++i)
      for (int j = 0; j <= n; ++j)
        ret[i][j] = sq[(i + k) * ((n + 1) << 1) + j];
    return ret;
  };
  int n = deg();
  poly ret = recurse(n, 1, {poly(1).slice(n), -o})[0];
  return ret;
}
poly recurse2d(int m, int n, int k, vector<poly> numer, vector<poly> denom) {
  if (!n) {
    poly ret = zeroes(k - 1);
    for (int i = 0; i < k; ++i) ret[i] = numer[i][0];
    return ret.slice(m);
  }
  int l = 0;
  while ((1 << l) <= ((k << 1) + 1) * ((n + 1) << 1) - 2) ++l;
  vector<int> nt(1 << l), dt(1 << l), sqn(1 << (l - 1)), sqd(1 << (l - 1));
  for (int i = 0; i < k; ++i)
    for (int j = 0; j <= n; ++j)
      nt[i * ((n + 1) << 1) + j] = numer[i][j];
  for (int i = 0; i <= k; ++i)
    for (int j = 0; j <= n; ++j)
      dt[i * ((n + 1) << 1) + j] = denom[i][j];
  fft(nt, l), fft(dt, l);
  if (n & 1) for (int i = 1; i < 1 << l; i += 2) nt[i] = neg(nt[i]);
  for (int i = 0; i < 1 << l; i += 2)
    sqd[i >> 1] = (ll)dt[i] * dt[i ^ 1] % mod,
    sqn[i >> 1] = ((ll)nt[i] * dt[i ^ 1] + (ll)nt[i ^ 1] * dt[i]) % mod * ginv[2] % mod;
  if (n & 1) for (int i = 1; i < 1 << (l - 1); ++i) sqn[i] = (ll)sqn[i] * NTT::root[i] % mod;
  fft(sqn, l - 1, -1), fft(sqd, l - 1, -1);
  vector<poly> subn(k << 1, zeroes(n >> 1)), subd(k << 1 | 1, zeroes(n >> 1));
  for (int i = 0; i < k << 1; ++i)
    for (int j = 0; j <= n >> 1; ++j)
      subn[i][j] = sqn[i * (n + 1) + j + (n & 1)];
  for (int i = 0; i <= k << 1; ++i)
    for (int j = 0; j <= n >> 1; ++j)
      subd[i][j] = sqd[i * (n + 1) + j];
  return recurse2d(m, n >> 1, k << 1, subn, subd);
};
poly poly::compInv() const {
  if (a[1] != 1) {
    int nv = mpow(a[1], mod - 2);
    poly ret = operator*(nv).compInv();
    for (int i = 1, pw = 1; i <= deg(); ++i) ret[i] = (ll)(pw = (ll)pw * nv % mod) * ret[i] % mod;
    return ret;
  }
  int n = deg();
  poly ret = recurse2d(n - 1, n - 1, 1, {(shift(-1).exp(mod - n)).slice(n - 1)}, {poly(1).slice(n), operator-()});
  reverse(ret.a.begin(), ret.a.end()), ret = ret.shift(1);
  for (int i = 1; i <= n; ++i) ret[i] = (ll)ret[i] * ginv[i] % mod;
  return ret;
}
poly poly::compCompInv(const poly &o) const {
  if (o[1] != 1) {
    int nv = mpow(o[1], mod - 2);
    poly ret = compCompInv(o * nv);
    for (int i = 1, pw = 1; i <= deg(); ++i) ret[i] = (ll)(pw = (ll)pw * nv % mod) * ret[i] % mod;
    return ret;
  }
  int n = deg();
  poly ret = recurse2d(n - 1, n - 1, 1, {(deriv() * o.shift(-1).exp(mod - n)).slice(n - 1)}, {poly(1).slice(n), -o});
  reverse(ret.a.begin(), ret.a.end()), ret = ret.shift(1) + a[0];
  for (int i = 1; i <= n; ++i) ret[i] = (ll)ret[i] * ginv[i] % mod;
  return ret;
}

using polyMat = vector<vector<poly>>;

polyMat operator*(const polyMat &lhs, const polyMat &rhs) {
  int x = lhs.size(), y = rhs.size(), z = rhs.back().size();
  int m = 0;
  for (int i = 0; i < x; ++i)
    for (int j = 0; j < z; ++j)
      for (int k = 0; k < y; ++k)
        m = max(m, lhs[i][k].deg() + rhs[k][j].deg());
  polyMat ret(x, vector<poly>(z));
  if (m <= BRUTE_N2_LIMIT) {
    for (int i = 0; i < x; ++i)
      for (int j = 0; j < z; ++j)
        for (int k = 0; k < y; ++k)
          ret[i][j] += lhs[i][k] * rhs[k][j];
    for (int i = 0; i < x; ++i)
      for (int j = 0; j < z; ++j)
        trim(ret[i][j]);
  } else {
    int l = 0;
    for (; (1 << l) <= m; ++l);
    vector<vector<vector<int>>> lDFT(x, vector<vector<int>>(y, vector<int>(1 << l))),
           rDFT(y, vector<vector<int>>(z, vector<int>(1 << l)));
    for (int i = 0; i < x; ++i)
      for (int j = 0; j < y; ++j) {
        memcpy(lDFT[i][j].begin().base(), lhs[i][j].a.begin().base(),
               sizeof(int) * lhs[i][j].a.size());
        fft(lDFT[i][j].begin().base(), l, 1);
      }
    for (int i = 0; i < y; ++i)
      for (int j = 0; j < z; ++j) {
        memcpy(rDFT[i][j].begin().base(), rhs[i][j].a.begin().base(),
               sizeof(int) * rhs[i][j].a.size());
        fft(rDFT[i][j].begin().base(), l, 1);
      }
    vector<int> res;
    for (int i = 0; i < x; ++i)
      for (int j = 0; j < z; ++j) {
        res.assign(1 << l, 0);
        for (int k = 0; k < y; ++k)
          for (int t = 0; t < (1 << l); ++t)
            res[t] = (res[t] + lDFT[i][k][t] * (ll)rDFT[k][j][t]) % mod;
        fft(res.begin().base(), l, -1);
        trim(ret[i][j] = res);
      }
  }
  return ret;
}
polyMat &operator*=(polyMat &a, const polyMat &b) { a = a * b; }

tuple<poly, poly, polyMat> halfEuclid(poly a, poly b, int l) {
  if (b.deg() < 1 << (l - 1)) return {a, b, {{1, 0}, {0, 1}}};
  polyMat ret = get<2>(halfEuclid(a.shift(-(1 << (l - 1))), b.shift(-(1 << (l - 1))), l - 1));
  auto t0 = ret * polyMat{{a}, {b}};
  a = t0[0][0], b = t0[1][0];
  if (b.deg() < 1 << (l - 1)) return {a, b, ret};
  auto [q, r] = a.div(b); trim(r);
  ret = polyMat{{0, 1}, {1, -q}} * ret, a = exchange(b, r);
  if (b.deg() < 1 << (l - 1)) return {a, b, ret};
  t0 = get<2>(halfEuclid(a.shift(-(1 << (l - 2))), b.shift(-(1 << (l - 2))), l - 1));
  auto t1 = t0 * polyMat{{a}, {b}};
  a = t1[0][0], b = t1[1][0], ret = t0 * ret;
  return {a, b, ret};
}
pair<poly, polyMat> exEuclid(poly a, poly b) {
  trim(a), trim(b);
  if (b.empty()) return {a, {{1, 0}}};
  polyMat ret = {{1, 0}, {0, 1}};
  auto [q, r] = a.div(b); trim(r);
  ret = polyMat{{0, 1}, {1, -q}} * ret, a = exchange(b, r);
  if (b.empty()) return {a, {ret[0]}};
  int l = 0;
  for (; 1 << l <= a.deg(); ++l);
  while (!b.empty()) {
    auto res = halfEuclid(a, b, l);
    a = get<0>(res), b = get<1>(res), ret = get<2>(res) * ret;
    if (b.empty()) break;
    auto [q, r] = a.div(b); trim(r);
    ret = polyMat{{0, 1}, {1, -q}} * ret, a = exchange(b, r);
    for (; l && 1 << (l - 1) > a.deg(); --l);
  }
  return {a, {ret[0]}};
}
pair<poly, bool> modInv(poly p, poly q) {
  auto [a, mat] = exEuclid(p, q);
  if (!a.empty()) {
    int nv = nt.inv(a.back());
    a *= nv, mat = polyMat{{nv, 0}, {0, nv}} * mat;
  }
  return {mat[0][0], a == 1};
}

const int img = mpow(3, (mod - 1) >> 2);

poly chebyshev(int n) {
  poly ret = zeroes(n);
  for (int k = 0; k * 2 <= n; ++k)
    ret[n - k * 2] = k & 1 ? mod - binom(n - k, k) : binom(n - k, k);
  return ret;
}

poly bostanMori(int n, poly m, poly p0, poly p1, poly q1, poly q2) {
  for (; n; n >>= 1) {
    if (n & 1) p0 = (p1 - p0 * q1) % m, p1 = p1 * q2 % m;
    else p1 = (p0 * q2 - p1 * q1) % m;
    q1 = (q2 * 2 - q1 * q1) % m,
    q2 = q2 * q2 % m;
  }
  return p0;
}

int resultant(poly a, poly b) {
  trim(a), trim(b);
  if (a.empty() || b.empty()) return 0;
  int ret = 1;
  if (a.back() != 1)
    ret = (ll)ret * mpow(a.back(), b.deg()) % mod, a *= nt.inv(a.back());
  if (b.back() != 1)
    ret = (ll)ret * mpow(b.back(), a.deg()) % mod, b *= nt.inv(b.back());
  if (!b.deg()) return ret;
  if (a.deg() & b.deg() & 1) ret = mod - ret;
  a = exchange(b, a % b), trim(b);
  if (b.empty()) return 0;
  if (b.back() != 1)
    ret = (ll)ret * mpow(b.back(), a.deg()) % mod, b *= nt.inv(b.back());
  if (!b.deg()) return ret;
  int l = 0;
  for (; 1 << l <= a.deg(); ++l);
  function<tuple<int, poly, poly, polyMat>(poly, poly, int, int)> halfEuclid =
    [&](poly a, poly b, int offset, int l) -> tuple<int, poly, poly, polyMat> {
      if (b.deg() < 1 << (l - 1)) return {1, a, b, {{1, 0}, {0, 1}}};
      auto [res, _, __, ret] = halfEuclid(a.shift(-(1 << (l - 1))), b.shift(-(1 << (l - 1))), offset + (1 << (l - 1)), l - 1);
      auto t0 = ret * polyMat{{a}, {b}};
      a = t0[0][0], b = t0[1][0];
      if (b.deg() < 1 << (l - 1)) return {res, a, b, ret};
      if (b.back() != 1) {
        res = (ll)res * mpow(b.back(), a.deg() + offset) % mod;
        int nv = nt.inv(b.back());
        b *= nv, ret = polyMat{{1, 0}, {0, nv}} * ret;
      }
      if ((a.deg() + offset) & (b.deg() + offset) & 1) res = mod - res;
      auto [q, r] = a.div(b); trim(r);
      ret = polyMat{{0, 1}, {1, -q}} * ret, a = exchange(b, r);
      if (b.deg() < 1 << (l - 1)) return {res, a, b, ret};
      if (b.back() != 1) {
        res = (ll)res * mpow(b.back(), a.deg() + offset) % mod;
        int nv = nt.inv(b.back());
        b *= nv, ret = polyMat{{1, 0}, {0, nv}} * ret;
      }
      auto tmp = halfEuclid(a.shift(-(1 << (l - 2))), b.shift(-(1 << (l - 2))), offset + (1 << (l - 2)), l - 1);
      t0 = get<3>(tmp) * polyMat{{a}, {b}},
      res = (ll)res * get<0>(tmp) % mod, a = t0[0][0], b = t0[1][0], ret = get<3>(tmp) * ret;
      return {res, a, b, ret};
    };
  while (b.deg()) {
    auto [nres, na, nb, _] = halfEuclid(a, b, 0, l);
    ret = (ll)ret * nres % mod, a = na, b = nb;
    if (b.empty()) return 0;
    if (b.back() != 1)
      ret = (ll)ret * mpow(b.back(), a.deg()) % mod, b *= nt.inv(b.back());
    if (!b.deg()) break;
    if (a.deg() & b.deg() & 1) ret = mod - ret;
    a = exchange(b, a % b), trim(b);
    if (b.back() != 1)
      ret = (ll)ret * mpow(b.back(), a.deg()) % mod,
      b *= nt.inv(b.back());
    for (; l && 1 << (l - 1) > a.deg(); --l);
  }
  return ret;
}

int bostanMori(int n, poly p, poly q) {
  trim(p), trim(q);
  if (p.deg() >= q.deg()) {
    auto [k, r] = p.div(q);
    int ret = bostanMori(n, r, q);
    if (n <= k.deg()) add(ret, k[n]);
    return ret;
  }
  int k = 0;
  for (; 1 << k < q.size() << 1; ++k);
  vector<int> root(1 << k), rev(1 << k);
  for (int i = 0; i < 1 << k; ++i)
    rev[i] = (rev[i >> 1] >> 1) | ((i & 1) << (k - 1));
  int w = mpow(Root, 1 << (K - k));
  root[0] = 1;
  for (int i = 1; i < 1 << k; ++i) root[i] = (ll)root[i - 1] * w % mod;
  p.resize(1 << k), q.resize(1 << k); vector<int> r(1 << k);
  fft(p, k - 1), fft(q, k - 1);
  for (; n; n >>= 1) {
    auto extend = [&](poly &f) {
      memcpy(f.base() + (1 << (k - 1)), f.base(), sizeof(int) << (k - 1));
      fft(f.base() + (1 << (k - 1)), k - 1, -1);
      for (int i = 1 << (k - 1); i < 1 << k; ++i) f[i] = (ll)f[i] * root[i ^ (1 << (k - 1))] % mod;
      fft(f.base() + (1 << (k - 1)), k - 1);
    };
    extend(p), extend(q);
    for (int i = 0; i < 1 << k; ++i) r[i] = q[i ^ 1];
    for (int i = 0; i < 1 << k; ++i) p[i] = (ll)p[i] * r[i] % mod, q[i] = (ll)q[i] * r[i] % mod;
    if (n & 1) for (int i = 1; i < 1 << k; ++i) p[i] = (ll)p[i] * root[(1 << k) - rev[i]] % mod;
    for (int i = 0; i < 1 << (k - 1); ++i) p[i] = quo2(norm(p[i << 1] + p[i << 1 | 1])), q[i] = q[i << 1];
  }
  int p0 = 0, q0 = 0;
  for (int i = 0; i < 1 << (k - 1); ++i) add(p0, p[i]), add(q0, q[i]);
  return (ll)p0 * mpow(q0, mod - 2) % mod;
}
int bostanMori(int n, pair<vector<int>, vector<int>> f) {
  return bostanMori(n, f.first, f.second);
}

pair<poly, poly> fracApprox(poly a, poly b, int k) {
  trim(a), trim(b);
  if (a.deg() < k) return {a, 1};
  polyMat mat = {{1, 0}, {0, 1}};
  auto [q, r] = a.div(b); trim(r);
  mat = polyMat{{0, 1}, {1, -q}} * mat, a = exchange(b, r);
  if (a.deg() < k) return {a, mat[0][0]};
  int l = 0;
  for (; 1 << l <= a.deg(); ++l);
  function<tuple<poly, poly, polyMat>(poly, poly, int, int)> halfEuclid =
    [&](poly a, poly b, int k, int l) -> tuple<poly, poly, polyMat> {
      if (b.deg() < 1 << (l - 1)) return {a, b, {{1, 0}, {0, 1}}};
      polyMat ret = get<2>(halfEuclid(a.shift(-(1 << (l - 1))), b.shift(-(1 << (l - 1))), k - (1 << (l - 1)), l - 1));
      auto t0 = ret * polyMat{{a}, {b}};
      a = t0[0][0], b = t0[1][0];
      if (a.deg() < k || b.deg() < 1 << (l - 1)) return {a, b, ret};
      auto [q, r] = a.div(b); trim(r);
      ret = polyMat{{0, 1}, {1, -q}} * ret, a = exchange(b, r);
      if (a.deg() < k || b.deg() < 1 << (l - 1)) return {a, b, ret};
      t0 = get<2>(halfEuclid(a.shift(-(1 << (l - 2))), b.shift(-(1 << (l - 2))), k - (1 << (l - 2)), l - 1));
      auto t1 = t0 * polyMat{{a}, {b}};
      a = t1[0][0], b = t1[1][0], ret = t0 * ret;
      return {a, b, ret};
    };
  while (a.deg() >= k) {
    auto res = halfEuclid(a, b, k, l);
    a = get<0>(res), b = get<1>(res), mat = get<2>(res) * mat;
    if (a.deg() < k) break;
    auto [q, r] = a.div(b); trim(r);
    mat = polyMat{{0, 1}, {1, -q}} * mat, a = exchange(b, r);
    for (; l && 1 << (l - 1) > a.deg(); --l);
  }
  return {a, mat[0][0]};
}
pair<poly, poly> berlekampMassey(poly f) {
  auto [p, q] = fracApprox(f, poly{1}.shift(f.size()), f.size() >> 1);
  if (q[0] != 1) {
    int nv = nt.inv(q[0]);
    p *= nv, q *= nv;
  }
  return {p, q};
}

struct RelaxedConvolution {
  static const int LG = 19;
  static const int B = 16;
  void run(int *f, int *g, int n, const function<void(int)> &relax) {
    vector<vector<int>> savef(LG + 1), saveg(LG + 1);
    function<void(int, int)> divideConquer = [&](int l, int r) {
      if (r - l <= BRUTE_N2_LIMIT) {
        for (int i = l + 1; i <= r; ++i) {
          for (int j = l; j < i; ++j)
            fam(f[i], f[j], g[i - j]);
          if (l > 0)
            for (int j = l; j < i; ++j)
              fam(f[i], g[j], f[i - j]);
          relax(i);
        }
        return ;
      }
      if (l == 0) {
        int mid = ((l + r) >> 1) + 5;
        divideConquer(l, mid);
        int lgd = 0;
        while ((1 << lgd) <= r)
          ++lgd;
        vector<int> tf(1 << lgd), tg(1 << lgd);
        for (int i = 0; i < mid; ++i)
          tf[i] = f[i], tg[i] = g[i];
        fft(tf.data(), lgd), fft(tg.data(), lgd);
        for (int i = 0; i < (1 << lgd); ++i)
          tf[i] = (ll)tf[i] * tg[i] % mod;
        fft(tf.data(), lgd, -1);
        for (int i = mid + 1; i <= r; ++i)
          add(f[i], tf[i]);
        divideConquer(mid, r);
        return ;
      }
      int d = (r - l) / B + 1, lgd = 0, lg;
      while ((1 << lgd) <= d * 2)
        ++lgd;
      d = (1 << (lgd - 1)) - 1, lg = (r - l + d - 1) / d;

      vector<int> dftf = savef[lgd], dftg = saveg[lgd];
      dftf.resize(lg << (lgd + 1)), dftg.resize(lg << (lgd + 1));
      for (int i = lg << lgd; i < (lg << (lgd + 1)); ++i)
        dftf[i] = dftg[i] = 0;
      int ef = lg;
      for (int i = 0; i < lg; ++i) {
        if ((i << lgd) < savef[lgd].size())
          continue;
        if ((i + 2) * d >= l)
          --ef;
        for (int j = 0; j <= min(d * 2, n - i * d); ++j)
          dftf[(i << lgd) + j] = f[i * d + j],
          dftg[(i << lgd) + j] = g[i * d + j];
        fft(dftf.data() + (i << lgd), lgd), fft(dftg.data() + (i << lgd), lgd);
      }
      if (savef[lgd].size() < (ef << lgd))
        savef[lgd] = vector<int>(dftf.begin(), dftf.begin() + (ef << lgd)),
        saveg[lgd] = vector<int>(dftg.begin(), dftg.begin() + (ef << lgd));
      for (int i = 0; i < lg; ++i) {
        if (i) {
          for (int j = 0; j < (1 << lgd); ++j)
            add(dftf[((lg + i) << lgd) + j], dftg[((lg + i) << lgd) + j]);
          fft(dftf.data() + ((lg + i) << lgd), lgd, -1);
        }
        for (int j = 1; j <= min(d, r - l - i * d); ++j)
          add(f[l + i * d + j], dftf[((lg + i) << lgd) + d + j]);
        divideConquer(l + i * d, min(l + (i + 1) * d, r));
        if (i + 1 < lg) {
          for (int j = 0; j < d; ++j)
            dftf[((lg + i) << lgd) + j] = f[l + i * d + j],
            dftg[((lg + i) << lgd) + j] = g[l + i * d + j];
          for (int j = d; j < (1 << lgd); ++j)
            dftf[((lg + i) << lgd) + j] = dftg[((lg + i) << lgd) + j] = 0;
          fft(dftf.data() + ((lg + i) << lgd), lgd), fft(dftg.data() + ((lg + i) << lgd), lgd);
        }
        for (int j = i + 1; j < lg; ++j)
          for (int k = 0; k < (1 << lgd); ++k)
            fam(dftf[((lg + j) << lgd) + k], dftg[((j - i - 1) << lgd) + k], dftf[((lg + i) << lgd) + k]),
            fam(dftg[((lg + j) << lgd) + k], dftf[((j - i - 1) << lgd) + k], dftg[((lg + i) << lgd) + k]);
      }
    };
    relax(0), divideConquer(0, n);
  }
} rc;

int main() {
  int n; scanf("%d", &n);
  poly f = zeroes(n); vector<int> g(n + 1);
  rc.run(f.base(), g.data(), n, [&](int k) {
    if (!k) return ;
    if (k == 1) f[k] = 1;
    fam(f[k], k - 1, f[k - 1]), g[k] = exchange(f[k], (ll)f[k] * ginv[k] % mod);
  });
  int ans = (ll)(-f + 1).inv()[n] * gfac[n] % mod;
  printf("%d\n", ans);
}

这程序好像有点Bug,我给组数据试试?

详细

Test #1:

score: 100
Accepted
time: 0ms
memory: 3952kb

input:

3

output:

28

result:

ok 1 number(s): "28"

Test #2:

score: 0
Accepted
time: 0ms
memory: 3980kb

input:

1

output:

1

result:

ok 1 number(s): "1"

Test #3:

score: 0
Accepted
time: 0ms
memory: 3884kb

input:

2

output:

4

result:

ok 1 number(s): "4"

Test #4:

score: 0
Accepted
time: 0ms
memory: 4180kb

input:

4

output:

282

result:

ok 1 number(s): "282"

Test #5:

score: 0
Accepted
time: 0ms
memory: 3972kb

input:

5

output:

3718

result:

ok 1 number(s): "3718"

Test #6:

score: 0
Accepted
time: 0ms
memory: 4048kb

input:

6

output:

60694

result:

ok 1 number(s): "60694"

Test #7:

score: 0
Accepted
time: 0ms
memory: 3960kb

input:

7

output:

1182522

result:

ok 1 number(s): "1182522"

Test #8:

score: 0
Accepted
time: 0ms
memory: 3956kb

input:

8

output:

26791738

result:

ok 1 number(s): "26791738"

Test #9:

score: 0
Accepted
time: 0ms
memory: 4044kb

input:

9

output:

692310518

result:

ok 1 number(s): "692310518"

Test #10:

score: 0
Accepted
time: 0ms
memory: 3884kb

input:

10

output:

135061370

result:

ok 1 number(s): "135061370"

Test #11:

score: 0
Accepted
time: 0ms
memory: 3968kb

input:

100

output:

423669705

result:

ok 1 number(s): "423669705"

Test #12:

score: 0
Accepted
time: 1ms
memory: 4044kb

input:

1234

output:

878522960

result:

ok 1 number(s): "878522960"

Test #13:

score: 0
Accepted
time: 36ms
memory: 7812kb

input:

54321

output:

827950477

result:

ok 1 number(s): "827950477"

Test #14:

score: 0
Accepted
time: 54ms
memory: 9832kb

input:

65536

output:

380835743

result:

ok 1 number(s): "380835743"

Test #15:

score: 0
Accepted
time: 125ms
memory: 15016kb

input:

131072

output:

842796122

result:

ok 1 number(s): "842796122"

Test #16:

score: 0
Accepted
time: 102ms
memory: 12460kb

input:

131071

output:

981021531

result:

ok 1 number(s): "981021531"

Test #17:

score: 0
Accepted
time: 100ms
memory: 12532kb

input:

131070

output:

480197639

result:

ok 1 number(s): "480197639"

Test #18:

score: 0
Accepted
time: 129ms
memory: 17012kb

input:

131074

output:

383000585

result:

ok 1 number(s): "383000585"

Test #19:

score: 0
Accepted
time: 133ms
memory: 17036kb

input:

131073

output:

316664839

result:

ok 1 number(s): "316664839"

Test #20:

score: 0
Accepted
time: 196ms
memory: 21212kb

input:

250000

output:

119658643

result:

ok 1 number(s): "119658643"

Test #21:

score: 0
Accepted
time: 193ms
memory: 21552kb

input:

249999

output:

78110138

result:

ok 1 number(s): "78110138"

Test #22:

score: 0
Accepted
time: 177ms
memory: 21440kb

input:

249998

output:

297253469

result:

ok 1 number(s): "297253469"

Extra Test:

score: 0
Extra Test Passed