HackerRank Definite Random Walks Solution
In this post, we will solve HackerRank Definite Random Walks Problem Solution.
Alex has a board game consisting of:
- A chip for marking his current location on the board.
n fields numbered from 1 to n. Each position i has a value. fi. denoting the next position for the chip to jump to from that field. - A die with m faces numbered from 0 to m – 1. Each face j has a probability, p,, of being rolled.
Alex then performs the following actions:
Begins the game by placing the chip at a position in a field randomly and with equiprobability. - Takes & turns; during each turn he:
- Rolls the die. We’ll denote the number rolled during a turn as d.
- Jumps the chip d times. Recall that each field contains a value denoting the next field number to jump to.
- After completing & turns, the game ends and he must calculate the respective probabilities for each field as to whether the game ended with the chip in that field.
Given n. m. k. the game board, and the probabilities for each die face, print n lines where each line i contains the probability that the chip is on field & at the end of the game.
Note: All the probabilities in this task are rational numbers modulo M = 998244353. That is, if the probability can be expressed as the irreducible fraction where q mod M 0. then it corresponds to the number (px q¹) mod M (or, alternatively,
px q = x( mod M)). Click here to learn about Modular Multiplicative Inverse.
Input Format
The first line contains three space-separated integers describing the respective values of n (the number of positions), m (the number of die faces), and (the number of turns). The second line contains n space-separated integers describing the respective values of each f. (i.e., the index of the field that field i can transition to).
The third line contains m space-separated integers describing the respective values of each Pj (where 0 < Pj < M) describing the probabilities of the faces of the m-sided die.
Output Format
Print n lines of output in which each line i contains a single integer,, (where
0 < xi < M), denoting the probability that the chip will be on field & after & turns.
Sample Input 0
4 5 1
2 3 2 4
332748118 332748118 332748118 0 0
Sample Output 0
582309206
332748118
332748118
748683265
Definite Random Walks C++ Solution
#include <bits/stdc++.h>
using namespace std;
#define sz(x) ((int) (x).size())
#define forn(i,n) for (int i = 0; i < int(n); ++i)
typedef long long ll;
typedef long long i64;
typedef long double ld;
typedef pair<int, int> pii;
const int inf = int(1e9) + int(1e5);
const ll infl = ll(2e18) + ll(1e10);
const int mod = 998244353;
const int root = 787603194;
const int LG = 19;
int add(int a, int b) {
a += b;
if (a >= mod)
a -= mod;
return a;
}
int sub(int a, int b) {
return add(a, mod - b);
}
void addTo(int &a, int b) {
a += b;
if (a >= mod)
a -= mod;
}
int mul(ll a, ll b) {
return a * b % mod;
}
int binpow(int a, int deg) {
int res = 1;
while (deg) {
if (deg & 1)
res = mul(res, a);
deg >>= 1;
a = mul(a, a);
}
return res;
}
int rev(int x) {
assert(x != 0);
return binpow(x, mod - 2);
}
int divide(int a, int b) {
return mul(a, rev(b));
}
vector<int> ang[2][LG + 1];
void initFFT() {
int rroot = rev(root);
int x0 = 1, x1 = 1;
ang[0][LG].resize(1 << LG);
ang[1][LG].resize(1 << LG);
forn (i, 1 << LG) {
ang[0][LG][i] = x0;
ang[1][LG][i] = x1;
x0 = mul(x0, root);
x1 = mul(x1, rroot);
}
for (int lg = LG - 1; lg >= 0; --lg) {
forn (q, 2) {
ang[q][lg].resize(1 << lg);
forn (i, 1 << lg)
ang[q][lg][i] = ang[q][lg + 1][i * 2];
}
}
}
void recFFT(int *a, int lg, bool inv) {
if (lg == 0)
return;
int hlen = (1 << lg) >> 1;
recFFT(a, lg - 1, inv);
recFFT(a + hlen, lg - 1, inv);
forn (i, hlen) {
int u = a[i];
int v = mul(ang[inv][lg][i], a[i + hlen]);
a[i] = add(u, v);
a[i + hlen] = sub(u, v);
}
}
void FFT(int *a, int n, bool inv) {
int lg = 0;
while ((1 << lg) < n)
++lg;
assert(n == (1 << lg));
int j = 0, bit;
for (int i = 1; i < n; ++i) {
for (bit = n >> 1; j & bit; bit >>= 1)
j ^= bit;
j ^= bit;
if (i < j)
swap(a[i], a[j]);
}
recFFT(a, lg, inv);
if (inv) {
int rn = rev(n);
forn (i, n)
a[i] = mul(a[i], rn);
}
}
void mul(int *a, int *b, int n) {
FFT(a, n, false);
if (a != b)
FFT(b, n, false);
forn (i, n)
a[i] = mul(a[i], b[i]);
FFT(a, n, true);
}
void cyclicMul(int *a, int *b, int n, int pre, int len) {
mul(a, b, n);
int j = 0;
forn (i, n) {
if (j < i) {
addTo(a[j], a[i]);
a[i] = 0;
}
++j;
if (j == pre + len)
j -= len;
}
}
const int maxn = 100100;
int ans[maxn];
int to[maxn];
vector<int> g[maxn];
int p[maxn];
bool used[maxn];
map<int, vector<int>> byLen;
int a[1 << LG];
int _a[1 << LG];
int cur[1 << LG];
int N, M, K;
int calcSize(int u, int root) {
int cnt = 1;
for (int v: g[u])
if (v != root)
cnt += calcSize(v, root);
return cnt;
}
int tmp[1 << LG];
void powk(int n, int pre, int len) {
forn (i, n)
a[i] = 0;
a[0] = rev(N);
int deg = K;
while (deg) {
if (deg & 1) {
forn (i, n)
tmp[i] = cur[i];
cyclicMul(a, tmp, n, pre, len);
}
deg >>= 1;
cyclicMul(cur, cur, n, pre, len);
}
}
int ROOT;
int in[maxn];
int out[maxn];
int ord[maxn];
int byh[maxn];
int h[maxn];
int timer;
void dfs1(int u, int ch) {
ord[timer] = u;
in[u] = timer++;
h[u] = ch;
for (int v: g[u]) {
if (v == ROOT)
continue;
dfs1(v, ch + 1);
}
out[u] = timer;
}
const int bs = 3500;
const int blocks = maxn / bs + 2;
int bl[blocks][1 << LG];
void calcBlock(int l, int r, int cnt, int lg, int *to) {
forn (i, 1 << lg)
to[i] = 0;
for (int i = l; i < r; ++i) {
int ch = h[ord[i]];
to[cnt - ch]++;
}
FFT(to, 1 << lg, false);
forn (i, 1 << lg)
to[i] = mul(to[i], _a[i]);
FFT(to, 1 << lg, true);
}
void solveComponent(int root, int m) {
ROOT = root, timer = 0;
dfs1(root, 0);
const int cnt = timer;
forn (i, cnt)
byh[i] = 0;
forn (i, cnt) {
int u = ord[i];
byh[h[u]]++;
}
int lg = 0;
while ((1 << lg) < m + cnt)
++lg;
assert(lg <= LG);
forn (i, m)
_a[i] = a[i];
for (int i = m; i < (1 << lg); ++i)
_a[i] = 0;
FFT(_a, 1 << lg, false);
for (int l = 0; l + bs <= cnt; l += bs)
calcBlock(l, l + bs, cnt, lg, bl[l / bs]);
forn (ii, cnt) {
int u = ord[ii];
int ch = h[u];
int l = in[u];
int r = out[u];
int lto = min(r, ((l + bs - 1) / bs) * bs);
for (; l < lto; ++l)
addTo(ans[u], a[h[ord[l]] - ch]);
int rto = max(l, (r / bs) * bs);
for (; r > rto; --r)
addTo(ans[u], a[h[ord[r - 1]] - ch]);
while (l < r) {
addTo(ans[u], bl[l / bs][cnt - ch]);
l += bs;
}
}
calcBlock(0, cnt, cnt, lg, bl[blocks - 1]);
int u = root;
for (int i = cnt + 1; i < (1 << lg); ++i) {
u = to[u];
addTo(ans[u], bl[blocks - 1][i]);
}
}
void solveLen(int len) {
//cerr << "solveLen " << len << ":";
//for (int x: byLen[len])
//cerr << ' ' << x + 1;
//cerr << '\n';
vector<pii> v;
for (int u: byLen[len])
v.emplace_back(calcSize(u, u), u);
sort(v.begin(), v.end());
reverse(v.begin(), v.end());
int pre = v[0].first;
int lg = 0;
while ((1 << lg) < (pre + len) * 2)
++lg;
assert(lg <= LG);
forn (i, 1 << lg)
cur[i] = 0;
int j = 0;
forn (i, M) {
addTo(cur[j], p[i]);
++j;
if (j == pre + len)
j -= len;
}
powk(1 << lg, pre, len);
for (auto p: v) {
while (p.first <= pre - len) {
for (int i = pre - len; i < pre; ++i) {
addTo(a[i], a[i + len]);
a[i + len] = 0;
}
pre -= len;
}
solveComponent(p.second, pre + len);
}
}
void solve() {
forn (i, N) {
int u = i;
vector<int> st;
while (!used[u]) {
st.push_back(u);
used[u] = true;
u = to[u];
}
int len = 1;
while (!st.empty() && st.back() != u)
st.pop_back(), ++len;
if (!st.empty())
byLen[len].push_back(u);
}
for (const auto &p: byLen)
solveLen(p.first);
}
int main() {
#ifdef LOCAL
assert(freopen("test.in", "r", stdin));
#else
#endif
initFFT();
scanf("%d%d%d", &N, &M, &K);
forn (i, N) {
scanf("%d", &to[i]);
--to[i];
g[to[i]].push_back(i);
}
int sum = 0;
forn (i, M) {
scanf("%d", &p[i]);
addTo(sum, p[i]);
}
assert(sum == 1);
solve();
sum = 0;
forn (i, N) {
cout << ans[i] << '\n';
addTo(sum, ans[i]);
}
assert(sum == 1);
cerr << "time: " << clock() / 1000 << "ms\n";
}
Definite Random Walks Java Solution
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.HashMap;
import java.util.InputMismatchException;
import java.util.Map;
public class G2 {
InputStream is;
PrintWriter out;
String INPUT = "";
// String INPUT = "7 3 1\r\n" +
// "4 2 7 2 4 3 5 \r\n" +
// "0 1 0 ";
// String INPUT = "4 2 1\r\n" +
// "1 4 2 3 \r\n" +
// "748683265 249561089";
// 1 2->4->3->2
// String INPUT = "4 2 2\r\n" +
// "2 3 1 3 \r\n" +
// "0 1 ";
// 1/4 1/4 1/2
// 1/2 1/4 1/4
// String INPUT = "3 1 2\r\n" +
// "1 3 1 \r\n" +
// "1 ";
// String INPUT = "4 5 1\r\n" +
// "2 3 2 4\r\n" +
// "332748118 332748118 332748118 0 0";
void solve()
{
int n = ni(), m = ni(), K = ni();
int[] f = na(n);
for(int i = 0;i < n;i++)f[i]--;
long[] ps = new long[m];
for(int i = 0;i < m;i++)ps[i] = nl();
int mod = 998244353;
// tr("phase 0");
long[] made = make(ps, K, n+1, 1);
int H = (int)Math.sqrt(n)*8; // naive height limit
int B = (int)Math.sqrt(n)*8; // cycle split period
SplitResult sres = split(f);
int[] tclus = new int[n];
Arrays.fill(tclus, -1);
for(int i = n-1;i >= 0;i--){
int cur = sres.ord[i];
if(sres.incycle[cur]){
tclus[cur] = cur;
}else{
tclus[cur] = tclus[f[cur]];
}
}
// tr("phase 1");
long[] rets = new long[n];
int[][] maps = makeBuckets(tclus, n);
for(int i = 0;i < n;i++){
if(maps[i].length > 0){
int[] map = maps[i];
int[] lpar = new int[map.length];
int p = 0;
for(int x : maps[i]){
if(sres.incycle[x]){
lpar[p++] = -1;
}else{
lpar[p++] = Arrays.binarySearch(map, f[x]);
}
}
long[] res = solve(parentToG(lpar), lpar, made, H, Arrays.binarySearch(map, i));
for(int j = 0;j < res.length;j++){
if(!sres.incycle[map[j]]){
rets[map[j]] += res[j];
}
}
}
}
// tr("phase 2");
int[] maxdep = new int[n];
for(int i = 0;i < n;i++){
int cur = sres.ord[i];
if(!sres.incycle[cur]){
maxdep[f[cur]] = Math.max(maxdep[f[cur]], maxdep[cur]+1);
}
}
int[] tdep = new int[n];
for(int i = n-1;i >= 0;i--){
int cur = sres.ord[i];
if(!sres.incycle[cur]){
tdep[cur] = tdep[f[cur]]+1;
}
}
// tr("phase 3");
// for(int j = made.length-1-1;j >= 0;j--){
// made[j] += made[j+1];
// if(made[j] >= mod)made[j] -= mod;
// }
boolean[] ved = new boolean[n];
int[] cycle = new int[n];
Map<Long, long[]> cache = new HashMap<>();
for(int i = 0;i < n;i++){
if(sres.incycle[i] && !ved[i]){
int p = 0;
ved[i] = true;
cycle[p++] = i;
int lmaxdep = maxdep[i];
for(int j = f[i];!ved[j];j = f[j]){
ved[j] = true;
cycle[p++] = j;
lmaxdep = Math.max(lmaxdep, maxdep[j]);
}
int tail = lmaxdep+p+1;
int fp = p;
long[] di = cache.computeIfAbsent((long)tail<<32|fp, (z) -> {
long[] res = make(ps, K, tail, fp);
for(int j = res.length-fp-1;j >= 0;j--){
res[j] += res[j+fp];
if(res[j] >= mod)res[j] -= mod;
}
return res;
});
// 0.1 0.2 0.3 0.4
// 0.4 0.6
// 0.1 0.6 0.3
// 0.1 0.2 0.3 0.4
// 0.4 0.6
// 0.6 0.3
// 0.3 0.4
if(p <= B){
for(int j = 0;j < p;j++){
for(int v : maps[cycle[j]]){
for(int k = tdep[v], l = j;k < tdep[v]+p;k++,l++){
if(l == p)l = 0;
rets[cycle[l]] += di[k];
}
}
}
}else{
inputed = null;
for(int b = 0;b < p;b+=B){
long[] ents = new long[tail+1];
for(int j = 0;j < b;j++){
for(int v : maps[cycle[j]]){
ents[tail-(b-j)-tdep[v]]++;
}
}
for(int j = b+B;j < p;j++){
for(int v : maps[cycle[j]]){
ents[tail-b-(p-j)-tdep[v]]++;
}
}
long[] ced = convoluteSimply(ents, di, mod, 3);
inputed = saved;
for(int k = b;k < p && k < b+B;k++){
rets[cycle[k]] += ced[tail+k-b];
}
}
inputed = null;
// remainder
for(int j = 0;j < p;j++){
for(int v : maps[cycle[j]]){
for(int k = tdep[v], l = j;k < tdep[v]+p && l < p && l < j/B*B+B;k++,l++){
rets[cycle[l]] += di[k];
}
for(int k = tdep[v]+p-j+j/B*B, l = j/B*B;k < tdep[v]+p && l < j;k++,l++){
rets[cycle[l]] += di[k];
}
}
}
}
}
}
// tr("phase 4");
long RN = invl(n, mod);
for(long ret : rets){
out.println(ret%mod*RN%mod);
}
}
int mod = 998244353;
long[] make(long[] ps, int K, int tail, int period)
{
long[] ms = ps;
if(ps.length > tail+period){
ms = Arrays.copyOf(ps, tail+period);
for(int j = tail+period, k = tail;j < ps.length;j++,k++){
if(k == tail+period)k -= period;
ms[k] += ps[j];
if(ms[k] >= mod)ms[k] -= mod;
}
}
long[] pps = new long[1];
pps[0] = 1;
for(int i = 0;1<<i <= K;i++){
if(K<<~i<0){
long[] res = convoluteSimply(pps, ms, mod, 3);
for(int j = res.length-1-period;j >= tail;j--){
res[j] += res[j+period];
res[j+period] = 0;
if(res[j] >= mod)res[j] -= mod;
}
pps = Arrays.copyOf(res, Math.min(tail+period, pps.length+ms.length));
}
if(1<<i+1 <= K){
long[] res = convoluteSimply(ms, ms, mod, 3);
for(int j = res.length-1-period;j >= tail;j--){
res[j] += res[j+period];
res[j+period] = 0;
if(res[j] >= mod)res[j] -= mod;
}
ms = Arrays.copyOf(res, Math.min(tail+period, ms.length+ms.length));
}
}
if(pps.length < tail+period)pps = Arrays.copyOf(pps, tail+period);
return pps;
}
long[] solve(int[][] g, int[] par, long[] di, int H, int root)
{
int n = g.length;
long[] ret = new long[n];
int[][] pars = parents3(g, root);
int[] des = new int[n];
long[] ws = new long[n];
int[] ord = pars[1], dep = pars[2];
int[] marked = new int[n];
for(int i = n-1;i >= 0;i--){
int cur = ord[i];
des[cur]++;
ws[cur] += des[cur];
if(marked[cur] == 0 && ws[cur] > (long)H*des[cur]){
marked[cur] = 1;
}
if(i > 0){
des[par[cur]] += des[cur];
ws[par[cur]] += ws[cur];
if(marked[cur] >= 1)marked[par[cur]] = 2;
}
}
// tr(g, root);
// tr(marked);
// large
// marked node
for(int i = 0;i < n;i++){
if(marked[i] == 1){
int[] fdep = new int[n];
collect(i, par[i], g, dep, fdep);
for(int j = par[i];j != -1 && marked[j] == 2;j = par[j]){
fdep[dep[j]]++;
marked[j] = 3;
}
int lmaxdep = n;
for(int j = n-1;j >= 0;j--){
if(fdep[j] > 0){
lmaxdep = j;
break;
}
}
long[] rfdep = new long[lmaxdep+1];
for(int j = 0;j <= lmaxdep;j++){
rfdep[lmaxdep-j] = fdep[j];
}
// tr("fdep", fdep, marked);
long[] ced = convoluteSimply(rfdep, Arrays.copyOf(di, lmaxdep+1), mod, 3);
for(int j = i;j != -1;j = par[j]){
ret[j] += ced[lmaxdep-dep[j]];
}
}
}
// small
for(int i = 0;i < n;i++){
if(marked[i] == 0){
for(int j = i;j != -1 && marked[j] != 1;j = par[j]){
ret[j] += di[dep[i]-dep[j]];
}
}
}
for(int i = 0;i < n;i++){
ret[i] %= mod;
}
return ret;
}
void collect(int cur, int par, int[][] g, int[] dep, int[] fdep)
{
fdep[dep[cur]]++;
for(int e : g[cur]){
if(e != par)collect(e, cur, g, dep, fdep);
}
}
// library
public static int[][] parents3(int[][] g, int root) {
int n = g.length;
int[] par = new int[n];
Arrays.fill(par, -1);
int[] depth = new int[n];
depth[0] = 0;
int[] q = new int[n];
q[0] = root;
for (int p = 0, r = 1; p < r; p++) {
int cur = q[p];
for (int nex : g[cur]) {
if (par[cur] != nex) {
q[r++] = nex;
par[nex] = cur;
depth[nex] = depth[cur] + 1;
}
}
}
return new int[][] { par, q, depth };
}
public static final int[] NTTPrimes = {1053818881, 1051721729, 1045430273, 1012924417, 1007681537, 1004535809, 998244353, 985661441, 976224257, 975175681};
public static final int[] NTTPrimitiveRoots = {7, 6, 3, 5, 3, 3, 3, 3, 3, 17};
// public static final int[] NTTPrimes = {1012924417, 1004535809, 998244353, 985661441, 975175681, 962592769, 950009857, 943718401, 935329793, 924844033};
// public static final int[] NTTPrimitiveRoots = {5, 3, 3, 3, 17, 7, 7, 7, 3, 5};
static long[] inputed;
static long[] saved;
public static long[] convoluteSimply(long[] a, long[] b, int P, int g)
{
int m = Math.max(2, Integer.highestOneBit(Math.max(a.length, b.length)-1)<<2);
long[] fa = nttmb(a, m, false, P, g);
long[] fb = a == b ? fa : inputed != null ? inputed : nttmb(b, m, false, P, g);
saved = fb;
for(int i = 0;i < m;i++){
fa[i] = fa[i]*fb[i]%P;
}
return nttmb(fa, m, true, P, g);
}
public static long[] convolute(long[] a, long[] b)
{
int USE = 2;
int m = Math.max(2, Integer.highestOneBit(Math.max(a.length, b.length)-1)<<2);
long[][] fs = new long[USE][];
for(int k = 0;k < USE;k++){
int P = NTTPrimes[k], g = NTTPrimitiveRoots[k];
long[] fa = nttmb(a, m, false, P, g);
long[] fb = a == b ? fa : nttmb(b, m, false, P, g);
for(int i = 0;i < m;i++){
fa[i] = fa[i]*fb[i]%P;
}
fs[k] = nttmb(fa, m, true, P, g);
}
int[] mods = Arrays.copyOf(NTTPrimes, USE);
long[] gammas = garnerPrepare(mods);
int[] buf = new int[USE];
for(int i = 0;i < fs[0].length;i++){
for(int j = 0;j < USE;j++)buf[j] = (int)fs[j][i];
long[] res = garnerBatch(buf, mods, gammas);
long ret = 0;
for(int j = res.length-1;j >= 0;j--)ret = ret * mods[j] + res[j];
fs[0][i] = ret;
}
return fs[0];
}
public static long[] convolute(long[] a, long[] b, int USE, int mod)
{
int m = Math.max(2, Integer.highestOneBit(Math.max(a.length, b.length)-1)<<2);
long[][] fs = new long[USE][];
for(int k = 0;k < USE;k++){
int P = NTTPrimes[k], g = NTTPrimitiveRoots[k];
long[] fa = nttmb(a, m, false, P, g);
long[] fb = a == b ? fa : nttmb(b, m, false, P, g);
for(int i = 0;i < m;i++){
fa[i] = fa[i]*fb[i]%P;
}
fs[k] = nttmb(fa, m, true, P, g);
}
int[] mods = Arrays.copyOf(NTTPrimes, USE);
long[] gammas = garnerPrepare(mods);
int[] buf = new int[USE];
for(int i = 0;i < fs[0].length;i++){
for(int j = 0;j < USE;j++)buf[j] = (int)fs[j][i];
long[] res = garnerBatch(buf, mods, gammas);
long ret = 0;
for(int j = res.length-1;j >= 0;j--)ret = (ret * mods[j] + res[j]) % mod;
fs[0][i] = ret;
}
return fs[0];
}
// static int[] wws = new int[270000]; // outer faster
// Modifed Montgomery + Barrett
private static long[] nttmb(long[] src, int n, boolean inverse, int P, int g)
{
long[] dst = Arrays.copyOf(src, n);
int h = Integer.numberOfTrailingZeros(n);
long K = Integer.highestOneBit(P)<<1;
int H = Long.numberOfTrailingZeros(K)*2;
long M = K*K/P;
int[] wws = new int[1<<h-1];
long dw = inverse ? pow(g, P-1-(P-1)/n, P) : pow(g, (P-1)/n, P);
long w = (1L<<32)%P;
for(int k = 0;k < 1<<h-1;k++){
wws[k] = (int)w;
w = modh(w*dw, M, H, P);
}
long J = invl(P, 1L<<32);
for(int i = 0;i < h;i++){
for(int j = 0;j < 1<<i;j++){
for(int k = 0, s = j<<h-i, t = s|1<<h-i-1;k < 1<<h-i-1;k++,s++,t++){
long u = (dst[s] - dst[t] + 2*P)*wws[k];
dst[s] += dst[t];
if(dst[s] >= 2*P)dst[s] -= 2*P;
// long Q = (u&(1L<<32)-1)*J&(1L<<32)-1;
long Q = (u<<32)*J>>>32;
dst[t] = (u>>>32)-(Q*P>>>32)+P;
}
}
if(i < h-1){
for(int k = 0;k < 1<<h-i-2;k++)wws[k] = wws[k*2];
}
}
for(int i = 0;i < n;i++){
if(dst[i] >= P)dst[i] -= P;
}
for(int i = 0;i < n;i++){
int rev = Integer.reverse(i)>>>-h;
if(i < rev){
long d = dst[i]; dst[i] = dst[rev]; dst[rev] = d;
}
}
if(inverse){
long in = invl(n, P);
for(int i = 0;i < n;i++)dst[i] = modh(dst[i]*in, M, H, P);
}
return dst;
}
// Modified Shoup + Barrett
private static long[] nttsb(long[] src, int n, boolean inverse, int P, int g)
{
long[] dst = Arrays.copyOf(src, n);
int h = Integer.numberOfTrailingZeros(n);
long K = Integer.highestOneBit(P)<<1;
int H = Long.numberOfTrailingZeros(K)*2;
long M = K*K/P;
long dw = inverse ? pow(g, P-1-(P-1)/n, P) : pow(g, (P-1)/n, P);
long[] wws = new long[1<<h-1];
long[] ws = new long[1<<h-1];
long w = 1;
for(int k = 0;k < 1<<h-1;k++){
wws[k] = (w<<32)/P;
ws[k] = w;
w = modh(w*dw, M, H, P);
}
for(int i = 0;i < h;i++){
for(int j = 0;j < 1<<i;j++){
for(int k = 0, s = j<<h-i, t = s|1<<h-i-1;k < 1<<h-i-1;k++,s++,t++){
long ndsts = dst[s] + dst[t];
if(ndsts >= 2*P)ndsts -= 2*P;
long T = dst[s] - dst[t] + 2*P;
long Q = wws[k]*T>>>32;
dst[s] = ndsts;
dst[t] = ws[k]*T-Q*P&(1L<<32)-1;
}
}
// dw = dw * dw % P;
if(i < h-1){
for(int k = 0;k < 1<<h-i-2;k++){
wws[k] = wws[k*2];
ws[k] = ws[k*2];
}
}
}
for(int i = 0;i < n;i++){
if(dst[i] >= P)dst[i] -= P;
}
for(int i = 0;i < n;i++){
int rev = Integer.reverse(i)>>>-h;
if(i < rev){
long d = dst[i]; dst[i] = dst[rev]; dst[rev] = d;
}
}
if(inverse){
long in = invl(n, P);
for(int i = 0;i < n;i++){
dst[i] = modh(dst[i] * in, M, H, P);
}
}
return dst;
}
static final long mask = (1L<<31)-1;
public static long modh(long a, long M, int h, int mod)
{
long r = a-((M*(a&mask)>>>31)+M*(a>>>31)>>>h-31)*mod;
return r < mod ? r : r-mod;
}
private static long[] garnerPrepare(int[] m)
{
int n = m.length;
assert n == m.length;
if(n == 0)return new long[0];
long[] gamma = new long[n];
for(int k = 1;k < n;k++){
long prod = 1;
for(int i = 0;i < k;i++){
prod = prod * m[i] % m[k];
}
gamma[k] = invl(prod, m[k]);
}
return gamma;
}
private static long[] garnerBatch(int[] u, int[] m, long[] gamma)
{
int n = u.length;
assert n == m.length;
long[] v = new long[n];
v[0] = u[0];
for(int k = 1;k < n;k++){
long temp = v[k-1];
for(int j = k-2;j >= 0;j--){
temp = (temp * m[j] + v[j]) % m[k];
}
v[k] = (u[k] - temp) * gamma[k] % m[k];
if(v[k] < 0)v[k] += m[k];
}
return v;
}
private static long pow(long a, long n, long mod) {
// a %= mod;
long ret = 1;
int x = 63 - Long.numberOfLeadingZeros(n);
for (; x >= 0; x--) {
ret = ret * ret % mod;
if (n << 63 - x < 0)
ret = ret * a % mod;
}
return ret;
}
private static long invl(long a, long mod) {
long b = mod;
long p = 1, q = 0;
while (b > 0) {
long c = a / b;
long d;
d = a;
a = b;
b = d % b;
d = p;
p = q;
q = d - c * q;
}
return p < 0 ? p + mod : p;
}
public static int[][] parentToG(int[] par)
{
int n = par.length;
int[] ct = new int[n];
for(int i = 0;i < n;i++){
if(par[i] >= 0){
ct[i]++;
ct[par[i]]++;
}
}
int[][] g = new int[n][];
for(int i = 0;i < n;i++){
g[i] = new int[ct[i]];
}
for(int i = 0;i < n;i++){
if(par[i] >= 0){
g[par[i]][--ct[par[i]]] = i;
g[i][--ct[i]] = par[i];
}
}
return g;
}
public static int[][] makeBuckets(int[] a, int sup)
{
int n = a.length;
int[][] bucket = new int[sup+1][];
int[] bp = new int[sup+1];
for(int i = 0;i < n;i++)bp[a[i]]++;
for(int i = 0;i <= sup;i++)bucket[i] = new int[bp[i]];
for(int i = n-1;i >= 0;i--)bucket[a[i]][--bp[a[i]]] = i;
return bucket;
}
public static class SplitResult
{
public boolean[] incycle;
public int[] ord;
}
public static SplitResult split(int[] f)
{
int n = f.length;
boolean[] incycle = new boolean[n];
Arrays.fill(incycle, true);
int[] indeg = new int[n];
for(int i = 0;i < n;i++)indeg[f[i]]++;
int[] q = new int[n];
int qp = 0;
for(int i = 0;i < n;i++){
if(indeg[i] == 0)q[qp++] = i;
}
for(int r = 0;r < qp;r++){
int cur = q[r];
indeg[cur] = -9999999;
incycle[cur] = false;
int e = f[cur];
indeg[e]--;
if(indeg[e] == 0)q[qp++] = e;
}
for(int i = 0;i < n;i++){
if(indeg[i] == 1){
q[qp++] = i;
}
}
assert qp == n;
SplitResult ret = new SplitResult();
ret.incycle = incycle;
ret.ord = q;
return ret;
}
void run() throws Exception
{
is = INPUT.isEmpty() ? System.in : new ByteArrayInputStream(INPUT.getBytes());
out = new PrintWriter(System.out);
long s = System.currentTimeMillis();
solve();
out.flush();
if(!INPUT.isEmpty())tr(System.currentTimeMillis()-s+"ms");
}
public static void main(String[] args) throws Exception { new G2().run(); }
private byte[] inbuf = new byte[1024];
public int lenbuf = 0, ptrbuf = 0;
private int readByte()
{
if(lenbuf == -1)throw new InputMismatchException();
if(ptrbuf >= lenbuf){
ptrbuf = 0;
try { lenbuf = is.read(inbuf); } catch (IOException e) { throw new InputMismatchException(); }
if(lenbuf <= 0)return -1;
}
return inbuf[ptrbuf++];
}
private boolean isSpaceChar(int c) { return !(c >= 33 && c <= 126); }
private int skip() { int b; while((b = readByte()) != -1 && isSpaceChar(b)); return b; }
private double nd() { return Double.parseDouble(ns()); }
private char nc() { return (char)skip(); }
private String ns()
{
int b = skip();
StringBuilder sb = new StringBuilder();
while(!(isSpaceChar(b))){ // when nextLine, (isSpaceChar(b) && b != ' ')
sb.appendCodePoint(b);
b = readByte();
}
return sb.toString();
}
private char[] ns(int n)
{
char[] buf = new char[n];
int b = skip(), p = 0;
while(p < n && !(isSpaceChar(b))){
buf[p++] = (char)b;
b = readByte();
}
return n == p ? buf : Arrays.copyOf(buf, p);
}
private char[][] nm(int n, int m)
{
char[][] map = new char[n][];
for(int i = 0;i < n;i++)map[i] = ns(m);
return map;
}
private int[] na(int n)
{
int[] a = new int[n];
for(int i = 0;i < n;i++)a[i] = ni();
return a;
}
private int ni()
{
int num = 0, b;
boolean minus = false;
while((b = readByte()) != -1 && !((b >= '0' && b <= '9') || b == '-'));
if(b == '-'){
minus = true;
b = readByte();
}
while(true){
if(b >= '0' && b <= '9'){
num = num * 10 + (b - '0');
}else{
return minus ? -num : num;
}
b = readByte();
}
}
private long nl()
{
long num = 0;
int b;
boolean minus = false;
while((b = readByte()) != -1 && !((b >= '0' && b <= '9') || b == '-'));
if(b == '-'){
minus = true;
b = readByte();
}
while(true){
if(b >= '0' && b <= '9'){
num = num * 10 + (b - '0');
}else{
return minus ? -num : num;
}
b = readByte();
}
}
private static void tr(Object... o) { System.out.println(Arrays.deepToString(o)); }
}