QOJ.ac
QOJ
ID | 题目 | 提交者 | 结果 | 用时 | 内存 | 语言 | 文件大小 | 提交时间 | 测评时间 |
---|---|---|---|---|---|---|---|---|---|
#601468 | #8781. Element-Wise Comparison | ericmegalovania | RE | 48ms | 618368kb | C++20 | 9.6kb | 2024-09-30 00:51:03 | 2024-09-30 00:51:04 |
Judging History
answer
#define __AVX__ 1
#define __AVX2__ 1
#pragma GCC optimize("Ofast,unroll-loops")
#pragma GCC target("avx2,bmi,fma,popcnt")
#include<immintrin.h>
#include<bits/stdc++.h>
using namespace std;
//#define ONLINE
#ifndef ONLINE
char DEBUG_BUFFER[1000];
#define debug(...) {sprintf(DEBUG_BUFFER,##__VA_ARGS__);\
cerr<<"\033[1;36m"<<DEBUG_BUFFER<<"\033[0;2m"<<"\033[0m";}
#else
#define debug(...) ;
#endif
using LL=long long;
using PII=pair<int,int>;
#define all(x) (x).begin(),(x).end()
#define allr(x) (x).rbegin(),(x).rend()
#define FAST_IO {ios::sync_with_stdio(false);cin.tie(nullptr);}
inline int read(){static int x; cin>>x; return x;}
inline LL readLL(){static LL x; cin>>x; return x;}
mt19937_64 rng(chrono::system_clock::now().time_since_epoch().count());
#define __JYC_CP
namespace avx256_bitset{
using vec=__m256i;
using u32=unsigned int;
using u64=unsigned long long;
using cup=const u32*; //const-u32-pointer
const u32 o=1,inf=0xffffffff;
#define v32(x) _mm256_##x
#define B Bitset<N>
#define vec_zero (v32(set1_epi32)(0))
#define vec_one (v32(set1_epi32)(inf))
#define load(x) (v32(load_si256)((vec*)(x)))
#define loadu(x) (v32(loadu_si256)((vec*)(x)))
template<const int N>
class Bitset{
private:
static_assert(N%256==0);
static const int T=(N-1)/32+1,T2=((N-1)/256+1)<<3,back_siz=N-32*(T-1);
static_assert(back_siz>=0 && back_siz<=32);
static const u32 set_last=(o<<back_siz-1)-1+(o<<back_siz-1);
alignas(32) u32 a[T2]={};
inline void revise(){
a[T-1]&=set_last;
for(int i=T;i<T2;++i) a[i]=0;
}
public:
Bitset(){}
inline void reset(){
static const vec all_zero=vec_zero;
cup p=a,pend=a+T2;
for(;p!=pend;p+=8){
v32(store_si256)((vec*)p,all_zero);
}
}
inline void set(){
static const vec all_one=vec_one;
cup p=a,pend=a+T2;
for(;p!=pend;p+=8){
v32(store_si256)((vec*)p,all_one);
}
revise();
}
inline void reset(const int& x){
a[x>>5]&=~(o<<x&31);
if((x>>5)==T-1) a[T-1]&=set_last;
}
inline void set(const int& x){
a[x>>5]|=o<<(x&31);
}
inline void flip(const int& x){
a[x>>5]^=o<<(x&31);
}
inline int test(int x){
return (a[x>>5]>>(x&31))&1;
}
inline bool operator [](int x) const{
return (a[x>>5]>>(x&31))&1;
}
inline bool none() const{
cup p=a,pend=a+T2;
for(;p!=pend;p+=8){
const vec& v=load(p);
if(!v32(testz_si256)(v,v)) return 0;
}
return 1;
}
inline bool any() const{
return !none();
}
inline int _Find_first() const{
for(int i=0;i<T2;i+=8){
const vec& v=load(&a[i]);
if(v32(testz_si256)(v,v)) continue;
for(int j=i;j<i+8;j++) if(a[j]){
return j*32+__builtin_ctz(a[j]);
}
}
return N;
}
inline int _Find_next(const int& pos) const{
if((pos&31)!=31){
int i=pos/32,j=(pos+1)&31,nw=pos+1;
for(;j<32;++j,++nw){
if(a[i]>>j&1) return nw;
}
}
int i=pos/32+1;
for(;i%8;++i) if(a[i]){
return i*32+__builtin_ctz(a[i]);
}
for(;i<T2;i+=8){
const vec& v=load(&a[i]);
if(v32(testz_si256)(v,v)) continue;
for(int j=i;j<i+8;j++) if(a[j]){
return j*32+__builtin_ctz(a[j]);
}
}
return N;
}
inline int count() const{
int ans=0;
for(int i=0;i<T2/2;++i){
ans+=__builtin_popcountll( ((u64*)(a)) [i]);
}
return ans;
//needs to support avx512
//flags: AVX512VPOPCNTDQ + AVX512VL
/*
int ans[8]={};
vec res=vec_zero,tmp;
cup p=a,pend=a+T2;
for(;p!=pend;p+=8){
tmp=v32(popcnt_epi32)(load(p));
res=v32(add_epi32)(res,tmp);
}
v32(store_si256)((vec*)ans,res);
return accumulate(ans,ans+8,0);
*/
}
inline B& operator =(const B& b){
u32* p=a;
cup pend=a+T2,p2=b.a;
for(;p!=pend;p+=8,p2+=8){
v32(store_si256)((vec*)p,load(p2));
}
return *this;
}
inline B operator ~() const{
static const __m256i all_one=vec_one;
B ans; cup p=a,pend=a+T2;
u32* pans=ans.a;
for(;p!=pend;p+=8,pans+=8){
auto v=v32(load_si256)((vec*)p);
v=v32(andnot_si256)(v,all_one);
v32(store_si256)((vec*)pans,v);
}
revise();
return ans;
}
#define OP(b,op) \
B ans; cup p=a,pend=a+T2,p2=b.a;\
u32* pans=ans.a;\
for(;p!=pend;p+=8,p2+=8,pans+=8){\
const vec& tmp=v32(op##_si256)(load(p),load(p2));\
v32(store_si256)((vec*)pans,tmp);\
}return ans
inline B operator &(const B& b) const{
OP(b,and);
}
inline B operator |(const B& b) const{
OP(b,or);
}
inline B operator ^(const B& b) const{
OP(b,xor);
}
#undef OP
#define OP(b,op) \
cup p=a,pend=a+T2,p2=b.a;\
for(;p!=pend;p+=8,p2+=8){\
const vec& tmp=v32(op##_si256)(load(p),load(p2));\
v32(store_si256)((vec*)p,tmp);\
}return *this
inline B& operator &=(const B& b){
OP(b,and);
}
inline B& operator |=(const B& b){
OP(b,or);
}
inline B& operator ^=(const B& b){
OP(b,xor);
}
#undef OP
inline B operator <<(const int& t) const{
B ans;
if(t>=N) return ans;
const int div=t/32,rem=t%32;
cup p=a+T2-8;
u32* pans=ans.a+T2-8;
if(rem){
for(;p-div-1>=a;p-=8,pans-=8){
vec v1=v32(slli_epi32)(loadu(p-div),rem);
vec v2=v32(srli_epi32)(loadu(p-div-1),32-rem);
v32(store_si256)((vec*)pans,v32(or_si256)(v1,v2));
}
}
else{
for(;p-div-1>=a;p-=8,pans-=8){
v32(store_si256)((vec*)pans,loadu(p-div));
}
}
p+=7,pans+=7;
for(;p-div-1>=a;--p,--pans){
(*pans)=(*(p-div))<<rem;
if(rem) (*pans)|=(*(p-div-1))>>(32-rem);
}
(*pans)=a[0]<<rem;
ans.revise();
return ans;
}
inline B& operator <<=(const int& t){
return (*this)=(*this)<<t;
}
inline B operator >>(const int& t) const{
B ans;
if(t>=N) return ans;
const int div=t/32,rem=t%32;
cup p=a;
u32* pans=ans.a;
if(rem){
for(;p+div+1<=a+T2-8;p+=8,pans+=8){
vec v1=v32(srli_epi32)(loadu(p+div),rem);
vec v2=v32(slli_epi32)(loadu(p+div+1),32-rem);
v32(store_si256)((vec*)pans,v32(or_si256)(v1,v2));
}
}
else{
for(;p+div+1<=a+T2-8;p+=8,pans+=8){
v32(store_si256)((vec*)pans,loadu(p+div));
}
}
p-=7,pans-=7;
for(;p+div+1<=a+T2-8;++p,++pans){
(*pans)=(*(p+div))>>rem;
if(rem) (*pans)|=(*(p+div+1))<<(32-rem);
}
(*pans)=a[T2-1]>>rem;
ans.revise();
return ans;
}
inline B& operator >>=(const int& t){
return (*this)=(*this)>>t;
}
//the following parts are for competitive programming
#ifdef __JYC_CP
#define OP(x,y,z,op) \
z.reset(); cup p=x.a,pend=x.a+T2,p2=y.a;\
u32* pans=z.a;\
for(;p!=pend;p+=8,p2+=8,pans+=8){\
const vec& tmp=v32(op##_si256)(load(p),load(p2));\
v32(store_si256)((vec*)pans,tmp);}
inline static void And(const B& x,const B& y,B& z){
OP(x,y,z,and);
}
inline static void Or(const B& x,const B& y,B& z){
OP(x,y,z,or);
}
inline static void Xor(const B& x,const B& y,B& z){
OP(x,y,z,xor);
}
#undef OP
inline static void Lsh(const B& x,const int& t,B& ans){
ans.reset();
if(t>=N) return;
const int div=t/32,rem=t%32;
cup p=x.a+T2-8;
u32* pans=ans.a+T2-8;
if(rem){
for(;p-div-1>=x.a;p-=8,pans-=8){
vec v1=v32(slli_epi32)(loadu(p-div),rem);
vec v2=v32(srli_epi32)(loadu(p-div-1),32-rem);
v32(store_si256)((vec*)pans,v32(or_si256)(v1,v2));
}
}
else{
for(;p-div-1>=x.a;p-=8,pans-=8){
v32(store_si256)((vec*)pans,loadu(p-div));
}
}
p+=7,pans+=7;
for(;p-div-1>=x.a;--p,--pans){
(*pans)=(*(p-div))<<rem;
if(rem) (*pans)|=(*(p-div-1))>>(32-rem);
}
(*pans)=x.a[0]<<rem;
ans.revise();
}
inline static void Rsh(const B& x,const int& t,B& ans){
ans.reset();
if(t>=N) return;
const int div=t/32,rem=t%32;
cup p=x.a;
u32* pans=ans.a;
if(rem){
for(;p+div+1<=x.a+T2-8;p+=8,pans+=8){
vec v1=v32(srli_epi32)(loadu(p+div),rem);
vec v2=v32(slli_epi32)(loadu(p+div+1),32-rem);
v32(store_si256)((vec*)pans,v32(or_si256)(v1,v2));
}
}
else{
for(;p+div+1<=x.a+T2-8;p+=8,pans+=8){
v32(store_si256)((vec*)pans,loadu(p+div));
}
}
p-=7,pans-=7;
for(;p+div+1<=x.a+T2-8;++p,++pans){
(*pans)=(*(p+div))>>rem;
if(rem) (*pans)|=(*(p+div+1))<<(32-rem);
}
(*pans)=x.a[T2-1]>>rem;
ans.revise();
}
inline int count(int n) const{
int ans=0;
for(int i=0;i<n+63;i+=64){
ans+=__builtin_popcountll(*( ((u64*)(a)) + i/32 ));
}
return ans;
}
#endif
string str() const{
string s="";
for(int i=N-1;i>=0;--i){
s.push_back('0'+(int)operator[](i));
}
return s;
}
void output() const{
debug("%s\n",this->str().data());
}
};
}
#undef B
//#define N 16
#define N (196*256)
using B=avx256_bitset::Bitset<N>;
int pos[N];
B f[N],suf[N],tmp,res;
int main(){
FAST_IO;
int n=read(),m=read();
for(int i=0;i<n;i++){
pos[read()-1]=i;
}
tmp.reset();
for(int x=n-1;x>=0;x--){
B::Rsh(tmp,pos[x],f[pos[x]]);
tmp.set(pos[x]);
// f[pos[x]]=tmp>>pos[x];
// tmp[pos[x]]=1;
}
LL ans=0;
for(int i=0,last=-1;i<n-m+1;i++){
if(i>last){
last+=m;
suf[0]=f[last];
for(int j=1;j<m;j++){
B::And(suf[j-1],f[last-j],suf[j]);
// suf[j]=suf[j-1]&f[last-j];
}
tmp.set();
}
B::And(suf[last-i],tmp,res);
ans+=(suf[last-i]&tmp).count(n+63);
// ans+=(suf[last-i]&tmp).count();
tmp&=f[i+m];
}
cout<<ans;
return 0;
}
/* stuff you should look for
* int overflow, array bounds
* special cases (n=1?)
* do smth instead of nothing and stay organized
* WRITE STUFF DOWN
* DON'T GET STUCK ON ONE APPROACH
*/
詳細信息
Test #1:
score: 100
Accepted
time: 36ms
memory: 618292kb
input:
5 3 5 2 1 3 4
output:
0
result:
ok answer is '0'
Test #2:
score: 0
Accepted
time: 27ms
memory: 618144kb
input:
5 2 3 1 4 2 5
output:
2
result:
ok answer is '2'
Test #3:
score: 0
Accepted
time: 35ms
memory: 618292kb
input:
4 2 1 2 3 4
output:
3
result:
ok answer is '3'
Test #4:
score: 0
Accepted
time: 23ms
memory: 618200kb
input:
4 2 4 3 2 1
output:
0
result:
ok answer is '0'
Test #5:
score: 0
Accepted
time: 48ms
memory: 618368kb
input:
1 1 1
output:
0
result:
ok answer is '0'
Test #6:
score: -100
Runtime Error
input:
50000 2 44045 29783 5389 7756 44022 45140 21967 5478 10868 49226 21775 31669 49836 13511 46116 14229 27206 31168 37389 3158 10658 41154 14635 18526 40540 6451 23197 46719 30593 13517 8604 46666 39189 43746 12778 3684 3194 36979 43020 14652 19549 31178 17144 27177 44336 2849 40220 11751 41993 32209 4...