社区讨论
求助编译期展开性能损耗
学术版参与者 3已保存回复 5
讨论操作
快速查看讨论及其快照的属性,并进行相关操作。
- 当前回复
- 5 条
- 当前快照
- 1 份
- 快照标识符
- @mhpi1ckj
- 此快照首次捕获于
- 2025/11/08 07:41 3 个月前
- 此快照最后确认于
- 2025/11/08 07:41 3 个月前
CPP
// using fread
#define INPUT_OPTIMIZE
// using fwrite
#define OUTPUT_OPTIMIZE
// using force_inline
#define INLINE_OPTIMIZE
// using mmap,需要在 freopen 之后手动调用 mmap_init()
#define USING_MMAP
// 保证仅含有空白,换行,EOF 等符号作为读入数字的间隔符,适用于读入格式不阴间的情况下加速
#define ONLY_BLANK
#define ONLY_TWO_BLANK // 保证所有数字之间只有最多两个空格,/r,/n
// #define ONLY_ONE_BLANK // 同理,保证最多一个
// 使用查表法进行优化
#ifdef USING_MMAP
#ifdef __unix__
#define UINT16_OPTIMIZE // 目前只做了使用 mmap 一次性全部读入时查表的优化,因为 fread 的缓冲区可能会溢出一点,到时候修吧
#endif
#endif
namespace IO {
constexpr int Read_Bufsize = 1 << 21;
constexpr int Write_Bufsize = 1 << 21;
#ifdef INLINE_OPTIMIZE
#define IOINLINE [[gnu::always_inline]] inline
#else
#define IOINLINE inline
#endif
#define mmap_init() 0
#ifdef INPUT_OPTIMIZE
#ifdef __unix__
#ifdef USING_MMAP
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#undef mmap_init
constexpr int fd = 0;
inline static const unsigned char *p1; struct stat stat_;
IOINLINE void mmap_init() {
fstat(fd, &stat_);
p1 = (const unsigned char *)mmap(NULL, stat_.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
}
#define getchar() (*p1++)
#else
static unsigned char buf[Read_Bufsize], *p1 = buf, *p2 = buf;
#define getchar() (p1 == p2 && (p2 = (p1 = buf) + fread_unlocked(buf, 1, Read_Bufsize, stdin), p1 == p2) ? EOF : *p1++)
#endif
#else
static unsigned char buf[Read_Bufsize], *p1 = buf, *p2 = buf;
#define getchar() (p1 == p2 && (p2 = (p1 = buf) + fread(buf, 1, Read_Bufsize, stdin), p1 == p2) ? EOF : *p1++)
#endif
#endif
#ifdef UINT16_OPTIMIZE
struct uint16_table {
int table[65536];
constexpr uint16_table() : table{} {
for(int i = 0; i < 65536; ++i) table[i] = -1;
for(int i = 48; i != 58; ++i)
for(int j = 48; j != 58; ++j)
table[i << 8 | j] = (j & 15) * 10 + (i & 15);
}
};
constexpr uint16_table Table;
#define TABLE Table.table
template<typename T>
constexpr T digit_count_unsigned(T num) {
return num < 10 ? 1 : 1 + digit_count_unsigned(num / 10);
}
#endif
IOINLINE bool read(unsigned char *t) {
memset(t, 0, sizeof t);
unsigned char *pd = t, c = getchar();
while (isspace(c)) c = getchar();
while (!isspace(c)) *pd++ = c, c = getchar();
return c == EOF;
}
template <typename T>
IOINLINE bool read(T &t) {
t=0;T sgn=1;
int c = getchar();
#ifdef UINT16_OPTIMIZE
#ifdef ONLY_BLANK
#ifdef ONLY_TWO_BLANK
if(c<=32) c = getchar();
#elif defined(ONLY_ONE_BLANK)
#else
while(c<=32) c = getchar();
#endif
if(c==45) sgn = -1, c = getchar();
#else
while(!isdigit(c)) {if(c == 45) sgn *= -1; c = getchar();}
#endif
#define io_next_likely __builtin_expect(!!~TABLE[*(uint16_t*)(p1 - 1)],1) && ((t = t * 100 + TABLE[*(uint16_t*)(p1 - 1)], p1+=2, 1))
#define io_next (~TABLE[*(uint16_t*)(p1 - 1)]) && ((t = t * 100 + TABLE[*(uint16_t*)(p1 - 1)], p1+=2, 1))
constexpr int max_io_count = digit_count_unsigned( numeric_limits<T>::max() ) >> 1, max_likely_count = max_io_count / 4;
[&]<size_t... I>(std::index_sequence<I...>) {
(
(I <= max_likely_count ? io_next_likely : io_next) && ...
);
}(std::make_index_sequence<max_io_count>{});
c = *(p1 - 1);
#ifdef ONLY_BLANK
if(c>32) t = t * 10 + (c & 15), ++p1;
#else
if(isdigit(c)) t = t * 10 + (c & 15), ++p1;
#endif
#else
#ifdef ONLY_BLANK
while(c<=32) c = getchar();
if(c==45) sgn = -1, c=getchar();
while(c>32) t = t * 10 + (c & 15), c = getchar();
#else
while(!isdigit(c)) {if(c == 45)sgn *= -1;c = getchar();}
while(isdigit(c)) t = t * 10 + (c & 15), c = getchar();
#endif
#endif
t*=sgn;
return c == EOF;
}
template <typename T, typename... Args>
IOINLINE bool read(T &t, Args&... args) {
return read(t) ? 1 : read(args...);
}
#ifdef OUTPUT_OPTIMIZE
static unsigned char outbuf[Write_Bufsize], *out = outbuf;
#define putchar(x) (*out++ = x)
#ifdef __unix__
#define flush() fwrite_unlocked(outbuf, 1, out - outbuf, stdout)
#else
#define flush() fwrite(outbuf, 1, out - outbuf, stdout)
#endif
struct auto_flush { auto_flush() { atexit([]() {flush();}); } } initializer;
#else
#define flush() 0
#endif
template <typename T>
IOINLINE void write(T x) {
if (!x) return putchar(48), void();
constexpr int max_digit_count = digit_count_unsigned( numeric_limits<T>::max() );
static unsigned char t[max_digit_count], p = 0;
while (x) t[++p] = (x % 10) & 15, x /= 10;
while (p) putchar(t[p--]);
}
} using namespace IO;
有以上代码,注意到代码中使用了 lambda 和折叠表达式在编译期展开了这个 IO 优化的逻辑。
但是不知为何,GCC 15 的 -O2 编译条件下测试发现和直接写死存在着约 5%~8% 的稳定性能损耗。这是为什么呢?
回复
共 5 条回复,欢迎继续交流。
正在加载回复...