求助编译期展开性能损耗

CPP

// using fread
#define INPUT_OPTIMIZE
// using fwrite
#define OUTPUT_OPTIMIZE
// using force_inline
#define INLINE_OPTIMIZE
// using mmap，需要在 freopen 之后手动调用 mmap_init() 
#define USING_MMAP
// 保证仅含有空白，换行，EOF 等符号作为读入数字的间隔符，适用于读入格式不阴间的情况下加速 
#define ONLY_BLANK
#define ONLY_TWO_BLANK // 保证所有数字之间只有最多两个空格，/r，/n
// #define ONLY_ONE_BLANK // 同理，保证最多一个 
// 使用查表法进行优化
#ifdef USING_MMAP
	#ifdef __unix__
		#define UINT16_OPTIMIZE // 目前只做了使用 mmap 一次性全部读入时查表的优化，因为 fread 的缓冲区可能会溢出一点，到时候修吧 
	#endif
#endif

namespace IO {

	constexpr int Read_Bufsize = 1 << 21;
	constexpr int Write_Bufsize = 1 << 21;

#ifdef INLINE_OPTIMIZE
	#define IOINLINE [[gnu::always_inline]] inline
#else
	#define IOINLINE inline
#endif

#define mmap_init() 0

#ifdef INPUT_OPTIMIZE
	#ifdef __unix__
		#ifdef USING_MMAP
			#include <fcntl.h>
			#include <sys/mman.h>
			#include <sys/stat.h>
			#include <unistd.h>
			
			#undef mmap_init
			
			constexpr int fd = 0;
			
			inline static const unsigned char *p1; struct stat stat_;
			IOINLINE void mmap_init() {
				fstat(fd, &stat_);
				p1 = (const unsigned char *)mmap(NULL, stat_.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
			}
			
			#define getchar() (*p1++)
		#else
			static unsigned char buf[Read_Bufsize], *p1 = buf, *p2 = buf;
			#define getchar() (p1 == p2 && (p2 = (p1 = buf) + fread_unlocked(buf, 1, Read_Bufsize, stdin), p1 == p2) ? EOF : *p1++)
		#endif
	#else
		static unsigned char buf[Read_Bufsize], *p1 = buf, *p2 = buf;
		#define getchar() (p1 == p2 && (p2 = (p1 = buf) + fread(buf, 1, Read_Bufsize, stdin), p1 == p2) ? EOF : *p1++)
	#endif
#endif

#ifdef UINT16_OPTIMIZE
	struct uint16_table {
		int table[65536];
		constexpr uint16_table() : table{} {
			for(int i = 0; i < 65536; ++i) table[i] = -1;
			for(int i = 48; i != 58; ++i)
				for(int j = 48; j != 58; ++j)
					table[i << 8 | j] = (j & 15) * 10 + (i & 15);
		}
	};
	constexpr uint16_table Table;
	#define TABLE Table.table
	
	template<typename T>
	constexpr T digit_count_unsigned(T num) {
		return num < 10 ? 1 : 1 + digit_count_unsigned(num / 10);
	}
#endif

	IOINLINE bool read(unsigned char *t) {
		memset(t, 0, sizeof t);
		unsigned char *pd = t, c = getchar();
		while (isspace(c)) c = getchar();
		while (!isspace(c)) *pd++ = c, c = getchar();
		return c == EOF;
	}

	template <typename T> 
	IOINLINE bool read(T &t) {
		t=0;T sgn=1;
		int c = getchar();
		#ifdef UINT16_OPTIMIZE
			#ifdef ONLY_BLANK
				#ifdef ONLY_TWO_BLANK
					if(c<=32) c = getchar();
				#elif defined(ONLY_ONE_BLANK)
				#else
					while(c<=32) c = getchar();
				#endif
				if(c==45) sgn = -1, c = getchar();
			#else
				while(!isdigit(c)) {if(c == 45) sgn *= -1; c = getchar();}
			#endif
		
			#define io_next_likely __builtin_expect(!!~TABLE[*(uint16_t*)(p1 - 1)],1) && ((t = t * 100 + TABLE[*(uint16_t*)(p1 - 1)], p1+=2, 1))
			#define io_next (~TABLE[*(uint16_t*)(p1 - 1)]) && ((t = t * 100 + TABLE[*(uint16_t*)(p1 - 1)], p1+=2, 1))
			
			constexpr int max_io_count = digit_count_unsigned( numeric_limits<T>::max() ) >> 1, max_likely_count = max_io_count / 4;
			
			[&]<size_t... I>(std::index_sequence<I...>) {
				(
					(I <= max_likely_count ? io_next_likely : io_next) && ...
				);
			}(std::make_index_sequence<max_io_count>{});
			
			c = *(p1 - 1);
			#ifdef ONLY_BLANK
				if(c>32) t = t * 10 + (c & 15), ++p1;
			#else
				if(isdigit(c)) t = t * 10 + (c & 15), ++p1;
			#endif
		#else
			#ifdef ONLY_BLANK
				while(c<=32) c = getchar();
				if(c==45) sgn = -1, c=getchar();
				while(c>32) t = t * 10 + (c & 15), c = getchar();
			#else
				while(!isdigit(c)) {if(c == 45)sgn *= -1;c = getchar();}
				while(isdigit(c)) t = t * 10 + (c & 15), c = getchar();
			#endif
		#endif
		t*=sgn;
		return c == EOF;
	}

	template <typename T, typename... Args> 
	IOINLINE bool read(T &t, Args&... args) {
		return read(t) ? 1 : read(args...);
	}

#ifdef OUTPUT_OPTIMIZE
	static unsigned char outbuf[Write_Bufsize], *out = outbuf;
	#define putchar(x) (*out++ = x)
	#ifdef __unix__
		#define flush() fwrite_unlocked(outbuf, 1, out - outbuf, stdout)
	#else
		#define flush() fwrite(outbuf, 1, out - outbuf, stdout)
	#endif
	struct auto_flush { auto_flush() { atexit([]() {flush();}); } } initializer;
#else 
	#define flush() 0
#endif

	template <typename T> 
	IOINLINE void write(T x) {
		if (!x) return putchar(48), void();
		constexpr int max_digit_count = digit_count_unsigned( numeric_limits<T>::max() );
		static unsigned char t[max_digit_count], p = 0;
		while (x) t[++p] = (x % 10) & 15, x /= 10;
		while (p) putchar(t[p--]);
	}
} using namespace IO;

有以上代码，注意到代码中使用了 lambda 和折叠表达式在编译期展开了这个 IO 优化的逻辑。

但是不知为何，GCC 15 的 -O2 编译条件下测试发现和直接写死存在着约 5%~8% 的稳定性能损耗。这是为什么呢？

求助编译期展开性能损耗

讨论操作

回复

相关推荐

求助编译期展开性能损耗