Glibc的memset源码分析 环境说明 CPU配置:
1 2 3 model name : Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single pti intel_ppin ssbd mba ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req pku ospke md_clear flush_l1d
可以发现其中支持avx的flag: avx2
,avx512f
… 以及ERMS(erms
)的支持.
使用的glibc
库是libc-2.27.so
.
代码分析 查看glibc-2.27
的版本代码,在sysdeps/x86_64/multiarch/
目录下, 在文件ifunc-memset.h
中定义了是否使用erms
和avx512
:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_ERMS)) return OPTIMIZE (erms); if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) { if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx512_no_vzeroupper); if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) return OPTIMIZE (avx512_unaligned_erms); return OPTIMIZE (avx512_unaligned); } if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)) { if (CPU_FEATURES_CPU_P (cpu_features, ERMS)) return OPTIMIZE (avx2_unaligned_erms); else return OPTIMIZE (avx2_unaligned); }
无论avx2
还是avx512
都共享使用了sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
中的代码,其中大致逻辑是如果支持erms就使用erms相关代码, 并针对对齐和非对齐的情况进行处理, 只有在erms
支持的条件下才会执行到L(stosb)
处的代码. 而非erms
支持的的则执行L(more_2x_vec):
进入循环执行mov操作, VMOVU(A)
在avx2
和avx512
下分别是vmovdqu(a)
和vmovdqu(a)64
VEC_SIZE
在avx512
下是64,在avx2
下是32.
相关代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 /* memset is implemented as: 1. Use overlapping store to avoid branch. 2. If size is less than VEC, use integer register stores. 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with 4 VEC stores and store 4 * VEC at a time until done. */ // stosb的相关代码在这里: L(stosb): /* Issue vzeroupper before rep stosb. */ VZEROUPPER movq %rdx, %rcx movzbl %sil, %eax movq %rdi, %rdx rep stosb movq %rdx, %rax ret // 触发进入stosb的条件是:rdx值大于REP_STOSB_THRESHOLD, 且满足大于2*VEC_SIZE ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) cmpq $VEC_SIZE, %rdx jb L(less_vec) cmpq $(VEC_SIZE * 2), %rdx ja L(stosb_more_2x_vec) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) VZEROUPPER ret L(stosb_more_2x_vec): cmpq $REP_STOSB_THRESHOLD, %rdx ja L(stosb) /* Threshold to use Enhanced REP STOSB. Since there is overhead to set up REP STOSB operation, REP STOSB isn't faster on short data. The memset micro benchmark in glibc shows that 2KB is the approximate value above which REP STOSB becomes faster on processors with Enhanced REP STOSB. Since the stored value is fixed, larger register size has minimal impact on threshold. */ #ifndef REP_STOSB_THRESHOLD # define REP_STOSB_THRESHOLD 2048 #endif L(more_2x_vec): cmpq $(VEC_SIZE * 4), %rdx ja L(loop_start) VMOVU %VEC(0), (%rdi) VMOVU %VEC(0), VEC_SIZE(%rdi) VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) L(return): VZEROUPPER ret L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx VMOVU %VEC(0), (%rdi) andq $-(VEC_SIZE * 4), %rcx VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), VEC_SIZE(%rdi) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) addq %rdi, %rdx andq $-(VEC_SIZE * 4), %rdx cmpq %rdx, %rcx je L(return) L(loop): VMOVA %VEC(0), (%rcx) VMOVA %VEC(0), VEC_SIZE(%rcx) VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) addq $(VEC_SIZE * 4), %rcx cmpq %rcx, %rdx jne L(loop) VZEROUPPER_SHORT_RETURN ret
glibc代码逻辑验证 测试验证以上逻辑:
当rdx>2048
时:
1 2 3 4 5 6 7 8 9 10 #include <string.h> int main () { char str[5000 ]; while (1 ){ memset (str, 'a' , 3000 ); } return 0 ; }
使用perf top
查看是L(stosb)
处的代码:
1 2 3 4 5 6 7 8 │ 000000000018ef20 <__nss_group_lookup@GLIBC_2.2.5+0x251e0>: 0.91 │ vzeroupper │ mov %rdx,%rcx │ movzbl %sil,%eax │ mov %rdi,%rdx 98.32 │ rep stos %al,%es:(%rdi) 0.76 │ mov %rdx,%rax │ ← retq
当rdx<=2048
时:
1 2 3 4 5 6 7 8 9 10 #include <string.h> int main () { char str[5000 ]; while (1 ){ memset (str, 'a' , 2048 ); } return 0 ; }
使用perf top
查看是L(loop)
处的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 __memset_avx2_unaligned_erms /lib/x86_64-linux-gnu/libc-2.27.so Percent│ vmovdqu %ymm0,(%rdi) │ and $0xffffffffffffff80,%rcx 6.41 │ vmovdqu %ymm0,-0x20(%rdi,%rdx,1) │ vmovdqu %ymm0,0x20(%rdi) │ vmovdqu %ymm0,-0x40(%rdi,%rdx,1) │ vmovdqu %ymm0,0x40(%rdi) 3.71 │ vmovdqu %ymm0,-0x60(%rdi,%rdx,1) │ vmovdqu %ymm0,0x60(%rdi) │ vmovdqu %ymm0,-0x80(%rdi,%rdx,1) │ add %rdi,%rdx 3.61 │ and $0xffffffffffffff80,%rdx │ cmp %rdx,%rcx │ ↑ je 51 14.78 │ 97: vmovdqa %ymm0,(%rcx) 37.20 │ vmovdqa %ymm0,0x20(%rcx) 13.57 │ vmovdqa %ymm0,0x40(%rcx) 13.06 │ vmovdqa %ymm0,0x60(%rcx) 3.45 │ add $0x80,%rcx │ cmp %rcx,%rdx │ ↑ jne 97 0.94 │ vzeroupper │ ← retq │ ba: cmp $0x10,%dl │ ↓ jae db │ vmovq %xmm0,%rcx │ cmp $0x8,%dl │ ↓ jae e9
当rdx<128
时:
1 2 3 4 5 6 7 8 9 10 #include <string.h> int main () { char str[5000 ]; while (1 ){ memset (str, 'a' , 100 ); } return 0 ; }
使用perf top
查看是L(more_2x_vec)
处的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 __memset_avx2_unaligned_erms /lib/x86_64-linux-gnu/libc-2.27.so Percent│ ↓ jb ba │ cmp $0x40,%rdx │ ↓ ja 2a │ vmovdqu %ymm0,-0x20(%rdi,%rdx,1) │ vmovdqu %ymm0,(%rdi) │ vzeroupper │ ← retq │ 2a: cmp $0x800,%rdx │ → ja 18ef20 <__nss_group_lookup@GLIBC_2.2.5+0x251e0> │ cmp $0x80,%rdx │ ↓ ja 55 24.76 │ vmovdqu %ymm0,(%rdi) 0.01 │ vmovdqu %ymm0,0x20(%rdi) │ vmovdqu %ymm0,-0x20(%rdi,%rdx,1) 61.51 │ vmovdqu %ymm0,-0x40(%rdi,%rdx,1) │ 51: vzeroupper │ ← retq
由于cpu
均支持erms
, 所以进入的逻辑是erms
相关代码.
avx指令可能导致cpu降频的问题 根据https://bugs.launchpad.net/linux/+bug/1727136 和 https://bugs.launchpad.net/linux/+bug/1727136 中的描述, glibc
不使用avx
相关代码实现memcpy/memset
是为了避免可能导致cpu降频,并引起性能力下降.
通过检测代码:
1 sudo perf stat -e cpu/event=0x28,umask=0x18,name=core_power_lvl1_turbo_license/,cpu/event=0x28,umask=0x20,name=core_power_lvl2_turbo_license/,cpu/event=0x28,umask=0x40,name=core_power_throttle/,cycles -a -I 1000 sleep 10
性能验证测试 参考: https://stackoverflow.com/questions/33480999/how-can-the-rep-stosb-instruction-execute-faster-than-the-equivalent-loop 和 https://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy 中给出的测试, REP STOSB
对于大块的数据是最优的选择.
在支持ERMS
的CPU
上,会对REP STOSB
指令进行优化,使得其执行效率优于循环执行mov
的逻辑.而在不支持ERMS
的CPU
上则只能选择循环mov
的操作了. 这里如果CPU
支持AVX
, 则mov
会变为vmovdqa/vmovdqa64
等AVX
的mov
指令.
以下进一步在当前CPU上进行代码对比测试:
测试代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 #include <string.h> int main () { char str[5000 ]; int i =0 ; int max = 1 <<30 ; while (i<max ){ memset (str, 'a' , 3000 ); i++; } return 0 ; }
这里使用一个第三方的优化过的libc
库, 其中使用了avx512
实现了memset
下载地址: https://www.agner.org/optimize/asmlib.zip
参考连接: https://www.agner.org/optimize/#asmlib
分别进行编译后运行perf
测试性能:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 new_acu:pilot@quanta2:~/wuding$ gcc 1.c -o asmlib ./libaelf64o.a new_acu:pilot@quanta2:~/wuding$ gcc 1.c -o origin new_acu:pilot@quanta2:~/wuding$ perf stat ./asmlib Performance counter stats for './asmlib': 33809.757694 task-clock (msec) # 1.000 CPUs utilized 11 context-switches # 0.000 K/sec 0 cpu-migrations # 0.000 K/sec 50 page-faults # 0.001 K/sec 108,191,627,848 cycles # 3.200 GHz 181,495,657,395 instructions # 1.68 insn per cycle 56,914,104,466 branches # 1683.363 M/sec 153,916 branch-misses # 0.00% of all branches 33.810266264 seconds time elapsed 33.809945000 seconds user 0.000000000 seconds sys new_acu:pilot@quanta2:~/wuding$ perf stat ./origin Performance counter stats for './origin': 34796.854964 task-clock (msec) # 1.000 CPUs utilized 15 context-switches # 0.000 K/sec 1 cpu-migrations # 0.000 K/sec 50 page-faults # 0.001 K/sec 111,350,368,185 cycles # 3.200 GHz 29,025,326,422 instructions # 0.26 insn per cycle 7,522,153,422 branches # 216.173 M/sec 168,441 branch-misses # 0.00% of all branches 34.805927226 seconds time elapsed 34.797053000 seconds user 0.000000000 seconds sys
对比可以发现运行性能仅提升约2%.
查看使用asmlib
运行时的perf top
可见热点代码, 可验证的确使用的是avx512
相关指令: vmovdqu64
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 │ │ 00000000000008ee <L050>: │ L050(): 95.91 │ vmovdqu64 %zmm16,(%rdi) 0.01 │ add $0x40,%rdi │ and $0xffffffffffffffc0,%rdi │ lea (%rcx,%rdx,1),%rax 4.08 │ and $0xffffffffffffffc0,%rax │ sub %rax,%rdi │ cmp MemsetCacheLimit,%rdx │ → ja L200 │ Disassembly of section .text: │ │ 0000000000000910 <L100>: │ L100(): 55.17 │0: vmovdqa64 %zmm16,(%rax,%rdi,1) │ add $0x40,%rdi 44.83 │ ↑ jne 0
结论 综合分析来看, glibc
的memset
已经进行了优化. rep stos
指令的性能并不差.