0%

Glibc的memset源码分析

Glibc的memset源码分析

环境说明

CPU配置:

1
2
3
model name      : Intel(R) Xeon(R) Gold 6134 CPU @ 3.20GHz

flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cdp_l3 invpcid_single pti intel_ppin ssbd mba ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm cqm mpx rdt_a avx512f avx512dq rdseed adx smap clflushopt clwb intel_pt avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local dtherm arat pln pts hwp hwp_act_window hwp_epp hwp_pkg_req pku ospke md_clear flush_l1d

可以发现其中支持avx的flag: avx2,avx512f… 以及ERMS(erms)的支持.

使用的glibc库是libc-2.27.so.

代码分析

查看glibc-2.27的版本代码,在sysdeps/x86_64/multiarch/目录下, 在文件ifunc-memset.h中定义了是否使用ermsavx512:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
 if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_ERMS))
return OPTIMIZE (erms);

if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx512_no_vzeroupper);

if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
return OPTIMIZE (avx512_unaligned_erms);

return OPTIMIZE (avx512_unaligned);
}

if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
{
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
return OPTIMIZE (avx2_unaligned_erms);
else
return OPTIMIZE (avx2_unaligned);
}

无论avx2还是avx512都共享使用了sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S中的代码,其中大致逻辑是如果支持erms就使用erms相关代码, 并针对对齐和非对齐的情况进行处理, 只有在erms支持的条件下才会执行到L(stosb)处的代码. 而非erms支持的的则执行L(more_2x_vec):进入循环执行mov操作, VMOVU(A)avx2avx512下分别是vmovdqu(a)vmovdqu(a)64

VEC_SIZEavx512下是64,在avx2下是32.

相关代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* memset is implemented as:
1. Use overlapping store to avoid branch.
2. If size is less than VEC, use integer register stores.
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
4 VEC stores and store 4 * VEC at a time until done. */

// stosb的相关代码在这里:
L(stosb):
/* Issue vzeroupper before rep stosb. */
VZEROUPPER
movq %rdx, %rcx
movzbl %sil, %eax
movq %rdi, %rdx
rep stosb
movq %rdx, %rax
ret

// 触发进入stosb的条件是:rdx值大于REP_STOSB_THRESHOLD, 且满足大于2*VEC_SIZE
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
cmpq $(VEC_SIZE * 2), %rdx
ja L(stosb_more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), (%rdi)
VZEROUPPER
ret

L(stosb_more_2x_vec):
cmpq $REP_STOSB_THRESHOLD, %rdx
ja L(stosb)


/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
up REP STOSB operation, REP STOSB isn't faster on short data. The
memset micro benchmark in glibc shows that 2KB is the approximate
value above which REP STOSB becomes faster on processors with
Enhanced REP STOSB. Since the stored value is fixed, larger register
size has minimal impact on threshold. */
#ifndef REP_STOSB_THRESHOLD
# define REP_STOSB_THRESHOLD 2048
#endif


L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_start)
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(0), VEC_SIZE(%rdi)
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
L(return):
VZEROUPPER
ret

L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx
VMOVU %VEC(0), (%rdi)
andq $-(VEC_SIZE * 4), %rcx
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(0), VEC_SIZE(%rdi)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
cmpq %rdx, %rcx
je L(return)
L(loop):
VMOVA %VEC(0), (%rcx)
VMOVA %VEC(0), VEC_SIZE(%rcx)
VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
VZEROUPPER_SHORT_RETURN
ret

glibc代码逻辑验证

测试验证以上逻辑:

rdx>2048时:

1
2
3
4
5
6
7
8
9
10
#include <string.h>

int main()
{
char str[5000];
while(1){
memset(str, 'a', 3000);
}
return 0;
}

使用perf top查看是L(stosb)处的代码:

1
2
3
4
5
6
7
8
    │    000000000018ef20 <__nss_group_lookup@GLIBC_2.2.5+0x251e0>:
0.91 │ vzeroupper
│ mov %rdx,%rcx
│ movzbl %sil,%eax
│ mov %rdi,%rdx
98.32 │ rep stos %al,%es:(%rdi)
0.76 │ mov %rdx,%rax
│ ← retq

rdx<=2048时:

1
2
3
4
5
6
7
8
9
10
#include <string.h>

int main()
{
char str[5000];
while(1){
memset(str, 'a', 2048);
}
return 0;
}

使用perf top查看是L(loop)处的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
__memset_avx2_unaligned_erms  /lib/x86_64-linux-gnu/libc-2.27.so
Percent│ vmovdqu %ymm0,(%rdi)
│ and $0xffffffffffffff80,%rcx
6.41 │ vmovdqu %ymm0,-0x20(%rdi,%rdx,1)
│ vmovdqu %ymm0,0x20(%rdi)
│ vmovdqu %ymm0,-0x40(%rdi,%rdx,1)
│ vmovdqu %ymm0,0x40(%rdi)
3.71 │ vmovdqu %ymm0,-0x60(%rdi,%rdx,1)
│ vmovdqu %ymm0,0x60(%rdi)
│ vmovdqu %ymm0,-0x80(%rdi,%rdx,1)
│ add %rdi,%rdx
3.61 │ and $0xffffffffffffff80,%rdx
│ cmp %rdx,%rcx
│ ↑ je 51
14.78 │ 97: vmovdqa %ymm0,(%rcx)
37.20 │ vmovdqa %ymm0,0x20(%rcx)
13.57 │ vmovdqa %ymm0,0x40(%rcx)
13.06 │ vmovdqa %ymm0,0x60(%rcx)
3.45 │ add $0x80,%rcx
│ cmp %rcx,%rdx
│ ↑ jne 97
0.94 │ vzeroupper
│ ← retq
│ ba: cmp $0x10,%dl
│ ↓ jae db
│ vmovq %xmm0,%rcx
│ cmp $0x8,%dl
│ ↓ jae e9

rdx<128时:

1
2
3
4
5
6
7
8
9
10
#include <string.h>

int main()
{
char str[5000];
while(1){
memset(str, 'a', 100);
}
return 0;
}

使用perf top查看是L(more_2x_vec)处的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
__memset_avx2_unaligned_erms  /lib/x86_64-linux-gnu/libc-2.27.so
Percent│ ↓ jb ba
│ cmp $0x40,%rdx
│ ↓ ja 2a
│ vmovdqu %ymm0,-0x20(%rdi,%rdx,1)
│ vmovdqu %ymm0,(%rdi)
│ vzeroupper
│ ← retq
│ 2a: cmp $0x800,%rdx
│ → ja 18ef20 <__nss_group_lookup@GLIBC_2.2.5+0x251e0>
│ cmp $0x80,%rdx
│ ↓ ja 55
24.76 │ vmovdqu %ymm0,(%rdi)
0.01 │ vmovdqu %ymm0,0x20(%rdi)
│ vmovdqu %ymm0,-0x20(%rdi,%rdx,1)
61.51 │ vmovdqu %ymm0,-0x40(%rdi,%rdx,1)
│ 51: vzeroupper
│ ← retq

由于cpu均支持erms, 所以进入的逻辑是erms相关代码.

avx指令可能导致cpu降频的问题

根据https://bugs.launchpad.net/linux/+bug/1727136https://bugs.launchpad.net/linux/+bug/1727136 中的描述, glibc不使用avx相关代码实现memcpy/memset是为了避免可能导致cpu降频,并引起性能力下降.

通过检测代码:

1
sudo perf stat -e cpu/event=0x28,umask=0x18,name=core_power_lvl1_turbo_license/,cpu/event=0x28,umask=0x20,name=core_power_lvl2_turbo_license/,cpu/event=0x28,umask=0x40,name=core_power_throttle/,cycles -a -I 1000 sleep 10

性能验证测试

参考: https://stackoverflow.com/questions/33480999/how-can-the-rep-stosb-instruction-execute-faster-than-the-equivalent-loophttps://stackoverflow.com/questions/43343231/enhanced-rep-movsb-for-memcpy
中给出的测试, REP STOSB对于大块的数据是最优的选择.

在支持ERMSCPU上,会对REP STOSB指令进行优化,使得其执行效率优于循环执行mov的逻辑.而在不支持ERMSCPU上则只能选择循环mov的操作了. 这里如果CPU支持AVX, 则mov会变为vmovdqa/vmovdqa64AVXmov指令.

以下进一步在当前CPU上进行代码对比测试:

测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include <string.h>

int main()
{
char str[5000];
int i =0;
int max = 1<<30;
while(i<max){
memset(str, 'a', 3000);
i++;
}
return 0;

}

这里使用一个第三方的优化过的libc库, 其中使用了avx512实现了memset

下载地址: https://www.agner.org/optimize/asmlib.zip

参考连接: https://www.agner.org/optimize/#asmlib

分别进行编译后运行perf测试性能:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
new_acu:pilot@quanta2:~/wuding$ gcc 1.c -o asmlib ./libaelf64o.a
new_acu:pilot@quanta2:~/wuding$ gcc 1.c -o origin

new_acu:pilot@quanta2:~/wuding$ perf stat ./asmlib

Performance counter stats for './asmlib':

33809.757694 task-clock (msec) # 1.000 CPUs utilized
11 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
50 page-faults # 0.001 K/sec
108,191,627,848 cycles # 3.200 GHz
181,495,657,395 instructions # 1.68 insn per cycle
56,914,104,466 branches # 1683.363 M/sec
153,916 branch-misses # 0.00% of all branches

33.810266264 seconds time elapsed

33.809945000 seconds user
0.000000000 seconds sys

new_acu:pilot@quanta2:~/wuding$ perf stat ./origin

Performance counter stats for './origin':

34796.854964 task-clock (msec) # 1.000 CPUs utilized
15 context-switches # 0.000 K/sec
1 cpu-migrations # 0.000 K/sec
50 page-faults # 0.001 K/sec
111,350,368,185 cycles # 3.200 GHz
29,025,326,422 instructions # 0.26 insn per cycle
7,522,153,422 branches # 216.173 M/sec
168,441 branch-misses # 0.00% of all branches

34.805927226 seconds time elapsed

34.797053000 seconds user
0.000000000 seconds sys

对比可以发现运行性能仅提升约2%.

查看使用asmlib运行时的perf top可见热点代码, 可验证的确使用的是avx512相关指令: vmovdqu64

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

│ 00000000000008ee <L050>:
│ L050():
95.91 │ vmovdqu64 %zmm16,(%rdi)
0.01 │ add $0x40,%rdi
│ and $0xffffffffffffffc0,%rdi
│ lea (%rcx,%rdx,1),%rax
4.08 │ and $0xffffffffffffffc0,%rax
│ sub %rax,%rdi
│ cmp MemsetCacheLimit,%rdx
│ → ja L200


│ Disassembly of section .text:

│ 0000000000000910 <L100>:
│ L100():
55.17 │0: vmovdqa64 %zmm16,(%rax,%rdi,1)
│ add $0x40,%rdi
44.83 │ ↑ jne 0

结论

综合分析来看, glibcmemset已经进行了优化. rep stos指令的性能并不差.