#include#include int main() { volatile int m; struct timeval start; gettimeofday(&start, NULL); for (int i = 0; i < 1000000; i++) { m++; } struct timeval end; gettimeofday(&end, NULL); printf("add cost %lldus\n", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)); int n; gettimeofday(&start, NULL); for (int i = 0; i < 1000000; i++) { __sync_fetch_and_add(&n, 1); } gettimeofday(&end, NULL); printf("atomic cost %lldus\n", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)); return 0; }
之所以用volatile修饰m是拒绝编译器对m++做优化。
使用O2编译并查看性能:
$gcc -O2 -std=c99 -o perf atomic_perf.c
$./perf
add cost 2638us
atomic cost 8510us
可见如果你的变量压根不会被多线程访问,并且对性能极度苛刻的话,还是不要用原子变量了吧。因为在有些平台上“A full memory barrier is created when this function is invoked”。
可以通过下面的方法看到m++和原子操作的汇编之间的区别:
$gcc -O2 -std=c99 -g -c atomic_perf.c
$objdump -Sl atomic_perf.o
atomic_perf.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 :
main():
/home/admin/jinxin/test/atomic_perf.c:5
#include
#include
int main()
{
0: 55 push %rbp
/home/admin/jinxin/test/atomic_perf.c:9
volatile int m;
struct timeval start;
gettimeofday(&start, NULL);
1: 31 f6 xor %esi,%esi
/home/admin/jinxin/test/atomic_perf.c:5
3: 53 push %rbx
4: 48 83 ec 38 sub $0x38,%rsp
/home/admin/jinxin/test/atomic_perf.c:9
8: 48 8d 6c 24 10 lea 0x10(%rsp),%rbp
d: 48 89 ef mov %rbp,%rdi
10: e8 00 00 00 00 callq 15
15: 31 d2 xor %edx,%edx
/home/admin/jinxin/test/atomic_perf.c:11
for (int i = 0; i < 1000000; i++) {
m++;
17: 8b 44 24 2c mov 0x2c(%rsp),%eax
/home/admin/jinxin/test/atomic_perf.c:10
1b: 83 c2 01 add $0x1,%edx
/home/admin/jinxin/test/atomic_perf.c:11
1e: 83 c0 01 add $0x1,%eax
/home/admin/jinxin/test/atomic_perf.c:10
21: 81 fa 40 42 0f 00 cmp $0xf4240,%edx
/home/admin/jinxin/test/atomic_perf.c:11
27: 89 44 24 2c mov %eax,0x2c(%rsp)
/home/admin/jinxin/test/atomic_perf.c:10
2b: 75 ea jne 17
/home/admin/jinxin/test/atomic_perf.c:14
}
struct timeval end;
gettimeofday(&end, NULL);
2d: 31 f6 xor %esi,%esi
2f: 48 89 e7 mov %rsp,%rdi
32: e8 00 00 00 00 callq 37
/home/admin/jinxin/test/atomic_perf.c:16
printf("add cost %lldus\n", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec));
37: 48 8b 04 24 mov (%rsp),%rax
3b: 48 2b 44 24 10 sub 0x10(%rsp),%rax
40: bf 00 00 00 00 mov $0x0,%edi
45: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
4a: 48 2b 74 24 18 sub 0x18(%rsp),%rsi
4f: 48 69 c0 40 42 0f 00 imul $0xf4240,%rax,%rax
56: 48 01 c6 add %rax,%rsi
59: 31 c0 xor %eax,%eax
5b: e8 00 00 00 00 callq 60
/home/admin/jinxin/test/atomic_perf.c:19
int n;
gettimeofday(&start, NULL);
60: 31 f6 xor %esi,%esi
62: 48 89 ef mov %rbp,%rdi
65: e8 00 00 00 00 callq 6a
6a: 48 8d 54 24 28 lea 0x28(%rsp),%rdx
6f: 31 c0 xor %eax,%eax
/home/admin/jinxin/test/atomic_perf.c:21
for (int i = 0; i < 1000000; i++) {
__sync_fetch_and_add(&n, 1);
71: f0 83 02 01 lock addl $0x1,(%rdx)
/home/admin/jinxin/test/atomic_perf.c:20
75: 83 c0 01 add $0x1,%eax
78: 3d 40 42 0f 00 cmp $0xf4240,%eax
7d: 75 f2 jne 71
/home/admin/jinxin/test/atomic_perf.c:23
}
gettimeofday(&end, NULL);
7f: 48 89 e7 mov %rsp,%rdi
82: 31 f6 xor %esi,%esi
84: e8 00 00 00 00 callq 89
/home/admin/jinxin/test/atomic_perf.c:24
printf("atomic cost %lldus\n", (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec));
89: 48 8b 04 24 mov (%rsp),%rax
8d: 48 2b 44 24 10 sub 0x10(%rsp),%rax
92: bf 00 00 00 00 mov $0x0,%edi
97: 48 8b 74 24 08 mov 0x8(%rsp),%rsi
9c: 48 2b 74 24 18 sub 0x18(%rsp),%rsi
a1: 48 69 c0 40 42 0f 00 imul $0xf4240,%rax,%rax
a8: 48 01 c6 add %rax,%rsi
ab: 31 c0 xor %eax,%eax
ad: e8 00 00 00 00 callq b2
/home/admin/jinxin/test/atomic_perf.c:27
return 0;
}
b2: 48 83 c4 38 add $0x38,%rsp
b6: 31 c0 xor %eax,%eax
b8: 5b pop %rbx
b9: 5d pop %rbp
ba: c3 retq