I find it solves about 80% of server issues with 5% of the effort.
top - 17:13:49 up 83 days, 23:10, 1 user, load average: 433.52, 422.54, 438.70
Tasks: 2765 total, 23 running, 1621 sleeping, 0 stopped, 34 zombie
%Cpu(s): 23.4 us, 9.5 sy, 0.0 ni, 65.5 id, 0.7 wa, 0.0 hi, 1.0 si, 0.0 st
load average: 433.52, 422.54, 438.70
#dstat -tp
----system---- ---procs---
time |run blk new
07-03 17:56:50|204 1.0 202
07-03 17:56:51|212 0 238
07-03 17:56:52|346 1.0 266
07-03 17:56:53|279 5.0 262
07-03 17:56:54|435 7.0 177
07-03 17:56:55|442 3.0 251
07-03 17:56:56|792 8.0 419
07-03 17:56:57|504 16 152
07-03 17:56:58|547 3.0 156
07-03 17:56:59|606 2.0 212
07-03 17:57:00|770 0 186
#free -g
total used free shared buff/cache available
Mem: 503 193 7 2 301 301
Swap: 0 0 0
更详细的信息可以直接去读/proc/meminfo:
#cat /proc/meminfo
MemTotal: 527624224 kB
MemFree: 8177852 kB
MemAvailable: 316023388 kB
Buffers: 23920716 kB
Cached: 275403332 kB
SwapCached: 0 kB
Active: 59079772 kB
Inactive: 431064908 kB
Active(anon): 1593580 kB
Inactive(anon): 191649352 kB
Active(file): 57486192 kB
Inactive(file): 239415556 kB
Unevictable: 249700 kB
Mlocked: 249700 kB
SwapTotal: 0 kB
SwapFree: 0 kB
[...]
#sar -B 1
11:00:16 AM pgscank/s pgscand/s pgsteal/s %vmeff
11:00:17 AM 0.00 0.00 3591.00 0.00
11:00:18 AM 0.00 0.00 10313.00 0.00
11:00:19 AM 0.00 0.00 8452.00 0.00
Node 0 Node 1 Node 2 Node 3
--------------- --------------- --------------- ---------------
MemTotal 31511.92 32255.18 32255.18 32255.18
MemFree 2738.79 131.89 806.50 10352.02
MemUsed 28773.12 32123.29 31448.69 21903.16
Active 7580.58 419.80 9597.45 5780.64
Inactive 17081.27 26844.28 19806.99 13504.79
Active(anon) 6.63 0.93 2.08 5.64
Inactive(anon) 12635.75 25560.53 12754.29 9053.80
Active(file) 7573.95 418.87 9595.37 5775.00
Inactive(file) 4445.52 1283.75 7052.70 4450.98
mkdir /sys/fs/cgroup/cpuset/overloaded
echo 0-1 > /sys/fs/cgroup/cpuset/cpuset.cpus
echo 0 > /sys/fs/cgroup/cpuset/cpuset.mems
echo $$
for i in {0..1023}; do /tmp/busy & done
#uptime
14:10:54 up 6 days, 18:52, 10 users, load average: 920.92, 411.61, 166.95
#pidstat -p `pgrep myserv` -t 1
15:47:05 UID TGID TID %usr %system %guest %CPU CPU Command
15:47:06 0 71942 - 415.00 0.00 0.00 415.00 22 myserv
15:47:06 0 - 71942 0.00 0.00 0.00 0.00 22 |__myserv
...
15:47:06 0 - 72079 7.00 94.00 0.00 101.00 21 |__myserv
15:47:06 0 - 72080 10.00 90.00 0.00 100.00 19 |__myserv
15:47:06 0 - 72081 9.00 91.00 0.00 100.00 35 |__myserv
15:47:06 0 - 72082 5.00 95.00 0.00 100.00 29 |__myserv
largest=70
while :; do
mem=$(ps -p `pidof mysqld` -o %mem | tail -1)
imem=$(printf %.0f $mem)
if [ $imem -gt $largest ]; then
echo 'p malloc_stats_print(0,0,0)' | gdb --quiet -nx -p `pidof mysqld`
fi
sleep 10
done
#perf list | grep Hardware
branch-misses [Hardware event]
bus-cycles [Hardware event]
cache-misses [Hardware event]
cache-references [Hardware event]
cpu-cycles OR cycles [Hardware event]
instructions [Hardware event]
L1-dcache-load-misses [Hardware cache event]
L1-dcache-loads [Hardware cache event]
L1-dcache-store-misses [Hardware cache event]
L1-dcache-stores [Hardware cache event]
L1-icache-load-misses [Hardware cache event]
L1-icache-loads [Hardware cache event]
branch-load-misses [Hardware cache event]
branch-loads [Hardware cache event]
dTLB-load-misses [Hardware cache event]
iTLB-load-misses [Hardware cache event]
mem:<addr>[/len][:access] [Hardware breakpoint]
#perf stat -p 31925 sleep 1
Performance counter stats for process id '31925':
2184.986720 task-clock (msec) # 2.180 CPUs utilized
3,210 context-switches # 0.001 M/sec
345 cpu-migrations # 0.158 K/sec
0 page-faults # 0.000 K/sec
4,311,798,055 cycles # 1.973 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
409,465,681 instructions # 0.09 insns per cycle
<not supported> branches
8,680,257 branch-misses # 0.00% of all branches
1.002506001 seconds time elapsed
void busy(long us) {
struct timeval tv1, tv2;
long delta = 0;
gettimeofday(&tv1, NULL);
do {
gettimeofday(&tv2, NULL);
delta = (tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec;
} while (delta < us);
}
void A() { busy(2000); }
void B() { busy(8000); }
int main() {
while (1) {
A(); B();
}
return 0;
}
Samples: 27K of event 'cycles', Event count (approx.): 14381317911
Children Self Command Shared Object Symbol
+ 99.99% 0.00% a.out [unknown] [.] 0x0000fffffb925137
+ 99.99% 0.00% a.out a.out [.] _start
+ 99.99% 0.00% a.out libc-2.17.so [.] __libc_start_main
+ 99.99% 0.00% a.out a.out [.] main
+ 99.06% 25.95% a.out a.out [.] busy
+ 79.98% 0.00% a.out a.out [.] B
- 71.31% 71.31% a.out [vdso] [.] __kernel_gettimeofday
__kernel_gettimeofday
- busy
+ 79.84% B
+ 20.16% A
+ 20.01% 0.00% a.out a.out [.] A
#strace -v perf record -g -e cycles ./a.out
perf_event_open({type=PERF_TYPE_HARDWARE, size=PERF_ATTR_SIZE_VER5, config=PERF_COUNT_HW_CPU_CYCLES, sample_freq=4000, sample_type=PERF_SAMPLE_IP|PERF_SAMPLE_TID|PERF_SAMPLE_TIME|PERF_SAMPLE_CALLCHAIN|PERF_SAMPLE_PERIOD, read_format=0, disabled=1, inherit=1, pinned=0, exclusive=0, exclusive_user=0, exclude_kernel=0, exclude_hv=0, exclude_idle=0, mmap=1, comm=1, freq=1, inherit_stat=0, enable_on_exec=1, task=1, watermark=0, precise_ip=0 /* arbitrary skid */, mmap_data=0, sample_id_all=1, exclude_host=0, exclude_guest=1, exclude_callchain_kernel=0, exclude_callchain_user=0, mmap2=1, comm_exec=1, use_clockid=0, context_switch=0, write_backward=0, namespaces=0, wakeup_events=0, config1=0, config2=0, sample_regs_user=0, sample_regs_intr=0, aux_watermark=0, sample_max_stack=0}, 51876, 25, -1, PERF_FLAG_FD_CLOEXEC) = 30
sudo btrace /dev/sda
8,0 0 1 0.000000000 1024 A WS 302266328 + 8 <- (8,5) 79435736
8,0 0 2 0.000001654 1024 Q WS 302266328 + 8 [jbd2/sda5-8]
8,0 0 3 0.000010042 1024 G WS 302266328 + 8 [jbd2/sda5-8]
8,0 0 4 0.000011605 1024 P N [jbd2/sda5-8]
8,0 0 5 0.000014993 1024 I WS 302266328 + 8 [jbd2/sda5-8]
8,0 0 0 0.000018026 0 m N cfq1024SN / insert_request
8,0 0 0 0.000019598 0 m N cfq1024SN / add_to_rr
8,0 0 6 0.000022546 1024 U N [jbd2/sda5-8] 1
$ sudo blktrace -d /dev/sdb -w 5
$ blkparse sdb -d sdb.bin
$ btt -i sdb.bin
==================== All Devices ====================
ALL MIN AVG MAX N
--------------- ------------- ------------- ------------- -----------
Q2Q 0.000000001 0.000014397 0.008275391 347303
Q2G 0.000000499 0.000071615 0.010518692 347298
S2G 0.000128160 0.002107990 0.010517875 11512
G2I 0.000000600 0.000001570 0.000040010 347298
I2D 0.000000395 0.000000929 0.000003743 347298
D2C 0.000116199 0.000144157 0.008443855 347288
Q2C 0.000118211 0.000218273 0.010678657 347288
==================== Device Overhead ====================
DEV | Q2G G2I Q2M I2D D2C
---------- | --------- --------- --------- --------- ---------
( 8, 16) | 32.8106% 0.7191% 0.0000% 0.4256% 66.0447%
---------- | --------- --------- --------- --------- ---------
Overall | 32.8106% 0.7191% 0.0000% 0.4256% 66.0447%
二 进阶篇
lat_mem_rd -P 1 -N 1 10240 512
trace-cmd record -p function --func-stack -l generic_make_request dd if=/dev/zero of=file bs=4k count=1 oflag=direct
#trace-cmd report
cpus=128
dd-11344 [104] 4148325.319997: function: generic_make_request
dd-11344 [104] 4148325.320002: kernel_stack: <stack trace>
=> ftrace_graph_call (ffff00000809849c)
=> generic_make_request (ffff000008445b80)
=> submit_bio (ffff000008445f00)
=> __blockdev_direct_IO (ffff00000835a0a8)
=> ext4_direct_IO_write (ffff000001615ff8)
=> ext4_direct_IO (ffff0000016164c4)
=> generic_file_direct_write (ffff00000825c4e0)
=> __generic_file_write_iter (ffff00000825c684)
=> ext4_file_write_iter (ffff0000016013b8)
=> __vfs_write (ffff00000830c308)
=> vfs_write (ffff00000830c564)
=> ksys_write (ffff00000830c884)
=> __arm64_sys_write (ffff00000830c918)
=> el0_svc_common (ffff000008095f38)
=> el0_svc_handler (ffff0000080960b0)
=> el0_svc (ffff000008084088)
$ sudo trace-cmd record -p function_graph -g generic_make_request dd if=/dev/zero of=file bs=4k count=1 oflag=direct
这样就可以拿到整个调用过程 (report结果稍微整理过):
$ trace-cmd report
dd-22961 | generic_make_request() {
dd-22961 | generic_make_request_checks() {
dd-22961 0.080 us | _cond_resched();
dd-22961 | create_task_io_context() {
dd-22961 0.485 us | kmem_cache_alloc_node();
dd-22961 0.042 us | _raw_spin_lock();
dd-22961 0.039 us | _raw_spin_unlock();
dd-22961 1.820 us | }
dd-22961 | blk_throtl_bio() {
dd-22961 0.302 us | throtl_update_dispatch_stats();
dd-22961 1.748 us | }
dd-22961 6.110 us | }
dd-22961 | blk_queue_bio() {
dd-22961 0.491 us | blk_queue_split();
dd-22961 0.299 us | blk_queue_bounce();
dd-22961 0.200 us | bio_integrity_enabled();
dd-22961 0.183 us | blk_attempt_plug_merge();
dd-22961 0.042 us | _raw_spin_lock_irq();
dd-22961 | elv_merge() {
dd-22961 0.176 us | elv_rqhash_find.isra.9();
dd-22961 | deadline_merge() {
dd-22961 0.108 us | elv_rb_find();
dd-22961 0.852 us | }
dd-22961 2.229 us | }
dd-22961 | get_request() {
dd-22961 0.130 us | elv_may_queue();
dd-22961 | mempool_alloc() {
dd-22961 0.040 us | _cond_resched();
dd-22961 | mempool_alloc_slab() {
dd-22961 0.395 us | kmem_cache_alloc();
dd-22961 0.744 us | }
dd-22961 1.650 us | }
dd-22961 0.334 us | blk_rq_init();
dd-22961 0.055 us | elv_set_request();
dd-22961 4.565 us | }
dd-22961 | init_request_from_bio() {
dd-22961 | blk_rq_bio_prep() {
dd-22961 | blk_recount_segments() {
dd-22961 0.222 us | __blk_recalc_rq_segments();
dd-22961 0.653 us | }
dd-22961 1.141 us | }
dd-22961 1.620 us | }
dd-22961 | blk_account_io_start() {
dd-22961 0.137 us | disk_map_sector_rcu();
dd-22961 | part_round_stats() {
dd-22961 0.195 us | part_round_stats_single();
dd-22961 0.054 us | part_round_stats_single();
dd-22961 0.955 us | }
dd-22961 2.148 us | }
dd-22961 + 15.847 us | }
dd-22961 + 23.642 us | }
[ ] | main() {
[ ] | A() {
0.160 us [ 69439] | busy();
1.080 us [ 69439] | } /* A */
[ ] | B() {
0.050 us [ 69439] | busy();
0.240 us [ 69439] | } /* B */
1.720 us [ 69439] | } /* main */
#include <linux/kvm_host.h>
BEGIN {
@qemu_pid = $1;
@mmio_start = 0xa000a00000;
@mmio_end = 0xa000a00000 + 16384;
@devid = 1536;
}
tracepoint:kvm:kvm_mmio /pid == @qemu_pid/ {
if (args->gpa >= @mmio_start && args->gpa < @mmio_end) {
@start = nsecs;
}
}
kprobe:kvm_set_msi {
$e = (struct kvm_kernel_irq_routing_entry *)arg0;
$kvm = (struct kvm *)arg1;
if (@start > 0 && $kvm->userspace_pid == @qemu_pid && $e->msi.devid == @devid) {
@dur = stats(nsecs - @start);
@start = 0;
}
}
interval:s:1 {
print(@dur); clear(@dur);
}
598, average 1606320, total 960579533 : count
543, average 1785906, total 969747196 : count
644, average 1495419, total 963049914 : count
624, average 1546575, total 965062935 : count
645, average 1495250, total 964436299 : count
5 * This file contains the magic bits required to compute the global loadavg
6 * figure. Its a silly number but people think its important. We go through
7 * great pains to make it work on big machines and tickless kernels.
kprobe:calc_load_fold_active /cpu == 0/ {
printf("%ld\n", nsecs / 1000000000);
}
#include "kernel/sched/sched.h"
kprobe:calc_global_load_tick /cpu == 0/ {
$rq = (struct rq *)arg0;
@[$rq->calc_load_update] = count();
}
interval:s:5 {
print(@); clear(@);
}
#./calc_load.bt -I /kernel-source
@[4465886482]: 61
@[4465887733]: 1189
@[4465887733]: 62
@[4465888984]: 1188
kprobe:id_nr_invalid /cpu == 0/ {
printf("%ld\n", nsecs / 1000000000);
}
while :; do
sec=$(awk -F. '{print $1}' /proc/uptime)
rem=$((sec % 5))
if [ $rem -eq 2 ]; then # 1s after updating load
break;
fi
sleep 0.1
done
for i in {0..63}; do
./busy 3 & # run 3s
done
top - 09:55:41 up 127 days, 1:44, 1 user, load average: 111.70, 87.08, 79.41
Tasks: 3685 total, 21 running, 2977 sleeping, 1 stopped, 8 zombie
%Cpu(s): 27.3 us, 8.9 sy, 0.0 ni, 59.8 id, 0.1 wa, 0.0 hi, 4.0 si, 0.0 st
#sar -B 1
11:00:16 AM pgscank/s pgscand/s pgsteal/s %vmeff
11:00:17 AM 0.00 0.00 3591.00 0.00
11:00:18 AM 0.00 0.00 10313.00 0.00
11:00:19 AM 0.00 0.00 8452.00 0.00
(gdb) b read_vmstat_paging
(gdb) set follow-fork-mode child
(gdb) r
Breakpoint 1, read_vmstat_paging (st_paging=0x424f40) at rd_stats.c:751
751 if ((fp = fopen(VMSTAT, "r")) == NULL)
(gdb) n
754 st_paging->pgsteal = 0;
(gdb)
757 while (fgets(line, sizeof(line), fp) != NULL) {
(gdb)
759 if (!strncmp(line, "pgpgin ", 7)) {
(gdb)
763 else if (!strncmp(line, "pgpgout ", 8)) {
(gdb)
767 else if (!strncmp(line, "pgfault ", 8)) {
(gdb)
771 else if (!strncmp(line, "pgmajfault ", 11)) {
(gdb)
775 else if (!strncmp(line, "pgfree ", 7)) {
(gdb)
779 else if (!strncmp(line, "pgsteal_", 8)) {
(gdb)
784 else if (!strncmp(line, "pgscan_kswapd", 13)) {
(gdb)
789 else if (!strncmp(line, "pgscan_direct", 13)) {
(gdb)
757 while (fgets(line, sizeof(line), fp) != NULL) {
(gdb)
#grep pgsteal_ /proc/vmstat
pgsteal_kswapd 168563
pgsteal_direct 0
pgsteal_anon 0
pgsteal_file 978205
#grep pgscan_ /proc/vmstat
pgscan_kswapd 204242
pgscan_direct 0
pgscan_direct_throttle 0
pgscan_anon 0
pgscan_file 50583828
if (current_is_kswapd()) {
if (!cgroup_reclaim(sc))
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
nr_scanned);
} else {
if (!cgroup_reclaim(sc))
__count_vm_events(PGSCAN_DIRECT, nr_scanned);
count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
nr_scanned);
}
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
#df -h .
Filesystem Size Used Avail Use% Mounted on
cgroup 0 0 0 - /sys/fs/cgroup/memory
#grep -c 'pgscan\|pgsteal' memory.stat
0
[1]https://www.cs.rice.edu/~johnmc/comp528/lecture-notes/
[2]https://brendangregg.com/
[3]http://dtrace.org/blogs/bmc/
[4]https://blog.stgolabs.net/
[5]https://lwn.net/
[6]https://github.com/namhyung/uftrace