SimplePerf C++

承接上文，本文主要记录simpleperf C++部分的代码的阅读笔记。

Main

main main.cpp
- RunSimpleperfCmd commond.cpp
  - CreateCommandInstance()
    - command->Run(args)

RecordCommand

RecordCommand : Commond

Run()
- CheckPerfEventLimit() // 系统默认会限制用户开启perf权限
- setrlimit(RLIMIT_NOFILE) // 设置打开文件句柄的限制为最大值
- RunInAppContext // 使用run-as在被监控的app权限下重新运行simpleperf
  - run-as packagename
  - cp simpleperf to /data/data/packagename
  - shell ./simpleperf
  - pkill simpleperf
  - cp output_file to output_path
- PrepareRecording // 准备录制
  - PrepareVdsoFile // vdso是一个特殊的so文件, 由内核映射到所有的进程中
  - event_selection_set_.AddEventType // Add default event type.
  - event_selection_set_.SetSampleSpeed // 设置采样率
  - ExcludeKernel
  - TraceOffCpu // 默认指统计在cpu上事件, 如果开启traceOffcpu则记录off cpu的所有时间, 包括在内核里等待执行的时间
  - SetEventSelectionFlags
    - event_selection_set_.SetBranchSimpling
    - event_selection_set_.EnableFpCallChainSampling() // 开启FP的堆栈回溯模式
    - or event_selection_set_.EnableDwarfCallChainSampling() // 或者开启dwarf模式的堆栈回溯
    - event_selection_set_.SetInherit()
    - event_selection_set_.SetClockId() // 设置指定的CPU
  - offline_unwinder_.reset(new OfflineUnwinder()) // TODO
  - callchain_joiner // TODO
  - event_selection_set_.AddMonitoredProcesses(pids) // Add monitored targets. 设置监控的进程pid
  - event_selection_set_.OpenEventFiles(cpus) // Open perf event files 打开perf的event files
  - event_selection_set_.MMapEventFiles() // create mapped buffers. 映射内核的循环队列缓存(内核会往这个buffer里写event, smipleperf会去读event)
  - event_selection_set_.PrepareToReadMmapEventData(callback) // 设置读取event的回调函数为 RecordCommand::ProcessRecord
  - CreateAndInitRecordFile() // 创建 perf.data. 文件
  - event_selection_set_.GetIOEventLoop()->AddSignalEvents( SIGCHLD, SIGINT, SIGTERM ) // Ctrl+C退出模式, 监控退出的信号量
  - or event_selection_set_.GetIOEventLoop()->AddPeriodicEvent(duration_in_sec_) // duration自动退出模式, 时长到了自动退出
  - jit_debug_reader_ // TODO
- ProcessRecord // 录制过程中数据的回调函数
  - UpdateRecord
    - UpdateMmapRecord if record type == PERF_RECORD_MMAP // TODO
    - UpdateMmapRecord if record type == PERF_RECORD_MMAP2
    - record->SetCommandName if record type == PERF_RECORD_COMM
  - ShouldOmitRecord() // TODO
  - jit_debug_reader_->UpdateRecord() // TODO
  - SaveRecord
    - SaveRecordForPostUnwinding()
      - record_file_writer_->WriteRecord()
    - or SaveRecordAfterUnwinding()
      - record.AdjustCallChainGeneratedByKernel()
      - UnwindRecord()
        
        offline_unwinder_->UnwindCallChain() // 回溯堆栈
        
        r.ReplaceRegAndStackWithcallChain(ips) // TODO
      - ExcludeKernelCallChain()
      - thread_tree_.Update() // TODO
      - record_file_writer_->WriteRecord()
    - or SaveRecordWithoutUnwinding()
      - record.AdjustCallChainGeneratedByKernel()
      - ExcludeKernelCallChain()
      - record_file_writer_->WriteRecord()
DoRecording // 开始录制
- write "STARTED" to start_profiling_fd_ file // 开始前往参数里的start_profiling文件写入一个字符串 "STARTED"并立刻关闭该文件, 标记已经开启录制
- event_selection_set_.GetIOEventLoop()->RunLoop() // 开始IO Loop
- event_selection_set_.FinishReadMMapEventData() // Loop结束后, 完成停止读取buffer
PostPrecessRecording // 结束录制
- PostUnwindRecords() // Post unwind dwarf callchain 默认是在录制的过程中回溯堆栈, 会有时间上的消耗, 在post-unwind模式下是在录制完成后再去回溯堆栈.
- JoinCallChains() // Optionally join Callchains.
- DumpAdditionalFeatures(args) // Dump additional features, and close record file.
- record_file_writer_->Close() // 关闭报告文件
- print result // Show brief record result 打印简要的报告结果

EventSelectionSet

AddEventType

SetSampleFreq

OpenEventFiles

ReadCounters

PrepareToReadMmapEventData

FinishedReadMmapEventData

OfflineUnwinder

UnwindCallChain()

使用 libunwindstack 库来回溯堆栈: https://android.googlesource.com/platform/system/core/+/master/libunwindstack/tools/
保存堆栈的sp和pc

perf

perf_record_type

PERF_RECORD_MMAP 映射so事件, 用于关联IP到代码, 返回so映射内存的地址, 长度, 名称等信息
PERF_RECORD_MMAP2 mmap调用事件, executable mappings
PERF_RECORD_COMM 进程名改变事件, 包括新的进程名称等信息
PERF_RECORD_EXIT 进程退出事件
PERF_RECORD_SAMPLE sample事件, 包括sample数据

参考资料: http://www.man7.org/linux/man-pages/man2/perf_event_open.2.html

sample事件:



struct {
  struct perf_event_header header;
  u64    sample_id;   /* if PERF_SAMPLE_IDENTIFIER */
  u64    ip;          /* if PERF_SAMPLE_IP */
  u32    pid, tid;    /* if PERF_SAMPLE_TID */
  u64    time;        /* if PERF_SAMPLE_TIME */
  u64    addr;        /* if PERF_SAMPLE_ADDR */
  u64    id;          /* if PERF_SAMPLE_ID */
  u64    stream_id;   /* if PERF_SAMPLE_STREAM_ID */
  u32    cpu, res;    /* if PERF_SAMPLE_CPU */
  u64    period;      /* if PERF_SAMPLE_PERIOD */
  struct read_format v; /* if PERF_SAMPLE_READ */
  u64    nr;          /* if PERF_SAMPLE_CALLCHAIN */
  u64    ips[nr];     /* if PERF_SAMPLE_CALLCHAIN */
  u32    size;        /* if PERF_SAMPLE_RAW */
  char  data[size];   /* if PERF_SAMPLE_RAW */
  u64    bnr;         /* if PERF_SAMPLE_BRANCH_STACK */
  struct perf_branch_entry lbr[bnr]; /* if PERF_SAMPLE_BRANCH_STACK */
  u64    abi;         /* if PERF_SAMPLE_REGS_USER */
  u64    regs[weight(mask)]; /* if PERF_SAMPLE_REGS_USER */
  u64    size;        /* if PERF_SAMPLE_STACK_USER */
  char   data[size];  /* if PERF_SAMPLE_STACK_USER */
  u64    dyn_size;    /* if PERF_SAMPLE_STACK_USER && size != 0 */
  u64    weight;      /* if PERF_SAMPLE_WEIGHT */
  u64    data_src;    /* if PERF_SAMPLE_DATA_SRC */
  u64    transaction; /* if PERF_SAMPLE_TRANSACTION */
  u64    abi;         /* if PERF_SAMPLE_REGS_INTR */
  u64    regs[weight(mask)]; /* if PERF_SAMPLE_REGS_INTR */
};

record

开始录制

simpleperf record

参数:

#if defined(__ANDROID__)
"--app package_name    Profile the process of an Android application.\n"
"                      On non-rooted devices, the app must be debuggable,\n"
"                      because we use run-as to switch to the app's context.\n"
#endif
"-p pid1,pid2,...       Record events on existing processes. Mutually exclusive\n"
"                       with -a.\n"
"-t tid1,tid2,... Record events on existing threads. Mutually exclusive with -a.\n"
"\n"
"Select monitored event types:\n"
"-e event1[:modifier1],event2[:modifier2],...\n"
"             Select a list of events to record. An event can be:\n"
"               1) an event name listed in `simpleperf list`;\n"
"               2) a raw PMU event in rN format. N is a hex number.\n"
"                  For example, r1b selects event number 0x1b.\n"
"             Modifiers can be added to define how the event should be\n"
"             monitored. Possible modifiers are:\n"
"                u - monitor user space events only\n"
"                k - monitor kernel space events only\n"
"--group event1[:modifier],event2[:modifier2],...\n"
"             Similar to -e option. But events specified in the same --group\n"
"             option are monitored as a group, and scheduled in and out at the\n"
"             same time.\n"
"--trace-offcpu   Generate samples when threads are scheduled off cpu.\n"
"                 Similar to \"-c 1 -e sched:sched_switch\".\n"
"\n"
"Select monitoring options:\n"
"-f freq      Set event sample frequency. It means recording at most [freq]\n"
"             samples every second. For non-tracepoint events, the default\n"
"             option is -f 4000. A -f/-c option affects all event types\n"
"             following it until meeting another -f/-c option. For example,\n"
"             for \"-f 1000 cpu-cycles -c 1 -e sched:sched_switch\", cpu-cycles\n"
"             has sample freq 1000, sched:sched_switch event has sample period 1.\n"
"-c count     Set event sample period. It means recording one sample when\n"
"             [count] events happen. For tracepoint events, the default option\n"
"             is -c 1.\n"
"--call-graph fp | dwarf[,]\n"
"             Enable call graph recording. Use frame pointer or dwarf debug\n"
"             frame as the method to parse call graph in stack.\n"
"             Default is dwarf,65528.\n"
"-g           Same as '--call-graph dwarf'.\n"
"--clockid clock_id      Generate timestamps of samples using selected clock.\n"
"                        Possible values are: realtime, monotonic,\n"
"                        monotonic_raw, boottime, perf. Default is perf.\n"
"--cpu cpu_item1,cpu_item2,...\n"
"             Collect samples only on the selected cpus. cpu_item can be cpu\n"
"             number like 1, or cpu range like 0-3.\n"
"--duration time_in_sec  Monitor for time_in_sec seconds instead of running\n"
"                        [command]. Here time_in_sec may be any positive\n"
"                        floating point number.\n"
"-j branch_filter1,branch_filter2,...\n"
"             Enable taken branch stack sampling. Each sample captures a series\n"
"             of consecutive taken branches.\n"
"             The following filters are defined:\n"
"                any: any type of branch\n"
"                any_call: any function call or system call\n"
"                any_ret: any function return or system call return\n"
"                ind_call: any indirect branch\n"
"                u: only when the branch target is at the user level\n"
"                k: only when the branch target is in the kernel\n"
"             This option requires at least one branch type among any, any_call,\n"
"             any_ret, ind_call.\n"
"-b           Enable taken branch stack sampling. Same as '-j any'.\n"
"-m mmap_pages   Set the size of the buffer used to receiving sample data from\n"
"                the kernel. It should be a power of 2. If not set, the max\n"
"                possible value <= 1024 will be used.\n"
"--no-inherit  Don't record created child threads/processes.\n"
"\n"
"Dwarf unwinding options:\n"
"--post-unwind=(yes|no) If `--call-graph dwarf` option is used, then the user's\n"
"                       stack will be recorded in perf.data and unwound while\n"
"                       recording by default. Use --post-unwind=yes to switch\n"
"                       to unwind after recording.\n"
"--no-unwind   If `--call-graph dwarf` option is used, then the user's stack\n"
"              will be unwound by default. Use this option to disable the\n"
"              unwinding of the user's stack.\n"
"--no-callchain-joiner  If `--call-graph dwarf` option is used, then by default\n"
"                       callchain joiner is used to break the 64k stack limit\n"
"                       and build more complete call graphs. However, the built\n"
"                       call graphs may not be correct in all cases.\n"
"--callchain-joiner-min-matching-nodes count\n"
"               When callchain joiner is used, set the matched nodes needed to join\n"
"               callchains. The count should be >= 1. By default it is 1.\n"
"\n"
"Recording file options:\n"
"--no-dump-kernel-symbols  Don't dump kernel symbols in perf.data. By default\n"
"                          kernel symbols will be dumped when needed.\n"
"--no-dump-symbols       Don't dump symbols in perf.data. By default symbols are\n"
"                        dumped in perf.data, to support reporting in another\n"
"                        environment.\n"
"-o record_file_name    Set record file name, default is perf.data.\n"
"--exit-with-parent            Stop recording when the process starting\n"
"                              simpleperf dies.\n"
"--size-limit SIZE[K|M|G]      Stop recording after SIZE bytes of records.\n"
"                              Default is unlimited.\n"
"--start_profiling_fd fd_no    After starting profiling, write \"STARTED\" to\n"
"                              , then close .\n"
"--symfs     Look for files with symbols relative to this directory.\n"
"                 This option is used to provide files with symbol table and\n"
"                 debug information, which are used for unwinding and dumping symbols.\n"

其中关键参数:

--call-graph, 是否开启调用栈的录制, 两种堆栈回溯的模式: fp(基于寄存器的stack frame pointer), 默认模式为dwarf(基于so的.debug_frame)
-e, 录制的事件列表, 可以通过 simpleperf list 来查看当前系统支持的event有哪些. u为只监控用户态, k为只监控内核态

关于事件:

硬件事件: 由CPU PMU模块的中断事件
软件事件: 由内核的中断事件, 例如tick事件
- task-clock:u 监控函数的CPU占用情况

关于堆栈回溯:

使用libunwind库

关于dwarf模式:
dwarf是so里面的 .debug 段, 其中 .debug_frame 可用来做栈回溯, 因为有些CPU不会保存 Stack Frame Pointer (fp) 到寄存器.

关于fp模式:
但是在Android的某些机器上(目前看是Android 6.x, 可能小于7.0的机器上) 会提示不支持 dwarf 模式, 使用fp模式, 又会提示 fp mode will not work will

report

将pref.data数据转换为txt数据

simpleperf report

参数列表:

-g, 带函数调用关系, callee是被调用关系, caller是调用关系
-o, output文件名
--symfs,

            "report", "report sampling information in perf.data",
            "Usage: simpleperf report [options]\n"
            "    -b            Use the branch-to addresses in sampled take branches instead of\n"
            "                  the instruction addresses. Only valid for perf.data recorded with\n"
            "                  -b/-j option.\n"
            "    --children    Print the overhead accumulated by appearing in the callchain.\n"
            "    --comms comm1,comm2,...\n"
            "                  Report only for selected comms.\n"
            "    --dsos dso1,dso2,...\n"
            "                  Report only for selected dsos.\n"
            "    -g [callee|caller]\n"
            "                  Print call graph. If callee mode is used, the graph shows how\n"
            "                  functions are called from others. Otherwise, the graph shows how\n"
            "                  functions call others. Default is callee mode.\n"
            "    -i      Specify path of record file, default is perf.data.\n"
            "    -n            Print the sample count for each item.\n"
            "    --no-demangle        Don't demangle symbol names.\n"
            "    -o report_file_name  Set report file name, default is stdout.\n"
            "    --pid pid1,pid2,...\n"
            "                  Report only for selected pids.\n"
            "    --sort key1,key2,...\n"
            "                  Select the keys to sort and print the report. Possible keys\n"
            "                  include pid, tid, comm, dso, symbol, dso_from, dso_to, symbol_from\n"
            "                  symbol_to. dso_from, dso_to, symbol_from, symbol_to can only be\n"
            "                  used with -b option. Default keys are \"comm,pid,tid,dso,symbol\"\n"
            "    --symfs  Look for files with symbols relative to this directory.\n"
            "    --tids tid1,tid2,...\n"
            "                  Report only for selected tids.\n"
            "    --vmlinux \n"
            "                  Parse kernel symbols from .\n"),

Event FD

打开 perf event

linux手册: http://www.man7.org/linux/man-pages/man2/perf_event_open.2.html

static int perf_event_open(const perf_event_attr& attr, pid_t pid, int cpu,
                           int group_fd, unsigned long flags) {  // NOLINT
 **  return syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, flags);**
}






std::unique_ptr EventFd::OpenEventFile(const perf_event_attr& attr,
                                                pid_t tid, int cpu,
                                                EventFd* group_event_fd,
                                                bool report_error) {
  std::string event_name = GetEventNameByAttr(attr);
  int group_fd = -1;
  if (group_event_fd != nullptr) {
    group_fd = group_event_fd->perf_event_fd_;
  }
  perf_event_attr real_attr = attr;
  if (attr.freq) {
    uint64_t max_sample_freq;
    if (GetMaxSampleFrequency(&max_sample_freq) && max_sample_freq < attr.sample_freq) {
      static bool warned = false;
      if (!warned) {
        warned = true;
        LOG(INFO) << "Adjust sample freq to max allowed sample freq " << max_sample_freq;
      }
      real_attr.sample_freq = max_sample_freq;
    }
  }
  int perf_event_fd = perf_event_open(real_attr, tid, cpu, group_fd, 0);
  if (perf_event_fd == -1) {
    if (report_error) {
      PLOG(ERROR) << "open perf_event_file (event " << event_name << ", tid "
                  << tid << ", cpu " << cpu << ", group_fd " << group_fd
                  << ") failed";
    } else {
      PLOG(DEBUG) << "open perf_event_file (event " << event_name << ", tid "
                  << tid << ", cpu " << cpu << ", group_fd " << group_fd
                  << ") failed";
    }
    return nullptr;
  }
  if (fcntl(perf_event_fd, F_SETFD, FD_CLOEXEC) == -1) {
    if (report_error) {
      PLOG(ERROR) << "fcntl(FD_CLOEXEC) for perf_event_file (event "
                  << event_name << ", tid " << tid << ", cpu " << cpu
                  << ", group_fd " << group_fd << ") failed";
    } else {
      PLOG(DEBUG) << "fcntl(FD_CLOEXEC) for perf_event_file (event "
                  << event_name << ", tid " << tid << ", cpu " << cpu
                  << ", group_fd " << group_fd << ") failed";
    }
    return nullptr;
  }
  return std::unique_ptr(
      new EventFd(real_attr, perf_event_fd, event_name, tid, cpu));
}

参考资料

Android内核版本

Android Version    |API Level  |Linux Kernel in AOSP
\----------------------------------------------------
1.5   Cupcake      |3          |2.6.27
1.6   Donut        |4          |2.6.29
2.0/1 Eclair       |5-7        |2.6.29
2.2.x Froyo        |8          |2.6.32
2.3.x Gingerbread  |9, 10      |2.6.35
3.x.x Honeycomb    |11-13      |2.6.36
4.0.x Ice Cream San|14, 15     |3.0.1
4.1.x Jelly Bean   |16         |3.0.31
4.2.x Jelly Bean   |17         |3.4.0
4.3   Jelly Bean   |18         |3.4.39
4.4   Kit Kat      |19, 20     |3.10
5.x   Lollipop     |21, 22     |3.16.1
6.0   Marshmallow  |23         |3.18.10
7.0   Nougat       |24         |4.4.1
7.1   Nougat       |25         |4.4.1 (To be updated)

Why we suggest profiling on Android >= N devices?

1. Running on a device reflects a real running situation, so we suggest
profiling on real devices instead of emulators.
2. To profile Java code, we need ART running in oat mode, which is only
available >= L for rooted devices, and >= N for non-rooted devices.
3. Old Android versions are likely to be shipped with old kernels (< 3.18),
which may not support profiling features like recording dwarf based call graphs.
4. Old Android versions are likely to be shipped with Arm32 chips. In Arm32
mode, recording stack frame based call graphs doesn't work well.

NOTE ATTRIBUTES

Created Date: 2018-08-09 08:45:52
Last Evernote Update Date: 2018-08-14 08:04:49

simpleperf源码阅读-1.C++