tombstone与debuggerd相关流程

2018-11-18 本文已影响0人 weiinter105

tombstone的抓取与debuggerd的有关系是一个守护进程，用来检测程序的崩溃，将程序崩溃前进程的状态记录下来，保存在/data/tombstone文件夹下，最多10个；本质上是对程序崩溃时某些信号的拦截

相关流程

客户端流程

首先，Android程序的入口有一个linker的操作，大致流程如下：

bionic/linker/arch/arm64/begin.S
31ENTRY(_start)
32  mov x0, sp
33  bl __linker_init
34
35  /* linker init returns the _entry address in the main image */
36  br x0
37END(_start)


bionic/linker/linker.cpp
4442/*
4443 * This is the entry point for the linker, called from begin.S. This
4444 * method is responsible for fixing the linker's own relocations, and
4445 * then calling __linker_init_post_relocation().
4446 *
4447 * Because this method is called before the linker has fixed it's own
4448 * relocations, any attempt to reference an extern variable, extern
4449 * function, or other GOT reference will generate a segfault.
4450 */
4451extern "C" ElfW(Addr) __linker_init(void* raw_args) {
          ...
4522  // We have successfully fixed our own relocations. It's safe to run
4523  // the main part of the linker now.
4524  args.abort_message_ptr = &g_abort_message;
4525  ElfW(Addr) start_address = __linker_init_post_relocation(args, linker_addr);
4526
4527  INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address));
4528
4529  // Return the address that the calling assembly stub should jump to.
4530  return start_address;
4531}

4195/*
4196 * This code is called after the linker has linked itself and
4197 * fixed it's own GOT. It is safe to make references to externs
4198 * and other non-local data at this point.
4199 */
4200static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
4201#if TIMING
4202  struct timeval t0, t1;
4203  gettimeofday(&t0, 0);
4204#endif
4205
4206  // Sanitize the environment.
4207  __libc_init_AT_SECURE(args);
4208
4209  // Initialize system properties
4210  __system_properties_init(); // may use 'environ'
4211
4212  debuggerd_init();
4213
4214  // Get a few environment variables.
4215  const char* LD_DEBUG = getenv("LD_DEBUG");
4216  if (LD_DEBUG != nullptr) {
4217    g_ld_debug_verbosity = atoi(LD_DEBUG);
4218  }
           ...
4412}

bionic/linker/debugger.cpp
302__LIBC_HIDDEN__ void debuggerd_init() {
303  struct sigaction action;
304  memset(&action, 0, sizeof(action));
305  sigemptyset(&action.sa_mask);
306  action.sa_sigaction = debuggerd_signal_handler;
307  action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309  // Use the alternate signal stack if available so we can catch stack overflows.
310  action.sa_flags |= SA_ONSTACK;
311
312  sigaction(SIGABRT, &action, nullptr);
313  sigaction(SIGBUS, &action, nullptr);
314  sigaction(SIGFPE, &action, nullptr);
315  sigaction(SIGILL, &action, nullptr);
316  sigaction(SIGSEGV, &action, nullptr);
317#if defined(SIGSTKFLT)
318  sigaction(SIGSTKFLT, &action, nullptr);
319#endif
320  sigaction(SIGTRAP, &action, nullptr);
321}

为上面这几个信号注册信号处理函数，也就是说只有这几个信号会生成tombstone

SIGILL(非法指令异常)

SIGABRT(abort退出异常)

SIGBUS(硬件访问异常)

SIGFPE(浮点运算异常)

SIGSEGV(内存访问异常)

SIGSTKFLT(协处理器栈异常)

SIGTRAP(这是什么?好像不常见)

信号处理函数为:

258/*
259 * Catches fatal signals so we can ask debuggerd to ptrace us before
260 * we crash.
261 */
262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
263  // It's possible somebody cleared the SA_SIGINFO flag, which would mean
264  // our "info" arg holds an undefined value.
265  if (!have_siginfo(signal_number)) {
266    info = nullptr;
267  }
268
269  log_signal_summary(signal_number, info);
270
271  send_debuggerd_packet(info); //发送请求 第一次接受到信号是向debuggerd服务端发送请求，等待回应表示链接上了
272
273  // We need to return from the signal handler so that debuggerd can dump the
274  // thread that crashed, but returning here does not guarantee that the signal
275  // will be thrown again, even for SIGSEGV and friends, since the signal could
276  // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to
277  // preserve the SA_SIGINFO contents.
278  signal(signal_number, SIG_DFL); //将信号处理函数置空
279
280  struct siginfo si;
281  if (!info) {
282    memset(&si, 0, sizeof(si));
283    si.si_code = SI_USER;
284    si.si_pid = getpid();
285    si.si_uid = getuid();
286    info = &si;
287  } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
288    // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
289    // that contain commit 66dd34a (3.9+). The manpage claims to only allow
290    // negative si_code values that are not SI_TKILL, but 66dd34a changed the
291    // check to allow all si_code values in calls coming from inside the house.
292  }
293
294  int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info); //给自己的相关线程再发送一次信号
295  if (rc != 0) {
296    __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297                      strerror(errno));
298    _exit(0);
299  }
300}

客户端向denggerd发送信息，并等待回应，通过socket的write & read

208static void send_debuggerd_packet(siginfo_t* info) {
209  // Mutex to prevent multiple crashing threads from trying to talk
210  // to debuggerd at the same time.
211  static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER;
212  int ret = pthread_mutex_trylock(&crash_mutex);
213  if (ret != 0) {
214    if (ret == EBUSY) {
215      __libc_format_log(ANDROID_LOG_INFO, "libc",
216          "Another thread contacted debuggerd first; not contacting debuggerd.");
217      // This will never complete since the lock is never released.
218      pthread_mutex_lock(&crash_mutex);
219    } else {
220      __libc_format_log(ANDROID_LOG_INFO, "libc",
221                        "pthread_mutex_trylock failed: %s", strerror(ret));
222    }
223    return;
224  }
225
226  int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC);
227  if (s == -1) {
228    __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s",
229                      strerror(errno));
230    return;
231  }
232
233  // debuggerd knows our pid from the credentials on the
234  // local socket but we need to tell it the tid of the crashing thread.
235  // debuggerd will be paranoid and verify that we sent a tid
236  // that's actually in our process.
237  debugger_msg_t msg;
238  msg.action = DEBUGGER_ACTION_CRASH;
239  msg.tid = gettid();
240  msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message);
241  msg.original_si_code = (info != nullptr) ? info->si_code : 0;
242  ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg)));
243  if (ret == sizeof(msg)) {
244    char debuggerd_ack;
245    ret = TEMP_FAILURE_RETRY(read(s, &debuggerd_ack, 1));
246    int saved_errno = errno;
247    notify_gdb_of_libraries();
248    errno = saved_errno;
249  } else {
250    // read or write failed -- broken connection?
251    __libc_format_log(ANDROID_LOG_FATAL, "libc", "Failed while talking to debuggerd: %s",
252                      strerror(errno));
253  }
254
255  close(s);
256}

debuggerd服务端启动，dump流程

debuggerd守护进程如何启动，可以通过debuggerd -b 启动，我们暂且不去说他，就说正常的启动模式

941int main(int argc, char** argv) {
942  union selinux_callback cb;
943  if (argc == 1) {
944    cb.func_audit = audit_callback;
945    selinux_set_callback(SELINUX_CB_AUDIT, cb);
946    cb.func_log = selinux_log_callback;
947    selinux_set_callback(SELINUX_CB_LOG, cb);
948    return do_server();
949  }
950
951  bool dump_backtrace = false;
952  bool have_tid = false;
953  pid_t tid = 0;
954  for (int i = 1; i < argc; i++) {
955    if (!strcmp(argv[i], "-b")) {
956      dump_backtrace = true;
957    } else if (!have_tid) {
958      tid = atoi(argv[i]);
959      have_tid = true;
960    } else {
961      usage();
962      return 1;
963    }
964  }
965  if (!have_tid) {
966    usage();
967    return 1;
968  }
969  return do_explicit_dump(tid, dump_backtrace);
970}

启动一个debuggerd服务端

849static int do_server() {
850  // debuggerd crashes can't be reported to debuggerd.
851  // Reset all of the crash handlers.
852  signal(SIGABRT, SIG_DFL);
853  signal(SIGBUS, SIG_DFL);
854  signal(SIGFPE, SIG_DFL);
855  signal(SIGILL, SIG_DFL);
856  signal(SIGSEGV, SIG_DFL);
857#ifdef SIGSTKFLT
858  signal(SIGSTKFLT, SIG_DFL);
859#endif
860  signal(SIGTRAP, SIG_DFL);
861
862  // Ignore failed writes to closed sockets
863  signal(SIGPIPE, SIG_IGN); //将debuggerd本身的crash忽略
864
865  // Block SIGCHLD so we can sigtimedwait for it.
866  sigset_t sigchld;
867  sigemptyset(&sigchld);
868  sigaddset(&sigchld, SIGCHLD);
869  sigprocmask(SIG_SETMASK, &sigchld, nullptr);
870
871  int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,
872                              SOCK_STREAM | SOCK_CLOEXEC); //创建一个服务端，等待客户端连接
873  if (s == -1) return 1;
874
875  typedef void (*NativeDebugInit)(void);
876  static NativeDebugInit s_func_ptr = NULL;
877  if(!s_func_ptr) {
878    void* handle = dlopen("libmiuindbg.so",RTLD_NOW);
879    if(handle) {
880      s_func_ptr = (NativeDebugInit)dlsym(handle,"hook_context_do_hook");
881    }
882  }
883
884  if(s_func_ptr) {
885    s_func_ptr();
886  }
887
888  // Fork a process that stays root, and listens on a pipe to pause and resume the target.
889  if (!start_signal_sender()) {
890    ALOGE("debuggerd: failed to fork signal sender");
891    return 1;
892  }
893
894  ALOGI("debuggerd: starting\n");
895
896  for (;;) {
897    sockaddr_storage ss;
898    sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss);
899    socklen_t alen = sizeof(ss);
900
901    ALOGV("waiting for connection\n");
902    int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);
903    if (fd == -1) {
904      ALOGE("accept failed: %s\n", strerror(errno));
905      continue;
906    }
907
908    handle_request(fd); //处理客户端的请求
909  }
910  return 0;
911}

处理客户端发来的请求

808static void handle_request(int fd) {
809  ALOGV("handle_request(%d)\n", fd);
810
811  ScopedFd closer(fd);
812  debugger_request_t request;
813  memset(&request, 0, sizeof(request));
814  int status = read_request(fd, &request); //读取客户端的请求
815  if (status != 0) {
816    return;
817  }
818
819  ALOGW("debuggerd: handling request: pid=%d uid=%d gid=%d tid=%d\n", request.pid, request.uid,
820        request.gid, request.tid);
821
822#if defined(__LP64__)
823  // On 64 bit systems, requests to dump 32 bit and 64 bit tids come
824  // to the 64 bit debuggerd. If the process is a 32 bit executable,
825  // redirect the request to the 32 bit debuggerd.
826  if (is32bit(request.tid)) {
827    // Only dump backtrace and dump tombstone requests can be redirected.
828    if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE ||
829        request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
830      redirect_to_32(fd, &request);
831    } else {
832      ALOGE("debuggerd: Not allowed to redirect action %d to 32 bit debuggerd\n", request.action);
833    }
834    return;
835  }
836#endif
837
838  // Fork a child to handle the rest of the request.
839  pid_t fork_pid = fork();
840  if (fork_pid == -1) {
841    ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
842  } else if (fork_pid == 0) {
843    worker_process(fd, request); //处理request
844  } else {
845    monitor_worker_process(fork_pid, request);
846  }
847}

read客户端发来的信息

197static int read_request(int fd, debugger_request_t* out_request) {
198  ucred cr;
199  socklen_t len = sizeof(cr);
200  int status = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &len);
201  if (status != 0) {
202    ALOGE("cannot get credentials");
203    return -1;
204  }
205
206  ALOGV("reading tid");
207  fcntl(fd, F_SETFL, O_NONBLOCK);
208
209  pollfd pollfds[1];
210  pollfds[0].fd = fd;
211  pollfds[0].events = POLLIN;
212  pollfds[0].revents = 0;
213  status = TEMP_FAILURE_RETRY(poll(pollfds, 1, 3000)); //轮询fd句柄
215    ALOGE("timed out reading tid (from pid=%d uid=%d)\n", cr.pid, cr.uid);
216    return -1;
217  }
218
219  debugger_msg_t msg;
220  memset(&msg, 0, sizeof(msg));
221  status = TEMP_FAILURE_RETRY(read(fd, &msg, sizeof(msg))); //读取客户端信息
222  if (status < 0) {
223    ALOGE("read failure? %s (pid=%d uid=%d)\n", strerror(errno), cr.pid, cr.uid);
224    return -1;
225  }
226  if (status != sizeof(debugger_msg_t)) {
227    ALOGE("invalid crash request of size %d (from pid=%d uid=%d)\n", status, cr.pid, cr.uid);
228    return -1;
229  }
230
231  out_request->action = static_cast<debugger_action_t>(msg.action);
232  out_request->tid = msg.tid;
233  out_request->pid = cr.pid;
234  out_request->uid = cr.uid;
235  out_request->gid = cr.gid;
236  out_request->abort_msg_address = msg.abort_msg_address;
237  out_request->original_si_code = msg.original_si_code;
238
239  if (msg.action == DEBUGGER_ACTION_CRASH) {
240    // Ensure that the tid reported by the crashing process is valid.
241    // This check needs to happen again after ptracing the requested thread to prevent a race.
242    if (!pid_contains_tid(out_request->pid, out_request->tid)) {
243      ALOGE("tid %d does not exist in pid %d. ignoring debug request\n", out_request->tid,
244            out_request->pid);
245      return -1;
246    }
247  } else if (cr.uid == 0 || (cr.uid == AID_SYSTEM && msg.action == DEBUGGER_ACTION_DUMP_BACKTRACE)) {
248    // Only root or system can ask us to attach to any process and dump it explicitly.
249    // However, system is only allowed to collect backtraces but cannot dump tombstones.
250    status = get_process_info(out_request->tid, &out_request->pid,
251                              &out_request->uid, &out_request->gid);
252    if (status < 0) {
253      ALOGE("tid %d does not exist. ignoring explicit dump request\n", out_request->tid);
254      return -1;
255    }
256
257    if (!selinux_action_allowed(fd, out_request))
258      return -1;
259  } else {
260    // No one else is allowed to dump arbitrary processes.
261    return -1;
262  }
263  return 0;
264}

整体的dump流程

566static void worker_process(int fd, debugger_request_t& request) {
567  // Open the tombstone file if we need it.
568  std::string tombstone_path;
569  int tombstone_fd = -1;
570  switch (request.action) {
571    case DEBUGGER_ACTION_DUMP_TOMBSTONE:
572    case DEBUGGER_ACTION_CRASH:
573      tombstone_fd = open_tombstone(&tombstone_path); 
574      if (tombstone_fd == -1) {
575        ALOGE("debuggerd: failed to open tombstone file: %s\n", strerror(errno));
576        exit(1);
577      }
578      break;
579
580    case DEBUGGER_ACTION_DUMP_BACKTRACE:
581      break;
582
583    default:
584      ALOGE("debuggerd: unexpected request action: %d", request.action);
585      exit(1);
586  }
587
588  // At this point, the thread that made the request is blocked in
589  // a read() call.  If the thread has crashed, then this gives us
590  // time to PTRACE_ATTACH to it before it has a chance to really fault.
591  //
592  // The PTRACE_ATTACH sends a SIGSTOP to the target process, but it
593  // won't necessarily have stopped by the time ptrace() returns.  (We
594  // currently assume it does.)  We write to the file descriptor to
595  // ensure that it can run as soon as we call PTRACE_CONT below.
596  // See details in bionic/libc/linker/debugger.c, in function
597  // debugger_signal_handler().
598
599  // Attach to the target process.
        //通过ptrace监控子进程(要crash的应用进程)，此时debuggerd变为其父进程，向应用进程发送sigstop；以后应用进程接受到的signal会先发到父进程
600  if (!ptrace_attach_thread(request.pid, request.tid)) {
601    ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
602    exit(1);
603  }
604
605  // DEBUGGER_ACTION_CRASH requests can come from arbitrary processes and the tid field in the
606  // request is sent from the other side. If an attacker can cause a process to be spawned with the
607  // pid of their process, they could trick debuggerd into dumping that process by exiting after
608  // sending the request. Validate the trusted request.uid/gid to defend against this.
609  if (request.action == DEBUGGER_ACTION_CRASH) {
610    pid_t pid;
611    uid_t uid;
612    gid_t gid;
613    if (get_process_info(request.tid, &pid, &uid, &gid) != 0) {
614      ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid);
615      exit(1);
616    }
617
618    if (pid != request.pid || uid != request.uid || gid != request.gid) {
619      ALOGE(
620        "debuggerd: attached task %d does not match request: "
621        "expected pid=%d,uid=%d,gid=%d, actual pid=%d,uid=%d,gid=%d",
622        request.tid, request.pid, request.uid, request.gid, pid, uid, gid);
623      exit(1);
624    }
625  }
626
627  // Don't attach to the sibling threads if we want to attach gdb.
628  // Supposedly, it makes the process less reliable.
629  bool attach_gdb = should_attach_gdb(request);
630  if (attach_gdb) {
631    // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges.
632    if (init_getevent() != 0) {
633      ALOGE("debuggerd: failed to initialize input device, not waiting for gdb");
634      attach_gdb = false;
635    }
636
637  }
638
639  std::set<pid_t> siblings;
640  if (!attach_gdb) {
641    ptrace_siblings(request.pid, request.tid, siblings);
642  }
643
644  // Generate the backtrace map before dropping privileges.
645  std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid));
646
647  int amfd = -1;
648  std::unique_ptr<std::string> amfd_data;
649  if (request.action == DEBUGGER_ACTION_CRASH) {
650    // Connect to the activity manager before dropping privileges.
651    amfd = activity_manager_connect();
652    amfd_data.reset(new std::string);
653  }
654
655  // Collect the list of open files.
656  OpenFilesList open_files;
657  populate_open_files_list(request.pid, &open_files);
658
659  bool succeeded = false;
660
661  // Now that we've done everything that requires privileges, we can drop them.
662  if (!drop_privileges()) {
663    ALOGE("debuggerd: failed to drop privileges, exiting");
664    _exit(1);
665  }
666
667  int crash_signal = SIGKILL;
668  succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,
669                           &crash_signal, &open_files, amfd_data.get());
670  if (succeeded) {
671    if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
672      if (!tombstone_path.empty()) {
673        android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length()); //将dump结果写到相关路径下
674      }
675    }
676  }
677
678  if (attach_gdb || request.action == DEBUGGER_ACTION_CRASH) {
679    // Before detach we must send SIGSTOP to the target.
680    // Tell the signal process to send SIGSTOP to the target.
681    if (!send_signal(request.pid, 0, SIGSTOP)) {
682      ALOGE("debuggerd: failed to stop process for gdb attach: %s", strerror(errno));
683      attach_gdb = false;
684    }
685  }
686
687  if (!attach_gdb) {
688    // Tell the Activity Manager about the crashing process. If we are
689    // waiting for gdb to attach, do not send this or Activity Manager
690    // might kill the process before anyone can attach.
691    activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
692  }
693
694  if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) { //detach客户端
695    ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));
696  }
697
698  for (pid_t sibling : siblings) {
699    ptrace(PTRACE_DETACH, sibling, 0, 0);
700  }
701
702  // Send the signal back to the process if it crashed and we're not waiting for gdb.
703  if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {
704    if (!send_signal(request.pid, request.tid, crash_signal)) {
705      ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));
706    }
707  }
708
709  // Wait for gdb, if requested.
710  if (attach_gdb) {
711    wait_for_user_action(request);
712
713    // Now tell the activity manager about this process.
714    activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
715
716    // Tell the signal process to send SIGCONT to the target.
717    if (!send_signal(request.pid, 0, SIGCONT)) {
718      ALOGE("debuggerd: failed to resume process %d: %s", request.pid, strerror(errno));
719    }
720
721    uninit_getevent();
722  }
723
724  close(amfd);
725
726  exit(!succeeded);
727}

perform_dump:进行dump的过程

484static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
485                         BacktraceMap* backtrace_map, const std::set<pid_t>& siblings,
486                         int* crash_signal, OpenFilesList* open_files, std::string* amfd_data) {
487  if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) { //向应用进程（客户端返回一个值），表示连上了，可以开始dump了
488    ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));
489    return false;
490  }
491
492  int total_sleep_time_usec = 0;
493  while (true) {
494    int signal = wait_for_signal(request.tid, &total_sleep_time_usec); //因为此时已经被ptrace_attach了，所以第二次客户端发给自己的信号会在这里被接收
495    switch (signal) {
496      case -1:
497        ALOGE("debuggerd: timed out waiting for signal");
498        return false;
499
500      case SIGSTOP: //这里是attach时向客户端发送的sigstop信号
501        if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
502          ALOGV("debuggerd: stopped -- dumping to tombstone");
503          engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
504                            request.original_si_code, request.abort_msg_address, open_files, amfd_data); 
505        } else if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE) {
506          ALOGV("debuggerd: stopped -- dumping to fd");
507          dump_backtrace(fd, backtrace_map, request.pid, request.tid, siblings, nullptr);
508        } else {
509          ALOGV("debuggerd: stopped -- continuing");
              //此时通过debuggerd用PTRACE_CONT命令让应用继续执行，
              // 这样应用的read系统调用就可以返回到用户态，继续执行debuggerd_signal_handler()
               // 此时，debuggerd进入下一次循环，block在wait_for_signal，继续等待应用的下一个信号
510          if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {
511            ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));
512            return false;
513          }
514          continue;  // loop again //注意，这里是继续循环，等待客户端的第二次信号
515        }
516        break;
517
518      case SIGABRT:
519      case SIGBUS:
520      case SIGFPE:
521      case SIGILL:
522      case SIGSEGV:
523#ifdef SIGSTKFLT
524      case SIGSTKFLT:
525#endif
526      case SIGSYS:
527      case SIGTRAP:
528        ALOGV("stopped -- fatal signal\n");
529        *crash_signal = signal;
530        engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
531                          request.original_si_code, request.abort_msg_address, open_files, amfd_data); //客户端发的第二次信号被debuggerd接受，开始dump
532        break; //dump完之后跳出循环，执行下面的操作
533
534      default:
535        ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
536        break;
537    }
538    break;
539  }
540
541  return true;
542}

本质上有两次通信；
第一次通信是进程的signal handler通过socket与启动的dubuggerd服务端进行通信，客户端向debuggerd写request，服务端获取request并返回一个值表示收到；同时attach到客户端，作为父进程；同时发送一个SIGSTOP信号，被接收时，此时通过debuggerd用PTRACE_CONT命令让应用继续执行，这样应用的read系统调用就可以返回到用户态，继续执行debuggerd_signal_handler，debuggerd进入下一次循环，block在wait_for_signal，继续等待应用的下一个信号

客户端收到答复之后，将注册的信号处理函数去掉，（这样再接收到信号就可以正常的走kernel流程了），然后再次发送一个信号

这里就是第二次通信，信号被父进程debuggerd拦截，开始dump操作，dump操作完后进行detach操作，不再作为客户端的父进程

此时客户端会进入到默认的信号处理逻辑中

2173int get_signal(struct ksignal *ksig)
2174{
2175    struct sighand_struct *sighand = current->sighand;
2176    struct signal_struct *signal = current->signal;
2177    int signr;
2178
2179    if (unlikely(current->task_works))
2180        task_work_run();
2181
2182    if (unlikely(uprobe_deny_signal()))
2183        return 0;
2184
2185    /*
2186     * Do this once, we can't return to user-mode if freezing() == T.
2187     * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2188     * thus do not need another check after return.
2189     */
2190    try_to_freeze();
2191
2192relock:
2193    spin_lock_irq(&sighand->siglock);
2194    /*
2195     * Every stopped thread goes here after wakeup. Check to see if
2196     * we should notify the parent, prepare_signal(SIGCONT) encodes
2197     * the CLD_ si_code into SIGNAL_CLD_MASK bits.
2198     */
2199    if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
2200        int why;
2201
2202        if (signal->flags & SIGNAL_CLD_CONTINUED)
2203            why = CLD_CONTINUED;
2204        else
2205            why = CLD_STOPPED;
2206
2207        signal->flags &= ~SIGNAL_CLD_MASK;
2208
2209        spin_unlock_irq(&sighand->siglock);
2210
2211        /*
2212         * Notify the parent that we're continuing.  This event is
2213         * always per-process and doesn't make whole lot of sense
2214         * for ptracers, who shouldn't consume the state via
2215         * wait(2) either, but, for backward compatibility, notify
2216         * the ptracer of the group leader too unless it's gonna be
2217         * a duplicate.
2218         */
2219        read_lock(&tasklist_lock);
2220        do_notify_parent_cldstop(current, false, why);
2221
2222        if (ptrace_reparented(current->group_leader))
2223            do_notify_parent_cldstop(current->group_leader,
2224                        true, why);
2225        read_unlock(&tasklist_lock);
2226
2227        goto relock;
2228    }
2229
2230    for (;;) {
2231        struct k_sigaction *ka;
2232
2233        if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
2234            do_signal_stop(0))
2235            goto relock;
2236
2237        if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
2238            do_jobctl_trap();
2239            spin_unlock_irq(&sighand->siglock);
2240            goto relock;
2241        }
2242
2243        signr = dequeue_signal(current, &current->blocked, &ksig->info);
2244
2245        if (!signr)
2246            break; /* will return 0 */
2247
2248        if (unlikely(current->ptrace) && signr != SIGKILL) {
2249            signr = ptrace_signal(signr, &ksig->info);
2250            if (!signr)
2251                continue;
2252        }
2253
2254        ka = &sighand->action[signr-1];
2255
2256        /* Trace actually delivered signals. */
2257        trace_signal_deliver(signr, &ksig->info, ka);
2258
2259        if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
2260            continue;
2261        if (ka->sa.sa_handler != SIG_DFL) {
2262            /* Run the handler.  */
2263            ksig->ka = *ka;
2264
2265            if (ka->sa.sa_flags & SA_ONESHOT)
2266                ka->sa.sa_handler = SIG_DFL;
2267
2268            break; /* will return non-zero "signr" value */
2269        }
2270
2271        /*
2272         * Now we are doing the default action for this signal.
2273         */
2274        if (sig_kernel_ignore(signr)) /* Default is nothing. */
2275            continue;
2276
2277        /*
2278         * Global init gets no signals it doesn't want.
2279         * Container-init gets no signals it doesn't want from same
2280         * container.
2281         *
2282         * Note that if global/container-init sees a sig_kernel_only()
2283         * signal here, the signal must have been generated internally
2284         * or must have come from an ancestor namespace. In either
2285         * case, the signal cannot be dropped.
2286         */
2287        if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
2288                !sig_kernel_only(signr))
2289            continue;
2290
2291        if (sig_kernel_stop(signr)) {
2292            /*
2293             * The default action is to stop all threads in
2294             * the thread group.  The job control signals
2295             * do nothing in an orphaned pgrp, but SIGSTOP
2296             * always works.  Note that siglock needs to be
2297             * dropped during the call to is_orphaned_pgrp()
2298             * because of lock ordering with tasklist_lock.
2299             * This allows an intervening SIGCONT to be posted.
2300             * We need to check for that and bail out if necessary.
2301             */
2302            if (signr != SIGSTOP) {
2303                spin_unlock_irq(&sighand->siglock);
2304
2305                /* signals can be posted during this window */
2306
2307                if (is_current_pgrp_orphaned())
2308                    goto relock;
2309
2310                spin_lock_irq(&sighand->siglock);
2311            }
2312
2313            if (likely(do_signal_stop(ksig->info.si_signo))) {
2314                /* It released the siglock.  */
2315                goto relock;
2316            }
2317
2318            /*
2319             * We didn't actually stop, due to a race
2320             * with SIGCONT or something like that.
2321             */
2322            continue;
2323        }
2324
2325        spin_unlock_irq(&sighand->siglock);
2326
2327        /*
2328         * Anything else is fatal, maybe with a core dump.
2329         */
2330        current->flags |= PF_SIGNALED;
2331
2332        if (sig_kernel_coredump(signr)) {
2333            if (print_fatal_signals)
2334                print_fatal_signal(ksig->info.si_signo);
2335            proc_coredump_connector(current);
2336            /*
2337             * If it was able to dump core, this kills all
2338             * other threads in the group and synchronizes with
2339             * their demise.  If we lost the race with another
2340             * thread getting here, it set group_exit_code
2341             * first and our do_group_exit call below will use
2342             * that value and ignore the one we pass it.
2343             */
2344            do_coredump(&ksig->info);
2345        }
2346
2347        /*
2348         * Death signals, no core dump.
2349         */
2350        do_group_exit(ksig->info.si_signo);
2351        /* NOTREACHED */
2352    }
2353    spin_unlock_irq(&sighand->siglock);
2354
2355    ksig->sig = signr;
2356    return ksig->sig > 0;
2357}

412#define sig_kernel_coredump(sig) \
413 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))

399        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
400 rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
401        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
402 rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
403        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
404 SIGEMT_MASK

可见coredump相应的信号比tombstone多，tombstone响应的为coredump的子集，能响应coredump的信号如下，参考default action列表：

 *      +--------------------+------------------+
 *      |  POSIX signal      |  default action  |
 *      +--------------------+------------------+
 *      |  SIGHUP            |  terminate       |
 *      |  SIGINT            |  terminate       |
 *      |  SIGQUIT           |  coredump        |
 *      |  SIGILL            |  coredump        |
 *      |  SIGTRAP           |  coredump        |
 *      |  SIGABRT/SIGIOT    |  coredump        |
 *      |  SIGBUS            |  coredump        |
 *      |  SIGFPE            |  coredump        |
 *      |  SIGKILL           |  terminate(+)    |
 *      |  SIGUSR1           |  terminate       |
 *      |  SIGSEGV           |  coredump        |
 *      |  SIGUSR2           |  terminate       |
 *      |  SIGPIPE           |  terminate       |
 *      |  SIGALRM           |  terminate       |
 *      |  SIGTERM           |  terminate       |
 *      |  SIGCHLD           |  ignore          |
 *      |  SIGCONT           |  ignore(*)       |
 *      |  SIGSTOP           |  stop(*)(+)      |
 *      |  SIGTSTP           |  stop(*)         |
 *      |  SIGTTIN           |  stop(*)         |
 *      |  SIGTTOU           |  stop(*)         |
 *      |  SIGURG            |  ignore          |
 *      |  SIGXCPU           |  coredump        |
 *      |  SIGXFSZ           |  coredump        |
 *      |  SIGVTALRM         |  terminate       |
 *      |  SIGPROF           |  terminate       |
 *      |  SIGPOLL/SIGIO     |  terminate       |
 *      |  SIGSYS/SIGUNUSED  |  coredump        |
 *      |  SIGSTKFLT         |  terminate       |
 *      |  SIGWINCH          |  ignore          |
 *      |  SIGPWR            |  terminate       |
 *      |  SIGRTMIN-SIGRTMAX |  terminate       |
 *      +--------------------+------------------+
 *      |  non-POSIX signal  |  default action  |
 *      +--------------------+------------------+
 *      |  SIGEMT            |  coredump        |
 *      +--------------------+------------------+

那么如何tombstone添加一个信号呢？

拓展

debuggerd_init打不出log?

原因：
bionic/linker/linker_main.cpp

/*
211 * This code is called after the linker has linked itself and
212 * fixed it's own GOT. It is safe to make references to externs
213 * and other non-local data at this point.
214 */
215static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args) {
216  ProtectedDataGuard guard;
217
218#if TIMING
219  struct timeval t0, t1;
220  gettimeofday(&t0, 0);
221#endif
222
223  // Sanitize the environment.
224  __libc_init_AT_SECURE(args);
225
226  // Initialize system properties
227  __system_properties_init(); // may use 'environ'
228
229  // Register the debuggerd signal handler.
230#ifdef __ANDROID__
231  debuggerd_callbacks_t callbacks = {
232    .get_abort_message = []() {
233      return g_abort_message;
234    },
235    .post_dump = &notify_gdb_of_libraries,
236  };
237  debuggerd_init(&callbacks); //此时LD_DEBUG还没有初始化
238#endif
239
240  g_linker_logger.ResetState();
241
242  // Get a few environment variables.
243  const char* LD_DEBUG = getenv("LD_DEBUG");
244  if (LD_DEBUG != nullptr) {
245    g_ld_debug_verbosity = atoi(LD_DEBUG);
246  }

bionic/linker/linker_debug.h

63#if LINKER_DEBUG_TO_LOG
64#define _PRINTVF(v, x...) \
65    do { \
66      if (g_ld_debug_verbosity > (v)) async_safe_format_log(5-(v), "linker", x); \
67    } while (0)
68#else /* !LINKER_DEBUG_TO_LOG */
69#define _PRINTVF(v, x...) \
70    do { \
71      if (g_ld_debug_verbosity > (v)) { async_safe_format_fd(1, x); write(1, "\n", 1); } \
72    } while (0)
73#endif /* !LINKER_DEBUG_TO_LOG */
74
75#define PRINT(x...)          _PRINTVF(-1, x)
76#define INFO(x...)           _PRINTVF(0, x)
77#define TRACE(x...)          _PRINTVF(1, x)

所以用INFO等等，级别不够，可以直接用async_safe_format_log进行打印，就一定能打出来

tombstone与debuggerd相关流程

相关流程

客户端流程

debuggerd服务端启动，dump流程

拓展

猜你喜欢

热点阅读