unit.cc 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. /*
  2. * Copyright [2021] JD.com, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. *
  16. */
  17. #include <sys/wait.h>
  18. #include <errno.h>
  19. #include "config/config.h"
  20. #include "unit.h"
  21. #include "log/log.h"
  22. #include "stat_alarm_reporter.h"
  23. #include <sstream>
  24. #include <dtc_global.h>
  25. WatchDogObject::~WatchDogObject()
  26. {
  27. }
  28. int WatchDogObject::attach_watch_dog(WatchDogUnit *u)
  29. {
  30. if (u && watchdog_object_owner_ == NULL)
  31. watchdog_object_owner_ = u;
  32. return watchdog_object_owner_ ? watchdog_object_owner_->attach_process(this) : -1;
  33. }
  34. void WatchDogObject::exited_notify(int retval)
  35. {
  36. delete this;
  37. }
  38. void WatchDogObject::killed_notify(int signo, int coredumped)
  39. {
  40. report_kill_alarm(signo, coredumped);
  41. delete this;
  42. }
  43. void WatchDogObject::report_kill_alarm(int signo, int coredumped)
  44. {
  45. if (!ALARM_REPORTER->init_alarm_cfg(std::string(ALARM_CONF_FILE), true)) {
  46. log4cplus_error("init alarm conf file fail");
  47. return;
  48. }
  49. ALARM_REPORTER->set_time_out(1);
  50. std::stringstream oss;
  51. oss << "child process[" << watchdog_object_pid_ << "][ " << watchdog_object_name_ << " ]killed by signal " << signo;
  52. if (coredumped)
  53. oss << " core dumped";
  54. ALARM_REPORTER->report_alarm(oss.str());
  55. }
  56. WatchDogUnit::WatchDogUnit()
  57. : pid_count_(0){};
  58. WatchDogUnit::~WatchDogUnit()
  59. {
  60. pidmap_t::iterator i;
  61. for (i = pid_map_watchdog_object_.begin(); i != pid_map_watchdog_object_.end(); i++) {
  62. WatchDogObject *obj = i->second;
  63. delete obj;
  64. }
  65. };
  66. int WatchDogUnit::check_watchdog()
  67. {
  68. while (1) {
  69. int status;
  70. int pid = waitpid(-1, &status, WNOHANG);
  71. int err = errno;
  72. if (pid < 0) {
  73. switch (err) {
  74. case EINTR:
  75. case ECHILD:
  76. break;
  77. default:
  78. log4cplus_info("wait() return pid %d errno %d", pid, err);
  79. break;
  80. }
  81. break;
  82. } else if (pid == 0) {
  83. break;
  84. } else {
  85. pidmap_t::iterator itr = pid_map_watchdog_object_.find(pid);
  86. if (itr == pid_map_watchdog_object_.end()) {
  87. log4cplus_info("wait() return unknown pid %d status 0x%x", pid, status);
  88. } else {
  89. WatchDogObject *obj = itr->second;
  90. const char *const name = obj->Name();
  91. pid_map_watchdog_object_.erase(itr);
  92. /* special exit value return-ed by CrashProtector */
  93. if (WIFEXITED(status) && WEXITSTATUS(status) == 85 && strncmp(name, "main", 5) == 0) {
  94. /* treat it as a fake SIGSEGV */
  95. status = W_STOPCODE(SIGSEGV);
  96. }
  97. if (WIFSIGNALED(status)) {
  98. const int sig = WTERMSIG(status);
  99. const int core = WCOREDUMP(status);
  100. log4cplus_fatal("%s: killed by signal %d", name, sig);
  101. log4cplus_error("child %.16s pid %d killed by signal %d%s",
  102. name, pid, sig,
  103. core ? " (core dumped)" : "");
  104. pid_count_--;
  105. obj->killed_notify(sig, core);
  106. } else if (WIFEXITED(status)) {
  107. const int retval = (signed char)WEXITSTATUS(status);
  108. if (retval == 0)
  109. log4cplus_debug("child %.16s pid %d exit status %d",
  110. name, pid, retval);
  111. else
  112. log4cplus_info("child %.16s pid %d exit status %d",
  113. name, pid, retval);
  114. pid_count_--;
  115. obj->exited_notify(retval);
  116. }
  117. }
  118. }
  119. }
  120. return pid_count_;
  121. }
  122. int WatchDogUnit::attach_process(WatchDogObject *obj)
  123. {
  124. const int pid = obj->watchdog_object_pid_;
  125. pidmap_t::iterator itr = pid_map_watchdog_object_.find(pid);
  126. if (itr != pid_map_watchdog_object_.end() || pid <= 1)
  127. return -1;
  128. pid_map_watchdog_object_[pid] = obj;
  129. pid_count_++;
  130. return 0;
  131. }
  132. int WatchDogUnit::kill_allwatchdog()
  133. {
  134. pidmap_t::iterator i;
  135. int n = 0;
  136. for (i = pid_map_watchdog_object_.begin(); i != pid_map_watchdog_object_.end(); i++) {
  137. WatchDogObject *obj = i->second;
  138. if (obj->get_watchdog_pid() > 1) {
  139. log4cplus_debug("killing child %.16s pid %d SIGTERM",
  140. obj->Name(), obj->get_watchdog_pid());
  141. kill(obj->get_watchdog_pid(), SIGTERM);
  142. n++;
  143. }
  144. }
  145. return n;
  146. }
  147. int WatchDogUnit::force_kill_allwatchdog()
  148. {
  149. pidmap_t::iterator i;
  150. int n = 0;
  151. for (i = pid_map_watchdog_object_.begin(); i != pid_map_watchdog_object_.end(); i++) {
  152. WatchDogObject *obj = i->second;
  153. if (obj->get_watchdog_pid() > 1) {
  154. log4cplus_error("child %.16s pid %d didn't exit in timely, sending SIGKILL.",
  155. obj->Name(), obj->get_watchdog_pid());
  156. kill(obj->get_watchdog_pid(), SIGKILL);
  157. n++;
  158. }
  159. }
  160. return n;
  161. }