123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- /*
- * Copyright [2021] JD.com, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- */
- #include <sys/wait.h>
- #include <errno.h>
- #include "config/config.h"
- #include "unit.h"
- #include "log/log.h"
- #include "stat_alarm_reporter.h"
- #include <sstream>
- #include <dtc_global.h>
- WatchDogObject::~WatchDogObject()
- {
- }
- int WatchDogObject::attach_watch_dog(WatchDogUnit *u)
- {
- if (u && watchdog_object_owner_ == NULL)
- watchdog_object_owner_ = u;
- return watchdog_object_owner_ ? watchdog_object_owner_->attach_process(this) : -1;
- }
- void WatchDogObject::exited_notify(int retval)
- {
- delete this;
- }
- void WatchDogObject::killed_notify(int signo, int coredumped)
- {
- report_kill_alarm(signo, coredumped);
- delete this;
- }
- void WatchDogObject::report_kill_alarm(int signo, int coredumped)
- {
- if (!ALARM_REPORTER->init_alarm_cfg(std::string(ALARM_CONF_FILE), true)) {
- log4cplus_error("init alarm conf file fail");
- return;
- }
- ALARM_REPORTER->set_time_out(1);
- std::stringstream oss;
- oss << "child process[" << watchdog_object_pid_ << "][ " << watchdog_object_name_ << " ]killed by signal " << signo;
- if (coredumped)
- oss << " core dumped";
- ALARM_REPORTER->report_alarm(oss.str());
- }
- WatchDogUnit::WatchDogUnit()
- : pid_count_(0){};
- WatchDogUnit::~WatchDogUnit()
- {
- pidmap_t::iterator i;
- for (i = pid_map_watchdog_object_.begin(); i != pid_map_watchdog_object_.end(); i++) {
- WatchDogObject *obj = i->second;
- delete obj;
- }
- };
- int WatchDogUnit::check_watchdog()
- {
- while (1) {
- int status;
- int pid = waitpid(-1, &status, WNOHANG);
- int err = errno;
- if (pid < 0) {
- switch (err) {
- case EINTR:
- case ECHILD:
- break;
- default:
- log4cplus_info("wait() return pid %d errno %d", pid, err);
- break;
- }
- break;
- } else if (pid == 0) {
- break;
- } else {
- pidmap_t::iterator itr = pid_map_watchdog_object_.find(pid);
- if (itr == pid_map_watchdog_object_.end()) {
- log4cplus_info("wait() return unknown pid %d status 0x%x", pid, status);
- } else {
- WatchDogObject *obj = itr->second;
- const char *const name = obj->Name();
- pid_map_watchdog_object_.erase(itr);
- /* special exit value return-ed by CrashProtector */
- if (WIFEXITED(status) && WEXITSTATUS(status) == 85 && strncmp(name, "main", 5) == 0) {
- /* treat it as a fake SIGSEGV */
- status = W_STOPCODE(SIGSEGV);
- }
- if (WIFSIGNALED(status)) {
- const int sig = WTERMSIG(status);
- const int core = WCOREDUMP(status);
- log4cplus_fatal("%s: killed by signal %d", name, sig);
- log4cplus_error("child %.16s pid %d killed by signal %d%s",
- name, pid, sig,
- core ? " (core dumped)" : "");
- pid_count_--;
- obj->killed_notify(sig, core);
- } else if (WIFEXITED(status)) {
- const int retval = (signed char)WEXITSTATUS(status);
- if (retval == 0)
- log4cplus_debug("child %.16s pid %d exit status %d",
- name, pid, retval);
- else
- log4cplus_info("child %.16s pid %d exit status %d",
- name, pid, retval);
- pid_count_--;
- obj->exited_notify(retval);
- }
- }
- }
- }
- return pid_count_;
- }
- int WatchDogUnit::attach_process(WatchDogObject *obj)
- {
- const int pid = obj->watchdog_object_pid_;
- pidmap_t::iterator itr = pid_map_watchdog_object_.find(pid);
- if (itr != pid_map_watchdog_object_.end() || pid <= 1)
- return -1;
- pid_map_watchdog_object_[pid] = obj;
- pid_count_++;
- return 0;
- }
- int WatchDogUnit::kill_allwatchdog()
- {
- pidmap_t::iterator i;
- int n = 0;
- for (i = pid_map_watchdog_object_.begin(); i != pid_map_watchdog_object_.end(); i++) {
- WatchDogObject *obj = i->second;
- if (obj->get_watchdog_pid() > 1) {
- log4cplus_debug("killing child %.16s pid %d SIGTERM",
- obj->Name(), obj->get_watchdog_pid());
- kill(obj->get_watchdog_pid(), SIGTERM);
- n++;
- }
- }
- return n;
- }
- int WatchDogUnit::force_kill_allwatchdog()
- {
- pidmap_t::iterator i;
- int n = 0;
- for (i = pid_map_watchdog_object_.begin(); i != pid_map_watchdog_object_.end(); i++) {
- WatchDogObject *obj = i->second;
- if (obj->get_watchdog_pid() > 1) {
- log4cplus_error("child %.16s pid %d didn't exit in timely, sending SIGKILL.",
- obj->Name(), obj->get_watchdog_pid());
- kill(obj->get_watchdog_pid(), SIGKILL);
- n++;
- }
- }
- return n;
- }
|