目录
定位监控
代码
分析
备注
camera监控
代码
分析
功能安全监控
代码
分析
CheckSafty函数分析
RunOnce 函数分析
记录功能监控
代码
分析
SmartRecorderStatus proto
状态的上报位置分析
监控信息汇总服务
代码
分析
class LocalizationMonitor : public RecurrentRunner {
public:
LocalizationMonitor();
void RunOnce(const double current_time) override;
};
void LocalizationMonitor::RunOnce(const double current_time) {
auto manager = MonitorManager::Instance();
auto* component = apollo::common::util::FindOrNull(
*manager->GetStatus()->mutable_components(),
FLAGS_localization_component_name);
if (component == nullptr) {
// localization is not monitored in current mode, skip.
return;
}
static auto reader =
manager->CreateReader(FLAGS_localization_msf_status);
reader->Observe();
const auto status = reader->GetLatestObserved();
ComponentStatus* component_status = component->mutable_other_status();
component_status->clear_status();
if (status == nullptr) {
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,
"No LocalizationStatus received",
component_status);
return;
}
// Translate LocalizationStatus to ComponentStatus. Note that ERROR and FATAL
// will trigger safety mode in current settings.
switch (status->fusion_status()) {
case MeasureState::OK:
SummaryMonitor::EscalateStatus(ComponentStatus::OK, "", component_status);
break;
case MeasureState::WARNNING:
SummaryMonitor::EscalateStatus(
ComponentStatus::WARN,
absl::StrCat("WARNNING: ", status->state_message()),
component_status);
break;
case MeasureState::ERROR:
SummaryMonitor::EscalateStatus(
ComponentStatus::WARN,
absl::StrCat("ERROR: ", status->state_message()), component_status);
break;
case MeasureState::CRITICAL_ERROR:
SummaryMonitor::EscalateStatus(
ComponentStatus::ERROR,
absl::StrCat("CRITICAL_ERROR: ", status->state_message()),
component_status);
break;
case MeasureState::FATAL_ERROR:
SummaryMonitor::EscalateStatus(
ComponentStatus::FATAL,
absl::StrCat("FATAL_ERROR: ", status->state_message()),
component_status);
break;
default:
AFATAL << "Unknown fusion_status: " << status->fusion_status();
break;
}
}
## Check MSF Localization Status We provide a simple way to check lidar localization, GNSS localization and fusion localization status. There are four states {NOT_VALID, NOT_STABLE, OK, VALID} for localization status. You can simply use `rostopic echo /apollo/localization/msf_status` to check localization status. If fusion_status is VALID or OK, the output of msf localization is reliable.
上述是apollo MSF 定位状态的判断逻辑,上述故障都是由业务模块定位部分设置并发出的。
下面是modules/localization/rtk/rtk_localization.cc的状态检测部分
void RTKLocalization::FillLocalizationStatusMsg(
const drivers::gnss::InsStat &status,
LocalizationStatus *localization_status) {
apollo::common::Header *header = localization_status->mutable_header();
double timestamp = apollo::cyber::Clock::NowInSeconds();
header->set_timestamp_sec(timestamp);
localization_status->set_measurement_time(status.header().timestamp_sec());
if (!status.has_pos_type()) {
localization_status->set_fusion_status(MeasureState::ERROR);
localization_status->set_state_message(
"Error: Current Localization Status Is Missing.");
return;
}
class CameraMonitor : public RecurrentRunner {
public:
CameraMonitor();
void RunOnce(const double current_time) override;
private:
static void UpdateStatus(ComponentStatus* status);
};
void CameraMonitor::RunOnce(const double current_time) {
auto* manager = MonitorManager::Instance();
auto* component = apollo::common::util::FindOrNull(
*manager->GetStatus()->mutable_components(), FLAGS_camera_component_name);
if (component == nullptr) {
// camera is not monitored in current mode, skip.
return;
}
auto* status = component->mutable_other_status();
UpdateStatus(status);
}
除了判断camera是不是被配置为监控配置之外核心函数在UpdateStatus 中
void CameraMonitor::UpdateStatus(ComponentStatus* status) {
status->clear_status();
std::string frame_id = "";
for (const auto& topic : camera_topic_set) {
const auto& reader_message_pair = CreateReaderAndLatestsMessage(topic);
const auto& reader = reader_message_pair.first;
const auto& message = reader_message_pair.second;
if (reader != nullptr && message != nullptr) {
if (frame_id.empty()) {
const auto& header = message->header();
if (header.has_frame_id()) {
frame_id = header.frame_id();
}
} else {
SummaryMonitor::EscalateStatus(
ComponentStatus::ERROR,
absl::StrCat("Only one camera is permitted"), status);
}
}
}
if (frame_id.empty()) {
SummaryMonitor::EscalateStatus(
ComponentStatus::ERROR, absl::StrCat("No camera is detected"), status);
} else {
SummaryMonitor::EscalateStatus(
ComponentStatus::OK, absl::StrCat("Detected one camera: ", frame_id),
status);
}
}
static const auto camera_topic_set = std::set{
FLAGS_image_long_topic, FLAGS_camera_image_long_topic,
FLAGS_camera_image_short_topic, FLAGS_camera_front_6mm_topic,
FLAGS_camera_front_6mm_2_topic, FLAGS_camera_front_12mm_topic,
// Add more cameras here if you want to monitor.
};
absl::StrCat("Only one camera is permitted"), status);
如果frame id 是 empty,就报ERROR
ComponentStatus::ERROR, absl::StrCat("No camera is detected"), status);
// Check if we need to switch to safe mode, and then
// 1. Notify driver to take action.
// 2. Trigger Guardian if no proper action was taken.
class FunctionalSafetyMonitor : public RecurrentRunner {
public:
FunctionalSafetyMonitor();
void RunOnce(const double current_time);
private:
bool CheckSafety();
};
void FunctionalSafetyMonitor::RunOnce(const double current_time) {
auto* system_status = MonitorManager::Instance()->GetStatus();
// Everything looks good or has been handled properly.
if (CheckSafety()) {
system_status->clear_passenger_msg();
system_status->clear_safety_mode_trigger_time();
system_status->clear_require_emergency_stop();
return;
}
if (system_status->require_emergency_stop()) {
// EStop has already been triggered.
return;
}
// Newly entered safety mode.
system_status->set_passenger_msg("Error! Please disengage.");
if (!system_status->has_safety_mode_trigger_time()) {
system_status->set_safety_mode_trigger_time(current_time);
return;
}
// Trigger EStop if no action was taken in time.
if (system_status->safety_mode_trigger_time() +
FLAGS_safety_mode_seconds_before_estop <
current_time) {
system_status->set_require_emergency_stop(true);
}
}
bool FunctionalSafetyMonitor::CheckSafety() {
// We only check safety in self driving mode.
auto manager = MonitorManager::Instance();
if (!manager->IsInAutonomousMode()) {
return true;
}
// Check HMI modules status.
const auto& mode = manager->GetHMIMode();
const auto& hmi_modules = manager->GetStatus()->hmi_modules();
for (const auto& iter : mode.modules()) {
const std::string& module_name = iter.first;
const auto& module = iter.second;
if (module.required_for_safety() &&
!IsSafe(module_name, hmi_modules.at(module_name))) {
return false;
}
}
// Check monitored components status.
const auto& components = manager->GetStatus()->components();
for (const auto& iter : mode.monitored_components()) {
const std::string& component_name = iter.first;
const auto& component = iter.second;
if (component.required_for_safety() &&
!IsSafe(component_name, components.at(component_name).summary())) {
return false;
}
}
// Everything looks good.
return true;
}
recorder monitor 是对于是apollo 对于记录服务的监控,方法是通过订阅/apollo/data/recorder/status 这个topic 获取Recorder status。
class RecorderMonitor : public RecurrentRunner {
public:
RecorderMonitor();
void RunOnce(const double current_time) override;
};
void RecorderMonitor::RunOnce(const double current_time) {
auto manager = MonitorManager::Instance();
auto* component = apollo::common::util::FindOrNull(
*manager->GetStatus()->mutable_components(),
FLAGS_smart_recorder_component_name);
if (component == nullptr) {
// SmartRecorder is not monitored in current mode, skip.
return;
}
static auto reader =
manager->CreateReader(FLAGS_recorder_status_topic);
reader->Observe();
const auto status = reader->GetLatestObserved();
ComponentStatus* component_status = component->mutable_other_status();
component_status->clear_status();
if (status == nullptr) {
SummaryMonitor::EscalateStatus(ComponentStatus::ERROR,
"No SmartRecorderStatus received",
component_status);
return;
}
// Translate SmartRecorderStatus to ComponentStatus. Note that ERROR and FATAL
// will trigger safety mode in current settings.
switch (status->recording_state()) {
case RecordingState::RECORDING:
SummaryMonitor::EscalateStatus(ComponentStatus::OK, "", component_status);
break;
case RecordingState::TERMINATING:
SummaryMonitor::EscalateStatus(
ComponentStatus::WARN,
absl::StrCat("WARNNING: ", status->state_message()),
component_status);
break;
case RecordingState::STOPPED:
SummaryMonitor::EscalateStatus(
ComponentStatus::OK,
absl::StrCat("STOPPED: ", status->state_message()), component_status);
break;
default:
AFATAL << "Unknown recording status: " << status->recording_state();
break;
}
}
第一步依旧是判断recorder 是不是被配置的监控模块,如果不是直接返回。
然后就是直接判断status->recording_state(),如果是RecordingState::TERMINATING(终止)状态就报出一个WARNING 的故障
enum RecordingState {
STOPPED = 0;
RECORDING = 1;
TERMINATING = 2;
}
message SmartRecorderStatus {
optional apollo.common.Header header = 1;
optional RecordingState recording_state = 2;
optional string state_message = 3;
}
modules/data/tools/smart_recorder/realtime_record_processor.cc
我们可以在上述文件中找到recorder状态赋值情况,但是可惜apollo 中目前没有一个模块会主动填写RecordingState::TERMINATING(终止)状态。
// A monitor which summarize other monitors' result and publish the whole status
// if it has changed.
class SummaryMonitor : public RecurrentRunner {
public:
SummaryMonitor();
void RunOnce(const double current_time) override;
// Escalate the status to a higher priority new status:
// FATAL > ERROR > WARN > OK > UNKNOWN.
static void EscalateStatus(const ComponentStatus::Status new_status,
const std::string& message,
ComponentStatus* current_status);
private:
size_t system_status_fp_ = 0;
double last_broadcast_ = 0;
};
void SummaryMonitor::RunOnce(const double current_time) {
auto manager = MonitorManager::Instance();
auto* status = manager->GetStatus();
// Escalate the summary status to the most severe one.
for (auto& component : *status->mutable_components()) {
auto* summary = component.second.mutable_summary();
const auto& process_status = component.second.process_status();
EscalateStatus(process_status.status(), process_status.message(), summary);
const auto& module_status = component.second.module_status();
EscalateStatus(module_status.status(), module_status.message(), summary);
const auto& channel_status = component.second.channel_status();
EscalateStatus(channel_status.status(), channel_status.message(), summary);
const auto& resource_status = component.second.resource_status();
EscalateStatus(resource_status.status(), resource_status.message(),
summary);
const auto& other_status = component.second.other_status();
EscalateStatus(other_status.status(), other_status.message(), summary);
}
// Get fingerprint of current status.
// Don't use DebugString() which has known bug on Map field. The string
// doesn't change though the value has changed.
static std::hash hash_fn;
std::string proto_bytes;
status->SerializeToString(&proto_bytes);
const size_t new_fp = hash_fn(proto_bytes);
if (system_status_fp_ != new_fp ||
current_time - last_broadcast_ > FLAGS_system_status_publish_interval) {
static auto writer =
manager->CreateWriter(FLAGS_system_status_topic);
apollo::common::util::FillHeader("SystemMonitor", status);
writer->Write(*status);
status->clear_header();
system_status_fp_ = new_fp;
last_broadcast_ = current_time;
}
}
针对前面所有的monitor 上报的故障信息,进行一个整合,然后发送到/apollo/monitor/system_status这个topic 上。