You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
318 lines
10 KiB
318 lines
10 KiB
// Copyright 2008 Google Inc. All Rights Reserved.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// error_diag.cc: Collects device errors for analysis to more accurately
|
|
// pin-point failed component.
|
|
|
|
#include <set>
|
|
#include <list>
|
|
#include <map>
|
|
|
|
// This file must work with autoconf on its public version,
|
|
// so these includes are correct.
|
|
#include "error_diag.h"
|
|
#include "sattypes.h"
|
|
|
|
|
|
// DeviceTree constructor.
|
|
DeviceTree::DeviceTree(string name)
|
|
: parent_(0), name_(name) {
|
|
pthread_mutex_init(&device_tree_mutex_, NULL);
|
|
}
|
|
|
|
// DeviceTree destructor.
|
|
DeviceTree::~DeviceTree() {
|
|
// Deallocate subtree devices.
|
|
for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
|
|
itr != subdevices_.end();
|
|
++itr) {
|
|
delete itr->second;
|
|
}
|
|
// Deallocate device errors.
|
|
for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
|
|
itr != errors_.end();
|
|
++itr) {
|
|
delete (*itr);
|
|
}
|
|
pthread_mutex_destroy(&device_tree_mutex_);
|
|
}
|
|
|
|
// Atomically find named device in sub device tree.
|
|
// Returns 0 if not found
|
|
DeviceTree *DeviceTree::FindInSubTree(string name) {
|
|
DeviceTree *ret;
|
|
pthread_mutex_lock(&device_tree_mutex_);
|
|
ret = UnlockedFindInSubTree(name);
|
|
pthread_mutex_unlock(&device_tree_mutex_);
|
|
return ret;
|
|
}
|
|
|
|
// Find named device in sub device tree (Non-atomic).
|
|
// Returns 0 if not found
|
|
DeviceTree *DeviceTree::UnlockedFindInSubTree(string name) {
|
|
std::map<string, DeviceTree*>::iterator itr = subdevices_.find(name);
|
|
if (itr != subdevices_.end()) {
|
|
return itr->second;
|
|
} else {
|
|
// Search sub-tree.
|
|
for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
|
|
itr != subdevices_.end();
|
|
++itr) {
|
|
DeviceTree *result = itr->second->UnlockedFindInSubTree(name);
|
|
if (result != 0)
|
|
return result;
|
|
}
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// Atomically add error instance to device.
|
|
void DeviceTree::AddErrorInstance(ErrorInstance *error_instance) {
|
|
pthread_mutex_lock(&device_tree_mutex_);
|
|
errors_.push_back(error_instance);
|
|
pthread_mutex_unlock(&device_tree_mutex_);
|
|
}
|
|
|
|
// Find or add queried device as necessary.
|
|
DeviceTree *DeviceTree::FindOrAddDevice(string name) {
|
|
// Assume named device does not exist and try to insert the device anyway.
|
|
// No-op if named device already exists.
|
|
InsertSubDevice(name);
|
|
// Find and return sub device pointer.
|
|
return FindInSubTree(name);
|
|
}
|
|
|
|
// Pretty prints device tree.
|
|
void DeviceTree::PrettyPrint(string spacer) {
|
|
for (std::map<string, DeviceTree*>::iterator itr = subdevices_.begin();
|
|
itr != subdevices_.end();
|
|
++itr) {
|
|
printf("%s%s\n", spacer.c_str(), itr->first.c_str());
|
|
itr->second->PrettyPrint(spacer+spacer);
|
|
}
|
|
}
|
|
|
|
// Atomically add sub device.
|
|
// No-op if named device already exists.
|
|
void DeviceTree::InsertSubDevice(string name) {
|
|
pthread_mutex_lock(&device_tree_mutex_);
|
|
if (UnlockedFindInSubTree(name) != 0) {
|
|
pthread_mutex_unlock(&device_tree_mutex_);
|
|
return;
|
|
}
|
|
subdevices_[name] = new DeviceTree(name);
|
|
subdevices_[name]->parent_ = this;
|
|
pthread_mutex_unlock(&device_tree_mutex_);
|
|
}
|
|
|
|
|
|
// Returns true of any error associated with this device is fatal.
|
|
bool DeviceTree::KnownBad() {
|
|
pthread_mutex_lock(&device_tree_mutex_);
|
|
for (std::list<ErrorInstance*>::iterator itr = errors_.begin();
|
|
itr != errors_.end();
|
|
++itr) {
|
|
if ((*itr)->severity_ == SAT_ERROR_FATAL) {
|
|
pthread_mutex_unlock(&device_tree_mutex_);
|
|
return true;
|
|
}
|
|
}
|
|
pthread_mutex_unlock(&device_tree_mutex_);
|
|
return false;
|
|
}
|
|
|
|
|
|
// ErrorDiag constructor.
|
|
ErrorDiag::ErrorDiag() {
|
|
os_ = 0;
|
|
system_tree_root_ = 0;
|
|
}
|
|
|
|
// ErrorDiag destructor.
|
|
ErrorDiag::~ErrorDiag() {
|
|
if (system_tree_root_)
|
|
delete system_tree_root_;
|
|
}
|
|
|
|
// Set platform specific handle and initialize device tree.
|
|
// Returns false on error. true otherwise.
|
|
bool ErrorDiag::set_os(OsLayer *os) {
|
|
os_ = os;
|
|
return(InitializeDeviceTree());
|
|
}
|
|
|
|
// Create and initialize system device tree.
|
|
// Returns false on error. true otherwise.
|
|
bool ErrorDiag::InitializeDeviceTree() {
|
|
system_tree_root_ = new DeviceTree("system_root");
|
|
if (!system_tree_root_)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
// Logs info about a CECC.
|
|
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
|
|
int ErrorDiag::AddCeccError(string dimm_string) {
|
|
DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
|
|
ECCErrorInstance *error = new ECCErrorInstance;
|
|
if (!error)
|
|
return -1;
|
|
error->severity_ = SAT_ERROR_CORRECTABLE;
|
|
dimm_device->AddErrorInstance(error);
|
|
return 0;
|
|
}
|
|
|
|
// Logs info about a UECC.
|
|
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
|
|
int ErrorDiag::AddUeccError(string dimm_string) {
|
|
DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
|
|
ECCErrorInstance *error = new ECCErrorInstance;
|
|
if (!error)
|
|
return -1;
|
|
error->severity_ = SAT_ERROR_FATAL;
|
|
dimm_device->AddErrorInstance(error);
|
|
return 0;
|
|
}
|
|
|
|
// Logs info about a miscompare.
|
|
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
|
|
int ErrorDiag::AddMiscompareError(string dimm_string, uint64 addr, int count) {
|
|
DeviceTree *dimm_device = system_tree_root_->FindOrAddDevice(dimm_string);
|
|
MiscompareErrorInstance *error = new MiscompareErrorInstance;
|
|
if (!error)
|
|
return -1;
|
|
error->severity_ = SAT_ERROR_FATAL;
|
|
error->addr_ = addr;
|
|
dimm_device->AddErrorInstance(error);
|
|
os_->ErrorReport(dimm_string.c_str(), "miscompare", count);
|
|
return 1;
|
|
}
|
|
|
|
// Utility Function to translate a virtual address to DIMM number.
|
|
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
|
|
string ErrorDiag::AddressToDimmString(OsLayer *os, void *addr, int offset) {
|
|
char dimm_string[256] = "";
|
|
char *vbyteaddr = reinterpret_cast<char*>(addr) + offset;
|
|
uint64 paddr = os->VirtualToPhysical(vbyteaddr);
|
|
os->FindDimm(paddr, dimm_string, sizeof(dimm_string));
|
|
return string(dimm_string);
|
|
}
|
|
|
|
// Info about a miscompare from a drive.
|
|
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
|
|
int ErrorDiag::AddHDDMiscompareError(string devicename, int block, int offset,
|
|
void *src_addr, void *dst_addr) {
|
|
bool mask_hdd_error = false;
|
|
|
|
HDDMiscompareErrorInstance *error = new HDDMiscompareErrorInstance;
|
|
if (!error)
|
|
return -1;
|
|
|
|
error->addr_ = reinterpret_cast<uint64>(src_addr);
|
|
error->addr2_ = reinterpret_cast<uint64>(dst_addr);
|
|
error->offset_ = offset;
|
|
error->block_ = block;
|
|
|
|
string src_dimm = AddressToDimmString(os_, src_addr, offset);
|
|
string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
|
|
|
|
// DIMM name look up success
|
|
if (src_dimm.compare("DIMM Unknown")) {
|
|
// Add src DIMM as possible miscompare cause.
|
|
DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
|
|
error->causes_.insert(src_dimm_dev);
|
|
if (src_dimm_dev->KnownBad()) {
|
|
mask_hdd_error = true;
|
|
logprintf(5, "Log: supressed %s miscompare report: "
|
|
"known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
|
|
}
|
|
}
|
|
if (dst_dimm.compare("DIMM Unknown")) {
|
|
// Add dst DIMM as possible miscompare cause.
|
|
DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
|
|
error->causes_.insert(dst_dimm_dev);
|
|
if (dst_dimm_dev->KnownBad()) {
|
|
mask_hdd_error = true;
|
|
logprintf(5, "Log: supressed %s miscompare report: "
|
|
"known bad destination: %s\n", devicename.c_str(),
|
|
dst_dimm.c_str());
|
|
}
|
|
}
|
|
|
|
DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
|
|
hdd_dev->AddErrorInstance(error);
|
|
|
|
// HDD error was not masked by bad DIMMs: report bad HDD.
|
|
if (!mask_hdd_error) {
|
|
os_->ErrorReport(devicename.c_str(), "miscompare", 1);
|
|
error->severity_ = SAT_ERROR_FATAL;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// Info about a sector tag miscompare from a drive.
|
|
// Returns -1 on error, 1 if diagnoser reports error externally; 0 otherwise.
|
|
int ErrorDiag::AddHDDSectorTagError(string devicename, int block, int offset,
|
|
int sector, void *src_addr,
|
|
void *dst_addr) {
|
|
bool mask_hdd_error = false;
|
|
|
|
HDDSectorTagErrorInstance *error = new HDDSectorTagErrorInstance;
|
|
if (!error)
|
|
return -1;
|
|
|
|
error->addr_ = reinterpret_cast<uint64>(src_addr);
|
|
error->addr2_ = reinterpret_cast<uint64>(dst_addr);
|
|
error->sector_ = sector;
|
|
error->block_ = block;
|
|
|
|
string src_dimm = AddressToDimmString(os_, src_addr, offset);
|
|
string dst_dimm = AddressToDimmString(os_, dst_addr, offset);
|
|
|
|
// DIMM name look up success
|
|
if (src_dimm.compare("DIMM Unknown")) {
|
|
// Add src DIMM as possible miscompare cause.
|
|
DeviceTree *src_dimm_dev = system_tree_root_->FindOrAddDevice(src_dimm);
|
|
error->causes_.insert(src_dimm_dev);
|
|
if (src_dimm_dev->KnownBad()) {
|
|
mask_hdd_error = true;
|
|
logprintf(5, "Log: supressed %s sector tag error report: "
|
|
"known bad source: %s\n", devicename.c_str(), src_dimm.c_str());
|
|
}
|
|
}
|
|
if (dst_dimm.compare("DIMM Unknown")) {
|
|
// Add dst DIMM as possible miscompare cause.
|
|
DeviceTree *dst_dimm_dev = system_tree_root_->FindOrAddDevice(dst_dimm);
|
|
error->causes_.insert(dst_dimm_dev);
|
|
if (dst_dimm_dev->KnownBad()) {
|
|
mask_hdd_error = true;
|
|
logprintf(5, "Log: supressed %s sector tag error report: "
|
|
"known bad destination: %s\n", devicename.c_str(),
|
|
dst_dimm.c_str());
|
|
}
|
|
}
|
|
|
|
DeviceTree *hdd_dev = system_tree_root_->FindOrAddDevice(devicename);
|
|
hdd_dev->AddErrorInstance(error);
|
|
|
|
// HDD error was not masked by bad DIMMs: report bad HDD.
|
|
if (!mask_hdd_error) {
|
|
os_->ErrorReport(devicename.c_str(), "sector", 1);
|
|
error->severity_ = SAT_ERROR_FATAL;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|