Skip to content

Commit

Permalink
Fixed a bug that caused crashes on shutdown
Browse files Browse the repository at this point in the history
I discovered one reason why Derecho processes often end with a
segmentation fault when attempting to shut down "cleanly" (as noted in
issues #135, #192, etc.): When a node marks itself as failed, the
SST will be told to freeze the node's own row (in process_suspicions()),
but SST::freeze() will dereference a null pointer if it is called on the
local row (res_vec has no entry for the node's own row).

The solution is to add a check for row_index == my_index in freeze(),
and also to ensure a node shuts itself down more promptly when it
detects that it has been marked as "failed" by the rest of the group.
  • Loading branch information
etremel committed Oct 1, 2021
1 parent 5e990cf commit a3443bb
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 7 deletions.
7 changes: 6 additions & 1 deletion include/derecho/sst/detail/sst_impl.hpp
Expand Up @@ -244,7 +244,12 @@ void SST<DerivedSST>::put_with_completion(const std::vector<uint32_t> receiver_r
}

template <typename DerivedSST>
void SST<DerivedSST>::freeze(int row_index) {
void SST<DerivedSST>::freeze(uint32_t row_index) {
// If some other node reported this one as failed,
// don't attempt to freeze your own row
if(row_index == my_index) {
return;
}
{
std::lock_guard<std::mutex> lock(freeze_mutex);
if(row_is_frozen[row_index]) {
Expand Down
2 changes: 1 addition & 1 deletion include/derecho/sst/sst.hpp
Expand Up @@ -313,7 +313,7 @@ class SST {

/** Marks a row as frozen, so it will no longer update, and its corresponding
* node will not receive writes. */
void freeze(int row_index);
void freeze(uint32_t row_index);

/** Returns the total number of rows in the table. */
unsigned int get_num_rows() const { return num_members; }
Expand Down
4 changes: 2 additions & 2 deletions src/core/git_version.cpp
Expand Up @@ -13,8 +13,8 @@ namespace derecho {
const int MAJOR_VERSION = 2;
const int MINOR_VERSION = 2;
const int PATCH_VERSION = 1;
const int COMMITS_AHEAD_OF_VERSION = 19;
const int COMMITS_AHEAD_OF_VERSION = 20;
const char* VERSION_STRING = "2.2.1";
const char* VERSION_STRING_PLUS_COMMITS = "2.2.1+19";
const char* VERSION_STRING_PLUS_COMMITS = "2.2.1+20";

}
9 changes: 6 additions & 3 deletions src/core/view_manager.cpp
Expand Up @@ -881,6 +881,10 @@ void ViewManager::propose_changes(DerechoSST& gmsSST) {
//First, check for failures
const std::vector<int> failed_ranks = process_suspicions(gmsSST);
for(const int failed_rank : failed_ranks) {
//If another node reported this one as failed, don't attempt to propose changes (they will be ignored)
if(failed_rank == my_rank) {
throw derecho_exception("Some other node reported that I failed. Node " + std::to_string(curr_view->members[my_rank]) + " terminating.");
}
if(!changes_contains(gmsSST, curr_view->members[failed_rank])) {
const int next_change_index = gmsSST.num_changes[my_rank] - gmsSST.num_installed[my_rank];
if(next_change_index == static_cast<int>(gmsSST.changes.size())) {
Expand Down Expand Up @@ -1186,8 +1190,7 @@ void ViewManager::terminate_epoch(DerechoSST& gmsSST) {
dbg_default_debug("Waiting for pending SST sends to finish");
curr_view->multicast_group->sst_send_trigger(subgroup_id, curr_subgroup_settings, num_shard_members, gmsSST);
gmsSST.put_with_completion();
gmsSST.sync_with_members(
curr_view->multicast_group->get_shard_sst_indices(subgroup_id));
gmsSST.sync_with_members(curr_view->multicast_group->get_shard_sst_indices(subgroup_id));
while(curr_view->multicast_group->receiver_predicate(
curr_subgroup_settings, shard_ranks_by_sender_rank,
num_shard_senders, gmsSST)) {
Expand Down Expand Up @@ -1416,7 +1419,7 @@ void ViewManager::finish_view_change(DerechoSST& gmsSST) {
dbg_default_debug("Sending joining node {} the new view over the joiner socket", proposed_join_sockets.front().first);
send_view(*next_view, proposed_join_sockets.front().second);
std::size_t size_of_vector = mutils::bytes_size(old_shard_leaders_by_id);
dbg_default_debug("Sending node {} the old shard leaders vector on the joiner socket. size_of_vector is {}",proposed_join_sockets.front().first, size_of_vector);
dbg_default_debug("Sending node {} the old shard leaders vector on the joiner socket. size_of_vector is {}", proposed_join_sockets.front().first, size_of_vector);
proposed_join_sockets.front().second.write(size_of_vector);
mutils::post_object([this](const char* bytes, std::size_t size) {
proposed_join_sockets.front().second.write(bytes, size);
Expand Down

0 comments on commit a3443bb

Please sign in to comment.