|
|
@@ -0,0 +1,453 @@
|
|
|
+use crate::beat_ticker::{Beat, SharedBeatTicker};
|
|
|
+use crate::daemon_env::Daemon;
|
|
|
+use crate::{Index, Raft, Term, HEARTBEAT_INTERVAL_MILLIS};
|
|
|
+use parking_lot::{Condvar, Mutex};
|
|
|
+use std::collections::VecDeque;
|
|
|
+use std::future::Future;
|
|
|
+use std::sync::atomic::Ordering;
|
|
|
+use std::sync::Arc;
|
|
|
+use std::time::{Duration, Instant};
|
|
|
+
|
|
|
+/// The result returned to a verify authority request.
|
|
|
+/// This request is not directly exposed to end users. Instead it is used
|
|
|
+/// internally to implement no-commit read-only requests.
|
|
|
+#[derive(Debug)]
|
|
|
+pub enum VerifyAuthorityResult {
|
|
|
+ Success(Index),
|
|
|
+ TermElapsed,
|
|
|
+ TimedOut,
|
|
|
+}
|
|
|
+
|
|
|
+/// Token stored in the internal queue for authority verification. Each token
|
|
|
+/// represents one verification request.
|
|
|
+#[derive(Debug)]
|
|
|
+struct VerifyAuthorityToken {
|
|
|
+ commit_index: Index,
|
|
|
+ beats_moment: Vec<Beat>,
|
|
|
+ rough_time: Instant,
|
|
|
+ sender: tokio::sync::oneshot::Sender<VerifyAuthorityResult>,
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Clone, Copy, Debug, Default, Eq, Ord, PartialOrd, PartialEq)]
|
|
|
+struct QueueIndex(usize);
|
|
|
+
|
|
|
+/// The state of this daemon, should bee protected by a mutex.
|
|
|
+struct VerifyAuthorityState {
|
|
|
+ /// The current term. Might be behind the real term in the cluster.
|
|
|
+ term: Term,
|
|
|
+ /// Pending requests to verify authority.
|
|
|
+ queue: VecDeque<VerifyAuthorityToken>,
|
|
|
+ /// Number of requests that have been processed.
|
|
|
+ start: QueueIndex,
|
|
|
+ /// A vector of queue indexes. Each element in this vector indicates the
|
|
|
+ /// index of the first request that has not been confirmed by the
|
|
|
+ /// corresponding peer.
|
|
|
+ /// These indexes include all processed requests. They will never go down.
|
|
|
+ covered: Vec<QueueIndex>,
|
|
|
+}
|
|
|
+
|
|
|
+impl VerifyAuthorityState {
|
|
|
+ pub fn create(peer_count: usize) -> Self {
|
|
|
+ VerifyAuthorityState {
|
|
|
+ term: Term(0),
|
|
|
+ queue: Default::default(),
|
|
|
+ start: QueueIndex(0),
|
|
|
+ covered: vec![QueueIndex(0); peer_count],
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn reset(&mut self, term: Term) {
|
|
|
+ self.clear_tickets();
|
|
|
+
|
|
|
+ self.term = term;
|
|
|
+ self.start = QueueIndex(0);
|
|
|
+ for item in self.covered.iter_mut() {
|
|
|
+ *item = QueueIndex(0)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn clear_tickets(&mut self) {
|
|
|
+ for token in self.queue.drain(..) {
|
|
|
+ let _ = token.sender.send(VerifyAuthorityResult::TermElapsed);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Clone)]
|
|
|
+pub(crate) struct DaemonBeatTicker {
|
|
|
+ beat_ticker: SharedBeatTicker,
|
|
|
+ condvar: Arc<Condvar>,
|
|
|
+}
|
|
|
+
|
|
|
+impl DaemonBeatTicker {
|
|
|
+ pub fn next_beat(&self) -> Beat {
|
|
|
+ let beat = self.beat_ticker.next_beat();
|
|
|
+ beat
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn tick(&self, beat: Beat) {
|
|
|
+ self.beat_ticker.tick(beat);
|
|
|
+ self.condvar.notify_one();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+#[derive(Clone)]
|
|
|
+pub(crate) struct VerifyAuthorityDaemon {
|
|
|
+ state: Arc<Mutex<VerifyAuthorityState>>,
|
|
|
+ beat_tickers: Vec<SharedBeatTicker>,
|
|
|
+ condvar: Arc<Condvar>,
|
|
|
+}
|
|
|
+
|
|
|
+impl VerifyAuthorityDaemon {
|
|
|
+ pub fn create(peer_count: usize) -> Self {
|
|
|
+ Self {
|
|
|
+ state: Arc::new(Mutex::new(VerifyAuthorityState::create(
|
|
|
+ peer_count,
|
|
|
+ ))),
|
|
|
+ beat_tickers: (0..peer_count)
|
|
|
+ .map(|_| SharedBeatTicker::create())
|
|
|
+ .collect(),
|
|
|
+ condvar: Arc::new(Condvar::new()),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn wait_for(&self, timeout: Duration) {
|
|
|
+ let mut guard = self.state.lock();
|
|
|
+ self.condvar.wait_for(&mut guard, timeout);
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn reset_state(&self, term: Term) {
|
|
|
+ self.state.lock().reset(term);
|
|
|
+ // Increase all beats by one to make sure upcoming verify authority
|
|
|
+ // requests wait for beats in the current term. This in fact creates
|
|
|
+ // phantom beats that will never be marked as completed by themselves.
|
|
|
+ // They will be automatically `ticked()` when newer (real) beats are
|
|
|
+ // created, sent and `ticked()`.
|
|
|
+ for beat_ticker in self.beat_tickers.iter() {
|
|
|
+ beat_ticker.next_beat();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Enqueues a verify authority request. Returns a receiver of the
|
|
|
+ /// verification result. Returns None if the term has passed.
|
|
|
+ pub fn verify_authority_async(
|
|
|
+ &self,
|
|
|
+ current_term: Term,
|
|
|
+ commit_index: Index,
|
|
|
+ ) -> Option<tokio::sync::oneshot::Receiver<VerifyAuthorityResult>> {
|
|
|
+ let mut state = self.state.lock();
|
|
|
+ // The inflight beats are sent at least for `current_term`. This is
|
|
|
+ // guaranteed by the fact that we immediately increase beats for all
|
|
|
+ // peers after being elected, before releasing the "elected" message to
|
|
|
+ // the rest of the Raft system. The newest beats we get here are at
|
|
|
+ // least as new as the phantom beats created by `Self::reset_state()`.
|
|
|
+ let beats_moment = self
|
|
|
+ .beat_tickers
|
|
|
+ .iter()
|
|
|
+ .map(|beat_ticker| beat_ticker.current_beat())
|
|
|
+ .collect();
|
|
|
+
|
|
|
+ // The inflight beats could also be for any term after `current_term`.
|
|
|
+ // We must check if the term stored in the daemon is the same as
|
|
|
+ // `current_term`.
|
|
|
+ // `state.term` could be smaller than `current_term`, if a new term is
|
|
|
+ // started by someone else and we lost leadership.
|
|
|
+ // `state.term` could be greater than `current_term`, if we lost
|
|
|
+ // leadership but are elected leader again in a following term.
|
|
|
+ // In both cases, we cannot confirm the leadership at `current_term`.
|
|
|
+ if state.term != current_term {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+
|
|
|
+ let (sender, receiver) = tokio::sync::oneshot::channel();
|
|
|
+ let token = VerifyAuthorityToken {
|
|
|
+ commit_index,
|
|
|
+ beats_moment,
|
|
|
+ rough_time: Instant::now(),
|
|
|
+ sender,
|
|
|
+ };
|
|
|
+ state.queue.push_back(token);
|
|
|
+
|
|
|
+ Some(receiver)
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Run one iteration of the verify authority daemon.
|
|
|
+ pub fn run_verify_authority_iteration(
|
|
|
+ &self,
|
|
|
+ current_term: Term,
|
|
|
+ commit_index: Index,
|
|
|
+ sentinel_commit_index: Index,
|
|
|
+ ) {
|
|
|
+ // Opportunistic check: do nothing if we don't have any requests.
|
|
|
+ if self.state.lock().queue.is_empty() {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ self.clear_committed_requests(current_term, commit_index);
|
|
|
+ // Do not use ticks to clear requests if we have not committed at least
|
|
|
+ // one log entry since the start of the term. At the start of the term,
|
|
|
+ // the leader might not know the commit index of the previous leader.
|
|
|
+ // This holds true even it is guaranteed that all entries committed by
|
|
|
+ // the previous leader will be committed by the current leader.
|
|
|
+ if commit_index >= sentinel_commit_index {
|
|
|
+ self.clear_ticked_requests();
|
|
|
+ }
|
|
|
+ self.removed_expired_requests(current_term);
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Clears all requests that have seen at least one commit.
|
|
|
+ /// This function handles the following scenario: a verify authority request
|
|
|
+ /// was received, when the `commit_index` was at C. Later as the leader we
|
|
|
+ /// moved the commit index to at least C+1. That implies that when the
|
|
|
+ /// request was first received, no other new commits after C could have been
|
|
|
+ /// added to the log, either by this replica or others. It then follows that
|
|
|
+ /// we can claim we had authority at that point.
|
|
|
+ fn clear_committed_requests(
|
|
|
+ &self,
|
|
|
+ current_term: Term,
|
|
|
+ commit_index: Index,
|
|
|
+ ) {
|
|
|
+ let mut state = self.state.lock();
|
|
|
+ // We might skip some requests that could have been cleared, if we did
|
|
|
+ // not react to the commit notification fast enough, and missed a
|
|
|
+ // commit. This is about the case where in the last iteration
|
|
|
+ // `commit_index` was `ci`, but in this iteration it becomes `ci + 2`
|
|
|
+ // (or even larger), skipping `ci + 1`.
|
|
|
+ //
|
|
|
+ // Obviously skipping a commit is a problem if `ci + 2` and `ci + 1` are
|
|
|
+ // both committed by us in this term. The requests that are cleared by
|
|
|
+ // `+1` will be cleared by `+2` anyway. Similarly it is not a problem if
|
|
|
+ // neither are committed by us in this term, since `+1` will not clear
|
|
|
+ // any requests.
|
|
|
+ //
|
|
|
+ // If `+2` is not committed by us, but `+1` is, we lose the opportunity
|
|
|
+ // to use `+1` to clear requests. The chances of losing this opportunity
|
|
|
+ // are slim, because between `+1` and `+2`, there has to be a missed
|
|
|
+ // heartbeat interval, and a new commit (`+2`) from another leader. We
|
|
|
+ // have plenty of time to run this method before `+2` reaches us.
|
|
|
+ //
|
|
|
+ // Overall it is acceptable to simplify the implementation and risk
|
|
|
+ // losing the mentioned opportunity.
|
|
|
+ if current_term != state.term {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Note the commit_index in the queue might not be in increasing order.
|
|
|
+ // We could still have requests that have a smaller commit_index after
|
|
|
+ // this sweep. That is an acceptable tradeoff we are taking.
|
|
|
+ while let Some(head) = state.queue.pop_front() {
|
|
|
+ if head.commit_index >= commit_index {
|
|
|
+ state.queue.push_front(head);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ // At the start of the term, the previous leader might have exposed
|
|
|
+ // all entries before the sentinel commit to clients. If a request
|
|
|
+ // arrived before the sentinel commit is committed, its commit index
|
|
|
+ // (token.commit_index) might be inaccurate. Thus we cannot allow
|
|
|
+ // the client to return any state before the sentinel index.
|
|
|
+ //
|
|
|
+ // We did not choose the sentinel index but opted for a more strict
|
|
|
+ // commit index, because the index is committed anyway. It should be
|
|
|
+ // delivered to the application really quickly. We paid the price
|
|
|
+ // with latency but made the request more fresh.
|
|
|
+ let _ = head
|
|
|
+ .sender
|
|
|
+ .send(VerifyAuthorityResult::Success(commit_index));
|
|
|
+ state.start.0 += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Fetches the newest successful RPC response from peers, and mark verify
|
|
|
+ /// authority requests as complete if they are covered by more than half of
|
|
|
+ /// the replicas.
|
|
|
+ fn clear_ticked_requests(&self) {
|
|
|
+ for (peer_index, beat_ticker) in self.beat_tickers.iter().enumerate() {
|
|
|
+ // Fetches the newest successful RPC response from the current peer.
|
|
|
+ let ticked = beat_ticker.ticked();
|
|
|
+ let mut state = self.state.lock();
|
|
|
+ // Update progress with `ticked`. All requests that came before
|
|
|
+ // `ticked` now have one more votes of leader authority from the
|
|
|
+ // current peer.
|
|
|
+ let first_not_ticked_index = state.queue.partition_point(|token| {
|
|
|
+ token.beats_moment[peer_index] <= ticked
|
|
|
+ });
|
|
|
+ let new_covered = first_not_ticked_index + state.start.0;
|
|
|
+ assert!(new_covered >= state.covered[peer_index].0);
|
|
|
+ state.covered[peer_index].0 = new_covered;
|
|
|
+
|
|
|
+ // Count the requests that has more than N / 2 votes. We always have
|
|
|
+ // the vote from ourselves, but the value is 0 in `covered` array.
|
|
|
+ let mut sorted_covered = state.covered.to_owned();
|
|
|
+ sorted_covered.sort_unstable();
|
|
|
+ let mid = sorted_covered.len() / 2 + 1;
|
|
|
+ let new_start = sorted_covered[mid];
|
|
|
+
|
|
|
+ // `state.start` could have been moved by other means, e.g. by a
|
|
|
+ // subsequent commit of the same term after the beat is issued.
|
|
|
+ // Then the relevant verify authority requests have been processed.
|
|
|
+ // If all ticked requests have been processed, nothing needs to be
|
|
|
+ // done. Skip to the next iteration.
|
|
|
+ if new_start <= state.start {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // All requests before `new_start` is now verified.
|
|
|
+ let verified = new_start.0 - state.start.0;
|
|
|
+ for token in state.queue.drain(..verified) {
|
|
|
+ let mut cnt = 0;
|
|
|
+ for (index, beat) in token.beats_moment.iter().enumerate() {
|
|
|
+ if self.beat_tickers[index].ticked() >= *beat {
|
|
|
+ cnt += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ assert!(cnt + cnt + 1 >= self.beat_tickers.len());
|
|
|
+ let _ = token
|
|
|
+ .sender
|
|
|
+ .send(VerifyAuthorityResult::Success(token.commit_index));
|
|
|
+ }
|
|
|
+ // Move the queue starting point.
|
|
|
+ state.start = new_start;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ const VERIFY_AUTHORITY_REQUEST_EXPIRATION: Duration =
|
|
|
+ Duration::from_millis(HEARTBEAT_INTERVAL_MILLIS * 2);
|
|
|
+
|
|
|
+ /// Remove expired requests if we are no longer the leader.
|
|
|
+ /// If we have lost leadership, we are unlikely to receive confirmations
|
|
|
+ /// of past leadership state from peers. Requests are expired after two
|
|
|
+ /// heartbeat period have passed. We do not immediately cancel all incoming
|
|
|
+ /// requests, in hope that we could still answer them accurately without
|
|
|
+ /// breaking the consistency guarantee.
|
|
|
+ fn removed_expired_requests(&self, current_term: Term) {
|
|
|
+ let mut state = self.state.lock();
|
|
|
+ // Return if we are still the leader, or we become the leader again.
|
|
|
+ //
|
|
|
+ // Note that we do not hold the main raft state lock, thus the value of
|
|
|
+ // `current_term` might not be up-to-date. We only update `state.term`
|
|
|
+ // after an election. If in a term after `current_term`, we are elected
|
|
|
+ // leader again, `state.term` could be updated and thus greater than the
|
|
|
+ // (now stale) `current_term`. In that case, the queue should have been
|
|
|
+ // reset. There will be no expired request to remove.
|
|
|
+ if state.term >= current_term {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ let expiring_line =
|
|
|
+ Instant::now() - Self::VERIFY_AUTHORITY_REQUEST_EXPIRATION;
|
|
|
+ // Assuming bounded clock skew, otherwise we will lose efficiency.
|
|
|
+ let expired =
|
|
|
+ |head: &VerifyAuthorityToken| head.rough_time < expiring_line;
|
|
|
+ // Note rough_time might not be in increasing order, so we might still
|
|
|
+ // have requests that are expired in the queue after the sweep.
|
|
|
+ while state.queue.front().map_or(false, expired) {
|
|
|
+ state
|
|
|
+ .queue
|
|
|
+ .pop_front()
|
|
|
+ .map(|head| head.sender.send(VerifyAuthorityResult::TimedOut));
|
|
|
+ state.start.0 += 1;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn beat_ticker(&self, peer_index: usize) -> DaemonBeatTicker {
|
|
|
+ DaemonBeatTicker {
|
|
|
+ beat_ticker: self.beat_tickers[peer_index].clone(),
|
|
|
+ condvar: self.condvar.clone(),
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ pub fn kill(&self) {
|
|
|
+ let term = self.state.lock().term;
|
|
|
+ // Fail all inflight verify authority requests. It is important to do
|
|
|
+ // this so that the RPC framework could drop requests served by us and
|
|
|
+ // release all references to the Raft instance.
|
|
|
+ self.reset_state(term);
|
|
|
+ self.condvar.notify_all();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+impl<Command: 'static + Send> Raft<Command> {
|
|
|
+ const BEAT_RECORDING_MAX_PAUSE: Duration = Duration::from_millis(20);
|
|
|
+
|
|
|
+ /// Create a thread and runs the verify authority daemon.
|
|
|
+ pub(crate) fn run_verify_authority_daemon(&self) {
|
|
|
+ let me = self.me.clone();
|
|
|
+ let keep_running = self.keep_running.clone();
|
|
|
+ let daemon_env = self.daemon_env.clone();
|
|
|
+ let this_daemon = self.verify_authority_daemon.clone();
|
|
|
+ let rf = self.inner_state.clone();
|
|
|
+ let stop_wait_group = self.stop_wait_group.clone();
|
|
|
+
|
|
|
+ let join_handle = std::thread::spawn(move || {
|
|
|
+ // Note: do not change this to `let _ = ...`.
|
|
|
+ let _guard = daemon_env.for_scope();
|
|
|
+
|
|
|
+ log::info!("{:?} verify authority daemon running ...", me);
|
|
|
+ while keep_running.load(Ordering::Acquire) {
|
|
|
+ this_daemon.wait_for(Self::BEAT_RECORDING_MAX_PAUSE);
|
|
|
+ let (current_term, commit_index, sentinel) = {
|
|
|
+ let rf = rf.lock();
|
|
|
+ (rf.current_term, rf.commit_index, rf.sentinel_commit_index)
|
|
|
+ };
|
|
|
+ this_daemon.run_verify_authority_iteration(
|
|
|
+ current_term,
|
|
|
+ commit_index,
|
|
|
+ sentinel,
|
|
|
+ );
|
|
|
+ }
|
|
|
+ log::info!("{:?} verify authority daemon done.", me);
|
|
|
+
|
|
|
+ drop(stop_wait_group);
|
|
|
+ });
|
|
|
+ self.daemon_env
|
|
|
+ .watch_daemon(Daemon::VerifyAuthority, join_handle);
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Create a verify authority request. Returns None if we are not the
|
|
|
+ /// leader.
|
|
|
+ ///
|
|
|
+ /// A successful verification allows the application to respond to read-only
|
|
|
+ /// requests that arrived before this function is called. The answer must
|
|
|
+ /// include all commands at or before a certain index, which is returned to
|
|
|
+ /// the application with the successful verification result. The index is
|
|
|
+ /// in fact the commit index at the moment this function was called. It is
|
|
|
+ /// guaranteed that no other commands could possibly have been committed at
|
|
|
+ /// the moment this function was called.
|
|
|
+ ///
|
|
|
+ /// The application is also free to include any subsequent commits in the
|
|
|
+ /// response. Consistency is still guaranteed, because Raft never rolls back
|
|
|
+ /// committed commands.
|
|
|
+ pub fn verify_authority_async(
|
|
|
+ &self,
|
|
|
+ ) -> Option<impl Future<Output = crate::VerifyAuthorityResult>> {
|
|
|
+ // Fail the request if we have been killed.
|
|
|
+ if !self.keep_running.load(Ordering::Acquire) {
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+
|
|
|
+ let (term, commit_index) = {
|
|
|
+ let rf = self.inner_state.lock();
|
|
|
+ if !rf.is_leader() {
|
|
|
+ // Returning none instead of `Pending::Ready(TermElapsed)`,
|
|
|
+ // because that requires a separate struct that implements
|
|
|
+ // Future, which is tedious to write.
|
|
|
+ return None;
|
|
|
+ }
|
|
|
+
|
|
|
+ (rf.current_term, rf.commit_index)
|
|
|
+ };
|
|
|
+ let receiver = self
|
|
|
+ .verify_authority_daemon
|
|
|
+ .verify_authority_async(term, commit_index);
|
|
|
+ self.heartbeats_daemon.trigger();
|
|
|
+ receiver.map(|receiver| async move {
|
|
|
+ receiver
|
|
|
+ .await
|
|
|
+ .expect("Verify authority daemon never drops senders")
|
|
|
+ })
|
|
|
+ }
|
|
|
+
|
|
|
+ pub(crate) fn beat_ticker(&self, peer_index: usize) -> DaemonBeatTicker {
|
|
|
+ self.verify_authority_daemon.beat_ticker(peer_index)
|
|
|
+ }
|
|
|
+}
|