sync_log_entries.rs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. use std::sync::atomic::{AtomicUsize, Ordering};
  2. use std::sync::Arc;
  3. use std::time::Duration;
  4. use parking_lot::{Condvar, Mutex};
  5. use crate::check_or_record;
  6. use crate::daemon_env::ErrorKind;
  7. use crate::index_term::IndexTerm;
  8. use crate::utils::{retry_rpc, RPC_DEADLINE};
  9. use crate::{
  10. AppendEntriesArgs, InstallSnapshotArgs, Peer, Raft, RaftState, RpcClient,
  11. Term, HEARTBEAT_INTERVAL_MILLIS,
  12. };
  13. #[repr(align(64))]
  14. struct Opening(Arc<AtomicUsize>);
  15. enum SyncLogEntriesOperation<Command> {
  16. AppendEntries(AppendEntriesArgs<Command>),
  17. InstallSnapshot(InstallSnapshotArgs),
  18. None,
  19. }
  20. enum SyncLogEntriesResult {
  21. TermElapsed(Term),
  22. Archived(IndexTerm),
  23. Diverged(IndexTerm),
  24. Success,
  25. }
  26. // Command must be
  27. // 0. 'static: Raft<Command> must be 'static, it is moved to another thread.
  28. // 1. clone: they are copied to the persister.
  29. // 2. send: Arc<Mutex<Vec<LogEntry<Command>>>> must be send, it is moved to another thread.
  30. // 3. serialize: they are converted to bytes to persist.
  31. // 4. default: a default value is used as the first element of log.
  32. impl<Command> Raft<Command>
  33. where
  34. Command: 'static + Clone + Send + serde::Serialize + Default,
  35. {
  36. pub(crate) fn run_log_entry_daemon(&mut self) {
  37. let (tx, rx) = std::sync::mpsc::channel::<Option<Peer>>();
  38. self.new_log_entry.replace(tx);
  39. // Clone everything that the thread needs.
  40. let this = self.clone();
  41. let join_handle = std::thread::spawn(move || {
  42. // Note: do not change this to `let _ = ...`.
  43. let _guard = this.daemon_env.for_scope();
  44. let mut openings = vec![];
  45. openings.resize_with(this.peers.len(), || {
  46. Opening(Arc::new(AtomicUsize::new(0)))
  47. });
  48. let openings = openings; // Not mutable beyond this point.
  49. while let Ok(peer) = rx.recv() {
  50. if !this.keep_running.load(Ordering::SeqCst) {
  51. break;
  52. }
  53. if !this.inner_state.lock().is_leader() {
  54. continue;
  55. }
  56. for (i, rpc_client) in this.peers.iter().enumerate() {
  57. if i != this.me.0 && peer.map(|p| p.0 == i).unwrap_or(true)
  58. {
  59. // Only schedule a new task if the last task has cleared
  60. // the queue of RPC requests.
  61. if openings[i].0.fetch_add(1, Ordering::SeqCst) == 0 {
  62. this.thread_pool.spawn(Self::sync_log_entries(
  63. this.inner_state.clone(),
  64. rpc_client.clone(),
  65. i,
  66. this.new_log_entry.clone().unwrap(),
  67. openings[i].0.clone(),
  68. this.apply_command_signal.clone(),
  69. ));
  70. }
  71. }
  72. }
  73. }
  74. let stop_wait_group = this.stop_wait_group.clone();
  75. // Making sure the rest of `this` is dropped before the wait group.
  76. drop(this);
  77. drop(stop_wait_group);
  78. });
  79. self.daemon_env.watch_daemon(join_handle);
  80. }
  81. async fn sync_log_entries(
  82. rf: Arc<Mutex<RaftState<Command>>>,
  83. rpc_client: Arc<RpcClient>,
  84. peer_index: usize,
  85. rerun: std::sync::mpsc::Sender<Option<Peer>>,
  86. opening: Arc<AtomicUsize>,
  87. apply_command_signal: Arc<Condvar>,
  88. ) {
  89. if opening.swap(0, Ordering::SeqCst) == 0 {
  90. return;
  91. }
  92. let operation = Self::build_sync_log_entries(&rf, peer_index);
  93. let (term, prev_log_index, match_index, succeeded) = match operation {
  94. SyncLogEntriesOperation::AppendEntries(args) => {
  95. let term = args.term;
  96. let prev_log_index = args.prev_log_index;
  97. let match_index = args.prev_log_index + args.entries.len();
  98. let succeeded = Self::append_entries(&rpc_client, args).await;
  99. (term, prev_log_index, match_index, succeeded)
  100. }
  101. SyncLogEntriesOperation::InstallSnapshot(args) => {
  102. let term = args.term;
  103. let prev_log_index = args.last_included_index;
  104. let match_index = args.last_included_index;
  105. let succeeded = Self::install_snapshot(&rpc_client, args).await;
  106. (term, prev_log_index, match_index, succeeded)
  107. }
  108. SyncLogEntriesOperation::None => return,
  109. };
  110. let peer = Peer(peer_index);
  111. match succeeded {
  112. Ok(SyncLogEntriesResult::Success) => {
  113. let mut rf = rf.lock();
  114. if rf.current_term != term {
  115. return;
  116. }
  117. rf.next_index[peer_index] = match_index + 1;
  118. rf.current_step[peer_index] = 0;
  119. if match_index > rf.match_index[peer_index] {
  120. rf.match_index[peer_index] = match_index;
  121. if rf.is_leader() && rf.current_term == term {
  122. let mut matched = rf.match_index.to_vec();
  123. let mid = matched.len() / 2 + 1;
  124. matched.sort_unstable();
  125. let new_commit_index = matched[mid];
  126. if new_commit_index > rf.commit_index
  127. && rf.log[new_commit_index].term == rf.current_term
  128. {
  129. // COMMIT_INDEX_INVARIANT, SNAPSHOT_INDEX_INVARIANT:
  130. // Index new_commit_index exists in the log array,
  131. // which implies new_commit_index is in range
  132. // [log.start(), log.end()).
  133. rf.commit_index = new_commit_index;
  134. apply_command_signal.notify_one();
  135. }
  136. }
  137. }
  138. }
  139. Ok(SyncLogEntriesResult::Archived(committed)) => {
  140. let mut rf = rf.lock();
  141. check_or_record!(
  142. prev_log_index < committed.index,
  143. ErrorKind::RefusedSnapshotAfterCommitted(
  144. prev_log_index,
  145. committed.index
  146. ),
  147. format!(
  148. "Peer {} misbehaves: claimed log index {} is archived, \
  149. but commit index is at {:?}) which is before that",
  150. peer_index, prev_log_index, committed
  151. ),
  152. &rf
  153. );
  154. Self::check_committed(&rf, peer, committed.clone());
  155. rf.current_step[peer_index] = 0;
  156. rf.next_index[peer_index] = committed.index;
  157. // Ignore the error. The log syncing thread must have died.
  158. let _ = rerun.send(Some(Peer(peer_index)));
  159. }
  160. Ok(SyncLogEntriesResult::Diverged(committed)) => {
  161. let mut rf = rf.lock();
  162. check_or_record!(
  163. prev_log_index > committed.index,
  164. ErrorKind::DivergedBeforeCommitted(
  165. prev_log_index,
  166. committed.index
  167. ),
  168. format!(
  169. "Peer {} claimed log index {} does not match, \
  170. but commit index is at {:?}) which is after that.",
  171. peer_index, prev_log_index, committed
  172. ),
  173. &rf
  174. );
  175. Self::check_committed(&rf, peer, committed.clone());
  176. let step = &mut rf.current_step[peer_index];
  177. if *step < 5 {
  178. *step += 1;
  179. }
  180. let diff = 4 << *step;
  181. let next_index = &mut rf.next_index[peer_index];
  182. if diff >= *next_index {
  183. *next_index = 1usize;
  184. } else {
  185. *next_index -= diff;
  186. }
  187. if *next_index < committed.index {
  188. *next_index = committed.index;
  189. }
  190. // Ignore the error. The log syncing thread must have died.
  191. let _ = rerun.send(Some(Peer(peer_index)));
  192. }
  193. // Do nothing, not our term anymore.
  194. Ok(SyncLogEntriesResult::TermElapsed(_)) => {}
  195. Err(_) => {
  196. tokio::time::sleep(Duration::from_millis(
  197. HEARTBEAT_INTERVAL_MILLIS,
  198. ))
  199. .await;
  200. // Ignore the error. The log syncing thread must have died.
  201. let _ = rerun.send(Some(Peer(peer_index)));
  202. }
  203. };
  204. }
  205. fn check_committed(
  206. rf: &RaftState<Command>,
  207. peer: Peer,
  208. committed: IndexTerm,
  209. ) {
  210. if committed.index < rf.log.start() {
  211. return;
  212. }
  213. let local_term = rf.log.at(committed.index).term;
  214. check_or_record!(
  215. committed.term == local_term,
  216. ErrorKind::DivergedAtCommitted(committed.index),
  217. format!(
  218. "{:?} committed log diverged at {:?}: {:?} v.s. leader {:?}",
  219. peer, committed.index, committed.term, local_term
  220. ),
  221. rf
  222. );
  223. }
  224. fn build_sync_log_entries(
  225. rf: &Mutex<RaftState<Command>>,
  226. peer_index: usize,
  227. ) -> SyncLogEntriesOperation<Command> {
  228. let rf = rf.lock();
  229. if !rf.is_leader() {
  230. return SyncLogEntriesOperation::None;
  231. }
  232. // To send AppendEntries request, next_index must be strictly larger
  233. // than start(). Otherwise we won't be able to know the log term of the
  234. // entry right before next_index.
  235. if rf.next_index[peer_index] > rf.log.start() {
  236. SyncLogEntriesOperation::AppendEntries(Self::build_append_entries(
  237. &rf, peer_index,
  238. ))
  239. } else {
  240. SyncLogEntriesOperation::InstallSnapshot(
  241. Self::build_install_snapshot(&rf),
  242. )
  243. }
  244. }
  245. fn build_append_entries(
  246. rf: &RaftState<Command>,
  247. peer_index: usize,
  248. ) -> AppendEntriesArgs<Command> {
  249. let prev_log_index = rf.next_index[peer_index] - 1;
  250. let prev_log_term = rf.log[prev_log_index].term;
  251. AppendEntriesArgs {
  252. term: rf.current_term,
  253. leader_id: rf.leader_id,
  254. prev_log_index,
  255. prev_log_term,
  256. entries: rf.log.after(rf.next_index[peer_index]).to_vec(),
  257. leader_commit: rf.commit_index,
  258. }
  259. }
  260. const APPEND_ENTRIES_RETRY: usize = 1;
  261. async fn append_entries(
  262. rpc_client: &RpcClient,
  263. args: AppendEntriesArgs<Command>,
  264. ) -> std::io::Result<SyncLogEntriesResult> {
  265. let term = args.term;
  266. let reply = retry_rpc(
  267. Self::APPEND_ENTRIES_RETRY,
  268. RPC_DEADLINE,
  269. move |_round| rpc_client.call_append_entries(args.clone()),
  270. )
  271. .await?;
  272. Ok(if reply.term == term {
  273. if let Some(committed) = reply.committed {
  274. if reply.success {
  275. SyncLogEntriesResult::Archived(committed)
  276. } else {
  277. SyncLogEntriesResult::Diverged(committed)
  278. }
  279. } else {
  280. SyncLogEntriesResult::Success
  281. }
  282. } else {
  283. SyncLogEntriesResult::TermElapsed(reply.term)
  284. })
  285. }
  286. fn build_install_snapshot(rf: &RaftState<Command>) -> InstallSnapshotArgs {
  287. let (last, snapshot) = rf.log.snapshot();
  288. InstallSnapshotArgs {
  289. term: rf.current_term,
  290. leader_id: rf.leader_id,
  291. last_included_index: last.index,
  292. last_included_term: last.term,
  293. data: snapshot.to_owned(),
  294. offset: 0,
  295. done: true,
  296. }
  297. }
  298. const INSTALL_SNAPSHOT_RETRY: usize = 1;
  299. async fn install_snapshot(
  300. rpc_client: &RpcClient,
  301. args: InstallSnapshotArgs,
  302. ) -> std::io::Result<SyncLogEntriesResult> {
  303. let term = args.term;
  304. let reply = retry_rpc(
  305. Self::INSTALL_SNAPSHOT_RETRY,
  306. RPC_DEADLINE,
  307. move |_round| rpc_client.call_install_snapshot(args.clone()),
  308. )
  309. .await?;
  310. Ok(if reply.term == term {
  311. if let Some(committed) = reply.committed {
  312. SyncLogEntriesResult::Archived(committed)
  313. } else {
  314. SyncLogEntriesResult::Success
  315. }
  316. } else {
  317. SyncLogEntriesResult::TermElapsed(reply.term)
  318. })
  319. }
  320. }