lib.rs 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903
  1. extern crate bincode;
  2. extern crate futures;
  3. extern crate labrpc;
  4. extern crate rand;
  5. #[macro_use]
  6. extern crate serde_derive;
  7. extern crate tokio;
  8. use std::convert::TryFrom;
  9. use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
  10. use std::sync::Arc;
  11. use std::time::{Duration, Instant};
  12. use crossbeam_utils::sync::WaitGroup;
  13. use parking_lot::{Condvar, Mutex};
  14. use rand::{thread_rng, Rng};
  15. use crate::persister::PersistedRaftState;
  16. pub use crate::persister::Persister;
  17. pub use crate::rpcs::RpcClient;
  18. use crate::utils::retry_rpc;
  19. mod persister;
  20. pub mod rpcs;
  21. pub mod utils;
  22. #[derive(Debug, Eq, PartialEq)]
  23. enum State {
  24. Follower,
  25. Candidate,
  26. // TODO: add PreVote
  27. Leader,
  28. }
  29. #[derive(
  30. Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize,
  31. )]
  32. pub struct Term(pub usize);
  33. #[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
  34. struct Peer(usize);
  35. pub type Index = usize;
  36. #[derive(Clone, Debug, Serialize, Deserialize)]
  37. pub struct Command(pub i32);
  38. #[derive(Clone, Debug, Serialize, Deserialize)]
  39. struct LogEntry {
  40. term: Term,
  41. index: Index,
  42. // TODO: Allow sending of arbitrary information.
  43. command: Command,
  44. }
  45. struct RaftState {
  46. current_term: Term,
  47. voted_for: Option<Peer>,
  48. log: Vec<LogEntry>,
  49. commit_index: Index,
  50. last_applied: Index,
  51. next_index: Vec<Index>,
  52. match_index: Vec<Index>,
  53. current_step: Vec<i64>,
  54. state: State,
  55. leader_id: Peer,
  56. }
  57. struct ElectionState {
  58. // Timer will be removed upon shutdown or elected.
  59. timer: Mutex<(usize, Option<Instant>)>,
  60. // Wake up the timer thread when the timer is reset or cancelled.
  61. signal: Condvar,
  62. }
  63. #[derive(Clone)]
  64. pub struct Raft {
  65. inner_state: Arc<Mutex<RaftState>>,
  66. peers: Vec<RpcClient>,
  67. me: Peer,
  68. persister: Arc<dyn Persister>,
  69. new_log_entry: Option<std::sync::mpsc::Sender<Option<Peer>>>,
  70. apply_command_signal: Arc<Condvar>,
  71. keep_running: Arc<AtomicBool>,
  72. election: Arc<ElectionState>,
  73. thread_pool: Arc<tokio::runtime::Runtime>,
  74. stop_wait_group: WaitGroup,
  75. }
  76. #[derive(Clone, Debug, Serialize, Deserialize)]
  77. struct RequestVoteArgs {
  78. term: Term,
  79. candidate_id: Peer,
  80. last_log_index: Index,
  81. last_log_term: Term,
  82. }
  83. #[derive(Clone, Debug, Serialize, Deserialize)]
  84. struct RequestVoteReply {
  85. term: Term,
  86. vote_granted: bool,
  87. }
  88. #[derive(Clone, Debug, Serialize, Deserialize)]
  89. struct AppendEntriesArgs {
  90. term: Term,
  91. leader_id: Peer,
  92. prev_log_index: Index,
  93. prev_log_term: Term,
  94. entries: Vec<LogEntry>,
  95. leader_commit: Index,
  96. }
  97. #[derive(Clone, Debug, Serialize, Deserialize)]
  98. struct AppendEntriesReply {
  99. term: Term,
  100. success: bool,
  101. }
  102. impl Raft {
  103. /// Create a new raft instance.
  104. ///
  105. /// Each instance will create at least 3 + (number of peers) threads. The
  106. /// extensive usage of threads is to minimize latency.
  107. pub fn new<Func>(
  108. peers: Vec<RpcClient>,
  109. me: usize,
  110. persister: Arc<dyn Persister>,
  111. apply_command: Func,
  112. ) -> Self
  113. where
  114. Func: 'static + Send + FnMut(Index, Command),
  115. {
  116. let peer_size = peers.len();
  117. let mut state = RaftState {
  118. current_term: Term(0),
  119. voted_for: None,
  120. log: vec![LogEntry {
  121. term: Term(0),
  122. index: 0,
  123. command: Command(0),
  124. }],
  125. commit_index: 0,
  126. last_applied: 0,
  127. next_index: vec![1; peer_size],
  128. match_index: vec![0; peer_size],
  129. current_step: vec![0; peer_size],
  130. state: State::Follower,
  131. leader_id: Peer(me),
  132. };
  133. if let Ok(persisted_state) =
  134. PersistedRaftState::try_from(persister.read_state())
  135. {
  136. state.current_term = persisted_state.current_term;
  137. state.voted_for = persisted_state.voted_for;
  138. state.log = persisted_state.log;
  139. }
  140. let election = ElectionState {
  141. timer: Mutex::new((0, None)),
  142. signal: Condvar::new(),
  143. };
  144. election.reset_election_timer();
  145. let thread_pool = tokio::runtime::Builder::new_multi_thread()
  146. .enable_time()
  147. .thread_name(format!("raft-instance-{}", me))
  148. .worker_threads(peer_size)
  149. .max_threads(peer_size * 2)
  150. .build()
  151. .expect("Creating thread pool should not fail");
  152. let mut this = Raft {
  153. inner_state: Arc::new(Mutex::new(state)),
  154. peers,
  155. me: Peer(me),
  156. persister,
  157. new_log_entry: None,
  158. apply_command_signal: Arc::new(Default::default()),
  159. keep_running: Arc::new(Default::default()),
  160. election: Arc::new(election),
  161. thread_pool: Arc::new(thread_pool),
  162. stop_wait_group: WaitGroup::new(),
  163. };
  164. this.keep_running.store(true, Ordering::SeqCst);
  165. // Running in a standalone thread.
  166. this.run_log_entry_daemon();
  167. // Running in a standalone thread.
  168. this.run_apply_command_daemon(apply_command);
  169. // One off function that schedules many little tasks, running on the
  170. // internal thread pool.
  171. this.schedule_heartbeats(Duration::from_millis(
  172. HEARTBEAT_INTERVAL_MILLIS,
  173. ));
  174. // The last step is to start running election timer.
  175. this.run_election_timer();
  176. this
  177. }
  178. pub(crate) fn process_request_vote(
  179. &self,
  180. args: RequestVoteArgs,
  181. ) -> RequestVoteReply {
  182. let mut rf = self.inner_state.lock();
  183. let term = rf.current_term;
  184. #[allow(clippy::comparison_chain)]
  185. if args.term < term {
  186. return RequestVoteReply {
  187. term,
  188. vote_granted: false,
  189. };
  190. } else if args.term > term {
  191. rf.current_term = args.term;
  192. rf.voted_for = None;
  193. rf.state = State::Follower;
  194. self.election.reset_election_timer();
  195. self.persister.save_state(rf.persisted_state().into());
  196. }
  197. let voted_for = rf.voted_for;
  198. let (last_log_index, last_log_term) = rf.last_log_index_and_term();
  199. if (voted_for.is_none() || voted_for == Some(args.candidate_id))
  200. && (args.last_log_term > last_log_term
  201. || (args.last_log_term == last_log_term
  202. && args.last_log_index >= last_log_index))
  203. {
  204. rf.voted_for = Some(args.candidate_id);
  205. // It is possible that we have set a timer above when updating the
  206. // current term. It does not hurt to update the timer again.
  207. // We do need to persist, though.
  208. self.election.reset_election_timer();
  209. self.persister.save_state(rf.persisted_state().into());
  210. RequestVoteReply {
  211. term: args.term,
  212. vote_granted: true,
  213. }
  214. } else {
  215. RequestVoteReply {
  216. term: args.term,
  217. vote_granted: false,
  218. }
  219. }
  220. }
  221. pub(crate) fn process_append_entries(
  222. &self,
  223. args: AppendEntriesArgs,
  224. ) -> AppendEntriesReply {
  225. let mut rf = self.inner_state.lock();
  226. if rf.current_term > args.term {
  227. return AppendEntriesReply {
  228. term: rf.current_term,
  229. success: false,
  230. };
  231. }
  232. if rf.current_term < args.term {
  233. rf.current_term = args.term;
  234. rf.voted_for = None;
  235. self.persister.save_state(rf.persisted_state().into());
  236. }
  237. rf.state = State::Follower;
  238. rf.leader_id = args.leader_id;
  239. self.election.reset_election_timer();
  240. if rf.log.len() <= args.prev_log_index
  241. || rf.log[args.prev_log_index].term != args.prev_log_term
  242. {
  243. return AppendEntriesReply {
  244. term: args.term,
  245. success: false,
  246. };
  247. }
  248. for (i, entry) in args.entries.iter().enumerate() {
  249. let index = i + args.prev_log_index + 1;
  250. if rf.log.len() > index {
  251. if rf.log[index].term != entry.term {
  252. rf.log.truncate(index);
  253. rf.log.push(entry.clone());
  254. }
  255. } else {
  256. rf.log.push(entry.clone());
  257. }
  258. }
  259. self.persister.save_state(rf.persisted_state().into());
  260. if args.leader_commit > rf.commit_index {
  261. rf.commit_index = if args.leader_commit < rf.log.len() {
  262. args.leader_commit
  263. } else {
  264. rf.log.len() - 1
  265. };
  266. self.apply_command_signal.notify_one();
  267. }
  268. AppendEntriesReply {
  269. term: args.term,
  270. success: true,
  271. }
  272. }
  273. fn run_election_timer(&self) -> std::thread::JoinHandle<()> {
  274. let this = self.clone();
  275. std::thread::spawn(move || {
  276. let election = this.election.clone();
  277. let mut should_run = None;
  278. while this.keep_running.load(Ordering::SeqCst) {
  279. let mut cancel_handle = should_run
  280. .map(|last_timer_count| this.run_election(last_timer_count))
  281. .flatten();
  282. let mut guard = election.timer.lock();
  283. let (timer_count, deadline) = *guard;
  284. if let Some(last_timer_count) = should_run {
  285. // If the timer was changed more than once, we know the
  286. // last scheduled election should have been cancelled.
  287. if timer_count > last_timer_count + 1 {
  288. cancel_handle.take().map(|c| c.send(()));
  289. }
  290. }
  291. // check the running signal before sleeping. We are holding the
  292. // timer lock, so no one can change it. The kill() method will
  293. // not be able to notify this thread before `wait` is called.
  294. if !this.keep_running.load(Ordering::SeqCst) {
  295. break;
  296. }
  297. should_run = match deadline {
  298. Some(timeout) => loop {
  299. let ret =
  300. election.signal.wait_until(&mut guard, timeout);
  301. let fired = ret.timed_out() && Instant::now() > timeout;
  302. // If the timer has been updated, do not schedule,
  303. // break so that we could cancel.
  304. if timer_count != guard.0 {
  305. // Timer has been updated, cancel current
  306. // election, and block on timeout again.
  307. break None;
  308. } else if fired {
  309. // Timer has fired, remove the timer and allow
  310. // running the next election at timer_count.
  311. // If the next election is cancelled before we
  312. // are back on wait, timer_count will be set to
  313. // a different value.
  314. guard.0 += 1;
  315. guard.1.take();
  316. break Some(guard.0);
  317. }
  318. },
  319. None => {
  320. election.signal.wait(&mut guard);
  321. // The timeout has changed, check again.
  322. None
  323. }
  324. };
  325. drop(guard);
  326. // Whenever woken up, cancel the current running election.
  327. // There are 3 cases we could reach here
  328. // 1. We received an AppendEntries, or decided to vote for
  329. // a peer, and thus turned into a follower. In this case we'll
  330. // be notified by the election signal.
  331. // 2. We are a follower but didn't receive a heartbeat on time,
  332. // or we are a candidate but didn't not collect enough vote on
  333. // time. In this case we'll have a timeout.
  334. // 3. When become a leader, or are shutdown. In this case we'll
  335. // be notified by the election signal.
  336. cancel_handle.map(|c| c.send(()));
  337. }
  338. let stop_wait_group = this.stop_wait_group.clone();
  339. // Making sure the rest of `this` is dropped before the wait group.
  340. drop(this);
  341. drop(stop_wait_group);
  342. })
  343. }
  344. fn run_election(
  345. &self,
  346. timer_count: usize,
  347. ) -> Option<futures::channel::oneshot::Sender<()>> {
  348. let me = self.me;
  349. let (term, args) = {
  350. let mut rf = self.inner_state.lock();
  351. // The previous election is faster and reached the critical section
  352. // before us. We should stop and not run this election.
  353. // Or someone else increased the term and the timer is reset.
  354. if !self.election.try_reset_election_timer(timer_count) {
  355. return None;
  356. }
  357. rf.current_term.0 += 1;
  358. rf.voted_for = Some(me);
  359. rf.state = State::Candidate;
  360. self.persister.save_state(rf.persisted_state().into());
  361. let term = rf.current_term;
  362. let (last_log_index, last_log_term) = rf.last_log_index_and_term();
  363. (
  364. term,
  365. RequestVoteArgs {
  366. term,
  367. candidate_id: me,
  368. last_log_index,
  369. last_log_term,
  370. },
  371. )
  372. };
  373. let mut votes = vec![];
  374. for (index, rpc_client) in self.peers.iter().enumerate() {
  375. if index != self.me.0 {
  376. // RpcClient must be cloned to avoid sending its reference
  377. // across threads.
  378. let rpc_client = rpc_client.clone();
  379. // RPCs are started right away.
  380. let one_vote = self
  381. .thread_pool
  382. .spawn(Self::request_vote(rpc_client, args.clone()));
  383. // Futures must be pinned so that they have Unpin, as required
  384. // by futures::future::select.
  385. votes.push(one_vote);
  386. }
  387. }
  388. let (tx, rx) = futures::channel::oneshot::channel();
  389. self.thread_pool.spawn(Self::count_vote_util_cancelled(
  390. me,
  391. term,
  392. self.inner_state.clone(),
  393. self.election.clone(),
  394. votes,
  395. self.peers.len() / 2,
  396. rx,
  397. self.new_log_entry.clone().unwrap(),
  398. ));
  399. Some(tx)
  400. }
  401. const REQUEST_VOTE_RETRY: usize = 1;
  402. async fn request_vote(
  403. rpc_client: RpcClient,
  404. args: RequestVoteArgs,
  405. ) -> Option<bool> {
  406. let term = args.term;
  407. let reply =
  408. retry_rpc(Self::REQUEST_VOTE_RETRY, RPC_DEADLINE, move |_round| {
  409. rpc_client.clone().call_request_vote(args.clone())
  410. })
  411. .await;
  412. if let Ok(reply) = reply {
  413. return Some(reply.vote_granted && reply.term == term);
  414. }
  415. None
  416. }
  417. #[allow(clippy::too_many_arguments)]
  418. async fn count_vote_util_cancelled(
  419. me: Peer,
  420. term: Term,
  421. rf: Arc<Mutex<RaftState>>,
  422. election: Arc<ElectionState>,
  423. votes: Vec<tokio::task::JoinHandle<Option<bool>>>,
  424. majority: usize,
  425. cancel_token: futures::channel::oneshot::Receiver<()>,
  426. new_log_entry: std::sync::mpsc::Sender<Option<Peer>>,
  427. ) {
  428. let mut vote_count = 0;
  429. let mut against_count = 0;
  430. let mut cancel_token = cancel_token;
  431. let mut futures_vec = votes;
  432. while vote_count < majority && against_count <= majority {
  433. // Mixing tokio futures with futures-rs ones. Fingers crossed.
  434. let selected = futures::future::select(
  435. cancel_token,
  436. futures::future::select_all(futures_vec),
  437. )
  438. .await;
  439. let ((one_vote, _, rest), new_token) = match selected {
  440. futures::future::Either::Left(_) => break,
  441. futures::future::Either::Right(tuple) => tuple,
  442. };
  443. futures_vec = rest;
  444. cancel_token = new_token;
  445. if let Ok(Some(vote)) = one_vote {
  446. if vote {
  447. vote_count += 1
  448. } else {
  449. against_count += 1
  450. }
  451. }
  452. }
  453. if vote_count < majority {
  454. return;
  455. }
  456. let mut rf = rf.lock();
  457. if rf.current_term == term && rf.state == State::Candidate {
  458. // We are the leader now. The election timer can be stopped.
  459. election.stop_election_timer();
  460. rf.state = State::Leader;
  461. rf.leader_id = me;
  462. let log_len = rf.log.len();
  463. for item in rf.next_index.iter_mut() {
  464. *item = log_len;
  465. }
  466. for item in rf.match_index.iter_mut() {
  467. *item = 0;
  468. }
  469. for item in rf.current_step.iter_mut() {
  470. *item = 0;
  471. }
  472. // Sync all logs now.
  473. new_log_entry
  474. .send(None)
  475. .expect("Triggering log entry syncing should not fail");
  476. }
  477. }
  478. fn schedule_heartbeats(&self, interval: Duration) {
  479. for (peer_index, rpc_client) in self.peers.iter().enumerate() {
  480. if peer_index != self.me.0 {
  481. // rf is now owned by the outer async function.
  482. let rf = self.inner_state.clone();
  483. // RPC client must be cloned into the outer async function.
  484. let rpc_client = rpc_client.clone();
  485. // Shutdown signal.
  486. let keep_running = self.keep_running.clone();
  487. self.thread_pool.spawn(async move {
  488. let mut interval = tokio::time::interval(interval);
  489. while keep_running.load(Ordering::SeqCst) {
  490. interval.tick().await;
  491. if let Some(args) = Self::build_heartbeat(&rf) {
  492. tokio::spawn(Self::send_heartbeat(
  493. rpc_client.clone(),
  494. args,
  495. ));
  496. }
  497. }
  498. });
  499. }
  500. }
  501. }
  502. fn build_heartbeat(
  503. rf: &Arc<Mutex<RaftState>>,
  504. ) -> Option<AppendEntriesArgs> {
  505. let rf = rf.lock();
  506. if !rf.is_leader() {
  507. return None;
  508. }
  509. let (last_log_index, last_log_term) = rf.last_log_index_and_term();
  510. let args = AppendEntriesArgs {
  511. term: rf.current_term,
  512. leader_id: rf.leader_id,
  513. prev_log_index: last_log_index,
  514. prev_log_term: last_log_term,
  515. entries: vec![],
  516. leader_commit: rf.commit_index,
  517. };
  518. Some(args)
  519. }
  520. const HEARTBEAT_RETRY: usize = 1;
  521. async fn send_heartbeat(
  522. rpc_client: RpcClient,
  523. args: AppendEntriesArgs,
  524. ) -> std::io::Result<()> {
  525. retry_rpc(Self::HEARTBEAT_RETRY, RPC_DEADLINE, move |_round| {
  526. rpc_client.clone().call_append_entries(args.clone())
  527. })
  528. .await?;
  529. Ok(())
  530. }
  531. fn run_log_entry_daemon(&mut self) -> std::thread::JoinHandle<()> {
  532. let (tx, rx) = std::sync::mpsc::channel::<Option<Peer>>();
  533. self.new_log_entry.replace(tx);
  534. // Clone everything that the thread needs.
  535. let this = self.clone();
  536. std::thread::spawn(move || {
  537. let mut openings = vec![];
  538. openings.resize_with(this.peers.len(), || AtomicUsize::new(0));
  539. let openings = Arc::new(openings); // Not mutable beyond this point.
  540. while let Ok(peer) = rx.recv() {
  541. if !this.keep_running.load(Ordering::SeqCst) {
  542. break;
  543. }
  544. if !this.inner_state.lock().is_leader() {
  545. continue;
  546. }
  547. for (i, rpc_client) in this.peers.iter().enumerate() {
  548. if i != this.me.0 && peer.map(|p| p.0 == i).unwrap_or(true)
  549. {
  550. openings[i].fetch_add(1, Ordering::SeqCst);
  551. this.thread_pool.spawn(Self::sync_log_entry(
  552. this.inner_state.clone(),
  553. rpc_client.clone(),
  554. i,
  555. this.new_log_entry.clone().unwrap(),
  556. openings.clone(),
  557. this.apply_command_signal.clone(),
  558. ));
  559. }
  560. }
  561. }
  562. let stop_wait_group = this.stop_wait_group.clone();
  563. // Making sure the rest of `this` is dropped before the wait group.
  564. drop(this);
  565. drop(stop_wait_group);
  566. })
  567. }
  568. async fn sync_log_entry(
  569. rf: Arc<Mutex<RaftState>>,
  570. rpc_client: RpcClient,
  571. peer_index: usize,
  572. rerun: std::sync::mpsc::Sender<Option<Peer>>,
  573. openings: Arc<Vec<AtomicUsize>>,
  574. apply_command_signal: Arc<Condvar>,
  575. ) {
  576. let opening = &openings[peer_index];
  577. if opening.swap(0, Ordering::SeqCst) == 0 {
  578. return;
  579. }
  580. let args = match Self::build_append_entries(&rf, peer_index) {
  581. Some(args) => args,
  582. None => return,
  583. };
  584. let term = args.term;
  585. let match_index = args.prev_log_index + args.entries.len();
  586. let succeeded = Self::append_entries(rpc_client, args).await;
  587. match succeeded {
  588. Ok(Some(succeeded)) => {
  589. if succeeded {
  590. let mut rf = rf.lock();
  591. rf.next_index[peer_index] = match_index + 1;
  592. rf.current_step[peer_index] = 0;
  593. if match_index > rf.match_index[peer_index] {
  594. rf.match_index[peer_index] = match_index;
  595. if rf.is_leader() && rf.current_term == term {
  596. let mut matched = rf.match_index.to_vec();
  597. let mid = matched.len() / 2 + 1;
  598. matched.sort();
  599. let new_commit_index = matched[mid];
  600. if new_commit_index > rf.commit_index
  601. && rf.log[new_commit_index].term
  602. == rf.current_term
  603. {
  604. rf.commit_index = new_commit_index;
  605. apply_command_signal.notify_one();
  606. }
  607. }
  608. }
  609. } else {
  610. let mut rf = rf.lock();
  611. let step = &mut rf.current_step[peer_index];
  612. if *step < 5 {
  613. *step += 1;
  614. }
  615. let diff = (1 << 8) << *step;
  616. let next_index = &mut rf.next_index[peer_index];
  617. if diff >= *next_index {
  618. *next_index = 1usize;
  619. } else {
  620. *next_index -= diff;
  621. }
  622. rerun
  623. .send(Some(Peer(peer_index)))
  624. .expect("Triggering log entry syncing should not fail");
  625. }
  626. }
  627. // Do nothing, not our term anymore.
  628. Ok(None) => {}
  629. Err(_) => {
  630. tokio::time::sleep(Duration::from_millis(
  631. HEARTBEAT_INTERVAL_MILLIS,
  632. ))
  633. .await;
  634. rerun
  635. .send(Some(Peer(peer_index)))
  636. .expect("Triggering log entry syncing should not fail");
  637. }
  638. };
  639. }
  640. fn build_append_entries(
  641. rf: &Arc<Mutex<RaftState>>,
  642. peer_index: usize,
  643. ) -> Option<AppendEntriesArgs> {
  644. let rf = rf.lock();
  645. if !rf.is_leader() {
  646. return None;
  647. }
  648. let prev_log_index = rf.next_index[peer_index] - 1;
  649. let prev_log_term = rf.log[prev_log_index].term;
  650. Some(AppendEntriesArgs {
  651. term: rf.current_term,
  652. leader_id: rf.leader_id,
  653. prev_log_index,
  654. prev_log_term,
  655. entries: rf.log[rf.next_index[peer_index]..].to_vec(),
  656. leader_commit: rf.commit_index,
  657. })
  658. }
  659. const APPEND_ENTRIES_RETRY: usize = 1;
  660. async fn append_entries(
  661. rpc_client: RpcClient,
  662. args: AppendEntriesArgs,
  663. ) -> std::io::Result<Option<bool>> {
  664. let term = args.term;
  665. let reply = retry_rpc(
  666. Self::APPEND_ENTRIES_RETRY,
  667. RPC_DEADLINE,
  668. move |_round| rpc_client.clone().call_append_entries(args.clone()),
  669. )
  670. .await?;
  671. Ok(if reply.term == term {
  672. Some(reply.success)
  673. } else {
  674. None
  675. })
  676. }
  677. fn run_apply_command_daemon<Func>(
  678. &self,
  679. mut apply_command: Func,
  680. ) -> std::thread::JoinHandle<()>
  681. where
  682. Func: 'static + Send + FnMut(Index, Command),
  683. {
  684. let keep_running = self.keep_running.clone();
  685. let rf = self.inner_state.clone();
  686. let condvar = self.apply_command_signal.clone();
  687. let stop_wait_group = self.stop_wait_group.clone();
  688. std::thread::spawn(move || {
  689. while keep_running.load(Ordering::SeqCst) {
  690. let (mut index, commands) = {
  691. let mut rf = rf.lock();
  692. if rf.last_applied >= rf.commit_index {
  693. condvar.wait_for(
  694. &mut rf,
  695. Duration::from_millis(HEARTBEAT_INTERVAL_MILLIS),
  696. );
  697. }
  698. if rf.last_applied < rf.commit_index {
  699. let index = rf.last_applied + 1;
  700. let last_one = rf.commit_index + 1;
  701. let commands: Vec<Command> = rf.log[index..last_one]
  702. .iter()
  703. .map(|entry| entry.command.clone())
  704. .collect();
  705. rf.last_applied = rf.commit_index;
  706. (index, commands)
  707. } else {
  708. continue;
  709. }
  710. };
  711. // Release the lock while calling external functions.
  712. for command in commands {
  713. apply_command(index, command);
  714. index += 1;
  715. }
  716. }
  717. drop(stop_wait_group);
  718. })
  719. }
  720. pub fn start(&self, command: Command) -> Option<(Term, Index)> {
  721. let mut rf = self.inner_state.lock();
  722. let term = rf.current_term;
  723. if !rf.is_leader() {
  724. return None;
  725. }
  726. let index = rf.log.len();
  727. rf.log.push(LogEntry {
  728. term,
  729. index,
  730. command,
  731. });
  732. self.persister.save_state(rf.persisted_state().into());
  733. self.new_log_entry
  734. .clone()
  735. .unwrap()
  736. .send(None)
  737. .expect("Sending to new log entry queue should never fail.");
  738. Some((term, index))
  739. }
  740. pub fn kill(mut self) {
  741. self.keep_running.store(false, Ordering::SeqCst);
  742. self.election.stop_election_timer();
  743. self.new_log_entry.take().map(|n| n.send(None));
  744. self.apply_command_signal.notify_all();
  745. self.stop_wait_group.wait();
  746. std::sync::Arc::try_unwrap(self.thread_pool)
  747. .expect(
  748. "All references to the thread pool should have been dropped.",
  749. )
  750. .shutdown_timeout(Duration::from_millis(
  751. HEARTBEAT_INTERVAL_MILLIS * 2,
  752. ));
  753. }
  754. pub fn get_state(&self) -> (Term, bool) {
  755. let state = self.inner_state.lock();
  756. (state.current_term, state.is_leader())
  757. }
  758. }
  759. impl RaftState {
  760. fn persisted_state(&self) -> PersistedRaftState {
  761. self.into()
  762. }
  763. fn last_log_index_and_term(&self) -> (Index, Term) {
  764. let len = self.log.len();
  765. assert!(len > 0, "There should always be at least one entry in log");
  766. (len - 1, self.log.last().unwrap().term)
  767. }
  768. fn is_leader(&self) -> bool {
  769. self.state == State::Leader
  770. }
  771. }
  772. const HEARTBEAT_INTERVAL_MILLIS: u64 = 150;
  773. const ELECTION_TIMEOUT_BASE_MILLIS: u64 = 150;
  774. const ELECTION_TIMEOUT_VAR_MILLIS: u64 = 250;
  775. const RPC_DEADLINE: Duration = Duration::from_secs(2);
  776. impl ElectionState {
  777. fn reset_election_timer(&self) {
  778. let mut guard = self.timer.lock();
  779. guard.0 += 1;
  780. guard.1.replace(Self::election_timeout());
  781. self.signal.notify_one();
  782. }
  783. fn try_reset_election_timer(&self, timer_count: usize) -> bool {
  784. let mut guard = self.timer.lock();
  785. if guard.0 != timer_count {
  786. return false;
  787. }
  788. guard.0 += 1;
  789. guard.1.replace(Self::election_timeout());
  790. self.signal.notify_one();
  791. true
  792. }
  793. fn election_timeout() -> Instant {
  794. Instant::now()
  795. + Duration::from_millis(
  796. ELECTION_TIMEOUT_BASE_MILLIS
  797. + thread_rng().gen_range(0, ELECTION_TIMEOUT_VAR_MILLIS),
  798. )
  799. }
  800. fn stop_election_timer(&self) {
  801. let mut guard = self.timer.lock();
  802. guard.0 += 1;
  803. guard.1.take();
  804. self.signal.notify_one();
  805. }
  806. }