agreement_tests.rs 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #![allow(clippy::identity_op)]
  2. #[macro_use]
  3. extern crate anyhow;
  4. extern crate bytes;
  5. extern crate labrpc;
  6. extern crate ruaft;
  7. #[macro_use]
  8. extern crate scopeguard;
  9. use rand::{thread_rng, Rng};
  10. // This is to remove the annoying "unused code in config" warnings.
  11. pub mod config;
  12. #[test]
  13. fn basic_agreement() -> config::Result<()> {
  14. const SERVERS: usize = 5;
  15. let cfg = config::make_config(SERVERS, false);
  16. defer!(cfg.cleanup());
  17. cfg.begin("Test (2B): basic agreement");
  18. for index in 1..4 {
  19. let committed = cfg.committed_count(index)?;
  20. assert_eq!(0, committed.0, "some have committed before start()");
  21. let commit_index = cfg.one(index as i32 * 100, SERVERS, false)?;
  22. assert_eq!(
  23. index, commit_index,
  24. "got index {} but expected {}",
  25. commit_index, index
  26. );
  27. }
  28. cfg.end();
  29. Ok(())
  30. }
  31. #[test]
  32. fn fail_agree() -> config::Result<()> {
  33. const SERVERS: usize = 3;
  34. let cfg = config::make_config(SERVERS, false);
  35. defer!(cfg.cleanup());
  36. cfg.begin("Test (2B): agreement despite follower disconnection");
  37. cfg.one(101, SERVERS, false)?;
  38. // follower network disconnection
  39. let leader = cfg.check_one_leader()?;
  40. cfg.disconnect((leader + 1) % SERVERS);
  41. // agree despite one disconnected server?
  42. cfg.one(102, SERVERS - 1, false)?;
  43. cfg.one(103, SERVERS - 1, false)?;
  44. config::sleep_election_timeouts(1);
  45. cfg.one(104, SERVERS - 1, false)?;
  46. cfg.one(105, SERVERS - 1, false)?;
  47. // re-connect
  48. cfg.connect((leader + 1) % SERVERS);
  49. // agree with full set of servers?
  50. cfg.one(106, SERVERS, true)?;
  51. config::sleep_election_timeouts(1);
  52. cfg.one(107, SERVERS, true)?;
  53. cfg.end();
  54. Ok(())
  55. }
  56. #[test]
  57. fn fail_no_agree() -> config::Result<()> {
  58. const SERVERS: usize = 5;
  59. let cfg = config::make_config(SERVERS, false);
  60. defer!(cfg.cleanup());
  61. cfg.begin("Test (2B): no agreement if too many followers disconnect");
  62. cfg.one(10, SERVERS, false)?;
  63. // 3 of 5 followers disconnect
  64. let leader = cfg.check_one_leader()?;
  65. cfg.disconnect((leader + 1) % SERVERS);
  66. cfg.disconnect((leader + 2) % SERVERS);
  67. cfg.disconnect((leader + 3) % SERVERS);
  68. let result = cfg.leader_start(leader, 20);
  69. assert!(result.is_some(), "leader rejected start()");
  70. let index = result.unwrap().1;
  71. assert_eq!(2, index, "expected index 2, got {}", index);
  72. config::sleep_election_timeouts(2);
  73. let (commit_count, _) = cfg.committed_count(index)?;
  74. assert_eq!(
  75. 0, commit_count,
  76. "{} committed but no majority",
  77. commit_count
  78. );
  79. // repair
  80. cfg.connect((leader + 1) % SERVERS);
  81. cfg.connect((leader + 2) % SERVERS);
  82. cfg.connect((leader + 3) % SERVERS);
  83. // the disconnected majority may have chosen a leader from
  84. // among their own ranks, forgetting index 2.
  85. let leader2 = cfg.check_one_leader()?;
  86. let result = cfg.leader_start(leader2, 30);
  87. assert!(result.is_some(), "leader2 rejected start()");
  88. let index = result.unwrap().1;
  89. assert!(index == 2 || index == 3, "unexpected index {}", index);
  90. cfg.one(1000, SERVERS, true)?;
  91. cfg.end();
  92. Ok(())
  93. }
  94. #[test]
  95. fn rejoin() -> config::Result<()> {
  96. const SERVERS: usize = 3;
  97. let cfg = config::make_config(SERVERS, false);
  98. defer!(cfg.cleanup());
  99. cfg.begin("Test (2B): rejoin of partitioned leader");
  100. cfg.one(101, SERVERS, true)?;
  101. // leader network failure
  102. let leader1 = cfg.check_one_leader()?;
  103. cfg.disconnect(leader1);
  104. // make old leader try to agree on some entries
  105. cfg.leader_start(leader1, 102);
  106. cfg.leader_start(leader1, 103);
  107. cfg.leader_start(leader1, 104);
  108. // new leader commits, also for index=2
  109. cfg.one(103, 2, true)?;
  110. // new leader network failure
  111. let leader2 = cfg.check_one_leader()?;
  112. cfg.disconnect(leader2);
  113. // old leader connected again
  114. cfg.connect(leader1);
  115. cfg.one(104, 2, true)?;
  116. // all together now
  117. cfg.connect(leader2);
  118. cfg.one(105, SERVERS, true)?;
  119. cfg.end();
  120. Ok(())
  121. }
  122. #[test]
  123. fn backup() -> config::Result<()> {
  124. const SERVERS: usize = 5;
  125. let cfg = config::make_config(SERVERS, false);
  126. defer!(cfg.cleanup());
  127. cfg.begin(
  128. "Test (2B): leader backs up quickly over incorrect follower logs",
  129. );
  130. cfg.one(thread_rng().gen(), SERVERS, true)?;
  131. // put leader and one follower in a partition
  132. let leader1 = cfg.check_one_leader()?;
  133. cfg.disconnect((leader1 + 2) % SERVERS);
  134. cfg.disconnect((leader1 + 3) % SERVERS);
  135. cfg.disconnect((leader1 + 4) % SERVERS);
  136. // submit lots of commands that won't commit
  137. for _ in 0..SERVERS {
  138. cfg.leader_start(leader1, thread_rng().gen());
  139. }
  140. config::sleep_election_timeouts(2);
  141. cfg.disconnect((leader1 + 0) % SERVERS);
  142. cfg.disconnect((leader1 + 1) % SERVERS);
  143. // allow other partition to recover
  144. cfg.connect((leader1 + 2) % SERVERS);
  145. cfg.connect((leader1 + 3) % SERVERS);
  146. cfg.connect((leader1 + 4) % SERVERS);
  147. // lots of successful commands to new group.
  148. for _ in 0..50 {
  149. cfg.one(thread_rng().gen(), 3, true)?;
  150. }
  151. // now another partitioned leader and one follower
  152. let leader2 = cfg.check_one_leader()?;
  153. let mut other = (leader1 + 2) % SERVERS;
  154. if leader2 == other {
  155. other = (leader2 + 1) % SERVERS;
  156. }
  157. cfg.disconnect(other);
  158. // lots more commands that won't commit
  159. for _ in 0..50 {
  160. cfg.leader_start(leader2, thread_rng().gen());
  161. }
  162. config::sleep_election_timeouts(2);
  163. // bring original leader back to life,
  164. for i in 0..SERVERS {
  165. cfg.disconnect(i);
  166. }
  167. cfg.connect((leader1 + 0) % SERVERS);
  168. cfg.connect((leader1 + 1) % SERVERS);
  169. cfg.connect(other);
  170. // lots of successful commands to new group.
  171. for _ in 0..50 {
  172. cfg.one(thread_rng().gen(), 3, true)?;
  173. }
  174. // now everyone
  175. for i in 0..SERVERS {
  176. cfg.connect(i);
  177. }
  178. cfg.one(thread_rng().gen(), SERVERS, true)?;
  179. cfg.end();
  180. Ok(())
  181. }
  182. #[test]
  183. fn count() -> config::Result<()> {
  184. const SERVERS: usize = 3;
  185. let cfg = config::make_config(SERVERS, false);
  186. defer!(cfg.cleanup());
  187. cfg.begin("Test (2B): RPC counts aren't too high");
  188. cfg.check_one_leader()?;
  189. let total = cfg.total_rpcs();
  190. assert!(
  191. (1..=30).contains(&total),
  192. "too many or few RPCs ({}) to elect initial leader",
  193. total
  194. );
  195. let mut retries = 0;
  196. let (success, total) = loop {
  197. if retries == 5 {
  198. break (false, 0);
  199. }
  200. if retries != 0 {
  201. config::sleep_millis(3000);
  202. }
  203. retries += 1;
  204. let leader = cfg.check_one_leader()?;
  205. let start_total = cfg.total_rpcs();
  206. const ITERS: usize = 10;
  207. let (term, start_index) = match cfg.leader_start(leader, 1) {
  208. Some(pair) => pair,
  209. None => continue,
  210. };
  211. let mut cmds = vec![];
  212. for i in 1..(ITERS + 2) {
  213. let cmd: i32 = thread_rng().gen();
  214. cmds.push(cmd);
  215. let index = match cfg.leader_start(leader, cmd) {
  216. Some((new_term, index)) => {
  217. if new_term == term {
  218. Some(index)
  219. } else {
  220. None
  221. }
  222. }
  223. None => None,
  224. };
  225. if let Some(index) = index {
  226. assert_eq!(start_index + i, index, "start() failed");
  227. } else {
  228. retries = 0;
  229. break;
  230. }
  231. }
  232. if retries == 0 {
  233. continue;
  234. }
  235. for i in 1..(ITERS + 1) {
  236. let cmd = cfg.wait(start_index + i, SERVERS, Some(term))?;
  237. if let Some(cmd) = cmd {
  238. assert_eq!(
  239. cmd,
  240. cmds[i - 1],
  241. "wrong value {} committed for index {}; expected {:?}",
  242. cmd,
  243. start_index + i,
  244. cmds
  245. )
  246. } else {
  247. retries = 0;
  248. break;
  249. }
  250. }
  251. if term != cfg.check_terms()?.expect("terms should be agreed on") {
  252. retries = 0;
  253. }
  254. if retries == 0 {
  255. continue;
  256. }
  257. let diff = cfg.total_rpcs() - start_total;
  258. if diff > (ITERS + 1 + 3) * 3 {
  259. panic!("too many RPCs ({}) for {} entries", diff, ITERS);
  260. }
  261. break (true, cfg.total_rpcs());
  262. };
  263. assert!(success, "term change too often");
  264. config::sleep_election_timeouts(1);
  265. let diff = cfg.total_rpcs() - total;
  266. assert!(
  267. diff < 3 * 20,
  268. "too many RPCs ({}) for 1 second of idleness",
  269. diff
  270. );
  271. cfg.end();
  272. Ok(())
  273. }