agreement_tests.rs 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. #![allow(clippy::identity_op)]
  2. #![allow(clippy::uninlined_format_args)]
  3. use rand::{thread_rng, Rng};
  4. use scopeguard::defer;
  5. use test_configs::utils::{sleep_election_timeouts, sleep_millis};
  6. use test_configs::{make_config, raft::config};
  7. #[test]
  8. fn basic_agreement() -> config::Result<()> {
  9. const SERVERS: usize = 5;
  10. let cfg = make_config!(SERVERS, false);
  11. defer!(cfg.cleanup());
  12. cfg.begin("Test (2B): basic agreement");
  13. for index in 1..4 {
  14. let committed = cfg.committed_count(index)?;
  15. assert_eq!(0, committed.0, "some have committed before start()");
  16. let commit_index = cfg.one(index as i32 * 100, SERVERS, false)?;
  17. assert_eq!(
  18. index, commit_index,
  19. "got index {} but expected {}",
  20. commit_index, index
  21. );
  22. }
  23. cfg.end();
  24. Ok(())
  25. }
  26. #[test]
  27. fn fail_agree() -> config::Result<()> {
  28. const SERVERS: usize = 3;
  29. let cfg = make_config!(SERVERS, false);
  30. defer!(cfg.cleanup());
  31. cfg.begin("Test (2B): agreement despite follower disconnection");
  32. cfg.one(101, SERVERS, false)?;
  33. // follower network disconnection
  34. let leader = cfg.check_one_leader()?;
  35. cfg.disconnect((leader + 1) % SERVERS);
  36. // agree despite one disconnected server?
  37. cfg.one(102, SERVERS - 1, false)?;
  38. cfg.one(103, SERVERS - 1, false)?;
  39. sleep_election_timeouts(1);
  40. cfg.one(104, SERVERS - 1, false)?;
  41. cfg.one(105, SERVERS - 1, false)?;
  42. // re-connect
  43. cfg.connect((leader + 1) % SERVERS);
  44. // agree with full set of servers?
  45. cfg.one(106, SERVERS, true)?;
  46. sleep_election_timeouts(1);
  47. cfg.one(107, SERVERS, true)?;
  48. cfg.end();
  49. Ok(())
  50. }
  51. #[test]
  52. fn fail_no_agree() -> config::Result<()> {
  53. const SERVERS: usize = 5;
  54. let cfg = make_config!(SERVERS, false);
  55. defer!(cfg.cleanup());
  56. cfg.begin("Test (2B): no agreement if too many followers disconnect");
  57. cfg.one(10, SERVERS, false)?;
  58. // 3 of 5 followers disconnect
  59. let leader = cfg.check_one_leader()?;
  60. cfg.disconnect((leader + 1) % SERVERS);
  61. cfg.disconnect((leader + 2) % SERVERS);
  62. cfg.disconnect((leader + 3) % SERVERS);
  63. let result = cfg.leader_start(leader, 20);
  64. assert!(result.is_some(), "leader rejected start()");
  65. let index = result.unwrap().1;
  66. assert_eq!(2, index, "expected index 2, got {}", index);
  67. sleep_election_timeouts(2);
  68. let (commit_count, _) = cfg.committed_count(index)?;
  69. assert_eq!(
  70. 0, commit_count,
  71. "{} committed but no majority",
  72. commit_count
  73. );
  74. // repair
  75. cfg.connect((leader + 1) % SERVERS);
  76. cfg.connect((leader + 2) % SERVERS);
  77. cfg.connect((leader + 3) % SERVERS);
  78. // the disconnected majority may have chosen a leader from
  79. // among their own ranks, forgetting index 2.
  80. let leader2 = cfg.check_one_leader()?;
  81. let result = cfg.leader_start(leader2, 30);
  82. assert!(result.is_some(), "leader2 rejected start()");
  83. let index = result.unwrap().1;
  84. assert!((2..=4).contains(&index), "unexpected index {}", index);
  85. cfg.one(1000, SERVERS, true)?;
  86. cfg.end();
  87. Ok(())
  88. }
  89. #[test]
  90. fn rejoin() -> config::Result<()> {
  91. const SERVERS: usize = 3;
  92. let cfg = make_config!(SERVERS, false);
  93. defer!(cfg.cleanup());
  94. cfg.begin("Test (2B): rejoin of partitioned leader");
  95. cfg.one(101, SERVERS, true)?;
  96. // leader network failure
  97. let leader1 = cfg.check_one_leader()?;
  98. cfg.disconnect(leader1);
  99. // make old leader try to agree on some entries
  100. cfg.leader_start(leader1, 102);
  101. cfg.leader_start(leader1, 103);
  102. cfg.leader_start(leader1, 104);
  103. // new leader commits, also for index=2
  104. cfg.one(103, 2, true)?;
  105. // new leader network failure
  106. let leader2 = cfg.check_one_leader()?;
  107. cfg.disconnect(leader2);
  108. // old leader connected again
  109. cfg.connect(leader1);
  110. cfg.one(104, 2, true)?;
  111. // all together now
  112. cfg.connect(leader2);
  113. cfg.one(105, SERVERS, true)?;
  114. cfg.end();
  115. Ok(())
  116. }
  117. #[test]
  118. fn backup() -> config::Result<()> {
  119. const SERVERS: usize = 5;
  120. let cfg = make_config!(SERVERS, false);
  121. defer!(cfg.cleanup());
  122. cfg.begin(
  123. "Test (2B): leader backs up quickly over incorrect follower logs",
  124. );
  125. cfg.one(thread_rng().gen(), SERVERS, true)?;
  126. // put leader and one follower in a partition
  127. let leader1 = cfg.check_one_leader()?;
  128. cfg.disconnect((leader1 + 2) % SERVERS);
  129. cfg.disconnect((leader1 + 3) % SERVERS);
  130. cfg.disconnect((leader1 + 4) % SERVERS);
  131. // submit lots of commands that won't commit
  132. for _ in 0..SERVERS {
  133. cfg.leader_start(leader1, thread_rng().gen());
  134. }
  135. sleep_election_timeouts(2);
  136. cfg.disconnect((leader1 + 0) % SERVERS);
  137. cfg.disconnect((leader1 + 1) % SERVERS);
  138. // allow other partition to recover
  139. cfg.connect((leader1 + 2) % SERVERS);
  140. cfg.connect((leader1 + 3) % SERVERS);
  141. cfg.connect((leader1 + 4) % SERVERS);
  142. // lots of successful commands to new group.
  143. for _ in 0..50 {
  144. cfg.one(thread_rng().gen(), 3, true)?;
  145. }
  146. // now another partitioned leader and one follower
  147. let leader2 = cfg.check_one_leader()?;
  148. let mut other = (leader1 + 2) % SERVERS;
  149. if leader2 == other {
  150. other = (leader2 + 1) % SERVERS;
  151. }
  152. cfg.disconnect(other);
  153. // lots more commands that won't commit
  154. for _ in 0..50 {
  155. cfg.leader_start(leader2, thread_rng().gen());
  156. }
  157. sleep_election_timeouts(2);
  158. // bring original leader back to life,
  159. for i in 0..SERVERS {
  160. cfg.disconnect(i);
  161. }
  162. cfg.connect((leader1 + 0) % SERVERS);
  163. cfg.connect((leader1 + 1) % SERVERS);
  164. cfg.connect(other);
  165. // lots of successful commands to new group.
  166. for _ in 0..50 {
  167. cfg.one(thread_rng().gen(), 3, true)?;
  168. }
  169. // now everyone
  170. for i in 0..SERVERS {
  171. cfg.connect(i);
  172. }
  173. cfg.one(thread_rng().gen(), SERVERS, true)?;
  174. cfg.end();
  175. Ok(())
  176. }
  177. #[test]
  178. fn count() -> config::Result<()> {
  179. const SERVERS: usize = 3;
  180. let cfg = make_config!(SERVERS, false);
  181. defer!(cfg.cleanup());
  182. cfg.begin("Test (2B): RPC counts aren't too high");
  183. cfg.check_one_leader()?;
  184. let total = cfg.total_rpcs();
  185. assert!(
  186. (1..=10).contains(&total),
  187. "too many or few RPCs ({}) to elect initial leader",
  188. total
  189. );
  190. let mut retries = 0;
  191. let (success, total) = loop {
  192. if retries == 5 {
  193. break (false, 0);
  194. }
  195. if retries != 0 {
  196. sleep_millis(3000);
  197. }
  198. retries += 1;
  199. let leader = cfg.check_one_leader()?;
  200. let start_total = cfg.total_rpcs();
  201. const ITERS: usize = 10;
  202. let Some((term, start_index)) = cfg.leader_start(leader, 1) else {
  203. continue
  204. };
  205. let mut cmds = vec![];
  206. for i in 1..(ITERS + 2) {
  207. let cmd: i32 = thread_rng().gen();
  208. cmds.push(cmd);
  209. let index = match cfg.leader_start(leader, cmd) {
  210. Some((new_term, index)) => {
  211. if new_term == term {
  212. Some(index)
  213. } else {
  214. None
  215. }
  216. }
  217. None => None,
  218. };
  219. if let Some(index) = index {
  220. assert_eq!(start_index + i, index, "start() failed");
  221. } else {
  222. retries = 0;
  223. break;
  224. }
  225. }
  226. if retries == 0 {
  227. continue;
  228. }
  229. for i in 1..(ITERS + 1) {
  230. let cmd = cfg.wait(start_index + i, SERVERS, Some(term))?;
  231. if let Some(cmd) = cmd {
  232. assert_eq!(
  233. cmd,
  234. cmds[i - 1],
  235. "wrong value {} committed for index {}; expected {:?}",
  236. cmd,
  237. start_index + i,
  238. cmds
  239. )
  240. } else {
  241. retries = 0;
  242. break;
  243. }
  244. }
  245. if term != cfg.check_terms()?.expect("terms should be agreed on") {
  246. retries = 0;
  247. }
  248. if retries == 0 {
  249. continue;
  250. }
  251. let diff = cfg.total_rpcs() - start_total;
  252. if diff > (ITERS + 1 + 3) * (SERVERS - 1) {
  253. panic!("too many RPCs ({}) for {} entries", diff, ITERS);
  254. }
  255. break (true, cfg.total_rpcs());
  256. };
  257. assert!(success, "term change too often");
  258. sleep_election_timeouts(1);
  259. let diff = cfg.total_rpcs() - total;
  260. assert!(
  261. diff < 20,
  262. "too many RPCs ({}) for 1 second of idleness",
  263. diff
  264. );
  265. cfg.end();
  266. Ok(())
  267. }