agreement_tests.rs 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. #![allow(clippy::identity_op)]
  2. use rand::{thread_rng, Rng};
  3. use scopeguard::defer;
  4. use test_configs::{make_config, raft::config};
  5. #[test]
  6. fn basic_agreement() -> config::Result<()> {
  7. const SERVERS: usize = 5;
  8. let cfg = make_config!(SERVERS, false);
  9. defer!(cfg.cleanup());
  10. cfg.begin("Test (2B): basic agreement");
  11. for index in 1..4 {
  12. let committed = cfg.committed_count(index)?;
  13. assert_eq!(0, committed.0, "some have committed before start()");
  14. let commit_index = cfg.one(index as i32 * 100, SERVERS, false)?;
  15. assert_eq!(
  16. index, commit_index,
  17. "got index {} but expected {}",
  18. commit_index, index
  19. );
  20. }
  21. cfg.end();
  22. Ok(())
  23. }
  24. #[test]
  25. fn fail_agree() -> config::Result<()> {
  26. const SERVERS: usize = 3;
  27. let cfg = make_config!(SERVERS, false);
  28. defer!(cfg.cleanup());
  29. cfg.begin("Test (2B): agreement despite follower disconnection");
  30. cfg.one(101, SERVERS, false)?;
  31. // follower network disconnection
  32. let leader = cfg.check_one_leader()?;
  33. cfg.disconnect((leader + 1) % SERVERS);
  34. // agree despite one disconnected server?
  35. cfg.one(102, SERVERS - 1, false)?;
  36. cfg.one(103, SERVERS - 1, false)?;
  37. config::sleep_election_timeouts(1);
  38. cfg.one(104, SERVERS - 1, false)?;
  39. cfg.one(105, SERVERS - 1, false)?;
  40. // re-connect
  41. cfg.connect((leader + 1) % SERVERS);
  42. // agree with full set of servers?
  43. cfg.one(106, SERVERS, true)?;
  44. config::sleep_election_timeouts(1);
  45. cfg.one(107, SERVERS, true)?;
  46. cfg.end();
  47. Ok(())
  48. }
  49. #[test]
  50. fn fail_no_agree() -> config::Result<()> {
  51. const SERVERS: usize = 5;
  52. let cfg = make_config!(SERVERS, false);
  53. defer!(cfg.cleanup());
  54. cfg.begin("Test (2B): no agreement if too many followers disconnect");
  55. cfg.one(10, SERVERS, false)?;
  56. // 3 of 5 followers disconnect
  57. let leader = cfg.check_one_leader()?;
  58. cfg.disconnect((leader + 1) % SERVERS);
  59. cfg.disconnect((leader + 2) % SERVERS);
  60. cfg.disconnect((leader + 3) % SERVERS);
  61. let result = cfg.leader_start(leader, 20);
  62. assert!(result.is_some(), "leader rejected start()");
  63. let index = result.unwrap().1;
  64. assert_eq!(2, index, "expected index 2, got {}", index);
  65. config::sleep_election_timeouts(2);
  66. let (commit_count, _) = cfg.committed_count(index)?;
  67. assert_eq!(
  68. 0, commit_count,
  69. "{} committed but no majority",
  70. commit_count
  71. );
  72. // repair
  73. cfg.connect((leader + 1) % SERVERS);
  74. cfg.connect((leader + 2) % SERVERS);
  75. cfg.connect((leader + 3) % SERVERS);
  76. // the disconnected majority may have chosen a leader from
  77. // among their own ranks, forgetting index 2.
  78. let leader2 = cfg.check_one_leader()?;
  79. let result = cfg.leader_start(leader2, 30);
  80. assert!(result.is_some(), "leader2 rejected start()");
  81. let index = result.unwrap().1;
  82. assert!(index == 2 || index == 3, "unexpected index {}", index);
  83. cfg.one(1000, SERVERS, true)?;
  84. cfg.end();
  85. Ok(())
  86. }
  87. #[test]
  88. fn rejoin() -> config::Result<()> {
  89. const SERVERS: usize = 3;
  90. let cfg = make_config!(SERVERS, false);
  91. defer!(cfg.cleanup());
  92. cfg.begin("Test (2B): rejoin of partitioned leader");
  93. cfg.one(101, SERVERS, true)?;
  94. // leader network failure
  95. let leader1 = cfg.check_one_leader()?;
  96. cfg.disconnect(leader1);
  97. // make old leader try to agree on some entries
  98. cfg.leader_start(leader1, 102);
  99. cfg.leader_start(leader1, 103);
  100. cfg.leader_start(leader1, 104);
  101. // new leader commits, also for index=2
  102. cfg.one(103, 2, true)?;
  103. // new leader network failure
  104. let leader2 = cfg.check_one_leader()?;
  105. cfg.disconnect(leader2);
  106. // old leader connected again
  107. cfg.connect(leader1);
  108. cfg.one(104, 2, true)?;
  109. // all together now
  110. cfg.connect(leader2);
  111. cfg.one(105, SERVERS, true)?;
  112. cfg.end();
  113. Ok(())
  114. }
  115. #[test]
  116. fn backup() -> config::Result<()> {
  117. const SERVERS: usize = 5;
  118. let cfg = make_config!(SERVERS, false);
  119. defer!(cfg.cleanup());
  120. cfg.begin(
  121. "Test (2B): leader backs up quickly over incorrect follower logs",
  122. );
  123. cfg.one(thread_rng().gen(), SERVERS, true)?;
  124. // put leader and one follower in a partition
  125. let leader1 = cfg.check_one_leader()?;
  126. cfg.disconnect((leader1 + 2) % SERVERS);
  127. cfg.disconnect((leader1 + 3) % SERVERS);
  128. cfg.disconnect((leader1 + 4) % SERVERS);
  129. // submit lots of commands that won't commit
  130. for _ in 0..SERVERS {
  131. cfg.leader_start(leader1, thread_rng().gen());
  132. }
  133. config::sleep_election_timeouts(2);
  134. cfg.disconnect((leader1 + 0) % SERVERS);
  135. cfg.disconnect((leader1 + 1) % SERVERS);
  136. // allow other partition to recover
  137. cfg.connect((leader1 + 2) % SERVERS);
  138. cfg.connect((leader1 + 3) % SERVERS);
  139. cfg.connect((leader1 + 4) % SERVERS);
  140. // lots of successful commands to new group.
  141. for _ in 0..50 {
  142. cfg.one(thread_rng().gen(), 3, true)?;
  143. }
  144. // now another partitioned leader and one follower
  145. let leader2 = cfg.check_one_leader()?;
  146. let mut other = (leader1 + 2) % SERVERS;
  147. if leader2 == other {
  148. other = (leader2 + 1) % SERVERS;
  149. }
  150. cfg.disconnect(other);
  151. // lots more commands that won't commit
  152. for _ in 0..50 {
  153. cfg.leader_start(leader2, thread_rng().gen());
  154. }
  155. config::sleep_election_timeouts(2);
  156. // bring original leader back to life,
  157. for i in 0..SERVERS {
  158. cfg.disconnect(i);
  159. }
  160. cfg.connect((leader1 + 0) % SERVERS);
  161. cfg.connect((leader1 + 1) % SERVERS);
  162. cfg.connect(other);
  163. // lots of successful commands to new group.
  164. for _ in 0..50 {
  165. cfg.one(thread_rng().gen(), 3, true)?;
  166. }
  167. // now everyone
  168. for i in 0..SERVERS {
  169. cfg.connect(i);
  170. }
  171. cfg.one(thread_rng().gen(), SERVERS, true)?;
  172. cfg.end();
  173. Ok(())
  174. }
  175. #[test]
  176. fn count() -> config::Result<()> {
  177. const SERVERS: usize = 3;
  178. let cfg = make_config!(SERVERS, false);
  179. defer!(cfg.cleanup());
  180. cfg.begin("Test (2B): RPC counts aren't too high");
  181. cfg.check_one_leader()?;
  182. let total = cfg.total_rpcs();
  183. assert!(
  184. (1..=30).contains(&total),
  185. "too many or few RPCs ({}) to elect initial leader",
  186. total
  187. );
  188. let mut retries = 0;
  189. let (success, total) = loop {
  190. if retries == 5 {
  191. break (false, 0);
  192. }
  193. if retries != 0 {
  194. config::sleep_millis(3000);
  195. }
  196. retries += 1;
  197. let leader = cfg.check_one_leader()?;
  198. let start_total = cfg.total_rpcs();
  199. const ITERS: usize = 10;
  200. let (term, start_index) = match cfg.leader_start(leader, 1) {
  201. Some(pair) => pair,
  202. None => continue,
  203. };
  204. let mut cmds = vec![];
  205. for i in 1..(ITERS + 2) {
  206. let cmd: i32 = thread_rng().gen();
  207. cmds.push(cmd);
  208. let index = match cfg.leader_start(leader, cmd) {
  209. Some((new_term, index)) => {
  210. if new_term == term {
  211. Some(index)
  212. } else {
  213. None
  214. }
  215. }
  216. None => None,
  217. };
  218. if let Some(index) = index {
  219. assert_eq!(start_index + i, index, "start() failed");
  220. } else {
  221. retries = 0;
  222. break;
  223. }
  224. }
  225. if retries == 0 {
  226. continue;
  227. }
  228. for i in 1..(ITERS + 1) {
  229. let cmd = cfg.wait(start_index + i, SERVERS, Some(term))?;
  230. if let Some(cmd) = cmd {
  231. assert_eq!(
  232. cmd,
  233. cmds[i - 1],
  234. "wrong value {} committed for index {}; expected {:?}",
  235. cmd,
  236. start_index + i,
  237. cmds
  238. )
  239. } else {
  240. retries = 0;
  241. break;
  242. }
  243. }
  244. if term != cfg.check_terms()?.expect("terms should be agreed on") {
  245. retries = 0;
  246. }
  247. if retries == 0 {
  248. continue;
  249. }
  250. let diff = cfg.total_rpcs() - start_total;
  251. if diff > (ITERS + 1 + 3) * 3 {
  252. panic!("too many RPCs ({}) for {} entries", diff, ITERS);
  253. }
  254. break (true, cfg.total_rpcs());
  255. };
  256. assert!(success, "term change too often");
  257. config::sleep_election_timeouts(1);
  258. let diff = cfg.total_rpcs() - total;
  259. assert!(
  260. diff < 3 * 20,
  261. "too many RPCs ({}) for 1 second of idleness",
  262. diff
  263. );
  264. cfg.end();
  265. Ok(())
  266. }