agreement_tests.rs 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. #[macro_use]
  2. extern crate anyhow;
  3. extern crate bytes;
  4. extern crate labrpc;
  5. extern crate ruaft;
  6. #[macro_use]
  7. extern crate scopeguard;
  8. use rand::{thread_rng, Rng};
  9. // This is to remove the annoying "unused code in config" warnings.
  10. pub mod config;
  11. #[test]
  12. fn basic_agreement() -> config::Result<()> {
  13. const SERVERS: usize = 5;
  14. let cfg = config::make_config(SERVERS, false);
  15. defer!(cfg.cleanup());
  16. cfg.begin("Test (2B): basic agreement");
  17. for index in 1..4 {
  18. let committed = cfg.committed_count(index)?;
  19. assert_eq!(0, committed.0, "some have committed before start()");
  20. let commit_index = cfg.one(index as i32 * 100, SERVERS, false)?;
  21. assert_eq!(
  22. index, commit_index,
  23. "got index {} but expected {}",
  24. commit_index, index
  25. );
  26. }
  27. cfg.end();
  28. Ok(())
  29. }
  30. #[test]
  31. fn fail_agree() -> config::Result<()> {
  32. const SERVERS: usize = 3;
  33. let cfg = config::make_config(SERVERS, false);
  34. defer!(cfg.cleanup());
  35. cfg.begin("Test (2B): agreement despite follower disconnection");
  36. cfg.one(101, SERVERS, false)?;
  37. // follower network disconnection
  38. let leader = cfg.check_one_leader()?;
  39. cfg.disconnect((leader + 1) % SERVERS);
  40. // agree despite one disconnected server?
  41. cfg.one(102, SERVERS - 1, false)?;
  42. cfg.one(103, SERVERS - 1, false)?;
  43. config::sleep_election_timeouts(1);
  44. cfg.one(104, SERVERS - 1, false)?;
  45. cfg.one(105, SERVERS - 1, false)?;
  46. // re-connect
  47. cfg.connect((leader + 1) % SERVERS);
  48. // agree with full set of servers?
  49. cfg.one(106, SERVERS, true)?;
  50. config::sleep_election_timeouts(1);
  51. cfg.one(107, SERVERS, true)?;
  52. cfg.end();
  53. Ok(())
  54. }
  55. #[test]
  56. fn fail_no_agree() -> config::Result<()> {
  57. const SERVERS: usize = 5;
  58. let cfg = config::make_config(SERVERS, false);
  59. defer!(cfg.cleanup());
  60. cfg.begin("Test (2B): no agreement if too many followers disconnect");
  61. cfg.one(10, SERVERS, false)?;
  62. // 3 of 5 followers disconnect
  63. let leader = cfg.check_one_leader()?;
  64. cfg.disconnect((leader + 1) % SERVERS);
  65. cfg.disconnect((leader + 2) % SERVERS);
  66. cfg.disconnect((leader + 3) % SERVERS);
  67. let result = cfg.leader_start(leader, 20);
  68. assert!(result.is_some(), "leader rejected start()");
  69. let index = result.unwrap().1;
  70. assert_eq!(2, index, "expected index 2, got {}", index);
  71. config::sleep_election_timeouts(2);
  72. let (commit_count, _) = cfg.committed_count(index)?;
  73. assert_eq!(
  74. 0, commit_count,
  75. "{} committed but no majority",
  76. commit_count
  77. );
  78. // repair
  79. cfg.connect((leader + 1) % SERVERS);
  80. cfg.connect((leader + 2) % SERVERS);
  81. cfg.connect((leader + 3) % SERVERS);
  82. // the disconnected majority may have chosen a leader from
  83. // among their own ranks, forgetting index 2.
  84. let leader2 = cfg.check_one_leader()?;
  85. let result = cfg.leader_start(leader2, 30);
  86. assert!(result.is_some(), "leader2 rejected start()");
  87. let index = result.unwrap().1;
  88. assert!(index == 2 || index == 3, "unexpected index {}", index);
  89. cfg.one(1000, SERVERS, true)?;
  90. cfg.end();
  91. Ok(())
  92. }
  93. #[test]
  94. fn rejoin() -> config::Result<()> {
  95. const SERVERS: usize = 3;
  96. let cfg = config::make_config(SERVERS, false);
  97. defer!(cfg.cleanup());
  98. cfg.begin("Test (2B): rejoin of partitioned leader");
  99. cfg.one(101, SERVERS, true)?;
  100. // leader network failure
  101. let leader1 = cfg.check_one_leader()?;
  102. cfg.disconnect(leader1);
  103. // make old leader try to agree on some entries
  104. cfg.leader_start(leader1, 102);
  105. cfg.leader_start(leader1, 103);
  106. cfg.leader_start(leader1, 104);
  107. // new leader commits, also for index=2
  108. cfg.one(103, 2, true)?;
  109. // new leader network failure
  110. let leader2 = cfg.check_one_leader()?;
  111. cfg.disconnect(leader2);
  112. // old leader connected again
  113. cfg.connect(leader1);
  114. cfg.one(104, 2, true)?;
  115. // all together now
  116. cfg.connect(leader2);
  117. cfg.one(105, SERVERS, true)?;
  118. cfg.end();
  119. Ok(())
  120. }
  121. #[test]
  122. fn backup() -> config::Result<()> {
  123. const SERVERS: usize = 5;
  124. let cfg = config::make_config(SERVERS, false);
  125. defer!(cfg.cleanup());
  126. cfg.begin(
  127. "Test (2B): leader backs up quickly over incorrect follower logs",
  128. );
  129. cfg.one(thread_rng().gen(), SERVERS, true)?;
  130. // put leader and one follower in a partition
  131. let leader1 = cfg.check_one_leader()?;
  132. cfg.disconnect((leader1 + 2) % SERVERS);
  133. cfg.disconnect((leader1 + 3) % SERVERS);
  134. cfg.disconnect((leader1 + 4) % SERVERS);
  135. // submit lots of commands that won't commit
  136. for _ in 0..SERVERS {
  137. cfg.leader_start(leader1, thread_rng().gen());
  138. }
  139. config::sleep_election_timeouts(2);
  140. cfg.disconnect((leader1 + 0) % SERVERS);
  141. cfg.disconnect((leader1 + 1) % SERVERS);
  142. // allow other partition to recover
  143. cfg.connect((leader1 + 2) % SERVERS);
  144. cfg.connect((leader1 + 3) % SERVERS);
  145. cfg.connect((leader1 + 4) % SERVERS);
  146. // lots of successful commands to new group.
  147. for _ in 0..50 {
  148. cfg.one(thread_rng().gen(), 3, true)?;
  149. }
  150. // now another partitioned leader and one follower
  151. let leader2 = cfg.check_one_leader()?;
  152. let mut other = (leader1 + 2) % SERVERS;
  153. if leader2 == other {
  154. other = (leader2 + 1) % SERVERS;
  155. }
  156. cfg.disconnect(other);
  157. // lots more commands that won't commit
  158. for _ in 0..50 {
  159. cfg.leader_start(leader2, thread_rng().gen());
  160. }
  161. config::sleep_election_timeouts(2);
  162. // bring original leader back to life,
  163. for i in 0..SERVERS {
  164. cfg.disconnect(i);
  165. }
  166. cfg.connect((leader1 + 0) % SERVERS);
  167. cfg.connect((leader1 + 1) % SERVERS);
  168. cfg.connect(other);
  169. // lots of successful commands to new group.
  170. for _ in 0..50 {
  171. cfg.one(thread_rng().gen(), 3, true)?;
  172. }
  173. // now everyone
  174. for i in 0..SERVERS {
  175. cfg.connect(i);
  176. }
  177. cfg.one(thread_rng().gen(), SERVERS, true)?;
  178. cfg.end();
  179. Ok(())
  180. }
  181. #[test]
  182. fn count() -> config::Result<()> {
  183. const SERVERS: usize = 3;
  184. let cfg = config::make_config(SERVERS, false);
  185. defer!(cfg.cleanup());
  186. cfg.begin("Test (2B): RPC counts aren't too high");
  187. cfg.check_one_leader()?;
  188. let total = cfg.total_rpcs();
  189. assert!(
  190. total >= 1 && total <= 30,
  191. "too many or few RPCs ({}) to elect initial leader",
  192. total
  193. );
  194. let mut retries = 0;
  195. let (success, total) = loop {
  196. if retries == 5 {
  197. break (false, 0);
  198. }
  199. if retries != 0 {
  200. config::sleep_millis(3000);
  201. }
  202. retries += 1;
  203. let leader = cfg.check_one_leader()?;
  204. let start_total = cfg.total_rpcs();
  205. const ITERS: usize = 10;
  206. let (term, start_index) = match cfg.leader_start(leader, 1) {
  207. Some(pair) => pair,
  208. None => continue,
  209. };
  210. let mut cmds = vec![];
  211. for i in 1..(ITERS + 2) {
  212. let cmd: i32 = thread_rng().gen();
  213. cmds.push(cmd);
  214. let index = match cfg.leader_start(leader, cmd) {
  215. Some((new_term, index)) => {
  216. if new_term == term {
  217. Some(index)
  218. } else {
  219. None
  220. }
  221. }
  222. None => None,
  223. };
  224. if let Some(index) = index {
  225. assert_eq!(start_index + i, index, "start() failed");
  226. } else {
  227. retries = 0;
  228. break;
  229. }
  230. }
  231. if retries == 0 {
  232. continue;
  233. }
  234. for i in 1..(ITERS + 1) {
  235. let cmd = cfg.wait(start_index + i, SERVERS, Some(term))?;
  236. if let Some(cmd) = cmd {
  237. assert_eq!(
  238. cmd,
  239. cmds[i - 1],
  240. "wrong value {} committed for index {}; expected {:?}",
  241. cmd,
  242. start_index + i,
  243. cmds
  244. )
  245. } else {
  246. retries = 0;
  247. break;
  248. }
  249. }
  250. if term != cfg.check_terms()?.expect("terms should be agreed on") {
  251. retries = 0;
  252. }
  253. if retries == 0 {
  254. continue;
  255. }
  256. let diff = cfg.total_rpcs() - start_total;
  257. if diff > (ITERS + 1 + 3) * 3 {
  258. panic!("too many RPCs ({}) for {} entries", diff, ITERS);
  259. }
  260. break (true, cfg.total_rpcs());
  261. };
  262. assert!(success, "term change too often");
  263. config::sleep_election_timeouts(1);
  264. let diff = cfg.total_rpcs() - total;
  265. assert!(
  266. diff < 3 * 20,
  267. "too many RPCs ({}) for 1 second of idleness",
  268. diff
  269. );
  270. cfg.end();
  271. Ok(())
  272. }