server.rs 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. use std::collections::hash_map::Entry;
  2. use std::collections::HashMap;
  3. use std::sync::atomic::{AtomicUsize, Ordering};
  4. use std::sync::mpsc::{channel, Receiver};
  5. use std::sync::Arc;
  6. use std::time::Duration;
  7. use parking_lot::{Condvar, Mutex};
  8. use ruaft::{ApplyCommandMessage, Persister, Raft, RpcClient, Term};
  9. use crate::common::{
  10. ClerkId, GetArgs, GetReply, KVError, PutAppendArgs, PutAppendEnum,
  11. PutAppendReply, UniqueId,
  12. };
  13. use crate::snapshot_holder::SnapshotHolder;
  14. pub struct KVServer {
  15. me: AtomicUsize,
  16. state: Mutex<KVServerState>,
  17. rf: Mutex<Raft<UniqueKVOp>>,
  18. // snapshot
  19. }
  20. #[derive(Clone, Default, Serialize, Deserialize)]
  21. pub struct UniqueKVOp {
  22. op: KVOp,
  23. me: usize,
  24. unique_id: UniqueId,
  25. }
  26. #[derive(Default, Serialize, Deserialize)]
  27. struct KVServerState {
  28. kv: HashMap<String, String>,
  29. debug_kv: HashMap<String, String>,
  30. applied_op: HashMap<ClerkId, (UniqueId, CommitResult)>,
  31. #[serde(skip)]
  32. queries: HashMap<UniqueId, Arc<ResultHolder>>,
  33. }
  34. #[derive(Clone, Serialize, Deserialize)]
  35. enum KVOp {
  36. NoOp,
  37. Get(String),
  38. Put(String, String),
  39. Append(String, String),
  40. }
  41. impl Default for KVOp {
  42. fn default() -> Self {
  43. KVOp::NoOp
  44. }
  45. }
  46. struct ResultHolder {
  47. term: AtomicUsize,
  48. result: Mutex<Result<CommitResult, CommitError>>,
  49. condvar: Condvar,
  50. }
  51. #[derive(Clone, Debug, Serialize, Deserialize)]
  52. enum CommitResult {
  53. Get(Option<String>),
  54. Put,
  55. Append,
  56. }
  57. #[derive(Clone, Debug)]
  58. enum CommitError {
  59. NotLeader,
  60. Expired(UniqueId),
  61. TimedOut,
  62. #[allow(dead_code)]
  63. Conflict,
  64. NotMe(CommitResult),
  65. Duplicate(CommitResult),
  66. }
  67. impl From<CommitError> for KVError {
  68. fn from(err: CommitError) -> Self {
  69. match err {
  70. CommitError::NotLeader => KVError::NotLeader,
  71. CommitError::Expired(_) => KVError::Expired,
  72. CommitError::TimedOut => KVError::TimedOut,
  73. CommitError::Conflict => KVError::Conflict,
  74. CommitError::NotMe(_) => panic!("NotMe is not a KVError"),
  75. CommitError::Duplicate(_) => panic!("Duplicate is not a KVError"),
  76. }
  77. }
  78. }
  79. impl KVServer {
  80. pub fn new(
  81. servers: Vec<RpcClient>,
  82. me: usize,
  83. persister: Arc<dyn Persister>,
  84. max_state_size_bytes: Option<usize>,
  85. ) -> Arc<Self> {
  86. let (tx, rx) = channel();
  87. let apply_command = move |message| {
  88. tx.send(message)
  89. .expect("The receiving end of apply command channel should have not been dropped");
  90. };
  91. let snapshot_holder = Arc::new(SnapshotHolder::default());
  92. let snapshot_holder_clone = snapshot_holder.clone();
  93. let ret = Arc::new(Self {
  94. me: AtomicUsize::new(me),
  95. state: Default::default(),
  96. rf: Mutex::new(Raft::new(
  97. servers,
  98. me,
  99. persister,
  100. apply_command,
  101. max_state_size_bytes,
  102. move |index| snapshot_holder_clone.request_snapshot(index),
  103. )),
  104. });
  105. ret.process_command(snapshot_holder, rx);
  106. ret
  107. }
  108. fn apply_op(&self, unique_id: UniqueId, leader: usize, op: KVOp) {
  109. // The borrow checker does not allow borrowing two fields of an instance
  110. // inside a MutexGuard. But it does allow borrowing two fields of the
  111. // instance itself. Calling deref_mut() on the MutexGuard works, too!
  112. let state = &mut *self.state.lock();
  113. let (applied_op, kv) = (&mut state.applied_op, &mut state.kv);
  114. let entry = applied_op.entry(unique_id.clerk_id);
  115. if let Entry::Occupied(curr) = &entry {
  116. let (applied_unique_id, _) = curr.get();
  117. if *applied_unique_id >= unique_id {
  118. // Redelivered.
  119. // It is guaranteed that we have no pending queries with the
  120. // same unique_id, because
  121. // 1. When inserting into queries, we first check the unique_id
  122. // is strictly larger than the one in applied_op.
  123. // 2. When modifying entries in applied_op, the unique_id can
  124. // only grow larger. And we make sure there is no entries with
  125. // the same unique_id in queries.
  126. // TODO(ditsing): in case 2), make sure there is no entries in
  127. // queries that have a smaller unique_id.
  128. assert!(!state.queries.contains_key(&unique_id));
  129. return;
  130. }
  131. }
  132. let result = match op {
  133. KVOp::NoOp => return,
  134. KVOp::Get(key) => CommitResult::Get(kv.get(&key).cloned()),
  135. KVOp::Put(key, value) => {
  136. kv.insert(key, value);
  137. CommitResult::Put
  138. }
  139. KVOp::Append(key, value) => {
  140. kv.entry(key)
  141. .and_modify(|str| str.push_str(&value))
  142. .or_insert(value);
  143. CommitResult::Append
  144. }
  145. };
  146. match entry {
  147. Entry::Occupied(mut curr) => {
  148. curr.insert((unique_id, result.clone()));
  149. }
  150. Entry::Vacant(vacant) => {
  151. vacant.insert((unique_id, result.clone()));
  152. }
  153. }
  154. if let Some(result_holder) = state.queries.remove(&unique_id) {
  155. // If this KV server might not be the same leader that committed
  156. // this change. We are not sure if it is a duplicate or a conflict.
  157. // To tell the difference, the terms and operations must be stored.
  158. *result_holder.result.lock() = if leader == self.me() {
  159. Ok(result)
  160. } else {
  161. Err(CommitError::NotMe(result))
  162. };
  163. result_holder.condvar.notify_all();
  164. };
  165. }
  166. fn restore_state(&self, mut new_state: KVServerState) {
  167. let mut state = self.state.lock();
  168. std::mem::swap(&mut new_state, &mut *state);
  169. for result_holder in new_state.queries.values() {
  170. *result_holder.result.lock() = Err(CommitError::NotLeader);
  171. result_holder.condvar.notify_all();
  172. }
  173. }
  174. fn process_command(
  175. self: &Arc<Self>,
  176. snapshot_holder: Arc<SnapshotHolder<KVServerState>>,
  177. command_channel: Receiver<ApplyCommandMessage<UniqueKVOp>>,
  178. ) {
  179. let this = Arc::downgrade(self);
  180. std::thread::spawn(move || {
  181. while let Ok(message) = command_channel.recv() {
  182. if let Some(this) = this.upgrade() {
  183. match message {
  184. ApplyCommandMessage::Snapshot(snapshot) => {
  185. let state = snapshot_holder.load_snapshot(snapshot);
  186. this.restore_state(state);
  187. }
  188. ApplyCommandMessage::Command(index, command) => {
  189. this.apply_op(
  190. command.unique_id,
  191. command.me,
  192. command.op,
  193. );
  194. if let Some(snapshot) = snapshot_holder
  195. .take_snapshot(&this.state.lock(), index)
  196. {
  197. this.rf.lock().save_snapshot(snapshot);
  198. snapshot_holder.unblock_response(index);
  199. }
  200. }
  201. }
  202. } else {
  203. break;
  204. }
  205. }
  206. });
  207. }
  208. const UNSEEN_TERM: usize = 0;
  209. const ATTEMPTING_TERM: usize = usize::MAX;
  210. fn block_for_commit(
  211. &self,
  212. unique_id: UniqueId,
  213. op: KVOp,
  214. timeout: Duration,
  215. ) -> Result<CommitResult, CommitError> {
  216. let result_holder = {
  217. let mut state = self.state.lock();
  218. let applied = state.applied_op.get(&unique_id.clerk_id);
  219. if let Some((applied_unique_id, result)) = applied {
  220. if unique_id < *applied_unique_id {
  221. return Err(CommitError::Expired(unique_id));
  222. } else if unique_id == *applied_unique_id {
  223. return Err(CommitError::Duplicate(result.clone()));
  224. }
  225. };
  226. let entry = state.queries.entry(unique_id).or_insert_with(|| {
  227. Arc::new(ResultHolder {
  228. term: AtomicUsize::new(Self::UNSEEN_TERM),
  229. result: Mutex::new(Err(CommitError::TimedOut)),
  230. condvar: Condvar::new(),
  231. })
  232. });
  233. entry.clone()
  234. };
  235. let (Term(hold_term), is_leader) = self.rf.lock().get_state();
  236. if !is_leader {
  237. result_holder.condvar.notify_all();
  238. return Err(CommitError::NotLeader);
  239. }
  240. Self::validate_term(hold_term);
  241. let set = result_holder.term.compare_exchange(
  242. Self::UNSEEN_TERM,
  243. Self::ATTEMPTING_TERM,
  244. Ordering::SeqCst,
  245. Ordering::SeqCst,
  246. );
  247. let start = match set {
  248. // Nobody has attempted start() yet.
  249. Ok(Self::UNSEEN_TERM) => true,
  250. Ok(_) => panic!(
  251. "compare_exchange should always return the current value 0"
  252. ),
  253. // Somebody has attempted, or is attempting, start().
  254. Err(prev_term) => {
  255. let start =
  256. prev_term != Self::ATTEMPTING_TERM && prev_term < hold_term;
  257. if start {
  258. let set = result_holder.term.compare_exchange(
  259. prev_term,
  260. Self::ATTEMPTING_TERM,
  261. Ordering::SeqCst,
  262. Ordering::SeqCst,
  263. );
  264. set.is_ok()
  265. } else {
  266. false
  267. }
  268. }
  269. };
  270. if start {
  271. let op = UniqueKVOp {
  272. op,
  273. me: self.me(),
  274. unique_id,
  275. };
  276. let start = self.rf.lock().start(op);
  277. let start_term =
  278. start.map_or(Self::UNSEEN_TERM, |(Term(term), _)| {
  279. Self::validate_term(term);
  280. term
  281. });
  282. let set = result_holder.term.compare_exchange(
  283. Self::ATTEMPTING_TERM,
  284. start_term,
  285. Ordering::SeqCst,
  286. Ordering::SeqCst,
  287. );
  288. // Setting term must have been successful, and must return the
  289. // value previously set by this attempt.
  290. assert_eq!(set, Ok(Self::ATTEMPTING_TERM));
  291. if start_term == Self::UNSEEN_TERM {
  292. result_holder.condvar.notify_all();
  293. return Err(CommitError::NotLeader);
  294. }
  295. }
  296. let mut guard = result_holder.result.lock();
  297. // Wait for the op to be committed.
  298. result_holder.condvar.wait_for(&mut guard, timeout);
  299. // Copy the result out.
  300. let result = guard.clone();
  301. // If the result is OK, all other requests should see "Duplicate".
  302. if let Ok(result) = guard.clone() {
  303. *guard = Err(CommitError::Duplicate(result))
  304. }
  305. return result;
  306. }
  307. fn validate_term(term: usize) {
  308. assert!(term > Self::UNSEEN_TERM, "Term must be larger than 0.");
  309. assert!(
  310. term < Self::ATTEMPTING_TERM,
  311. "Term must be smaller than usize::MAX."
  312. );
  313. }
  314. const DEFAULT_TIMEOUT: Duration = Duration::from_secs(1);
  315. pub fn get(&self, args: GetArgs) -> GetReply {
  316. let (is_retry, result) = match self.block_for_commit(
  317. args.unique_id,
  318. KVOp::Get(args.key),
  319. Self::DEFAULT_TIMEOUT,
  320. ) {
  321. Ok(result) => (false, result),
  322. Err(CommitError::Duplicate(result)) => (true, result),
  323. Err(CommitError::NotMe(result)) => (true, result),
  324. Err(e) => {
  325. return GetReply {
  326. result: Err(e.into()),
  327. is_retry: false,
  328. }
  329. }
  330. };
  331. let result = match result {
  332. CommitResult::Get(result) => Ok(result),
  333. CommitResult::Put => Err(KVError::Conflict),
  334. CommitResult::Append => Err(KVError::Conflict),
  335. };
  336. GetReply { result, is_retry }
  337. }
  338. pub fn put_append(&self, args: PutAppendArgs) -> PutAppendReply {
  339. let op = match args.op {
  340. PutAppendEnum::Put => KVOp::Put(args.key, args.value),
  341. PutAppendEnum::Append => KVOp::Append(args.key, args.value),
  342. };
  343. let result = match self.block_for_commit(
  344. args.unique_id,
  345. op,
  346. Self::DEFAULT_TIMEOUT,
  347. ) {
  348. Ok(result) => result,
  349. Err(CommitError::Duplicate(result)) => result,
  350. Err(CommitError::NotMe(result)) => result,
  351. Err(e) => {
  352. return PutAppendReply {
  353. result: Err(e.into()),
  354. }
  355. }
  356. };
  357. let result = match result {
  358. CommitResult::Put => {
  359. if args.op == PutAppendEnum::Put {
  360. Ok(())
  361. } else {
  362. Err(KVError::Conflict)
  363. }
  364. }
  365. CommitResult::Append => {
  366. if args.op == PutAppendEnum::Append {
  367. Ok(())
  368. } else {
  369. Err(KVError::Conflict)
  370. }
  371. }
  372. CommitResult::Get(_) => Err(KVError::Conflict),
  373. };
  374. PutAppendReply { result }
  375. }
  376. pub fn me(&self) -> usize {
  377. self.me.load(Ordering::Relaxed)
  378. }
  379. pub fn raft(&self) -> Raft<UniqueKVOp> {
  380. self.rf.lock().clone()
  381. }
  382. pub fn kill(self: Arc<Self>) {
  383. let rf = self.raft();
  384. // We must drop self to remove the only clone of raft, so that
  385. // `rf.kill()` does not block.
  386. drop(self);
  387. rf.kill();
  388. // The process_command thread will exit, after Raft drops the reference
  389. // to the sender.
  390. }
  391. }