server.rs 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. use std::collections::hash_map::Entry;
  2. use std::collections::HashMap;
  3. use std::sync::atomic::{AtomicUsize, Ordering};
  4. use std::sync::mpsc::{channel, Receiver};
  5. use std::sync::Arc;
  6. use std::time::Duration;
  7. use parking_lot::{Condvar, Mutex};
  8. use ruaft::{ApplyCommandMessage, Persister, Raft, RpcClient, Term};
  9. use crate::common::{
  10. ClerkId, GetArgs, GetReply, KVError, PutAppendArgs, PutAppendEnum,
  11. PutAppendReply, UniqueId,
  12. };
  13. use crate::snapshot_holder::SnapshotHolder;
  14. pub struct KVServer {
  15. me: AtomicUsize,
  16. state: Mutex<KVServerState>,
  17. rf: Mutex<Raft<UniqueKVOp>>,
  18. // snapshot
  19. }
  20. #[derive(Clone, Default, Serialize, Deserialize)]
  21. pub struct UniqueKVOp {
  22. op: KVOp,
  23. me: usize,
  24. unique_id: UniqueId,
  25. }
  26. #[derive(Default, Serialize, Deserialize)]
  27. struct KVServerState {
  28. kv: HashMap<String, String>,
  29. debug_kv: HashMap<String, String>,
  30. applied_op: HashMap<ClerkId, (UniqueId, CommitResult)>,
  31. #[serde(skip)]
  32. queries: HashMap<UniqueId, Arc<ResultHolder>>,
  33. }
  34. #[derive(Clone, Serialize, Deserialize)]
  35. enum KVOp {
  36. NoOp,
  37. Get(String),
  38. Put(String, String),
  39. Append(String, String),
  40. }
  41. impl Default for KVOp {
  42. fn default() -> Self {
  43. KVOp::NoOp
  44. }
  45. }
  46. struct ResultHolder {
  47. term: AtomicUsize,
  48. result: Mutex<Result<CommitResult, CommitError>>,
  49. condvar: Condvar,
  50. }
  51. #[derive(Clone, Debug, Serialize, Deserialize)]
  52. enum CommitResult {
  53. Get(Option<String>),
  54. Put,
  55. Append,
  56. }
  57. #[derive(Clone, Debug)]
  58. enum CommitError {
  59. NotLeader,
  60. Expired(UniqueId),
  61. TimedOut,
  62. Conflict,
  63. NotMe(CommitResult),
  64. Duplicate(CommitResult),
  65. }
  66. impl From<CommitError> for KVError {
  67. fn from(err: CommitError) -> Self {
  68. match err {
  69. CommitError::NotLeader => KVError::NotLeader,
  70. CommitError::Expired(_) => KVError::Expired,
  71. CommitError::TimedOut => KVError::TimedOut,
  72. CommitError::Conflict => KVError::Conflict,
  73. CommitError::NotMe(_) => panic!("NotMe is not a KVError"),
  74. CommitError::Duplicate(_) => panic!("Duplicate is not a KVError"),
  75. }
  76. }
  77. }
  78. impl KVServer {
  79. pub fn new(
  80. servers: Vec<RpcClient>,
  81. me: usize,
  82. persister: Arc<dyn Persister>,
  83. max_state_size_bytes: Option<usize>,
  84. ) -> Arc<Self> {
  85. let (tx, rx) = channel();
  86. let apply_command = move |message| {
  87. tx.send(message)
  88. .expect("The receiving end of apply command channel should have not been dropped");
  89. };
  90. let snapshot_holder = Arc::new(SnapshotHolder::default());
  91. let snapshot_holder_clone = snapshot_holder.clone();
  92. let ret = Arc::new(Self {
  93. me: AtomicUsize::new(me),
  94. state: Default::default(),
  95. rf: Mutex::new(Raft::new(
  96. servers,
  97. me,
  98. persister,
  99. apply_command,
  100. max_state_size_bytes,
  101. move |index| snapshot_holder_clone.request_snapshot(index),
  102. )),
  103. });
  104. ret.process_command(snapshot_holder, rx);
  105. ret
  106. }
  107. fn apply_op(&self, unique_id: UniqueId, leader: usize, op: KVOp) {
  108. // The borrow checker does not allow borrowing two fields of an instance
  109. // inside a MutexGuard. But it does allow borrowing two fields of the
  110. // instance itself. Calling deref_mut() on the MutexGuard works, too!
  111. let state = &mut *self.state.lock();
  112. let (applied_op, kv) = (&mut state.applied_op, &mut state.kv);
  113. let entry = applied_op.entry(unique_id.clerk_id);
  114. if let Entry::Occupied(curr) = &entry {
  115. let (applied_unique_id, _) = curr.get();
  116. if *applied_unique_id >= unique_id {
  117. // Redelivered.
  118. // It is guaranteed that we have no pending queries with the
  119. // same unique_id, because
  120. // 1. When inserting into queries, we first check the unique_id
  121. // is strictly larger than the one in applied_op.
  122. // 2. When modifying entries in applied_op, the unique_id can
  123. // only grow larger. And we make sure there is no entries with
  124. // the same unique_id in queries.
  125. // TODO(ditsing): in case 2), make sure there is no entries in
  126. // queries that have a smaller unique_id.
  127. assert!(!state.queries.contains_key(&unique_id));
  128. return;
  129. }
  130. }
  131. let result = match op {
  132. KVOp::NoOp => return,
  133. KVOp::Get(key) => CommitResult::Get(kv.get(&key).cloned()),
  134. KVOp::Put(key, value) => {
  135. kv.insert(key, value);
  136. CommitResult::Put
  137. }
  138. KVOp::Append(key, value) => {
  139. kv.entry(key)
  140. .and_modify(|str| str.push_str(&value))
  141. .or_insert(value);
  142. CommitResult::Append
  143. }
  144. };
  145. match entry {
  146. Entry::Occupied(mut curr) => {
  147. curr.insert((unique_id, result.clone()));
  148. }
  149. Entry::Vacant(vacant) => {
  150. vacant.insert((unique_id, result.clone()));
  151. }
  152. }
  153. if let Some(result_holder) = state.queries.remove(&unique_id) {
  154. // If this KV server might not be the same leader that committed
  155. // this change. We are not sure if it is a duplicate or a conflict.
  156. // To tell the difference, the terms and operations must be stored.
  157. *result_holder.result.lock() = if leader == self.me() {
  158. Ok(result)
  159. } else {
  160. Err(CommitError::NotMe(result))
  161. };
  162. result_holder.condvar.notify_all();
  163. };
  164. }
  165. fn restore_state(&self, mut new_state: KVServerState) {
  166. let mut state = self.state.lock();
  167. std::mem::swap(&mut new_state, &mut *state);
  168. for result_holder in new_state.queries.values() {
  169. *result_holder.result.lock() = Err(CommitError::NotLeader);
  170. result_holder.condvar.notify_all();
  171. }
  172. }
  173. fn process_command(
  174. self: &Arc<Self>,
  175. snapshot_holder: Arc<SnapshotHolder<KVServerState>>,
  176. command_channel: Receiver<ApplyCommandMessage<UniqueKVOp>>,
  177. ) {
  178. let this = Arc::downgrade(self);
  179. std::thread::spawn(move || {
  180. while let Ok(message) = command_channel.recv() {
  181. if let Some(this) = this.upgrade() {
  182. match message {
  183. ApplyCommandMessage::Snapshot(snapshot) => {
  184. let state = snapshot_holder.load_snapshot(snapshot);
  185. this.restore_state(state);
  186. }
  187. ApplyCommandMessage::Command(index, command) => {
  188. this.apply_op(
  189. command.unique_id,
  190. command.me,
  191. command.op,
  192. );
  193. snapshot_holder
  194. .take_snapshot(&this.state.lock(), index);
  195. snapshot_holder.unblock_response();
  196. }
  197. }
  198. } else {
  199. break;
  200. }
  201. }
  202. });
  203. }
  204. const UNSEEN_TERM: usize = 0;
  205. const ATTEMPTING_TERM: usize = usize::MAX;
  206. fn block_for_commit(
  207. &self,
  208. unique_id: UniqueId,
  209. op: KVOp,
  210. timeout: Duration,
  211. ) -> Result<CommitResult, CommitError> {
  212. let result_holder = {
  213. let mut state = self.state.lock();
  214. let applied = state.applied_op.get(&unique_id.clerk_id);
  215. if let Some((applied_unique_id, result)) = applied {
  216. if unique_id < *applied_unique_id {
  217. return Err(CommitError::Expired(unique_id));
  218. } else if unique_id == *applied_unique_id {
  219. return Err(CommitError::Duplicate(result.clone()));
  220. }
  221. };
  222. let entry = state.queries.entry(unique_id).or_insert_with(|| {
  223. Arc::new(ResultHolder {
  224. term: AtomicUsize::new(Self::UNSEEN_TERM),
  225. result: Mutex::new(Err(CommitError::TimedOut)),
  226. condvar: Condvar::new(),
  227. })
  228. });
  229. entry.clone()
  230. };
  231. let (Term(hold_term), is_leader) = self.rf.lock().get_state();
  232. if !is_leader {
  233. result_holder.condvar.notify_all();
  234. return Err(CommitError::NotLeader);
  235. }
  236. Self::validate_term(hold_term);
  237. let set = result_holder.term.compare_exchange(
  238. Self::UNSEEN_TERM,
  239. Self::ATTEMPTING_TERM,
  240. Ordering::SeqCst,
  241. Ordering::SeqCst,
  242. );
  243. let start = match set {
  244. // Nobody has attempted start() yet.
  245. Ok(Self::UNSEEN_TERM) => true,
  246. Ok(_) => panic!(
  247. "compare_exchange should always return the current value 0"
  248. ),
  249. // Somebody has attempted, or is attempting, start().
  250. Err(prev_term) => {
  251. let start =
  252. prev_term != Self::ATTEMPTING_TERM && prev_term < hold_term;
  253. if start {
  254. let set = result_holder.term.compare_exchange(
  255. prev_term,
  256. Self::ATTEMPTING_TERM,
  257. Ordering::SeqCst,
  258. Ordering::SeqCst,
  259. );
  260. set.is_ok()
  261. } else {
  262. false
  263. }
  264. }
  265. };
  266. if start {
  267. let op = UniqueKVOp {
  268. op,
  269. me: self.me(),
  270. unique_id,
  271. };
  272. let start = self.rf.lock().start(op);
  273. let start_term =
  274. start.map_or(Self::UNSEEN_TERM, |(Term(term), _)| {
  275. Self::validate_term(term);
  276. term
  277. });
  278. let set = result_holder.term.compare_exchange(
  279. Self::ATTEMPTING_TERM,
  280. start_term,
  281. Ordering::SeqCst,
  282. Ordering::SeqCst,
  283. );
  284. // Setting term must have been successful, and must return the
  285. // value previously set by this attempt.
  286. assert_eq!(set, Ok(Self::ATTEMPTING_TERM));
  287. if start_term == Self::UNSEEN_TERM {
  288. result_holder.condvar.notify_all();
  289. return Err(CommitError::NotLeader);
  290. }
  291. }
  292. let mut guard = result_holder.result.lock();
  293. // Wait for the op to be committed.
  294. result_holder.condvar.wait_for(&mut guard, timeout);
  295. // Copy the result out.
  296. let result = guard.clone();
  297. // If the result is OK, all other requests should see "Duplicate".
  298. if let Ok(result) = guard.clone() {
  299. *guard = Err(CommitError::Duplicate(result))
  300. }
  301. return result;
  302. }
  303. fn validate_term(term: usize) {
  304. assert!(term > Self::UNSEEN_TERM, "Term must be larger than 0.");
  305. assert!(
  306. term < Self::ATTEMPTING_TERM,
  307. "Term must be smaller than usize::MAX."
  308. );
  309. }
  310. const DEFAULT_TIMEOUT: Duration = Duration::from_secs(1);
  311. pub fn get(&self, args: GetArgs) -> GetReply {
  312. let (is_retry, result) = match self.block_for_commit(
  313. args.unique_id,
  314. KVOp::Get(args.key),
  315. Self::DEFAULT_TIMEOUT,
  316. ) {
  317. Ok(result) => (false, result),
  318. Err(CommitError::Duplicate(result)) => (true, result),
  319. Err(CommitError::NotMe(result)) => (true, result),
  320. Err(e) => {
  321. return GetReply {
  322. result: Err(e.into()),
  323. is_retry: false,
  324. }
  325. }
  326. };
  327. let result = match result {
  328. CommitResult::Get(result) => Ok(result),
  329. CommitResult::Put => Err(KVError::Conflict),
  330. CommitResult::Append => Err(KVError::Conflict),
  331. };
  332. GetReply { result, is_retry }
  333. }
  334. pub fn put_append(&self, args: PutAppendArgs) -> PutAppendReply {
  335. let op = match args.op {
  336. PutAppendEnum::Put => KVOp::Put(args.key, args.value),
  337. PutAppendEnum::Append => KVOp::Append(args.key, args.value),
  338. };
  339. let result = match self.block_for_commit(
  340. args.unique_id,
  341. op,
  342. Self::DEFAULT_TIMEOUT,
  343. ) {
  344. Ok(result) => result,
  345. Err(CommitError::Duplicate(result)) => result,
  346. Err(CommitError::NotMe(result)) => result,
  347. Err(e) => {
  348. return PutAppendReply {
  349. result: Err(e.into()),
  350. }
  351. }
  352. };
  353. let result = match result {
  354. CommitResult::Put => {
  355. if args.op == PutAppendEnum::Put {
  356. Ok(())
  357. } else {
  358. Err(KVError::Conflict)
  359. }
  360. }
  361. CommitResult::Append => {
  362. if args.op == PutAppendEnum::Append {
  363. Ok(())
  364. } else {
  365. Err(KVError::Conflict)
  366. }
  367. }
  368. CommitResult::Get(_) => Err(KVError::Conflict),
  369. };
  370. PutAppendReply { result }
  371. }
  372. pub fn me(&self) -> usize {
  373. self.me.load(Ordering::Relaxed)
  374. }
  375. pub fn raft(&self) -> Raft<UniqueKVOp> {
  376. self.rf.lock().clone()
  377. }
  378. pub fn kill(self: Arc<Self>) {
  379. let rf = self.raft();
  380. // We must drop self to remove the only clone of raft, so that
  381. // `rf.kill()` does not block.
  382. drop(self);
  383. rf.kill();
  384. // The process_command thread will exit, after Raft drops the reference
  385. // to the sender.
  386. }
  387. }