From bd3fdb266faba63d1157d975bdd2a68de54b0b0d Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Thu, 1 Feb 2024 03:02:12 +0100 Subject: [PATCH 01/43] First attempt at storage engine --- Cargo.lock | 10 + Cargo.toml | 3 +- storage_engine/Cargo.toml | 10 + storage_engine/src/lib.rs | 2 + storage_engine/src/main.rs | 54 ++++ storage_engine/src/storage_engine.rs | 436 +++++++++++++++++++++++++++ 6 files changed, 514 insertions(+), 1 deletion(-) create mode 100644 storage_engine/Cargo.toml create mode 100644 storage_engine/src/lib.rs create mode 100644 storage_engine/src/main.rs create mode 100644 storage_engine/src/storage_engine.rs diff --git a/Cargo.lock b/Cargo.lock index 5ea9fdc..2944fae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -599,6 +599,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "storage_engine" +version = "0.1.0" +dependencies = [ + "async-trait", + "bincode", + "thiserror", + "tokio", +] + [[package]] name = "strsim" version = "0.10.0" diff --git a/Cargo.toml b/Cargo.toml index 57d0219..3e6cda3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,5 +5,6 @@ members = [ "proto", "server", "client", - "parser" + "parser", + "storage_engine" ] diff --git a/storage_engine/Cargo.toml b/storage_engine/Cargo.toml new file mode 100644 index 0000000..6658764 --- /dev/null +++ b/storage_engine/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "storage_engine" +version = "0.1.0" +edition = "2021" + +[dependencies] +bincode = "2.0.0-rc.3" +tokio = { version = "1.34.0", features = ["full"] } +async-trait = "0.1.74" +thiserror = "1.0.50" diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs new file mode 100644 index 0000000..0e2eb79 --- /dev/null +++ b/storage_engine/src/lib.rs @@ -0,0 +1,2 @@ + +pub mod storage_engine; diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs new file mode 100644 index 0000000..fffde93 --- /dev/null +++ b/storage_engine/src/main.rs @@ -0,0 +1,54 @@ +use tokio::sync::{Mutex, RwLock}; +use tokio::fs::{File, OpenOptions}; +use tokio::io::{BufReader, BufWriter, AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::fs; + +mod storage_engine; + +use crate::storage_engine::*; + +#[tokio::main] +async fn main() -> Result<(), std::io::Error> { + println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); + + let blob_name = "blob10.minisql"; + + // WARNING: Number of columns is 5????? + + let mut store = Store::new(blob_name, 5).await.map_err(|e| e.to_io_or_panic())?; + + // let store_bytes = store.get_all_bytes().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", store_bytes); + + let mut buff: Vec = vec![0;1]; + let x = store.file.read_exact(&mut buff[..]).await?; + println!("{:?}", buff); + + + // let entry0: Entry = Entry::new_deleted(vec![1, 2, 3, 4, 5]); + // let entry1: Entry = Entry::new_deleted(vec![200,200,5,6,7]); + // let cursor0 = store.append_entry(&entry0).await.map_err(|e| e.to_io_or_panic())?; + // // println!("cursor0 = {}", cursor0); + + // let cursor1 = store.append_entry(&entry1).await.map_err(|e| e.to_io_or_panic())?; + // println!("cursor0 = {}, cursor1 = {}", cursor0, cursor1); + + // let mut store = Store::connect(blob_name).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", store); + + // let x = store.entry_at::(16).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + + // let store_bytes = store.get_all_bytes().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", store_bytes); + + // let mut store = ColumnStore::connect("blob08.minisql").await.map_err(|e| e.to_io_or_panic())?; + // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); + // let cursor2 = store.append_entry(&entry2).await.map_err(|e| e.to_io_or_panic())?; + // println!("cursor2 = {}", cursor2); + // println!("{:?}", store); + + + println!("DONE"); + Ok(()) +} diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs new file mode 100644 index 0000000..802444e --- /dev/null +++ b/storage_engine/src/storage_engine.rs @@ -0,0 +1,436 @@ +use tokio::io::{BufReader, BufWriter, AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::sync::{Mutex, RwLock}; +use tokio::fs::{File, OpenOptions}; +use tokio::fs; + +use bincode; +use bincode::de::Decoder; +use bincode::enc::write::Writer; +use bincode::enc::Encoder; +use bincode::{BorrowDecode, Decode, Encode}; +use bincode::config::{BigEndian, Configuration, Fixint}; + +use std::mem::size_of; + + +// =============Byte encoding/decoding============ +const BIN_CONFIG: Configuration = bincode::config::standard().with_big_endian().with_fixed_int_encoding(); + +fn encode(t: &T) -> Result, bincode::error::EncodeError> { + bincode::encode_to_vec(t, BIN_CONFIG) +} + +fn decode(bytes: &[u8]) -> Result<(T, usize), bincode::error::DecodeError> { + bincode::decode_from_slice(bytes, BIN_CONFIG) +} + +fn encode_vector(ts: &[T]) -> Result, bincode::error::EncodeError> { + let size: usize = ts.len(); + let mut result = encode(&size)?; + for t in ts { + result.append(&mut encode(&t)?); + } + Ok(result) +} + +fn decode_vector(bytes: &[u8]) -> Result, bincode::error::DecodeError> { + let mut offset = size_of::(); + let result_len: usize = decode(&bytes[..offset])?.0; + let mut result: Vec = Vec::with_capacity(result_len); + for _ in 0..result_len { + let (x, bytes_consumed) = decode::(&bytes[offset..])?; + offset += bytes_consumed; + result.push(x); + } + Ok(result) +} + +// We don't care about encoding the length here (since it will be used for a row with known column +// size) +fn encode_sequence(ts: &[T]) -> Result, bincode::error::EncodeError> { + let mut result = vec![]; + for t in ts { + result.append(&mut encode(&t)?); + + } + Ok(result) +} + +fn decode_sequence(len: usize, bytes: &[u8]) -> Result, bincode::error::DecodeError> { + let mut result: Vec = Vec::with_capacity(len); + let mut offset = 0; + for _ in 0..len { + let (x, bytes_consumed) = decode::(&bytes[offset..])?; + offset += bytes_consumed; + result.push(x); + } + Ok(result) +} + + +fn example_encoding_decoding() { + let xs: Vec = vec![123, 250, 256, 123, 123, 123]; + let xs: Vec = vec![]; + let xs: Vec = vec![123]; + let xs: Vec = vec![123, 250]; + + let xs: Vec = vec!["foo".to_string(), "bar".to_string()]; + + + println!("original {:?}", xs); + let exs = encode_vector(&xs[..]).unwrap(); + println!("encoded {:?}", exs); + + // WARNING: Don't forget to specify the type here + // let dxs = decode_vector::(&exs[..]).unwrap(); + let dxs = decode_vector::(&exs[..]).unwrap(); + println!("decoded {:?}", dxs); +} + +// ============Column Store=============== + +// ColumnStore +#[derive(Debug)] +pub struct Store { + column_file_name: String, + // TODO: This should be private + pub file: File, + header: StoreHeader + // meta + // location of rows file + // locations of index files + // + // rows file + // list +} + +// These type aliases are here because they make writing decoders easier. +type NumberOfColumns = usize; +type DeletedCount = usize; +#[derive(Debug)] +pub struct StoreHeader { + number_of_columns: NumberOfColumns, + deleted_count: DeletedCount, +} + +#[derive(Debug)] +pub struct Entry { + is_deleted: bool, + // file_position: FilePosition, + data: Vec, +} + +#[derive(Debug)] +pub enum Error { + DecodeError(DecodeErrorKind, bincode::error::DecodeError), + EncodeError(bincode::error::EncodeError), + IoError(std::io::Error), + InvalidStoreHeader, +} + +#[derive(Debug)] +pub enum DecodeErrorKind { + StoreHeaderNumberOfColumns, + StoreHeaderDeletedCount, + EntryData, + EntryIsDeleted, + EntryDataSize +} + +// ===Errors=== +impl Error { + pub fn to_io_or_panic(self) -> std::io::Error { + use Error::*; + match self { + IoError(err) => err, + err => { + println!("{:?}", err); + panic!(); + } + } + } +} + +impl From for Error { + fn from(err: bincode::error::EncodeError) -> Self { + Self::EncodeError(err) + } +} + +impl From for Error { + fn from(err: std::io::Error) -> Self { + Self::IoError(err) + } +} + + +// ====Entry==== +impl Entry { + pub fn new(data: Vec) -> Self { + Self { is_deleted: false, data } + } + + pub fn new_deleted(data: Vec) -> Self { + Self { is_deleted: true, data } + } + + // FORMAT: [HEADER, ..sequence of data] + // HEADER: [Boolean (one byte), number of bytes in the data (not including the boolean)] + fn encode(self: &Entry) -> Result, Error> + where T: Encode + { + let mut result: Vec = encode(&self.is_deleted)?; // bool 1 byte + let mut encoded_data = encode_sequence(&self.data[..])?; + let encoded_data_len = encoded_data.len(); + result.append(&mut encode(&encoded_data_len)?); // usize 8 bytes + println!("enc data len == {}", encoded_data_len); + println!("encoded_data == {:?} ", encoded_data); + result.append(&mut encoded_data); // data variable size + Ok(result) + } + + // in bytes + pub fn header_size() -> usize { + size_of::() + size_of::() + } + + // TODO: Maybe introduce an EntryHeader as a separate type? + pub fn decode_header(header_bytes: Vec) -> Result<(bool, usize), Error> { + let (is_deleted, offset) = + decode::(&header_bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + + let (data_size, _) = + decode::(&header_bytes[offset..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryDataSize, e))?; + Ok((is_deleted, data_size)) + } +} + +pub type Column = u64; +pub type Cursor = u64; + +impl StoreHeader { + fn encode(&self) -> Result, Error> { + // FORMAT: First Number of Columns, Then Deleted Count. + let mut result = encode(&self.number_of_columns)?; + result.append(&mut encode(&self.deleted_count)?); + Ok(result) + } + + async fn decode(file: &mut File) -> Result { + let number_of_columns_size = size_of::(); + let deleted_count_size = size_of::(); + let header_size: usize = number_of_columns_size + deleted_count_size; + let mut header_bytes: Vec = vec![0; header_size]; + // TODO: Why do we need to have mutable reference for the file when we are reading it? + match file.read_exact(&mut header_bytes).await { + Ok(_) => { + let offset = 0; + let (number_of_columns, offset) = + decode::(&header_bytes[offset..offset + number_of_columns_size]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; + let (deleted_count, _) = + decode::(&header_bytes[offset..offset + deleted_count_size]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; + let header = StoreHeader { + number_of_columns, + deleted_count, + }; + Ok(header) + }, + Err(err) => { + // TODO: When err is of the kind UnexpectedEof, return InvalidStoreHeader + println!("ARE WE HERE?"); + Err(Error::from(err)) + } + } + } +} + + +impl Store { + // For debugging + pub async fn get_all_bytes(mut self) -> Result, Error>{ + let mut bytes: Vec = vec![]; + use std::io::Read; + // for byte in self.file.bytes() { + // } + self.file.read_exact(&mut bytes[..]).await?; + Ok(bytes) + } + + pub async fn new(column_file_name: &str, number_of_columns: usize) -> Result { + let mut file = + OpenOptions::new() + .write(true) + .read(true) + .create_new(true) + .open(column_file_name) + .await?; + + let header = StoreHeader { + number_of_columns, + deleted_count: 0, + } ; + let encoded_header = header.encode()?; + file.write(&encoded_header).await?; + println!("is something being encoded? {:?}", encoded_header); + let store = Self { + column_file_name: column_file_name.to_string(), + file, + header, + }; + Ok(store) + } + + pub async fn connect(column_file_name: &str) -> Result { + let mut file = OpenOptions::new().read(true).write(true).open(column_file_name).await?; + + let header = StoreHeader::decode(&mut file).await?; + Ok(Self { + column_file_name: column_file_name.to_string(), + file, + header + }) + } + + pub async fn entry_at(&mut self, cursor: Cursor) -> Result, Error> { + self.file.seek(SeekFrom::Start(cursor)).await?; + + // 1. read header bytes (fixed number of bytes). + // 2. decode header + // 3. read entry data bytes. + // 4. decode data + // That will tell us how much data there is. + let entry_header_size = Entry::::header_size(); + let mut header_bytes: Vec = vec![0; entry_header_size]; + self.file.read_exact(&mut header_bytes).await?; + + println!("cursor == {}", cursor); + println!("header_bytes == {:?}", header_bytes); + + let (is_deleted, data_size) = Entry::::decode_header(header_bytes)?; + + self.file.seek(SeekFrom::Current(entry_header_size as i64)).await?; + let mut data_bytes: Vec = vec![0; data_size]; + + println!("(is_delted, data_size) = ({}, {})", is_deleted, data_size); + let data = + decode_sequence::(self.header.number_of_columns, &mut data_bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; + Ok(Entry { + is_deleted, + data + }) + } + + pub async fn append_entry(&mut self, entry: &Entry) -> Result { + // On linux when opening a file in append mode, the seek is set to 0 + // and only updated after a write. That's why we do the cursor gymnastic at the end. + let encoded_entry: Vec = entry.encode()?; + println!("encoded_entry == {:?}", encoded_entry); + println!("bool size == {}", size_of::()); + println!("usize size == {}", size_of::()); + self.file.write(&encoded_entry).await?; + let next_cursor: Cursor = self.file.stream_position().await?; + let cursor: Cursor = next_cursor - encoded_entry.len() as u64; + Ok(cursor) + } + + pub async fn iterate_all(&mut self) -> Result { + // Loop through the rows and print them out + todo!() + } +} + +// impl StorageEngine for ColumnStore { +// async fn append(&mut self, id: Index, entry: Row) -> Result + +// async fn get_all(&self) -> ??? +// async fn get_eq(&self, column: Column, value: T) -> ??? + +// async fn delete_all(&mut self) +// async fn delete_eq(&mut self, column: Column, value: T) -> ??? +// } + +// struct Error { +// } + + +// Selected( +// &'a TableSchema, +// ColumnSelection, +// TODO: Don't do the Box(dyn Iterator<...>) +// you'll have a concrete implementation of Iterator, and that's what you'll use +// Box + 'a + Send>, +// ), + + + +// #[async_trait] +// trait StorageEngine +// where T: Encode + Decode +// { +// async fn append(&mut self, id: Index, entry: Row) -> Result + +// async fn get_all(&self) -> ??? +// async fn get_eq(&self, column: Column, value: T) -> ??? + +// async fn delete_all(&mut self) +// async fn delete_eq(&mut self, column: Column, value: T) -> ??? +// } + +// #[cfg(test)] +// mod tests { +// #[test] +// fn hello_test() { +// assert!(true); +// } +// } + +// let sroage_engine = STorageEngine::new("users") +// let mut next_position = 0 + + +// type FilePosition = usize; + + +// type StoreFile = Vec; +// type IndexFile = ??? + +// struct IndexEntry { + +// } + + +// #00000 [false, u26, "Arnold", "schwarzenegger", "gettothechoppa@yahoo.com"] #5120000 [true, u27, "Arnold", "Vosloo", "avosloo@aol.com"] +// #00000 [true, u27, "Arnold", "Vosloo", "avosloo@aol.com"] + + +// at #00000 512 kb deleted, +// ... + + + + +// [(u26, [#00000]), (u27, [#5120000])] +// [("Arnold", [#000000, #5120000]), ("Arnfsdaf", []), ("Adasdsd", []), ("Bdsad", [])] +// // basically always keep indexes in memory and on write always sync on disk + + + + +// CREATE INDEX usersname ON "users" (name); + +// INSERT INTO users (id, name, surname, email) VALUES (u26, "Arnold", "schwarzenegger", "gettothechoppa@yahoo.com"); +// INSERT INTO users (id, name, surname, email) VALUES (u27, "Arnold", "Vosloo", "avosloo@aol.com"); + + +// SELECT * FROM users WHERE id=u26; + +// SELECT * FROM users WHERE name="Arnold"; + + +// SELECT * FROM cars; +// DELETE FROM users WHERE name="Arnold"; From cad4ba82155d89632f2d43b1e7079b2e899ef8d7 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Fri, 2 Feb 2024 13:56:37 +0100 Subject: [PATCH 02/43] Print first n entries --- storage_engine/src/binary_coding.rs | 78 ++++ storage_engine/src/error.rs | 44 ++ storage_engine/src/lib.rs | 3 +- storage_engine/src/main.rs | 94 +++-- storage_engine/src/storage_engine.rs | 578 ++++++++++++++------------- 5 files changed, 481 insertions(+), 316 deletions(-) create mode 100644 storage_engine/src/binary_coding.rs create mode 100644 storage_engine/src/error.rs diff --git a/storage_engine/src/binary_coding.rs b/storage_engine/src/binary_coding.rs new file mode 100644 index 0000000..b6fe132 --- /dev/null +++ b/storage_engine/src/binary_coding.rs @@ -0,0 +1,78 @@ +use bincode; +use bincode::{Decode, Encode}; +use bincode::config::{BigEndian, Configuration, Fixint}; +use std::mem::size_of; + +const BIN_CONFIG: Configuration = bincode::config::standard().with_big_endian().with_fixed_int_encoding(); + +pub fn encode(t: &T) -> Result, bincode::error::EncodeError> { + bincode::encode_to_vec(t, BIN_CONFIG) +} + +pub fn decode(bytes: &[u8]) -> Result<(T, usize), bincode::error::DecodeError> { + bincode::decode_from_slice(bytes, BIN_CONFIG) +} + +pub fn encode_vector(ts: &[T]) -> Result, bincode::error::EncodeError> { + let size: usize = ts.len(); + let mut result = encode(&size)?; + for t in ts { + result.append(&mut encode(&t)?); + } + Ok(result) +} + +pub fn decode_vector(bytes: &[u8]) -> Result, bincode::error::DecodeError> { + let mut offset = size_of::(); + let result_len: usize = decode(&bytes[..offset])?.0; + let mut result: Vec = Vec::with_capacity(result_len); + for _ in 0..result_len { + let (x, bytes_consumed) = decode::(&bytes[offset..])?; + offset += bytes_consumed; + result.push(x); + } + Ok(result) +} + +// We don't care about encoding the length here (since it will be used for a row with known column +// size) +pub fn encode_sequence(ts: &[T]) -> Result, bincode::error::EncodeError> { + let mut result = vec![]; + for t in ts { + result.append(&mut encode(&t)?); + + } + Ok(result) +} + +pub fn decode_sequence(len: usize, bytes: &[u8]) -> Result, bincode::error::DecodeError> { + let mut result: Vec = Vec::with_capacity(len); + let mut offset = 0; + for _ in 0..len { + let (x, bytes_consumed) = decode::(&bytes[offset..])?; + offset += bytes_consumed; + result.push(x); + } + Ok(result) +} + + +fn example_encoding_decoding() { + let xs: Vec = vec![123, 250, 256, 123, 123, 123]; + let xs: Vec = vec![]; + let xs: Vec = vec![123]; + let xs: Vec = vec![123, 250]; + + let xs: Vec = vec!["foo".to_string(), "bar".to_string()]; + + + println!("original {:?}", xs); + let exs = encode_vector(&xs[..]).unwrap(); + println!("encoded {:?}", exs); + + // WARNING: Don't forget to specify the type here + // let dxs = decode_vector::(&exs[..]).unwrap(); + let dxs = decode_vector::(&exs[..]).unwrap(); + println!("decoded {:?}", dxs); +} + diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs new file mode 100644 index 0000000..7c213f3 --- /dev/null +++ b/storage_engine/src/error.rs @@ -0,0 +1,44 @@ +#[derive(Debug)] +pub enum Error { + DecodeError(DecodeErrorKind, bincode::error::DecodeError), + EncodeError(bincode::error::EncodeError), + IoError(std::io::Error), + InvalidStoreHeader, +} + +#[derive(Debug)] +pub enum DecodeErrorKind { + StoreHeaderNumberOfColumns, + StoreHeaderDeletedCount, + EntryData, + EntryIsDeleted, + EntryDataSize +} + +// ===Errors=== +impl Error { + pub fn to_io_or_panic(self) -> std::io::Error { + use Error::*; + match self { + IoError(err) => err, + err => { + println!("{:?}", err); + panic!(); + } + } + } +} + +impl From for Error { + fn from(err: bincode::error::EncodeError) -> Self { + Self::EncodeError(err) + } +} + +impl From for Error { + fn from(err: std::io::Error) -> Self { + Self::IoError(err) + } +} + + diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs index 0e2eb79..7ce85ce 100644 --- a/storage_engine/src/lib.rs +++ b/storage_engine/src/lib.rs @@ -1,2 +1,3 @@ - pub mod storage_engine; +mod binary_coding; +mod error; diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index fffde93..ac6fa88 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -2,47 +2,85 @@ use tokio::sync::{Mutex, RwLock}; use tokio::fs::{File, OpenOptions}; use tokio::io::{BufReader, BufWriter, AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; use tokio::fs; +use std::path::Path; mod storage_engine; +mod binary_coding; +mod error; use crate::storage_engine::*; +type Data = u32; + +const TABLE_PATH: &'static str = "test_table"; + +type Result = std::result::Result; + +async fn create_store() -> Result> { + let mut store: Store = Store::new(TABLE_PATH, 5).await.map_err(|e| e.to_io_or_panic())?; + println!("CREATED"); + println!("{:?}", store.read_all_bytes().await?); + + let entry0: Entry = Entry::new_deleted(vec![1, 2, 3, 4, 5]); + append_entry(&mut store, &entry0).await?; + + let entry1: Entry = Entry::new_deleted(vec![200, 200, 5, 6, 7]); + append_entry(&mut store, &entry1).await?; + + println!("{:?}", store.read_all_bytes().await?); + Ok(store) +} +async fn connect_store() -> Result> { + let mut store: Store = Store::connect(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; + println!("CONNECTED"); + println!("{:?}", store.read_all_bytes().await?); + Ok(store) +} + +async fn create_or_connect() -> Result> { + let exists = storage_engine::store_exists(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; + if exists { + connect_store().await + } else { + create_store().await + } +} + + +async fn append_entry(store: &mut Store, entry: &Entry) -> Result{ + println!("APPENDING"); + println!("entry == {:?}", entry); + let cursor: Cursor = store.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; + println!("cursor == {:?}", cursor); + Ok(cursor) +} + +async fn read_entry(store: &mut Store, cursor: Cursor) -> Result>{ + println!("READING ENTRY at cursor={}", cursor); + let entry = store.read_entry_at(cursor).await.map_err(|e| e.to_io_or_panic())?; + println!("ENTRY: {:?}", entry); + Ok(entry) +} + + #[tokio::main] -async fn main() -> Result<(), std::io::Error> { +async fn main() -> Result<()> { println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); - let blob_name = "blob10.minisql"; + let mut store: Store = create_or_connect().await?; - // WARNING: Number of columns is 5????? - - let mut store = Store::new(blob_name, 5).await.map_err(|e| e.to_io_or_panic())?; - - // let store_bytes = store.get_all_bytes().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", store_bytes); - - let mut buff: Vec = vec![0;1]; - let x = store.file.read_exact(&mut buff[..]).await?; - println!("{:?}", buff); - - - // let entry0: Entry = Entry::new_deleted(vec![1, 2, 3, 4, 5]); - // let entry1: Entry = Entry::new_deleted(vec![200,200,5,6,7]); - // let cursor0 = store.append_entry(&entry0).await.map_err(|e| e.to_io_or_panic())?; - // // println!("cursor0 = {}", cursor0); - - // let cursor1 = store.append_entry(&entry1).await.map_err(|e| e.to_io_or_panic())?; - // println!("cursor0 = {}, cursor1 = {}", cursor0, cursor1); - - // let mut store = Store::connect(blob_name).await.map_err(|e| e.to_io_or_panic())?; + // let entry0 = read_entry(&mut store, 16).await?; + // let entry1 = read_entry(&mut store, 45).await?; // println!("{:?}", store); + // println!("{:?}", store.read_all_bytes().await?); + + // let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); + // append_entry(&mut store, &entry0).await?; + + store.read_entries(4).await.map_err(|e| e.to_io_or_panic())?; - // let x = store.entry_at::(16).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let store_bytes = store.get_all_bytes().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", store_bytes); - // let mut store = ColumnStore::connect("blob08.minisql").await.map_err(|e| e.to_io_or_panic())?; // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); // let cursor2 = store.append_entry(&entry2).await.map_err(|e| e.to_io_or_panic())?; // println!("cursor2 = {}", cursor2); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 802444e..d947d14 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -1,101 +1,29 @@ -use tokio::io::{BufReader, BufWriter, AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; -use tokio::sync::{Mutex, RwLock}; -use tokio::fs::{File, OpenOptions}; -use tokio::fs; +use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::fs::{File, OpenOptions, DirBuilder}; +use std::path::Path; +use std::marker::PhantomData; use bincode; -use bincode::de::Decoder; -use bincode::enc::write::Writer; -use bincode::enc::Encoder; -use bincode::{BorrowDecode, Decode, Encode}; -use bincode::config::{BigEndian, Configuration, Fixint}; +use bincode::{Decode, Encode}; +use crate::binary_coding::{encode, decode, encode_sequence, decode_sequence}; +use tokio::fs; + +use crate::error::{Error, DecodeErrorKind}; use std::mem::size_of; +type Result = std::result::Result; -// =============Byte encoding/decoding============ -const BIN_CONFIG: Configuration = bincode::config::standard().with_big_endian().with_fixed_int_encoding(); +pub type Column = u64; +pub type Cursor = u64; -fn encode(t: &T) -> Result, bincode::error::EncodeError> { - bincode::encode_to_vec(t, BIN_CONFIG) -} - -fn decode(bytes: &[u8]) -> Result<(T, usize), bincode::error::DecodeError> { - bincode::decode_from_slice(bytes, BIN_CONFIG) -} - -fn encode_vector(ts: &[T]) -> Result, bincode::error::EncodeError> { - let size: usize = ts.len(); - let mut result = encode(&size)?; - for t in ts { - result.append(&mut encode(&t)?); - } - Ok(result) -} - -fn decode_vector(bytes: &[u8]) -> Result, bincode::error::DecodeError> { - let mut offset = size_of::(); - let result_len: usize = decode(&bytes[..offset])?.0; - let mut result: Vec = Vec::with_capacity(result_len); - for _ in 0..result_len { - let (x, bytes_consumed) = decode::(&bytes[offset..])?; - offset += bytes_consumed; - result.push(x); - } - Ok(result) -} - -// We don't care about encoding the length here (since it will be used for a row with known column -// size) -fn encode_sequence(ts: &[T]) -> Result, bincode::error::EncodeError> { - let mut result = vec![]; - for t in ts { - result.append(&mut encode(&t)?); - - } - Ok(result) -} - -fn decode_sequence(len: usize, bytes: &[u8]) -> Result, bincode::error::DecodeError> { - let mut result: Vec = Vec::with_capacity(len); - let mut offset = 0; - for _ in 0..len { - let (x, bytes_consumed) = decode::(&bytes[offset..])?; - offset += bytes_consumed; - result.push(x); - } - Ok(result) -} - - -fn example_encoding_decoding() { - let xs: Vec = vec![123, 250, 256, 123, 123, 123]; - let xs: Vec = vec![]; - let xs: Vec = vec![123]; - let xs: Vec = vec![123, 250]; - - let xs: Vec = vec!["foo".to_string(), "bar".to_string()]; - - - println!("original {:?}", xs); - let exs = encode_vector(&xs[..]).unwrap(); - println!("encoded {:?}", exs); - - // WARNING: Don't forget to specify the type here - // let dxs = decode_vector::(&exs[..]).unwrap(); - let dxs = decode_vector::(&exs[..]).unwrap(); - println!("decoded {:?}", dxs); -} - -// ============Column Store=============== - -// ColumnStore +// TODO: Consider introducing a phantom type for the data that's used in the store. #[derive(Debug)] -pub struct Store { - column_file_name: String, - // TODO: This should be private - pub file: File, - header: StoreHeader +pub struct Store { + table_folder: String, + file: File, + header: StoreHeader, + data_type: PhantomData, // meta // location of rows file // locations of index files @@ -104,246 +32,322 @@ pub struct Store { // list } -// These type aliases are here because they make writing decoders easier. -type NumberOfColumns = usize; -type DeletedCount = usize; #[derive(Debug)] pub struct StoreHeader { - number_of_columns: NumberOfColumns, - deleted_count: DeletedCount, + number_of_columns: usize, + deleted_count: usize, +} +impl StoreHeader { + const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); + const DELETED_COUNT_SIZE: usize = size_of::(); + const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE; +} + +#[derive(Debug)] +pub struct EntryHeader { + is_deleted: bool, +} +impl EntryHeader { + const IS_DELETED_SIZE: usize = size_of::(); + const HEADER_SIZE: usize = Self::IS_DELETED_SIZE; +} + +#[derive(Debug)] +pub struct EntryHeaderWithDataSize { + is_deleted: bool, + data_size: usize, // in bytes +} +impl EntryHeaderWithDataSize { + const IS_DELETED_SIZE: usize = size_of::(); + const DATA_SIZE_SIZE: usize = size_of::(); + const SIZE: usize = Self::IS_DELETED_SIZE + Self::DATA_SIZE_SIZE; } #[derive(Debug)] pub struct Entry { - is_deleted: bool, - // file_position: FilePosition, + header: EntryHeader, data: Vec, } #[derive(Debug)] -pub enum Error { - DecodeError(DecodeErrorKind, bincode::error::DecodeError), - EncodeError(bincode::error::EncodeError), - IoError(std::io::Error), - InvalidStoreHeader, +pub struct EntryDetailed { + header: EntryHeaderWithDataSize, + data: Vec, } -#[derive(Debug)] -pub enum DecodeErrorKind { - StoreHeaderNumberOfColumns, - StoreHeaderDeletedCount, - EntryData, - EntryIsDeleted, - EntryDataSize +//===Store=== +pub async fn store_exists(table_folder: &str) -> Result { + Ok(fs::metadata(table_folder).await.is_ok()) } -// ===Errors=== -impl Error { - pub fn to_io_or_panic(self) -> std::io::Error { - use Error::*; - match self { - IoError(err) => err, - err => { - println!("{:?}", err); - panic!(); - } - } - } -} - -impl From for Error { - fn from(err: bincode::error::EncodeError) -> Self { - Self::EncodeError(err) - } -} - -impl From for Error { - fn from(err: std::io::Error) -> Self { - Self::IoError(err) - } -} - - -// ====Entry==== -impl Entry { - pub fn new(data: Vec) -> Self { - Self { is_deleted: false, data } +impl Store { + //===primitive file operations=== + // Moves the cursor right. + async fn write_bytes(&mut self, bytes: &[u8]) -> Result { + Ok(self.file.write(bytes).await?) } - pub fn new_deleted(data: Vec) -> Self { - Self { is_deleted: true, data } + // Moves the cursor right. + async fn read_bytes(&mut self, bytes: &mut [u8]) -> Result<()> { + self.file.read_exact(bytes).await?; + Ok(()) } - // FORMAT: [HEADER, ..sequence of data] - // HEADER: [Boolean (one byte), number of bytes in the data (not including the boolean)] - fn encode(self: &Entry) -> Result, Error> - where T: Encode - { - let mut result: Vec = encode(&self.is_deleted)?; // bool 1 byte - let mut encoded_data = encode_sequence(&self.data[..])?; - let encoded_data_len = encoded_data.len(); - result.append(&mut encode(&encoded_data_len)?); // usize 8 bytes - println!("enc data len == {}", encoded_data_len); - println!("encoded_data == {:?} ", encoded_data); - result.append(&mut encoded_data); // data variable size + // Moves the cursor right. + async fn get_bytes(&mut self, count: usize) -> Result> { + let mut result: Vec = Vec::with_capacity(count); + self.read_bytes(&mut result).await?; Ok(result) } - // in bytes - pub fn header_size() -> usize { - size_of::() + size_of::() + async fn seek_to(&mut self, cursor: Cursor) -> Result<()>{ + self.file.seek(SeekFrom::Start(cursor)).await?; + Ok(()) } - // TODO: Maybe introduce an EntryHeader as a separate type? - pub fn decode_header(header_bytes: Vec) -> Result<(bool, usize), Error> { - let (is_deleted, offset) = - decode::(&header_bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + async fn seek_to_start(&mut self) -> Result<()> { + self.file.seek(SeekFrom::Start(0)).await?; + Ok(()) + } - let (data_size, _) = - decode::(&header_bytes[offset..]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryDataSize, e))?; - Ok((is_deleted, data_size)) + async fn seek_to_end(&mut self) -> Result<()> { + self.file.seek(SeekFrom::End(0)).await?; + Ok(()) + } + + async fn seek_to_start_of_data(&mut self) -> Result<()> { + self.seek_to(StoreHeader::SIZE as u64).await + } + + async fn current_cursor(&mut self) -> Result { + let next_cursor: Cursor = self.file.stream_position().await?; + Ok(next_cursor) + } + + // For debugging. + // Moves cursor to the end. + pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error>{ + let mut bytes: Vec = vec![]; + self.seek_to_start().await.map_err(|e| e.to_io_or_panic())?; + self.file.read_to_end(&mut bytes).await?; + Ok(bytes) + } + + const ROWS_FILE_NAME: &'static str = "rows"; + + // ===Creation=== + pub async fn new(table_folder: &str, number_of_columns: usize) -> Result { + let path_to_table = Path::new(table_folder); + let path_to_rows = path_to_table.join(Self::ROWS_FILE_NAME); + DirBuilder::new() + .create(path_to_table).await?; + + let file: File = + OpenOptions::new() + .write(true) + .read(true) + .create_new(true) + .open(path_to_rows) + .await?; + + let header = StoreHeader { + number_of_columns, + deleted_count: 0, + }; + let encoded_header: Vec = header.encode()?; + + let mut store = Self { + table_folder: table_folder.to_string(), + file, + header, + data_type: PhantomData::, + }; + store.write_bytes(&encoded_header).await?; + Ok(store) + } + + pub async fn connect(table_folder: &str) -> Result { + let path_to_table = Path::new(table_folder); + let path_to_rows = path_to_table.join(Self::ROWS_FILE_NAME); + + let mut file: File = + OpenOptions::new() + .read(true) + .write(true) + .open(path_to_rows) + .await?; + + // Unfortunately we can't yet use store.read_bytes, since it can't be created without the + // header. + let mut header_bytes = StoreHeader::decode_buffer(); + file.read_exact(&mut header_bytes).await?; + let header = StoreHeader::decode(&mut header_bytes).await?; + + Ok(Self { + table_folder: table_folder.to_string(), + file, + header, + data_type: PhantomData::, + }) + } + + // ===Append Entry=== + // Moves cursor to the end. + pub async fn append_entry(&mut self, entry: &Entry) -> Result + where T: Encode + { + let encoded_entry: Vec = entry.encode()?; + self.seek_to_end().await?; + let cursor: Cursor = self.current_cursor().await?; + self.write_bytes(&encoded_entry).await?; + Ok(cursor) + } + + + // ===Lookup=== + // WARNING: The cursor has to be at the start of an entry. Otherwise garbage data will be + // decoded as an entry. + pub async fn read_entry_header_at(&mut self, cursor: Cursor) -> Result { + self.seek_to(cursor).await?; + self.file.seek(SeekFrom::Start(cursor)).await?; + + let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::SIZE]; + self.read_bytes(&mut header_bytes).await?; + let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..])?; + // TODO: Get rid of the println's + // println!("HEADER_BYTES: {:?}", header_bytes); + // println!("HEADER: {:?}", header); + + Ok(header) + } + + pub async fn read_entry_data(&mut self, header: &EntryHeaderWithDataSize) -> Result> { + let mut data_bytes: Vec = vec![0; header.data_size]; + // TODO: Get rid of the println's + // println!("HEADER_BYTES: {:?}", header_bytes); + // println!("PREPARED_DATA_BYTES: {:?}", data_bytes); + self.read_bytes(&mut data_bytes).await?; + todo!() + } + + pub async fn read_entry_at(&mut self, cursor: Cursor) -> Result> + where T: Decode + { + let header = self.read_entry_header_at(cursor).await?; + + let mut data_bytes: Vec = vec![0; header.data_size]; + // TODO: Get rid of the println's + // println!("PREPARED_DATA_BYTES: {:?}", data_bytes); + self.read_bytes(&mut data_bytes).await?; + // println!("DATA_BYTES: {:?}", data_bytes); + let entry: EntryDetailed = + EntryDetailed::decode(header, self.header.number_of_columns, &mut data_bytes)?; + + Ok(entry) + } + + pub async fn read_entries(&mut self, n: usize) -> Result<()> + where T: Decode + std::fmt::Debug + { + self.seek_to_start_of_data().await?; + let mut cursor: Cursor = self.current_cursor().await?; + for i in 0..n { + let entry = self.read_entry_at(cursor).await?; + println!("({}, {:?})", i, entry); + cursor = self.current_cursor().await?; + } + Ok(()) } } -pub type Column = u64; -pub type Cursor = u64; - +// ===Store Header=== impl StoreHeader { - fn encode(&self) -> Result, Error> { + fn encode(&self) -> Result> { // FORMAT: First Number of Columns, Then Deleted Count. let mut result = encode(&self.number_of_columns)?; result.append(&mut encode(&self.deleted_count)?); Ok(result) } - async fn decode(file: &mut File) -> Result { - let number_of_columns_size = size_of::(); - let deleted_count_size = size_of::(); - let header_size: usize = number_of_columns_size + deleted_count_size; - let mut header_bytes: Vec = vec![0; header_size]; - // TODO: Why do we need to have mutable reference for the file when we are reading it? - match file.read_exact(&mut header_bytes).await { - Ok(_) => { - let offset = 0; - let (number_of_columns, offset) = - decode::(&header_bytes[offset..offset + number_of_columns_size]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; - let (deleted_count, _) = - decode::(&header_bytes[offset..offset + deleted_count_size]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; - let header = StoreHeader { - number_of_columns, - deleted_count, - }; - Ok(header) - }, - Err(err) => { - // TODO: When err is of the kind UnexpectedEof, return InvalidStoreHeader - println!("ARE WE HERE?"); - Err(Error::from(err)) - } - } - } -} - - -impl Store { - // For debugging - pub async fn get_all_bytes(mut self) -> Result, Error>{ - let mut bytes: Vec = vec![]; - use std::io::Read; - // for byte in self.file.bytes() { - // } - self.file.read_exact(&mut bytes[..]).await?; - Ok(bytes) + fn decode_buffer() -> [u8; StoreHeader::SIZE] { + [0; StoreHeader::SIZE] } - pub async fn new(column_file_name: &str, number_of_columns: usize) -> Result { - let mut file = - OpenOptions::new() - .write(true) - .read(true) - .create_new(true) - .open(column_file_name) - .await?; - + async fn decode(result: &mut [u8]) -> Result { + let offset = 0; + let (number_of_columns, offset) = + decode::(&result[offset..offset + Self::NUMBER_OF_COLUMNS_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; + let (deleted_count, _) = + decode::(&result[offset..offset + Self::DELETED_COUNT_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; let header = StoreHeader { number_of_columns, - deleted_count: 0, - } ; - let encoded_header = header.encode()?; - file.write(&encoded_header).await?; - println!("is something being encoded? {:?}", encoded_header); - let store = Self { - column_file_name: column_file_name.to_string(), - file, - header, + deleted_count, }; - Ok(store) - } - pub async fn connect(column_file_name: &str) -> Result { - let mut file = OpenOptions::new().read(true).write(true).open(column_file_name).await?; - - let header = StoreHeader::decode(&mut file).await?; - Ok(Self { - column_file_name: column_file_name.to_string(), - file, - header - }) - } - - pub async fn entry_at(&mut self, cursor: Cursor) -> Result, Error> { - self.file.seek(SeekFrom::Start(cursor)).await?; - - // 1. read header bytes (fixed number of bytes). - // 2. decode header - // 3. read entry data bytes. - // 4. decode data - // That will tell us how much data there is. - let entry_header_size = Entry::::header_size(); - let mut header_bytes: Vec = vec![0; entry_header_size]; - self.file.read_exact(&mut header_bytes).await?; - - println!("cursor == {}", cursor); - println!("header_bytes == {:?}", header_bytes); - - let (is_deleted, data_size) = Entry::::decode_header(header_bytes)?; - - self.file.seek(SeekFrom::Current(entry_header_size as i64)).await?; - let mut data_bytes: Vec = vec![0; data_size]; - - println!("(is_delted, data_size) = ({}, {})", is_deleted, data_size); - let data = - decode_sequence::(self.header.number_of_columns, &mut data_bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; - Ok(Entry { - is_deleted, - data - }) - } - - pub async fn append_entry(&mut self, entry: &Entry) -> Result { - // On linux when opening a file in append mode, the seek is set to 0 - // and only updated after a write. That's why we do the cursor gymnastic at the end. - let encoded_entry: Vec = entry.encode()?; - println!("encoded_entry == {:?}", encoded_entry); - println!("bool size == {}", size_of::()); - println!("usize size == {}", size_of::()); - self.file.write(&encoded_entry).await?; - let next_cursor: Cursor = self.file.stream_position().await?; - let cursor: Cursor = next_cursor - encoded_entry.len() as u64; - Ok(cursor) - } - - pub async fn iterate_all(&mut self) -> Result { - // Loop through the rows and print them out - todo!() + Ok(header) } } +// ====Entry==== +impl EntryHeader { + fn encode(self: &EntryHeader) -> Result> { + let result: Vec = encode(&self.is_deleted)?; + Ok(result) + } +} + +impl EntryHeaderWithDataSize { + fn decode(bytes: &mut [u8]) -> Result { + let (is_deleted, offset) = + decode::(&bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + + let (data_size, _) = + decode::(&bytes[offset..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryDataSize, e))?; + Ok(Self { is_deleted, data_size} ) + } +} + +impl Entry { + pub fn new(data: Vec) -> Self { + Self { header: EntryHeader { is_deleted: false }, data } + } + + pub fn new_deleted(data: Vec) -> Self { + Self { header: EntryHeader { is_deleted: true}, data } + } + + // FORMAT: [HEADER, ..sequence of data] + // HEADER: [Boolean (one byte), number of bytes in the data (not including the boolean)] + fn encode(self: &Entry) -> Result> + where T: Encode + { + let mut result: Vec = self.header.encode()?; + let mut encoded_data = encode_sequence(&self.data[..])?; + let encoded_data_len: usize = encoded_data.len(); + result.append(&mut encode(&encoded_data_len)?); // usize 8 bytes + result.append(&mut encoded_data); // data variable size + Ok(result) + } + +} + +impl EntryDetailed { + fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result + where T: Decode + { + let data = decode_sequence::(number_of_columns, bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; + Ok(EntryDetailed { header, data }) + } +} + + + + // impl StorageEngine for ColumnStore { // async fn append(&mut self, id: Index, entry: Row) -> Result From 2f23df1009390c7df12fb22ddd061131cea65441 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Fri, 2 Feb 2024 15:06:17 +0100 Subject: [PATCH 03/43] Introduce total_count --- storage_engine/src/error.rs | 1 + storage_engine/src/main.rs | 6 +-- storage_engine/src/storage_engine.rs | 62 +++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index 7c213f3..898be3a 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -10,6 +10,7 @@ pub enum Error { pub enum DecodeErrorKind { StoreHeaderNumberOfColumns, StoreHeaderDeletedCount, + StoreHeaderTotalCount, EntryData, EntryIsDeleted, EntryDataSize diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index ac6fa88..6518649 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -77,14 +77,14 @@ async fn main() -> Result<()> { // let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); // append_entry(&mut store, &entry0).await?; - store.read_entries(4).await.map_err(|e| e.to_io_or_panic())?; - + store.read_entries(3).await.map_err(|e| e.to_io_or_panic())?; // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); // let cursor2 = store.append_entry(&entry2).await.map_err(|e| e.to_io_or_panic())?; // println!("cursor2 = {}", cursor2); - // println!("{:?}", store); + + println!("{:?}", store); println!("DONE"); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index d947d14..9e12041 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -36,11 +36,17 @@ pub struct Store { pub struct StoreHeader { number_of_columns: usize, deleted_count: usize, + total_count: usize, } impl StoreHeader { const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); const DELETED_COUNT_SIZE: usize = size_of::(); - const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE; + const TOTAL_COUNT_SIZE: usize = size_of::(); + const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE; + + const NUMBER_OF_COLUMNS_OFFSET: usize = 0; + const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; + const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; } #[derive(Debug)] @@ -153,6 +159,7 @@ impl Store { let header = StoreHeader { number_of_columns, deleted_count: 0, + total_count: 0, }; let encoded_header: Vec = header.encode()?; @@ -192,17 +199,46 @@ impl Store { } // ===Append Entry=== + async fn increment_total_count(&mut self) -> Result<()> { + self.seek_to_start().await?; + self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; + let new_count = self.header.increment_total_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + async fn increment_deleted_count(&mut self) -> Result<()> { + self.seek_to_start().await?; + self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; + let new_count = self.header.increment_deleted_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + // Moves cursor to the end. pub async fn append_entry(&mut self, entry: &Entry) -> Result where T: Encode { + self.increment_total_count().await?; + let encoded_entry: Vec = entry.encode()?; self.seek_to_end().await?; let cursor: Cursor = self.current_cursor().await?; self.write_bytes(&encoded_entry).await?; + Ok(cursor) } + // ===Deletion=== + pub async fn mark_deleted_at(&mut self, cursor: Cursor) -> Result<()> { + self.increment_deleted_count().await?; + + self.seek_to(cursor).await?; + + // TODO: Now you need to mutate the entry itself + todo!() + } + // ===Lookup=== // WARNING: The cursor has to be at the start of an entry. Otherwise garbage data will be @@ -266,6 +302,7 @@ impl StoreHeader { // FORMAT: First Number of Columns, Then Deleted Count. let mut result = encode(&self.number_of_columns)?; result.append(&mut encode(&self.deleted_count)?); + result.append(&mut encode(&self.total_count)?); Ok(result) } @@ -274,20 +311,35 @@ impl StoreHeader { } async fn decode(result: &mut [u8]) -> Result { - let offset = 0; - let (number_of_columns, offset) = - decode::(&result[offset..offset + Self::NUMBER_OF_COLUMNS_SIZE]) + let (number_of_columns, _) = + decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; let (deleted_count, _) = - decode::(&result[offset..offset + Self::DELETED_COUNT_SIZE]) + decode::(&result[Self::DELETED_COUNT_OFFSET..Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; + let (total_count, _offset) = + decode::(&result[Self::TOTAL_COUNT_OFFSET..Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderTotalCount, e))?; let header = StoreHeader { number_of_columns, deleted_count, + total_count, }; Ok(header) } + + // returns new count + fn increment_total_count(&mut self) -> usize { + self.total_count += 1; + self.total_count + } + + // returns new count + fn increment_deleted_count(&mut self) -> usize { + self.deleted_count += 1; + self.deleted_count + } } // ====Entry==== From cac34d95e072e9f241a16c5f41db1ee98900ea48 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Fri, 2 Feb 2024 18:27:32 +0100 Subject: [PATCH 04/43] Change entry header. Expand api --- storage_engine/src/binary_coding.rs | 12 ++++ storage_engine/src/error.rs | 3 +- storage_engine/src/index.rs | 68 ++++++++++++++++++ storage_engine/src/lib.rs | 1 + storage_engine/src/main.rs | 5 +- storage_engine/src/storage_engine.rs | 100 +++++++++++++++++---------- 6 files changed, 150 insertions(+), 39 deletions(-) create mode 100644 storage_engine/src/index.rs diff --git a/storage_engine/src/binary_coding.rs b/storage_engine/src/binary_coding.rs index b6fe132..5e6378d 100644 --- a/storage_engine/src/binary_coding.rs +++ b/storage_engine/src/binary_coding.rs @@ -45,6 +45,18 @@ pub fn encode_sequence(ts: &[T]) -> Result, bincode::error::E Ok(result) } +pub fn encode_sequence_with_sizes(ts: &[T]) -> Result<(Vec, Vec), bincode::error::EncodeError> { + let mut result_bytes = vec![]; + let mut sizes = Vec::with_capacity(ts.len()); + for t in ts { + let mut bytes = encode(&t)?; + sizes.push(bytes.len()); + result_bytes.append(&mut bytes); + + } + Ok((result_bytes, sizes)) +} + pub fn decode_sequence(len: usize, bytes: &[u8]) -> Result, bincode::error::DecodeError> { let mut result: Vec = Vec::with_capacity(len); let mut offset = 0; diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index 898be3a..10a74ae 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -11,9 +11,10 @@ pub enum DecodeErrorKind { StoreHeaderNumberOfColumns, StoreHeaderDeletedCount, StoreHeaderTotalCount, + StoreHeaderPrimaryColumn, EntryData, EntryIsDeleted, - EntryDataSize + EntryHeaderWithDataSizes, } // ===Errors=== diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs new file mode 100644 index 0000000..b7e48ab --- /dev/null +++ b/storage_engine/src/index.rs @@ -0,0 +1,68 @@ +use std::marker::PhantomData; +use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::fs::{File, OpenOptions, DirBuilder}; +use std::path::Path; + +use std::collections::{BTreeMap}; + +use bincode; +use bincode::{Decode, Encode}; +use crate::binary_coding::{encode, decode, encode_sequence, decode_sequence}; +use tokio::fs; + +use crate::error::{Error, DecodeErrorKind}; + +use std::mem::size_of; + +type Result = std::result::Result; + +// Implements a persistant self-balancing Binary Search Tree. Nope. +// We need fixed-size nodes. But we want to index Strings which are variable length. + +pub struct Index { + file: File, + // None means index is asleep on disk. + in_memory: Option>, + header: IndexHeader, + key_type: PhantomData, + value_type: PhantomData, +} + +pub struct IndexHeader { +} + +impl Index { + pub async fn new(file_name: &str) -> Result> { + todo!() + } + + pub async fn connect(file_name: &str) -> Result> { + todo!() + } + + // Saves the in-memory index to disk and deallocates. + pub async fn sleep() -> Result> { + todo!() + } + + // Loads the index into memory + pub async fn wake() -> Result> { + todo!() + } + + pub async fn insert() -> Result<()> + where I: Encode, V: Encode + { + todo!() + } + + pub async fn lookup(&mut self, k: I) -> Result> + where I: Encode + Decode, + { + todo!() + } + + pub async fn delete(&mut self, k: I) -> Result> { + todo!() + } +} diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs index 7ce85ce..192f3db 100644 --- a/storage_engine/src/lib.rs +++ b/storage_engine/src/lib.rs @@ -1,3 +1,4 @@ pub mod storage_engine; mod binary_coding; mod error; +mod index; diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 6518649..0bdd82c 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -7,6 +7,7 @@ use std::path::Path; mod storage_engine; mod binary_coding; mod error; +mod index; use crate::storage_engine::*; @@ -17,7 +18,7 @@ const TABLE_PATH: &'static str = "test_table"; type Result = std::result::Result; async fn create_store() -> Result> { - let mut store: Store = Store::new(TABLE_PATH, 5).await.map_err(|e| e.to_io_or_panic())?; + let mut store: Store = Store::new(TABLE_PATH, 5, 0).await.map_err(|e| e.to_io_or_panic())?; println!("CREATED"); println!("{:?}", store.read_all_bytes().await?); @@ -77,7 +78,7 @@ async fn main() -> Result<()> { // let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); // append_entry(&mut store, &entry0).await?; - store.read_entries(3).await.map_err(|e| e.to_io_or_panic())?; + store.read_entries(2).await.map_err(|e| e.to_io_or_panic())?; // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 9e12041..2555939 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -5,7 +5,7 @@ use std::marker::PhantomData; use bincode; use bincode::{Decode, Encode}; -use crate::binary_coding::{encode, decode, encode_sequence, decode_sequence}; +use crate::binary_coding::{encode, decode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; use tokio::fs; use crate::error::{Error, DecodeErrorKind}; @@ -37,16 +37,19 @@ pub struct StoreHeader { number_of_columns: usize, deleted_count: usize, total_count: usize, + primary_column: Column, } impl StoreHeader { const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); const DELETED_COUNT_SIZE: usize = size_of::(); const TOTAL_COUNT_SIZE: usize = size_of::(); - const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE; + const PRIMARY_COLUMN_SIZE: usize = size_of::(); + const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; const NUMBER_OF_COLUMNS_OFFSET: usize = 0; const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; + const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; } #[derive(Debug)] @@ -61,12 +64,22 @@ impl EntryHeader { #[derive(Debug)] pub struct EntryHeaderWithDataSize { is_deleted: bool, - data_size: usize, // in bytes + data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 + // bytes etc } impl EntryHeaderWithDataSize { + const IS_DELETED_OFFSET: usize = 0; const IS_DELETED_SIZE: usize = size_of::(); - const DATA_SIZE_SIZE: usize = size_of::(); - const SIZE: usize = Self::IS_DELETED_SIZE + Self::DATA_SIZE_SIZE; + const DATA_SIZES_OFFSET: usize = Self::IS_DELETED_OFFSET + Self::IS_DELETED_SIZE; + + fn size(number_of_columns: usize) -> usize { + let size_of_data_sizes: usize = number_of_columns*size_of::(); + Self::IS_DELETED_SIZE + size_of_data_sizes + } + + fn size_of_data(&self) -> usize{ + self.data_sizes.iter().sum() + } } #[derive(Debug)] @@ -142,7 +155,7 @@ impl Store { const ROWS_FILE_NAME: &'static str = "rows"; // ===Creation=== - pub async fn new(table_folder: &str, number_of_columns: usize) -> Result { + pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { let path_to_table = Path::new(table_folder); let path_to_rows = path_to_table.join(Self::ROWS_FILE_NAME); DirBuilder::new() @@ -160,9 +173,12 @@ impl Store { number_of_columns, deleted_count: 0, total_count: 0, + primary_column, }; let encoded_header: Vec = header.encode()?; + println!("ENCODED_HEADER: {:?}", encoded_header); + let mut store = Self { table_folder: table_folder.to_string(), file, @@ -173,7 +189,9 @@ impl Store { Ok(store) } - pub async fn connect(table_folder: &str) -> Result { + pub async fn connect(table_folder: &str) -> Result + where T: std::fmt::Debug + { let path_to_table = Path::new(table_folder); let path_to_rows = path_to_table.join(Self::ROWS_FILE_NAME); @@ -190,12 +208,14 @@ impl Store { file.read_exact(&mut header_bytes).await?; let header = StoreHeader::decode(&mut header_bytes).await?; - Ok(Self { + let store = Self { table_folder: table_folder.to_string(), file, header, data_type: PhantomData::, - }) + }; + println!("just connected TOOOOO {:?}", store); + Ok(store) } // ===Append Entry=== @@ -247,9 +267,10 @@ impl Store { self.seek_to(cursor).await?; self.file.seek(SeekFrom::Start(cursor)).await?; - let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::SIZE]; + let number_of_columns: usize = self.header.number_of_columns; + let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; self.read_bytes(&mut header_bytes).await?; - let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..])?; + let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..], number_of_columns)?; // TODO: Get rid of the println's // println!("HEADER_BYTES: {:?}", header_bytes); // println!("HEADER: {:?}", header); @@ -257,21 +278,12 @@ impl Store { Ok(header) } - pub async fn read_entry_data(&mut self, header: &EntryHeaderWithDataSize) -> Result> { - let mut data_bytes: Vec = vec![0; header.data_size]; - // TODO: Get rid of the println's - // println!("HEADER_BYTES: {:?}", header_bytes); - // println!("PREPARED_DATA_BYTES: {:?}", data_bytes); - self.read_bytes(&mut data_bytes).await?; - todo!() - } - pub async fn read_entry_at(&mut self, cursor: Cursor) -> Result> where T: Decode { let header = self.read_entry_header_at(cursor).await?; - let mut data_bytes: Vec = vec![0; header.data_size]; + let mut data_bytes: Vec = vec![0; header.size_of_data()]; // TODO: Get rid of the println's // println!("PREPARED_DATA_BYTES: {:?}", data_bytes); self.read_bytes(&mut data_bytes).await?; @@ -294,15 +306,29 @@ impl Store { } Ok(()) } + + pub async fn search_for_entry_with_id(&mut self, id: T) -> Result>> { + // TODO: make call to the primary index + todo!() + } + + // TODO: This needs to be some sort of an iterator + pub async fn get_eq(&self, column: Column, value: T) -> Result>> { + todo!() + } + + pub async fn garbage_collect(&mut self) -> Result<()> { + todo!() + } } // ===Store Header=== impl StoreHeader { fn encode(&self) -> Result> { - // FORMAT: First Number of Columns, Then Deleted Count. let mut result = encode(&self.number_of_columns)?; result.append(&mut encode(&self.deleted_count)?); result.append(&mut encode(&self.total_count)?); + result.append(&mut encode(&self.primary_column)?); Ok(result) } @@ -317,13 +343,17 @@ impl StoreHeader { let (deleted_count, _) = decode::(&result[Self::DELETED_COUNT_OFFSET..Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; - let (total_count, _offset) = + let (total_count, _) = decode::(&result[Self::TOTAL_COUNT_OFFSET..Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderTotalCount, e))?; + let (primary_column, _) = + decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; let header = StoreHeader { number_of_columns, deleted_count, total_count, + primary_column, }; Ok(header) @@ -351,15 +381,15 @@ impl EntryHeader { } impl EntryHeaderWithDataSize { - fn decode(bytes: &mut [u8]) -> Result { - let (is_deleted, offset) = + fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { + let (is_deleted, _) = decode::(&bytes) .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; - let (data_size, _) = - decode::(&bytes[offset..]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryDataSize, e))?; - Ok(Self { is_deleted, data_size} ) + let data_sizes = decode_sequence::(number_of_columns, &bytes[Self::DATA_SIZES_OFFSET..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryHeaderWithDataSizes, e))?; + + Ok(Self { is_deleted, data_sizes } ) } } @@ -372,19 +402,17 @@ impl Entry { Self { header: EntryHeader { is_deleted: true}, data } } - // FORMAT: [HEADER, ..sequence of data] - // HEADER: [Boolean (one byte), number of bytes in the data (not including the boolean)] - fn encode(self: &Entry) -> Result> + // FORMAT: [EntryHeaderWithDataSize, ..sequence of data] + fn encode(&self) -> Result> where T: Encode { let mut result: Vec = self.header.encode()?; - let mut encoded_data = encode_sequence(&self.data[..])?; - let encoded_data_len: usize = encoded_data.len(); - result.append(&mut encode(&encoded_data_len)?); // usize 8 bytes + + let (mut encoded_data, sizes) = encode_sequence_with_sizes(&self.data[..])?; + result.append(&mut encode_sequence(&sizes)?); // sizes of data (fixed by number of columns) result.append(&mut encoded_data); // data variable size Ok(result) } - } impl EntryDetailed { From 28741006e73dfe640eacdd308b99d148fac11254 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Fri, 2 Feb 2024 19:04:51 +0100 Subject: [PATCH 05/43] Rename cursor ~> file_position --- storage_engine/src/main.rs | 27 ++++---- storage_engine/src/storage_engine.rs | 100 ++++++++++++++++++--------- 2 files changed, 81 insertions(+), 46 deletions(-) diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 0bdd82c..a286df5 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -20,7 +20,8 @@ type Result = std::result::Result; async fn create_store() -> Result> { let mut store: Store = Store::new(TABLE_PATH, 5, 0).await.map_err(|e| e.to_io_or_panic())?; println!("CREATED"); - println!("{:?}", store.read_all_bytes().await?); + println!("THE STORE: {:?}", store); + println!("THE BYTES: {:?}", store.read_all_bytes().await?); let entry0: Entry = Entry::new_deleted(vec![1, 2, 3, 4, 5]); append_entry(&mut store, &entry0).await?; @@ -31,10 +32,12 @@ async fn create_store() -> Result> { println!("{:?}", store.read_all_bytes().await?); Ok(store) } + async fn connect_store() -> Result> { let mut store: Store = Store::connect(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; println!("CONNECTED"); - println!("{:?}", store.read_all_bytes().await?); + println!("THE STORE: {:?}", store); + println!("THE BYTES: {:?}", store.read_all_bytes().await?); Ok(store) } @@ -48,17 +51,17 @@ async fn create_or_connect() -> Result> { } -async fn append_entry(store: &mut Store, entry: &Entry) -> Result{ +async fn append_entry(store: &mut Store, entry: &Entry) -> Result{ println!("APPENDING"); println!("entry == {:?}", entry); - let cursor: Cursor = store.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; - println!("cursor == {:?}", cursor); - Ok(cursor) + let file_position: FilePosition = store.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; + println!("file_position == {:?}", file_position); + Ok(file_position) } -async fn read_entry(store: &mut Store, cursor: Cursor) -> Result>{ - println!("READING ENTRY at cursor={}", cursor); - let entry = store.read_entry_at(cursor).await.map_err(|e| e.to_io_or_panic())?; +async fn read_entry(store: &mut Store, file_position: FilePosition) -> Result>>{ + println!("READING ENTRY at file_position={}", file_position); + let entry = store.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; println!("ENTRY: {:?}", entry); Ok(entry) } @@ -75,10 +78,10 @@ async fn main() -> Result<()> { // println!("{:?}", store); // println!("{:?}", store.read_all_bytes().await?); - // let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - // append_entry(&mut store, &entry0).await?; + let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); + append_entry(&mut store, &entry0).await?; - store.read_entries(2).await.map_err(|e| e.to_io_or_panic())?; + store.read_entries().await.map_err(|e| e.to_io_or_panic())?; // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 2555939..113982a 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -15,7 +15,7 @@ use std::mem::size_of; type Result = std::result::Result; pub type Column = u64; -pub type Cursor = u64; +pub type FilePosition = u64; // TODO: Consider introducing a phantom type for the data that's used in the store. #[derive(Debug)] @@ -24,6 +24,8 @@ pub struct Store { file: File, header: StoreHeader, data_type: PhantomData, + + eof_file_position: FilePosition, // meta // location of rows file // locations of index files @@ -32,6 +34,10 @@ pub struct Store { // list } +// TODO: Basically a pointer to Store + its own file position +// pub struct Cursor<'a, T> { +// } + #[derive(Debug)] pub struct StoreHeader { number_of_columns: usize, @@ -94,33 +100,41 @@ pub struct EntryDetailed { data: Vec, } + +pub struct EntryIterator<'a> { + file: &'a mut File, + current_file_position: FilePosition +} + //===Store=== pub async fn store_exists(table_folder: &str) -> Result { Ok(fs::metadata(table_folder).await.is_ok()) } impl Store { + const ROWS_FILE_NAME: &'static str = "rows"; + //===primitive file operations=== - // Moves the cursor right. + // Moves the file cursor right. async fn write_bytes(&mut self, bytes: &[u8]) -> Result { Ok(self.file.write(bytes).await?) } - // Moves the cursor right. + // Moves the file cursor right. async fn read_bytes(&mut self, bytes: &mut [u8]) -> Result<()> { self.file.read_exact(bytes).await?; Ok(()) } - // Moves the cursor right. + // Moves the file cursor right. async fn get_bytes(&mut self, count: usize) -> Result> { let mut result: Vec = Vec::with_capacity(count); self.read_bytes(&mut result).await?; Ok(result) } - async fn seek_to(&mut self, cursor: Cursor) -> Result<()>{ - self.file.seek(SeekFrom::Start(cursor)).await?; + async fn seek_to(&mut self, file_position: FilePosition) -> Result<()>{ + self.file.seek(SeekFrom::Start(file_position)).await?; Ok(()) } @@ -138,13 +152,13 @@ impl Store { self.seek_to(StoreHeader::SIZE as u64).await } - async fn current_cursor(&mut self) -> Result { - let next_cursor: Cursor = self.file.stream_position().await?; - Ok(next_cursor) + async fn current_file_position(&mut self) -> Result { + let next_file_position: FilePosition = self.file.stream_position().await?; + Ok(next_file_position) } // For debugging. - // Moves cursor to the end. + // Moves file cursor to the end. pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error>{ let mut bytes: Vec = vec![]; self.seek_to_start().await.map_err(|e| e.to_io_or_panic())?; @@ -152,8 +166,6 @@ impl Store { Ok(bytes) } - const ROWS_FILE_NAME: &'static str = "rows"; - // ===Creation=== pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { let path_to_table = Path::new(table_folder); @@ -177,15 +189,15 @@ impl Store { }; let encoded_header: Vec = header.encode()?; - println!("ENCODED_HEADER: {:?}", encoded_header); - let mut store = Self { table_folder: table_folder.to_string(), file, header, data_type: PhantomData::, + eof_file_position: 0, }; store.write_bytes(&encoded_header).await?; + store.eof_file_position = store.current_file_position().await?; Ok(store) } @@ -208,13 +220,15 @@ impl Store { file.read_exact(&mut header_bytes).await?; let header = StoreHeader::decode(&mut header_bytes).await?; + let eof_file_position = file.seek(SeekFrom::End(0)).await?; + let store = Self { table_folder: table_folder.to_string(), file, header, data_type: PhantomData::, + eof_file_position }; - println!("just connected TOOOOO {:?}", store); Ok(store) } @@ -236,24 +250,27 @@ impl Store { } // Moves cursor to the end. - pub async fn append_entry(&mut self, entry: &Entry) -> Result + pub async fn append_entry(&mut self, entry: &Entry) -> Result where T: Encode { self.increment_total_count().await?; let encoded_entry: Vec = entry.encode()?; self.seek_to_end().await?; - let cursor: Cursor = self.current_cursor().await?; + let file_position: FilePosition = self.current_file_position().await?; self.write_bytes(&encoded_entry).await?; - Ok(cursor) + let eof_file_position: FilePosition = self.current_file_position().await?; + self.eof_file_position = eof_file_position; + + Ok(file_position) } // ===Deletion=== - pub async fn mark_deleted_at(&mut self, cursor: Cursor) -> Result<()> { + pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> { self.increment_deleted_count().await?; - self.seek_to(cursor).await?; + self.seek_to(file_position).await?; // TODO: Now you need to mutate the entry itself todo!() @@ -263,9 +280,8 @@ impl Store { // ===Lookup=== // WARNING: The cursor has to be at the start of an entry. Otherwise garbage data will be // decoded as an entry. - pub async fn read_entry_header_at(&mut self, cursor: Cursor) -> Result { - self.seek_to(cursor).await?; - self.file.seek(SeekFrom::Start(cursor)).await?; + pub async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { + self.seek_to(file_position).await?; let number_of_columns: usize = self.header.number_of_columns; let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; @@ -278,10 +294,15 @@ impl Store { Ok(header) } - pub async fn read_entry_at(&mut self, cursor: Cursor) -> Result> + // Returns None when file_positoin == eof_file_position + pub async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> where T: Decode { - let header = self.read_entry_header_at(cursor).await?; + if file_position == self.eof_file_position { + return Ok(None) + } + + let header = self.read_entry_header_at(file_position).await?; let mut data_bytes: Vec = vec![0; header.size_of_data()]; // TODO: Get rid of the println's @@ -291,20 +312,31 @@ impl Store { let entry: EntryDetailed = EntryDetailed::decode(header, self.header.number_of_columns, &mut data_bytes)?; - Ok(entry) + Ok(Some(entry)) } - pub async fn read_entries(&mut self, n: usize) -> Result<()> + // TODO: This needs to be some sort of an iterator + // pub async fn entries() -> EntryIterator { + // todo!() + // } + + pub async fn read_entries(&mut self) -> Result<()> where T: Decode + std::fmt::Debug { self.seek_to_start_of_data().await?; - let mut cursor: Cursor = self.current_cursor().await?; - for i in 0..n { - let entry = self.read_entry_at(cursor).await?; - println!("({}, {:?})", i, entry); - cursor = self.current_cursor().await?; + let mut file_position: FilePosition = self.current_file_position().await?; + loop { + match self.read_entry_at(file_position).await? { + Some(entry) => { + println!("{:?}", entry); + file_position = self.current_file_position().await?; + }, + None => { + println!("END of entries."); + return Ok(()) + } + } } - Ok(()) } pub async fn search_for_entry_with_id(&mut self, id: T) -> Result>> { @@ -313,7 +345,7 @@ impl Store { } // TODO: This needs to be some sort of an iterator - pub async fn get_eq(&self, column: Column, value: T) -> Result>> { + pub async fn get_all_eq(&self, column: Column, value: T) -> Result>> { todo!() } From 85ef52dfb4ab5e42b4f1e00b45d188311fd063ac Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Fri, 2 Feb 2024 20:45:18 +0100 Subject: [PATCH 06/43] Renaming --- storage_engine/src/index.rs | 18 +++++++++--------- storage_engine/src/storage_engine.rs | 4 ++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index b7e48ab..dde57d3 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -31,38 +31,38 @@ pub struct Index { pub struct IndexHeader { } -impl Index { - pub async fn new(file_name: &str) -> Result> { +impl Index { + pub async fn new(file_name: &str) -> Result> { todo!() } - pub async fn connect(file_name: &str) -> Result> { + pub async fn connect(file_name: &str) -> Result> { todo!() } // Saves the in-memory index to disk and deallocates. - pub async fn sleep() -> Result> { + pub async fn sleep() -> Result> { todo!() } // Loads the index into memory - pub async fn wake() -> Result> { + pub async fn wake() -> Result> { todo!() } pub async fn insert() -> Result<()> - where I: Encode, V: Encode + where K: Encode, V: Encode { todo!() } - pub async fn lookup(&mut self, k: I) -> Result> - where I: Encode + Decode, + pub async fn lookup(&mut self, k: K) -> Result> + where K: Encode + Decode, { todo!() } - pub async fn delete(&mut self, k: I) -> Result> { + pub async fn delete(&mut self, k: K) -> Result> { todo!() } } diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 113982a..6075476 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -22,6 +22,8 @@ pub type FilePosition = u64; pub struct Store { table_folder: String, file: File, + // primary_index: Vec>>, + // indexes: Vec>>>, header: StoreHeader, data_type: PhantomData, @@ -189,6 +191,8 @@ impl Store { }; let encoded_header: Vec = header.encode()?; + // Index::new(format!("rows", primary_column.to_string())) + let mut store = Self { table_folder: table_folder.to_string(), file, From dbd2ba99461e519e70a28ebd41add25b8dcd5e92 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 15:48:44 +0100 Subject: [PATCH 07/43] Thinking about indexes --- storage_engine/src/index.rs | 27 +++++++++++++-- storage_engine/src/storage_engine.rs | 50 ++++++++++++++++++++++++++-- 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index dde57d3..c28c91e 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -4,13 +4,14 @@ use tokio::fs::{File, OpenOptions, DirBuilder}; use std::path::Path; use std::collections::{BTreeMap}; +use async_trait::async_trait; use bincode; use bincode::{Decode, Encode}; use crate::binary_coding::{encode, decode, encode_sequence, decode_sequence}; use tokio::fs; -use crate::error::{Error, DecodeErrorKind}; +use crate::error::Error; use std::mem::size_of; @@ -19,6 +20,7 @@ type Result = std::result::Result; // Implements a persistant self-balancing Binary Search Tree. Nope. // We need fixed-size nodes. But we want to index Strings which are variable length. +#[derive(Debug)] pub struct Index { file: File, // None means index is asleep on disk. @@ -28,11 +30,28 @@ pub struct Index { value_type: PhantomData, } +#[derive(Debug)] pub struct IndexHeader { } +use crate::storage_engine::FilePosition; + +#[async_trait] +pub trait SomethingSupportingLeq { + async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> std::result::Result; +} + impl Index { - pub async fn new(file_name: &str) -> Result> { + // TODO: delete + // pub async fn new(file_name: &str, less_than_eq: &F) -> Result> + // where F: Fn(&mut Store, K, K) -> Fut, + // Store: SomethingSupportingLeq, + // Fut: Future>, + // { + // todo!() + // } + pub async fn new(file_name: &str) -> Result> + { todo!() } @@ -56,9 +75,11 @@ impl Index { todo!() } - pub async fn lookup(&mut self, k: K) -> Result> + pub async fn lookup(&mut self, store: &mut Store, k: K) -> Result> where K: Encode + Decode, + Store: SomethingSupportingLeq, { + let x = store.less_than_eq(123, 123).await?; todo!() } diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 6075476..00054b1 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -3,13 +3,20 @@ use tokio::fs::{File, OpenOptions, DirBuilder}; use std::path::Path; use std::marker::PhantomData; +use async_trait::async_trait; + use bincode; use bincode::{Decode, Encode}; use crate::binary_coding::{encode, decode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; use tokio::fs; +use crate::index::SomethingSupportingLeq; use crate::error::{Error, DecodeErrorKind}; +use crate::index::Index; + +use std::cell::RefCell; + use std::mem::size_of; type Result = std::result::Result; @@ -24,6 +31,7 @@ pub struct Store { file: File, // primary_index: Vec>>, // indexes: Vec>>>, + // primary_index: Index, header: StoreHeader, data_type: PhantomData, @@ -36,6 +44,11 @@ pub struct Store { // list } + + +type PositionOfValue = FilePosition; +type PositionOfRow = FilePosition; + // TODO: Basically a pointer to Store + its own file position // pub struct Cursor<'a, T> { // } @@ -113,6 +126,23 @@ pub async fn store_exists(table_folder: &str) -> Result { Ok(fs::metadata(table_folder).await.is_ok()) } +pub async fn less_than_eq(store: &mut Store, file_position0: FilePosition, file_position1: FilePosition) -> Result { + todo!() +} + +// pub trait SomethingSupportingLeq { +// async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> std::result::Result; +// } + +#[async_trait] +impl SomethingSupportingLeq for Store + where T: Send +{ + async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> std::result::Result { + Ok(true) + } +} + impl Store { const ROWS_FILE_NAME: &'static str = "rows"; @@ -168,6 +198,10 @@ impl Store { Ok(bytes) } + pub async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> Result { + todo!() + } + // ===Creation=== pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { let path_to_table = Path::new(table_folder); @@ -190,8 +224,11 @@ impl Store { primary_column, }; let encoded_header: Vec = header.encode()?; + - // Index::new(format!("rows", primary_column.to_string())) + // let index: Index = Index::new( + // &format!("rows_{}", primary_column.to_string()), + // ).await?; let mut store = Self { table_folder: table_folder.to_string(), @@ -202,6 +239,7 @@ impl Store { }; store.write_bytes(&encoded_header).await?; store.eof_file_position = store.current_file_position().await?; + Ok(store) } @@ -231,7 +269,7 @@ impl Store { file, header, data_type: PhantomData::, - eof_file_position + eof_file_position, }; Ok(store) } @@ -298,6 +336,14 @@ impl Store { Ok(header) } + pub async fn search_for(&mut self, index: T) -> Result<()> + where T: Send + { + // let index = self.primary_index.borrow_mut(); + // let x = index.lookup(self, 123).await?; + todo!() + } + // Returns None when file_positoin == eof_file_position pub async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> where T: Decode From 3e7e8665fd6909f3f0c7a4adc788bac4ada9b2d3 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 16:39:40 +0100 Subject: [PATCH 08/43] Split Store into Store and Cursor --- storage_engine/src/main.rs | 20 +- storage_engine/src/storage_engine.rs | 435 +++++++++++++++------------ 2 files changed, 252 insertions(+), 203 deletions(-) diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index a286df5..c5b47de 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -23,11 +23,12 @@ async fn create_store() -> Result> { println!("THE STORE: {:?}", store); println!("THE BYTES: {:?}", store.read_all_bytes().await?); + let mut cursor = store.cursor(AccessMode::Write).await.map_err(|e| e.to_io_or_panic())?; let entry0: Entry = Entry::new_deleted(vec![1, 2, 3, 4, 5]); - append_entry(&mut store, &entry0).await?; + append_entry(&mut cursor, &entry0).await?; let entry1: Entry = Entry::new_deleted(vec![200, 200, 5, 6, 7]); - append_entry(&mut store, &entry1).await?; + append_entry(&mut cursor, &entry1).await?; println!("{:?}", store.read_all_bytes().await?); Ok(store) @@ -51,17 +52,17 @@ async fn create_or_connect() -> Result> { } -async fn append_entry(store: &mut Store, entry: &Entry) -> Result{ +async fn append_entry(cursor: &mut Cursor, entry: &Entry) -> Result{ println!("APPENDING"); println!("entry == {:?}", entry); - let file_position: FilePosition = store.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; + let file_position: FilePosition = cursor.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; println!("file_position == {:?}", file_position); Ok(file_position) } -async fn read_entry(store: &mut Store, file_position: FilePosition) -> Result>>{ +async fn read_entry(cursor: &mut Cursor, file_position: FilePosition) -> Result>>{ println!("READING ENTRY at file_position={}", file_position); - let entry = store.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; + let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; println!("ENTRY: {:?}", entry); Ok(entry) } @@ -71,17 +72,18 @@ async fn read_entry(store: &mut Store, file_position: FilePosition) -> Res async fn main() -> Result<()> { println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); - let mut store: Store = create_or_connect().await?; + let store: Store = create_or_connect().await?; // let entry0 = read_entry(&mut store, 16).await?; // let entry1 = read_entry(&mut store, 45).await?; // println!("{:?}", store); // println!("{:?}", store.read_all_bytes().await?); + let mut cursor = store.cursor(AccessMode::Write).await.map_err(|e| e.to_io_or_panic())?; let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - append_entry(&mut store, &entry0).await?; + append_entry(&mut cursor, &entry0).await?; - store.read_entries().await.map_err(|e| e.to_io_or_panic())?; + cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 00054b1..d96a42d 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -15,8 +15,6 @@ use crate::error::{Error, DecodeErrorKind}; use crate::index::Index; -use std::cell::RefCell; - use std::mem::size_of; type Result = std::result::Result; @@ -27,15 +25,19 @@ pub type FilePosition = u64; // TODO: Consider introducing a phantom type for the data that's used in the store. #[derive(Debug)] pub struct Store { + // TODO: This needs to track how many read-write cursors there are...? + // RWMutex + // {write: 0, read: n} ~> {write:0, read: n + 1} // create read + // {write: 0, read: n + 1} ~> {write:0, read: n} // destroy read + // {write: 0, read: 0} ~> {write: 1, read: 0} // create write + // {write: 1, read: 0} ~> {write: 0, read: 0} // destroy write table_folder: String, - file: File, // primary_index: Vec>>, // indexes: Vec>>>, // primary_index: Index, header: StoreHeader, data_type: PhantomData, - eof_file_position: FilePosition, // meta // location of rows file // locations of index files @@ -44,16 +46,24 @@ pub struct Store { // list } +pub struct Cursor { + header: StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + +pub enum AccessMode { + Read, + Write +} + +pub type PositionOfValue = FilePosition; +pub type PositionOfRow = FilePosition; -type PositionOfValue = FilePosition; -type PositionOfRow = FilePosition; - -// TODO: Basically a pointer to Store + its own file position -// pub struct Cursor<'a, T> { -// } - -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct StoreHeader { number_of_columns: usize, deleted_count: usize, @@ -143,8 +153,225 @@ impl SomethingSupportingLeq for Store } } +const ROWS_FILE_NAME: &'static str = "rows"; + impl Store { - const ROWS_FILE_NAME: &'static str = "rows"; + // For debugging. + // Moves file cursor to the end. + pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error>{ + let mut bytes: Vec = vec![]; + let mut cursor = self.cursor(AccessMode::Read).await.map_err(|e| e.to_io_or_panic())?; + cursor.file.read_to_end(&mut bytes).await?; + Ok(bytes) + } + + // ===Creation=== + pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { + let path_to_table = Path::new(table_folder); + let path_to_rows = path_to_table.join(ROWS_FILE_NAME); + DirBuilder::new() + .create(path_to_table).await?; + + let mut file: File = + OpenOptions::new() + .write(true) + .read(true) + .create_new(true) + .open(path_to_rows) + .await?; + + let header = StoreHeader { + number_of_columns, + deleted_count: 0, + total_count: 0, + primary_column, + }; + let encoded_header: Vec = header.encode()?; + file.write(&encoded_header).await?; + + + // TODO: indexes + // let index: Index = Index::new( + // &format!("rows_{}", primary_column.to_string()), + // ).await?; + + let store = Self { + table_folder: table_folder.to_string(), + header, + data_type: PhantomData::, + }; + + Ok(store) + } + + pub async fn connect(table_folder: &str) -> Result + where T: std::fmt::Debug + { + let path_to_table = Path::new(table_folder); + let path_to_rows = path_to_table.join(ROWS_FILE_NAME); + + let mut file: File = + OpenOptions::new() + .read(true) + .write(true) + .open(path_to_rows) + .await?; + + // Unfortunately we can't yet use store.read_bytes, since it can't be created without the + // header. + let mut header_bytes = StoreHeader::decode_buffer(); + file.read_exact(&mut header_bytes).await?; + let header = StoreHeader::decode(&mut header_bytes).await?; + + let store = Self { + table_folder: table_folder.to_string(), + header, + data_type: PhantomData::, + }; + Ok(store) + } + + pub async fn cursor(&self, mode: AccessMode) -> Result> { + Cursor::new(&self, mode).await + } + + pub async fn garbage_collect(&mut self) -> Result<()> { + todo!() + } +} + +// ===Store Header=== +impl StoreHeader { + fn encode(&self) -> Result> { + let mut result = encode(&self.number_of_columns)?; + result.append(&mut encode(&self.deleted_count)?); + result.append(&mut encode(&self.total_count)?); + result.append(&mut encode(&self.primary_column)?); + Ok(result) + } + + fn decode_buffer() -> [u8; StoreHeader::SIZE] { + [0; StoreHeader::SIZE] + } + + async fn decode(result: &mut [u8]) -> Result { + let (number_of_columns, _) = + decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; + let (deleted_count, _) = + decode::(&result[Self::DELETED_COUNT_OFFSET..Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; + let (total_count, _) = + decode::(&result[Self::TOTAL_COUNT_OFFSET..Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderTotalCount, e))?; + let (primary_column, _) = + decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; + let header = StoreHeader { + number_of_columns, + deleted_count, + total_count, + primary_column, + }; + + Ok(header) + } + + // returns new count + fn increment_total_count(&mut self) -> usize { + self.total_count += 1; + self.total_count + } + + // returns new count + fn increment_deleted_count(&mut self) -> usize { + self.deleted_count += 1; + self.deleted_count + } +} + +// ====Entry==== +impl EntryHeader { + fn encode(self: &EntryHeader) -> Result> { + let result: Vec = encode(&self.is_deleted)?; + Ok(result) + } +} + +impl EntryHeaderWithDataSize { + fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { + let (is_deleted, _) = + decode::(&bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + + let data_sizes = decode_sequence::(number_of_columns, &bytes[Self::DATA_SIZES_OFFSET..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryHeaderWithDataSizes, e))?; + + Ok(Self { is_deleted, data_sizes } ) + } +} + +impl Entry { + pub fn new(data: Vec) -> Self { + Self { header: EntryHeader { is_deleted: false }, data } + } + + pub fn new_deleted(data: Vec) -> Self { + Self { header: EntryHeader { is_deleted: true}, data } + } + + // FORMAT: [EntryHeaderWithDataSize, ..sequence of data] + fn encode(&self) -> Result> + where T: Encode + { + let mut result: Vec = self.header.encode()?; + + let (mut encoded_data, sizes) = encode_sequence_with_sizes(&self.data[..])?; + result.append(&mut encode_sequence(&sizes)?); // sizes of data (fixed by number of columns) + result.append(&mut encoded_data); // data variable size + Ok(result) + } +} + +impl EntryDetailed { + fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result + where T: Decode + { + let data = decode_sequence::(number_of_columns, bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; + Ok(EntryDetailed { header, data }) + } +} + + +//=================Cursor================== +impl Cursor { + pub async fn new(store: &Store, mode: AccessMode) -> Result { + let path_to_rows = Path::new(&store.table_folder).join(ROWS_FILE_NAME); + let file: File = match mode { + AccessMode::Read => + OpenOptions::new() + .read(true) + .open(path_to_rows) + .await?, + + AccessMode::Write => + OpenOptions::new() + .read(true) + .write(true) + .open(path_to_rows) + .await?, + }; + + let cursor = Self { + header: store.header.clone(), + file, + data_type: store.data_type, + + eof_file_position: 0, + }; + Ok(cursor) + } //===primitive file operations=== // Moves the file cursor right. @@ -165,7 +392,7 @@ impl Store { Ok(result) } - async fn seek_to(&mut self, file_position: FilePosition) -> Result<()>{ + async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { self.file.seek(SeekFrom::Start(file_position)).await?; Ok(()) } @@ -203,76 +430,6 @@ impl Store { } // ===Creation=== - pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { - let path_to_table = Path::new(table_folder); - let path_to_rows = path_to_table.join(Self::ROWS_FILE_NAME); - DirBuilder::new() - .create(path_to_table).await?; - - let file: File = - OpenOptions::new() - .write(true) - .read(true) - .create_new(true) - .open(path_to_rows) - .await?; - - let header = StoreHeader { - number_of_columns, - deleted_count: 0, - total_count: 0, - primary_column, - }; - let encoded_header: Vec = header.encode()?; - - - // let index: Index = Index::new( - // &format!("rows_{}", primary_column.to_string()), - // ).await?; - - let mut store = Self { - table_folder: table_folder.to_string(), - file, - header, - data_type: PhantomData::, - eof_file_position: 0, - }; - store.write_bytes(&encoded_header).await?; - store.eof_file_position = store.current_file_position().await?; - - Ok(store) - } - - pub async fn connect(table_folder: &str) -> Result - where T: std::fmt::Debug - { - let path_to_table = Path::new(table_folder); - let path_to_rows = path_to_table.join(Self::ROWS_FILE_NAME); - - let mut file: File = - OpenOptions::new() - .read(true) - .write(true) - .open(path_to_rows) - .await?; - - // Unfortunately we can't yet use store.read_bytes, since it can't be created without the - // header. - let mut header_bytes = StoreHeader::decode_buffer(); - file.read_exact(&mut header_bytes).await?; - let header = StoreHeader::decode(&mut header_bytes).await?; - - let eof_file_position = file.seek(SeekFrom::End(0)).await?; - - let store = Self { - table_folder: table_folder.to_string(), - file, - header, - data_type: PhantomData::, - eof_file_position, - }; - Ok(store) - } // ===Append Entry=== async fn increment_total_count(&mut self) -> Result<()> { @@ -398,118 +555,8 @@ impl Store { pub async fn get_all_eq(&self, column: Column, value: T) -> Result>> { todo!() } - - pub async fn garbage_collect(&mut self) -> Result<()> { - todo!() - } } -// ===Store Header=== -impl StoreHeader { - fn encode(&self) -> Result> { - let mut result = encode(&self.number_of_columns)?; - result.append(&mut encode(&self.deleted_count)?); - result.append(&mut encode(&self.total_count)?); - result.append(&mut encode(&self.primary_column)?); - Ok(result) - } - - fn decode_buffer() -> [u8; StoreHeader::SIZE] { - [0; StoreHeader::SIZE] - } - - async fn decode(result: &mut [u8]) -> Result { - let (number_of_columns, _) = - decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; - let (deleted_count, _) = - decode::(&result[Self::DELETED_COUNT_OFFSET..Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; - let (total_count, _) = - decode::(&result[Self::TOTAL_COUNT_OFFSET..Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderTotalCount, e))?; - let (primary_column, _) = - decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; - let header = StoreHeader { - number_of_columns, - deleted_count, - total_count, - primary_column, - }; - - Ok(header) - } - - // returns new count - fn increment_total_count(&mut self) -> usize { - self.total_count += 1; - self.total_count - } - - // returns new count - fn increment_deleted_count(&mut self) -> usize { - self.deleted_count += 1; - self.deleted_count - } -} - -// ====Entry==== -impl EntryHeader { - fn encode(self: &EntryHeader) -> Result> { - let result: Vec = encode(&self.is_deleted)?; - Ok(result) - } -} - -impl EntryHeaderWithDataSize { - fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { - let (is_deleted, _) = - decode::(&bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; - - let data_sizes = decode_sequence::(number_of_columns, &bytes[Self::DATA_SIZES_OFFSET..]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryHeaderWithDataSizes, e))?; - - Ok(Self { is_deleted, data_sizes } ) - } -} - -impl Entry { - pub fn new(data: Vec) -> Self { - Self { header: EntryHeader { is_deleted: false }, data } - } - - pub fn new_deleted(data: Vec) -> Self { - Self { header: EntryHeader { is_deleted: true}, data } - } - - // FORMAT: [EntryHeaderWithDataSize, ..sequence of data] - fn encode(&self) -> Result> - where T: Encode - { - let mut result: Vec = self.header.encode()?; - - let (mut encoded_data, sizes) = encode_sequence_with_sizes(&self.data[..])?; - result.append(&mut encode_sequence(&sizes)?); // sizes of data (fixed by number of columns) - result.append(&mut encoded_data); // data variable size - Ok(result) - } -} - -impl EntryDetailed { - fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result - where T: Decode - { - let data = decode_sequence::(number_of_columns, bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; - Ok(EntryDetailed { header, data }) - } -} - - - - // impl StorageEngine for ColumnStore { // async fn append(&mut self, id: Index, entry: Row) -> Result From 53aa5a01278a89849c41ccaab967f2a285338d7a Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 17:56:08 +0100 Subject: [PATCH 09/43] Attempt at delete --- storage_engine/src/main.rs | 11 ++- storage_engine/src/storage_engine.rs | 123 ++++++++++++++++++--------- 2 files changed, 93 insertions(+), 41 deletions(-) diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index c5b47de..117ac4a 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -24,10 +24,10 @@ async fn create_store() -> Result> { println!("THE BYTES: {:?}", store.read_all_bytes().await?); let mut cursor = store.cursor(AccessMode::Write).await.map_err(|e| e.to_io_or_panic())?; - let entry0: Entry = Entry::new_deleted(vec![1, 2, 3, 4, 5]); + let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); append_entry(&mut cursor, &entry0).await?; - let entry1: Entry = Entry::new_deleted(vec![200, 200, 5, 6, 7]); + let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); append_entry(&mut cursor, &entry1).await?; println!("{:?}", store.read_all_bytes().await?); @@ -83,6 +83,13 @@ async fn main() -> Result<()> { let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); append_entry(&mut cursor, &entry0).await?; + let entry1: Entry = Entry::new(vec![50,50,50,50,50]); + let file_position = append_entry(&mut cursor, &entry1).await?; + println!("CURRENT FILE_POSITION = {}", file_position); + // Now file_position point to entry1. + // cursor.mark_deleted_at(file_position).await.map_err(|e| e.to_io_or_panic())?; + // cursor.seek_to(file_position).await.map_err(|e| e.to_io_or_panic())?; + cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index d96a42d..ddc0bb2 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -35,6 +35,10 @@ pub struct Store { // primary_index: Vec>>, // indexes: Vec>>>, // primary_index: Index, + + // TODO: It's not good to have StoreHeader copied to all the cursors, since they may modify it. + // How to sync? + // All header: StoreHeader, data_type: PhantomData, @@ -46,6 +50,9 @@ pub struct Store { // list } +// Read Cursors don't modify the rows nor Store Header. +// Write Cursors can modify both rows and Store Header. +// Probably should split these into two types. But they will have a lot of functionality in common. pub struct Cursor { header: StoreHeader, file: File, @@ -54,6 +61,14 @@ pub struct Cursor { eof_file_position: FilePosition, } +pub struct WriteCursor<'a, T> { + header: &'a mut StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + pub enum AccessMode { Read, Write @@ -87,10 +102,6 @@ impl StoreHeader { pub struct EntryHeader { is_deleted: bool, } -impl EntryHeader { - const IS_DELETED_SIZE: usize = size_of::(); - const HEADER_SIZE: usize = Self::IS_DELETED_SIZE; -} #[derive(Debug)] pub struct EntryHeaderWithDataSize { @@ -363,13 +374,15 @@ impl Cursor { .await?, }; - let cursor = Self { + let mut cursor = Self { header: store.header.clone(), file, data_type: store.data_type, - eof_file_position: 0, + eof_file_position: 0, // This will be overwriten by the seek_to_start_of_data }; + cursor.seek_to_start_of_data().await?; + Ok(cursor) } @@ -392,7 +405,8 @@ impl Cursor { Ok(result) } - async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { + // TODO: make private + pub async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { self.file.seek(SeekFrom::Start(file_position)).await?; Ok(()) } @@ -411,27 +425,41 @@ impl Cursor { self.seek_to(StoreHeader::SIZE as u64).await } - async fn current_file_position(&mut self) -> Result { + // TODO: Make private + pub async fn current_file_position(&mut self) -> Result { let next_file_position: FilePosition = self.file.stream_position().await?; Ok(next_file_position) } - // For debugging. - // Moves file cursor to the end. - pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error>{ - let mut bytes: Vec = vec![]; - self.seek_to_start().await.map_err(|e| e.to_io_or_panic())?; - self.file.read_to_end(&mut bytes).await?; - Ok(bytes) + async fn is_at_eof(&mut self) -> Result { + Ok(self.current_file_position().await? == self.eof_file_position) } pub async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> Result { todo!() } - // ===Creation=== + // ===Iteration=== + // Assumes that the current file position is at a valid entry or EOF. + pub async fn next(&mut self) -> Result>> + where T: Decode + { + if self.is_at_eof().await? { + return Ok(None) + } - // ===Append Entry=== + let header = self.read_entry_header().await?; + + let mut data_bytes: Vec = vec![0; header.size_of_data()]; + self.read_bytes(&mut data_bytes).await?; + let entry: EntryDetailed = + EntryDetailed::decode(header, self.header.number_of_columns, &mut data_bytes)?; + + Ok(Some(entry)) + } + + + // ===Store Header Manipulation=== async fn increment_total_count(&mut self) -> Result<()> { self.seek_to_start().await?; self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; @@ -448,7 +476,18 @@ impl Cursor { Ok(()) } + // ===Entry Header Manipulation=== + // assumes we are at the start of the valid entry. + async fn set_entry_is_deleted_to(&mut self, is_deleted: bool) -> Result<()> { + self.seek_to(EntryHeaderWithDataSize::IS_DELETED_OFFSET as u64).await?; + self.write_bytes(&encode::(&is_deleted)?).await?; + Ok(()) + } + + // ===Append Entry=== + // Moves cursor to the end. + // Returns file position to the start of the new entry. pub async fn append_entry(&mut self, entry: &Entry) -> Result where T: Encode { @@ -467,21 +506,34 @@ impl Cursor { // ===Deletion=== pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> { - self.increment_deleted_count().await?; - self.seek_to(file_position).await?; + let entry_header = self.read_entry_header().await?; + if entry_header.is_deleted { + Ok(()) + } else { + self.increment_deleted_count().await?; + self.seek_to(file_position).await?; + self.set_entry_is_deleted_to(true).await?; - // TODO: Now you need to mutate the entry itself - todo!() + self.attempt_garbage_collection_if_necessary().await?; + Ok(()) + } } + async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> { + // TODO: What should be the policy? Counting size of garbage? Counting how many entries are + // garbage? + if self.header.deleted_count > 100 { + todo!() + } else { + Ok(()) + } + } // ===Lookup=== // WARNING: The cursor has to be at the start of an entry. Otherwise garbage data will be // decoded as an entry. - pub async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { - self.seek_to(file_position).await?; - + async fn read_entry_header(&mut self) -> Result { let number_of_columns: usize = self.header.number_of_columns; let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; self.read_bytes(&mut header_bytes).await?; @@ -493,6 +545,11 @@ impl Cursor { Ok(header) } + pub async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { + self.seek_to(file_position).await?; + self.read_entry_header().await + } + pub async fn search_for(&mut self, index: T) -> Result<()> where T: Send { @@ -501,25 +558,13 @@ impl Cursor { todo!() } + // Returns None when file_positoin == eof_file_position pub async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> where T: Decode { - if file_position == self.eof_file_position { - return Ok(None) - } - - let header = self.read_entry_header_at(file_position).await?; - - let mut data_bytes: Vec = vec![0; header.size_of_data()]; - // TODO: Get rid of the println's - // println!("PREPARED_DATA_BYTES: {:?}", data_bytes); - self.read_bytes(&mut data_bytes).await?; - // println!("DATA_BYTES: {:?}", data_bytes); - let entry: EntryDetailed = - EntryDetailed::decode(header, self.header.number_of_columns, &mut data_bytes)?; - - Ok(Some(entry)) + self.seek_to(file_position).await?; + self.next().await } // TODO: This needs to be some sort of an iterator From a37c3a5e772f0f3d4bf09f1b44b462162d095499 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 19:00:00 +0100 Subject: [PATCH 10/43] Split Cursor into ReadCursor and WriteCursor --- storage_engine/src/cursor.rs | 302 +++++++++++++++++++++++++++ storage_engine/src/lib.rs | 1 + storage_engine/src/main.rs | 1 + storage_engine/src/storage_engine.rs | 70 +++---- 4 files changed, 334 insertions(+), 40 deletions(-) create mode 100644 storage_engine/src/cursor.rs diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs new file mode 100644 index 0000000..648d964 --- /dev/null +++ b/storage_engine/src/cursor.rs @@ -0,0 +1,302 @@ +use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::fs::{File, OpenOptions, DirBuilder}; +use std::path::Path; +use std::marker::PhantomData; + +use async_trait::async_trait; + +use bincode; +use bincode::{Decode, Encode}; +use crate::binary_coding::{encode, decode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; +use tokio::fs; + +use crate::error::{Error, DecodeErrorKind}; +use crate::storage_engine::{Store, StoreHeader, FilePosition, Result, ROWS_FILE_NAME, EntryDetailed, EntryHeaderWithDataSize, Entry}; + +#[async_trait] +trait PrimitiveCursor { + fn file(&mut self) -> &mut File; + fn eof_file_position(&self) -> FilePosition; + + async fn read_bytes(&mut self, bytes: &mut [u8]) -> Result<()> { + self.file().read_exact(bytes).await?; + Ok(()) + } + + async fn get_bytes(&mut self, count: usize) -> Result> { + let mut result: Vec = Vec::with_capacity(count); + self.read_bytes(&mut result).await?; + Ok(result) + } + + async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { + self.file().seek(SeekFrom::Start(file_position)).await?; + Ok(()) + } + + async fn seek_to_start(&mut self) -> Result<()> { + self.file().seek(SeekFrom::Start(0)).await?; + Ok(()) + } + + async fn seek_to_end(&mut self) -> Result<()> { + self.file().seek(SeekFrom::End(0)).await?; + Ok(()) + } + + async fn seek_to_start_of_data(&mut self) -> Result<()> { + self.seek_to(StoreHeader::SIZE as u64).await + } + + async fn current_file_position(&mut self) -> Result { + let next_file_position: FilePosition = self.file().stream_position().await?; + Ok(next_file_position) + } + + async fn is_at_eof(&mut self) -> Result { + Ok(self.current_file_position().await? == self.eof_file_position()) + } +} + +#[async_trait] +trait CursorWithStoreHeader: PrimitiveCursor { + fn header(&self) -> &StoreHeader; + + async fn read_entry_header(&mut self) -> Result { + let number_of_columns: usize = self.header().number_of_columns; + let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; + self.read_bytes(&mut header_bytes).await?; + let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..], number_of_columns)?; + + Ok(header) + } + + async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { + self.seek_to(file_position).await?; + self.read_entry_header().await + } + + // Returns None when file_position == eof_file_position + async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> + where T: Decode + { + self.seek_to(file_position).await?; + self.next().await + } + + // ===Iteration=== + // Assumes that the current file position is at a valid entry or EOF. + async fn next(&mut self) -> Result>> + where T: Decode + { + if self.is_at_eof().await? { + return Ok(None) + } + + let entry_header = self.read_entry_header().await?; + + let mut data_bytes: Vec = vec![0; entry_header.size_of_data()]; + self.read_bytes(&mut data_bytes).await?; + let entry: EntryDetailed = + EntryDetailed::decode(entry_header, self.header().number_of_columns, &mut data_bytes)?; + + Ok(Some(entry)) + } + + async fn read_entries(&mut self) -> Result<()> + where T: Decode + std::fmt::Debug + { + self.seek_to_start_of_data().await?; + while let Some(entry) = self.next().await? { + println!("{:?}", entry); + } + println!("END of entries."); + Ok(()) + } +} + + +// ===Concrete Cursors=== +pub struct ReadCursor { + header: StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + +pub struct WriteCursor<'a, T> { + header: &'a mut StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + +// ===PrimitiveCursor=== +impl PrimitiveCursor for ReadCursor { + fn file(&mut self) -> &mut File { + &mut self.file + } + + fn eof_file_position(&self) -> FilePosition { + self.eof_file_position + } +} + +impl PrimitiveCursor for WriteCursor<'_, T> { + fn file(&mut self) -> &mut File { + &mut self.file + } + + fn eof_file_position(&self) -> FilePosition { + self.eof_file_position + } +} + +// ===CursorWithStoreHeader=== +impl CursorWithStoreHeader for ReadCursor { + fn header(&self) -> &StoreHeader { + &self.header + } +} + +impl CursorWithStoreHeader for WriteCursor<'_, T> { + fn header(&self) -> &StoreHeader { + &self.header + } +} + +impl ReadCursor { + pub async fn new(store: &Store) -> Result + where T: Send + { + let path_to_rows = Path::new(&store.table_folder).join(ROWS_FILE_NAME); + let file: File = + OpenOptions::new() + .read(true) + .open(path_to_rows) + .await?; + + let mut cursor = Self { + header: store.header.clone(), + file, + data_type: store.data_type, + + eof_file_position: 0, // This will be overwriten by the seek_to_start_of_data + }; + cursor.seek_to_start_of_data().await?; + + Ok(cursor) + } +} + +impl <'cursor, T> WriteCursor<'cursor, T> { + // 'store lives atleast as long as 'cursor + pub async fn new<'store: 'cursor>(store: &'store mut Store) -> Result + where T: Send + { + let path_to_rows = Path::new(&store.table_folder).join(ROWS_FILE_NAME); + let file: File = + OpenOptions::new() + .read(true) + .write(true) + .open(path_to_rows) + .await?; + + let mut cursor = Self { + header: &mut store.header, + file, + data_type: store.data_type, + + eof_file_position: 0, // This will be overwriten by the seek_to_start_of_data + }; + cursor.seek_to_start_of_data().await?; + + Ok(cursor) + } + + // ===Primitive Operations=== + async fn write_bytes(&mut self, bytes: &[u8]) -> Result { + Ok(self.file.write(bytes).await?) + } + + // ===Store Header Manipulation=== + async fn increment_total_count(&mut self) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; + let new_count = self.header.increment_total_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + async fn increment_deleted_count(&mut self) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; + let new_count = self.header.increment_deleted_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + // ===Entry Header Manipulation=== + // assumes we are at the start of the valid entry. + async fn set_entry_is_deleted_to(&mut self, is_deleted: bool) -> Result<()> + where T: Send + { + self.seek_to(EntryHeaderWithDataSize::IS_DELETED_OFFSET as u64).await?; + self.write_bytes(&encode::(&is_deleted)?).await?; + Ok(()) + } + + // ===Append Entry=== + + // Moves cursor to the end. + // Returns file position to the start of the new entry. + pub async fn append_entry(&mut self, entry: &Entry) -> Result + where T: Encode + Send + { + self.increment_total_count().await?; + + let encoded_entry: Vec = entry.encode()?; + self.seek_to_end().await?; + let file_position: FilePosition = self.current_file_position().await?; + self.write_bytes(&encoded_entry).await?; + + let eof_file_position: FilePosition = self.current_file_position().await?; + self.eof_file_position = eof_file_position; + + Ok(file_position) + } + + // ===Deletion=== + pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> + where T: Send + { + self.seek_to(file_position).await?; + let entry_header = self.read_entry_header().await?; + if entry_header.is_deleted { + Ok(()) + } else { + self.increment_deleted_count().await?; + self.seek_to(file_position).await?; + self.set_entry_is_deleted_to(true).await?; + + self.attempt_garbage_collection_if_necessary().await?; + Ok(()) + } + } + + async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> { + // TODO: What should be the policy? Counting size of garbage? Counting how many entries are + // garbage? + if self.header.deleted_count > 100 { + todo!() + } else { + Ok(()) + } + } +} diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs index 192f3db..164e164 100644 --- a/storage_engine/src/lib.rs +++ b/storage_engine/src/lib.rs @@ -2,3 +2,4 @@ pub mod storage_engine; mod binary_coding; mod error; mod index; +mod cursor; diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 117ac4a..504e519 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -8,6 +8,7 @@ mod storage_engine; mod binary_coding; mod error; mod index; +mod cursor; use crate::storage_engine::*; diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index ddc0bb2..27e9e41 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -17,7 +17,7 @@ use crate::index::Index; use std::mem::size_of; -type Result = std::result::Result; +pub type Result = std::result::Result; pub type Column = u64; pub type FilePosition = u64; @@ -31,7 +31,7 @@ pub struct Store { // {write: 0, read: n + 1} ~> {write:0, read: n} // destroy read // {write: 0, read: 0} ~> {write: 1, read: 0} // create write // {write: 1, read: 0} ~> {write: 0, read: 0} // destroy write - table_folder: String, + pub table_folder: String, // primary_index: Vec>>, // indexes: Vec>>>, // primary_index: Index, @@ -39,8 +39,8 @@ pub struct Store { // TODO: It's not good to have StoreHeader copied to all the cursors, since they may modify it. // How to sync? // All - header: StoreHeader, - data_type: PhantomData, + pub header: StoreHeader, + pub data_type: PhantomData, // meta // location of rows file @@ -61,14 +61,6 @@ pub struct Cursor { eof_file_position: FilePosition, } -pub struct WriteCursor<'a, T> { - header: &'a mut StoreHeader, - file: File, - data_type: PhantomData, - - eof_file_position: FilePosition, -} - pub enum AccessMode { Read, Write @@ -80,22 +72,22 @@ pub type PositionOfRow = FilePosition; #[derive(Debug, Clone)] pub struct StoreHeader { - number_of_columns: usize, - deleted_count: usize, - total_count: usize, - primary_column: Column, + pub number_of_columns: usize, + pub deleted_count: usize, + pub total_count: usize, + pub primary_column: Column, } impl StoreHeader { - const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); - const DELETED_COUNT_SIZE: usize = size_of::(); - const TOTAL_COUNT_SIZE: usize = size_of::(); - const PRIMARY_COLUMN_SIZE: usize = size_of::(); - const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; + pub const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); + pub const DELETED_COUNT_SIZE: usize = size_of::(); + pub const TOTAL_COUNT_SIZE: usize = size_of::(); + pub const PRIMARY_COLUMN_SIZE: usize = size_of::(); + pub const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; - const NUMBER_OF_COLUMNS_OFFSET: usize = 0; - const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; - const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; - const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; + pub const NUMBER_OF_COLUMNS_OFFSET: usize = 0; + pub const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; + pub const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; + pub const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; } #[derive(Debug)] @@ -105,21 +97,21 @@ pub struct EntryHeader { #[derive(Debug)] pub struct EntryHeaderWithDataSize { - is_deleted: bool, - data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 + pub is_deleted: bool, + pub data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 // bytes etc } impl EntryHeaderWithDataSize { - const IS_DELETED_OFFSET: usize = 0; - const IS_DELETED_SIZE: usize = size_of::(); - const DATA_SIZES_OFFSET: usize = Self::IS_DELETED_OFFSET + Self::IS_DELETED_SIZE; + pub const IS_DELETED_OFFSET: usize = 0; + pub const IS_DELETED_SIZE: usize = size_of::(); + pub const DATA_SIZES_OFFSET: usize = Self::IS_DELETED_OFFSET + Self::IS_DELETED_SIZE; - fn size(number_of_columns: usize) -> usize { + pub fn size(number_of_columns: usize) -> usize { let size_of_data_sizes: usize = number_of_columns*size_of::(); Self::IS_DELETED_SIZE + size_of_data_sizes } - fn size_of_data(&self) -> usize{ + pub fn size_of_data(&self) -> usize{ self.data_sizes.iter().sum() } } @@ -164,7 +156,7 @@ impl SomethingSupportingLeq for Store } } -const ROWS_FILE_NAME: &'static str = "rows"; +pub const ROWS_FILE_NAME: &'static str = "rows"; impl Store { // For debugging. @@ -289,13 +281,13 @@ impl StoreHeader { } // returns new count - fn increment_total_count(&mut self) -> usize { + pub fn increment_total_count(&mut self) -> usize { self.total_count += 1; self.total_count } // returns new count - fn increment_deleted_count(&mut self) -> usize { + pub fn increment_deleted_count(&mut self) -> usize { self.deleted_count += 1; self.deleted_count } @@ -310,7 +302,7 @@ impl EntryHeader { } impl EntryHeaderWithDataSize { - fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { + pub fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { let (is_deleted, _) = decode::(&bytes) .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; @@ -332,7 +324,7 @@ impl Entry { } // FORMAT: [EntryHeaderWithDataSize, ..sequence of data] - fn encode(&self) -> Result> + pub fn encode(&self) -> Result> where T: Encode { let mut result: Vec = self.header.encode()?; @@ -345,7 +337,7 @@ impl Entry { } impl EntryDetailed { - fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result + pub fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result where T: Decode { let data = decode_sequence::(number_of_columns, bytes) @@ -405,7 +397,6 @@ impl Cursor { Ok(result) } - // TODO: make private pub async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { self.file.seek(SeekFrom::Start(file_position)).await?; Ok(()) @@ -425,7 +416,6 @@ impl Cursor { self.seek_to(StoreHeader::SIZE as u64).await } - // TODO: Make private pub async fn current_file_position(&mut self) -> Result { let next_file_position: FilePosition = self.file.stream_position().await?; Ok(next_file_position) From 2261fe39de55197c8a0bd9d557ef87a2e8838680 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 19:17:47 +0100 Subject: [PATCH 11/43] Use new cursors --- storage_engine/src/cursor.rs | 33 +++- storage_engine/src/main.rs | 54 +++--- storage_engine/src/storage_engine.rs | 277 ++------------------------- 3 files changed, 81 insertions(+), 283 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 648d964..c658213 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -59,7 +59,7 @@ trait PrimitiveCursor { } #[async_trait] -trait CursorWithStoreHeader: PrimitiveCursor { +pub trait CursorWithStoreHeader: PrimitiveCursor { fn header(&self) -> &StoreHeader; async fn read_entry_header(&mut self) -> Result { @@ -103,6 +103,7 @@ trait CursorWithStoreHeader: PrimitiveCursor { Ok(Some(entry)) } + // ===Debugging=== async fn read_entries(&mut self) -> Result<()> where T: Decode + std::fmt::Debug { @@ -113,6 +114,12 @@ trait CursorWithStoreHeader: PrimitiveCursor { println!("END of entries."); Ok(()) } + + async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> { + let mut bytes: Vec = vec![]; + self.file().read_to_end(&mut bytes).await?; + Ok(bytes) + } } @@ -189,6 +196,10 @@ impl ReadCursor { Ok(cursor) } + + pub async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> Result { + todo!() + } } impl <'cursor, T> WriteCursor<'cursor, T> { @@ -300,3 +311,23 @@ impl <'cursor, T> WriteCursor<'cursor, T> { } } } + + + +// TODO + // pub async fn search_for(&mut self, index: T) -> Result<()> + // where T: Send + // { + // // let index = self.primary_index.borrow_mut(); + // // let x = index.lookup(self, 123).await?; + // todo!() + // } + // pub async fn search_for_entry_with_id(&mut self, id: T) -> Result>> { + // // TODO: make call to the primary index + // todo!() + // } + + // // TODO: This needs to be some sort of an iterator + // pub async fn get_all_eq(&self, column: Column, value: T) -> Result>> { + // todo!() + // } diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 504e519..fd38d58 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -11,6 +11,7 @@ mod index; mod cursor; use crate::storage_engine::*; +use crate::cursor::*; type Data = u32; @@ -24,14 +25,6 @@ async fn create_store() -> Result> { println!("THE STORE: {:?}", store); println!("THE BYTES: {:?}", store.read_all_bytes().await?); - let mut cursor = store.cursor(AccessMode::Write).await.map_err(|e| e.to_io_or_panic())?; - let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); - append_entry(&mut cursor, &entry0).await?; - - let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); - append_entry(&mut cursor, &entry1).await?; - - println!("{:?}", store.read_all_bytes().await?); Ok(store) } @@ -53,7 +46,7 @@ async fn create_or_connect() -> Result> { } -async fn append_entry(cursor: &mut Cursor, entry: &Entry) -> Result{ +async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: &Entry) -> Result{ println!("APPENDING"); println!("entry == {:?}", entry); let file_position: FilePosition = cursor.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; @@ -63,9 +56,10 @@ async fn append_entry(cursor: &mut Cursor, entry: &Entry) -> Result< async fn read_entry(cursor: &mut Cursor, file_position: FilePosition) -> Result>>{ println!("READING ENTRY at file_position={}", file_position); - let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; - println!("ENTRY: {:?}", entry); - Ok(entry) + // let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; + // println!("ENTRY: {:?}", entry); + // Ok(entry) + todo!() } @@ -73,32 +67,44 @@ async fn read_entry(cursor: &mut Cursor, file_position: FilePosition) -> R async fn main() -> Result<()> { println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); - let store: Store = create_or_connect().await?; + let mut store: Store = create_or_connect().await?; + + { + let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; + let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); + append_entry(&mut cursor, &entry0).await?; + + let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); + append_entry(&mut cursor, &entry1).await?; + + // println!("{:?}", store.read_all_bytes().await?); + let entry2: Entry = Entry::new(vec![99, 98, 97, 96, 95]); + append_entry(&mut cursor, &entry2).await?; + + let entry3: Entry = Entry::new(vec![50,50,50,50,50]); + append_entry(&mut cursor, &entry3).await?; + } - // let entry0 = read_entry(&mut store, 16).await?; - // let entry1 = read_entry(&mut store, 45).await?; // println!("{:?}", store); // println!("{:?}", store.read_all_bytes().await?); - let mut cursor = store.cursor(AccessMode::Write).await.map_err(|e| e.to_io_or_panic())?; - let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - append_entry(&mut cursor, &entry0).await?; + // let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); + // append_entry(&mut cursor, &entry0).await?; + + // let entry1: Entry = Entry::new(vec![50,50,50,50,50]); + // let file_position = append_entry(&mut cursor, &entry1).await?; + // println!("CURRENT FILE_POSITION = {}", file_position); - let entry1: Entry = Entry::new(vec![50,50,50,50,50]); - let file_position = append_entry(&mut cursor, &entry1).await?; - println!("CURRENT FILE_POSITION = {}", file_position); // Now file_position point to entry1. // cursor.mark_deleted_at(file_position).await.map_err(|e| e.to_io_or_panic())?; // cursor.seek_to(file_position).await.map_err(|e| e.to_io_or_panic())?; - cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; - - // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); // let cursor2 = store.append_entry(&entry2).await.map_err(|e| e.to_io_or_panic())?; // println!("cursor2 = {}", cursor2); println!("{:?}", store); + println!("{:?}", store.read_all_bytes().await?); println!("DONE"); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 27e9e41..091c403 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -12,6 +12,7 @@ use tokio::fs; use crate::index::SomethingSupportingLeq; use crate::error::{Error, DecodeErrorKind}; +use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader}; use crate::index::Index; @@ -159,15 +160,6 @@ impl SomethingSupportingLeq for Store pub const ROWS_FILE_NAME: &'static str = "rows"; impl Store { - // For debugging. - // Moves file cursor to the end. - pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error>{ - let mut bytes: Vec = vec![]; - let mut cursor = self.cursor(AccessMode::Read).await.map_err(|e| e.to_io_or_panic())?; - cursor.file.read_to_end(&mut bytes).await?; - Ok(bytes) - } - // ===Creation=== pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { let path_to_table = Path::new(table_folder); @@ -234,12 +226,26 @@ impl Store { Ok(store) } - pub async fn cursor(&self, mode: AccessMode) -> Result> { - Cursor::new(&self, mode).await + // ===Cursors=== + pub async fn read_cursor(&self) -> Result> + where T: Send + { + ReadCursor::new(self).await } - pub async fn garbage_collect(&mut self) -> Result<()> { - todo!() + pub async fn write_cursor(&mut self) -> Result> + where T: Send + { + WriteCursor::new(self).await + } + + // For debugging. + pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> + where T: Send + { + let mut cursor = self.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let bytes = cursor.read_all_bytes().await?; + Ok(bytes) } } @@ -347,251 +353,6 @@ impl EntryDetailed { } -//=================Cursor================== -impl Cursor { - pub async fn new(store: &Store, mode: AccessMode) -> Result { - let path_to_rows = Path::new(&store.table_folder).join(ROWS_FILE_NAME); - let file: File = match mode { - AccessMode::Read => - OpenOptions::new() - .read(true) - .open(path_to_rows) - .await?, - - AccessMode::Write => - OpenOptions::new() - .read(true) - .write(true) - .open(path_to_rows) - .await?, - }; - - let mut cursor = Self { - header: store.header.clone(), - file, - data_type: store.data_type, - - eof_file_position: 0, // This will be overwriten by the seek_to_start_of_data - }; - cursor.seek_to_start_of_data().await?; - - Ok(cursor) - } - - //===primitive file operations=== - // Moves the file cursor right. - async fn write_bytes(&mut self, bytes: &[u8]) -> Result { - Ok(self.file.write(bytes).await?) - } - - // Moves the file cursor right. - async fn read_bytes(&mut self, bytes: &mut [u8]) -> Result<()> { - self.file.read_exact(bytes).await?; - Ok(()) - } - - // Moves the file cursor right. - async fn get_bytes(&mut self, count: usize) -> Result> { - let mut result: Vec = Vec::with_capacity(count); - self.read_bytes(&mut result).await?; - Ok(result) - } - - pub async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { - self.file.seek(SeekFrom::Start(file_position)).await?; - Ok(()) - } - - async fn seek_to_start(&mut self) -> Result<()> { - self.file.seek(SeekFrom::Start(0)).await?; - Ok(()) - } - - async fn seek_to_end(&mut self) -> Result<()> { - self.file.seek(SeekFrom::End(0)).await?; - Ok(()) - } - - async fn seek_to_start_of_data(&mut self) -> Result<()> { - self.seek_to(StoreHeader::SIZE as u64).await - } - - pub async fn current_file_position(&mut self) -> Result { - let next_file_position: FilePosition = self.file.stream_position().await?; - Ok(next_file_position) - } - - async fn is_at_eof(&mut self) -> Result { - Ok(self.current_file_position().await? == self.eof_file_position) - } - - pub async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> Result { - todo!() - } - - // ===Iteration=== - // Assumes that the current file position is at a valid entry or EOF. - pub async fn next(&mut self) -> Result>> - where T: Decode - { - if self.is_at_eof().await? { - return Ok(None) - } - - let header = self.read_entry_header().await?; - - let mut data_bytes: Vec = vec![0; header.size_of_data()]; - self.read_bytes(&mut data_bytes).await?; - let entry: EntryDetailed = - EntryDetailed::decode(header, self.header.number_of_columns, &mut data_bytes)?; - - Ok(Some(entry)) - } - - - // ===Store Header Manipulation=== - async fn increment_total_count(&mut self) -> Result<()> { - self.seek_to_start().await?; - self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; - let new_count = self.header.increment_total_count(); - self.write_bytes(&encode::(&new_count)?).await?; - Ok(()) - } - - async fn increment_deleted_count(&mut self) -> Result<()> { - self.seek_to_start().await?; - self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; - let new_count = self.header.increment_deleted_count(); - self.write_bytes(&encode::(&new_count)?).await?; - Ok(()) - } - - // ===Entry Header Manipulation=== - // assumes we are at the start of the valid entry. - async fn set_entry_is_deleted_to(&mut self, is_deleted: bool) -> Result<()> { - self.seek_to(EntryHeaderWithDataSize::IS_DELETED_OFFSET as u64).await?; - self.write_bytes(&encode::(&is_deleted)?).await?; - Ok(()) - } - - // ===Append Entry=== - - // Moves cursor to the end. - // Returns file position to the start of the new entry. - pub async fn append_entry(&mut self, entry: &Entry) -> Result - where T: Encode - { - self.increment_total_count().await?; - - let encoded_entry: Vec = entry.encode()?; - self.seek_to_end().await?; - let file_position: FilePosition = self.current_file_position().await?; - self.write_bytes(&encoded_entry).await?; - - let eof_file_position: FilePosition = self.current_file_position().await?; - self.eof_file_position = eof_file_position; - - Ok(file_position) - } - - // ===Deletion=== - pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> { - self.seek_to(file_position).await?; - let entry_header = self.read_entry_header().await?; - if entry_header.is_deleted { - Ok(()) - } else { - self.increment_deleted_count().await?; - self.seek_to(file_position).await?; - self.set_entry_is_deleted_to(true).await?; - - self.attempt_garbage_collection_if_necessary().await?; - Ok(()) - } - } - - async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> { - // TODO: What should be the policy? Counting size of garbage? Counting how many entries are - // garbage? - if self.header.deleted_count > 100 { - todo!() - } else { - Ok(()) - } - } - - // ===Lookup=== - // WARNING: The cursor has to be at the start of an entry. Otherwise garbage data will be - // decoded as an entry. - async fn read_entry_header(&mut self) -> Result { - let number_of_columns: usize = self.header.number_of_columns; - let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; - self.read_bytes(&mut header_bytes).await?; - let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..], number_of_columns)?; - // TODO: Get rid of the println's - // println!("HEADER_BYTES: {:?}", header_bytes); - // println!("HEADER: {:?}", header); - - Ok(header) - } - - pub async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { - self.seek_to(file_position).await?; - self.read_entry_header().await - } - - pub async fn search_for(&mut self, index: T) -> Result<()> - where T: Send - { - // let index = self.primary_index.borrow_mut(); - // let x = index.lookup(self, 123).await?; - todo!() - } - - - // Returns None when file_positoin == eof_file_position - pub async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> - where T: Decode - { - self.seek_to(file_position).await?; - self.next().await - } - - // TODO: This needs to be some sort of an iterator - // pub async fn entries() -> EntryIterator { - // todo!() - // } - - pub async fn read_entries(&mut self) -> Result<()> - where T: Decode + std::fmt::Debug - { - self.seek_to_start_of_data().await?; - let mut file_position: FilePosition = self.current_file_position().await?; - loop { - match self.read_entry_at(file_position).await? { - Some(entry) => { - println!("{:?}", entry); - file_position = self.current_file_position().await?; - }, - None => { - println!("END of entries."); - return Ok(()) - } - } - } - } - - pub async fn search_for_entry_with_id(&mut self, id: T) -> Result>> { - // TODO: make call to the primary index - todo!() - } - - // TODO: This needs to be some sort of an iterator - pub async fn get_all_eq(&self, column: Column, value: T) -> Result>> { - todo!() - } -} - // impl StorageEngine for ColumnStore { // async fn append(&mut self, id: Index, entry: Row) -> Result From 3bf04ae2d6d6698f6acf9f001ed03955afd8b047 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 19:32:50 +0100 Subject: [PATCH 12/43] Cleanup --- storage_engine/src/cursor.rs | 11 +- storage_engine/src/entry.rs | 57 ++++++++ storage_engine/src/entry_header.rs | 42 ++++++ storage_engine/src/lib.rs | 3 + storage_engine/src/main.rs | 18 ++- storage_engine/src/storage_engine.rs | 206 +-------------------------- storage_engine/src/store_header.rs | 72 ++++++++++ 7 files changed, 194 insertions(+), 215 deletions(-) create mode 100644 storage_engine/src/entry.rs create mode 100644 storage_engine/src/entry_header.rs create mode 100644 storage_engine/src/store_header.rs diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index c658213..716abf7 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -1,5 +1,5 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; -use tokio::fs::{File, OpenOptions, DirBuilder}; +use tokio::fs::{File, OpenOptions}; use std::path::Path; use std::marker::PhantomData; @@ -7,11 +7,12 @@ use async_trait::async_trait; use bincode; use bincode::{Decode, Encode}; -use crate::binary_coding::{encode, decode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; -use tokio::fs; +use crate::binary_coding::encode; -use crate::error::{Error, DecodeErrorKind}; -use crate::storage_engine::{Store, StoreHeader, FilePosition, Result, ROWS_FILE_NAME, EntryDetailed, EntryHeaderWithDataSize, Entry}; +use crate::entry::{Entry, EntryDetailed}; +use crate::entry_header::EntryHeaderWithDataSize; +use crate::store_header::StoreHeader; +use crate::storage_engine::{Store, FilePosition, Result, ROWS_FILE_NAME}; #[async_trait] trait PrimitiveCursor { diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs new file mode 100644 index 0000000..84d15f9 --- /dev/null +++ b/storage_engine/src/entry.rs @@ -0,0 +1,57 @@ +use bincode::{Decode, Encode}; + +use crate::binary_coding::{encode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; +use crate::storage_engine::Result; +use crate::error::{Error, DecodeErrorKind}; +use crate::entry_header::{EntryHeader, EntryHeaderWithDataSize}; + +#[derive(Debug)] +pub struct Entry { + header: EntryHeader, + data: Vec, +} + +#[derive(Debug)] +pub struct EntryDetailed { + header: EntryHeaderWithDataSize, + data: Vec, +} + +impl EntryHeader { + fn encode(self: &EntryHeader) -> Result> { + let result: Vec = encode(&self.is_deleted)?; + Ok(result) + } +} + +impl Entry { + pub fn new(data: Vec) -> Self { + Self { header: EntryHeader { is_deleted: false }, data } + } + + pub fn new_deleted(data: Vec) -> Self { + Self { header: EntryHeader { is_deleted: true}, data } + } + + // FORMAT: [EntryHeaderWithDataSize, ..sequence of data] + pub fn encode(&self) -> Result> + where T: Encode + { + let mut result: Vec = self.header.encode()?; + + let (mut encoded_data, sizes) = encode_sequence_with_sizes(&self.data[..])?; + result.append(&mut encode_sequence(&sizes)?); // sizes of data (fixed by number of columns) + result.append(&mut encoded_data); // data variable size + Ok(result) + } +} + +impl EntryDetailed { + pub fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result + where T: Decode + { + let data = decode_sequence::(number_of_columns, bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; + Ok(EntryDetailed { header, data }) + } +} diff --git a/storage_engine/src/entry_header.rs b/storage_engine/src/entry_header.rs new file mode 100644 index 0000000..b1a9f7d --- /dev/null +++ b/storage_engine/src/entry_header.rs @@ -0,0 +1,42 @@ +use crate::binary_coding::{decode, decode_sequence}; +use crate::storage_engine::Result; +use crate::error::{Error, DecodeErrorKind}; +use std::mem::size_of; + +#[derive(Debug)] +pub struct EntryHeader { + pub is_deleted: bool, +} + +#[derive(Debug)] +pub struct EntryHeaderWithDataSize { + pub is_deleted: bool, + pub data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 + // bytes etc +} +impl EntryHeaderWithDataSize { + pub const IS_DELETED_OFFSET: usize = 0; + pub const IS_DELETED_SIZE: usize = size_of::(); + pub const DATA_SIZES_OFFSET: usize = Self::IS_DELETED_OFFSET + Self::IS_DELETED_SIZE; + + pub fn size(number_of_columns: usize) -> usize { + let size_of_data_sizes: usize = number_of_columns*size_of::(); + Self::IS_DELETED_SIZE + size_of_data_sizes + } + + pub fn size_of_data(&self) -> usize{ + self.data_sizes.iter().sum() + } + + pub fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { + let (is_deleted, _) = + decode::(&bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + + let data_sizes = decode_sequence::(number_of_columns, &bytes[Self::DATA_SIZES_OFFSET..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryHeaderWithDataSizes, e))?; + + Ok(Self { is_deleted, data_sizes } ) + } +} + diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs index 164e164..65f1a06 100644 --- a/storage_engine/src/lib.rs +++ b/storage_engine/src/lib.rs @@ -3,3 +3,6 @@ mod binary_coding; mod error; mod index; mod cursor; +mod entry; +mod entry_header; +mod store_header; diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index fd38d58..e09e481 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -1,17 +1,15 @@ -use tokio::sync::{Mutex, RwLock}; -use tokio::fs::{File, OpenOptions}; -use tokio::io::{BufReader, BufWriter, AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; -use tokio::fs; -use std::path::Path; - mod storage_engine; mod binary_coding; mod error; mod index; mod cursor; +mod entry; +mod entry_header; +mod store_header; -use crate::storage_engine::*; -use crate::cursor::*; +use crate::entry::{Entry, EntryDetailed}; +use crate::storage_engine::{Store, FilePosition}; +use crate::cursor::{ReadCursor, WriteCursor}; type Data = u32; @@ -46,7 +44,7 @@ async fn create_or_connect() -> Result> { } -async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: &Entry) -> Result{ +async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: &Entry) -> Result { println!("APPENDING"); println!("entry == {:?}", entry); let file_position: FilePosition = cursor.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; @@ -54,7 +52,7 @@ async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: &Entry) - Ok(file_position) } -async fn read_entry(cursor: &mut Cursor, file_position: FilePosition) -> Result>>{ +async fn read_entry(cursor: &mut ReadCursor, file_position: FilePosition) -> Result>> { println!("READING ENTRY at file_position={}", file_position); // let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; // println!("ENTRY: {:?}", entry); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 091c403..4284f74 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -1,22 +1,18 @@ -use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::fs::{File, OpenOptions, DirBuilder}; +use tokio::fs; use std::path::Path; use std::marker::PhantomData; - use async_trait::async_trait; -use bincode; -use bincode::{Decode, Encode}; -use crate::binary_coding::{encode, decode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; -use tokio::fs; - use crate::index::SomethingSupportingLeq; -use crate::error::{Error, DecodeErrorKind}; +use crate::error::Error; use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader}; +use crate::store_header::StoreHeader; -use crate::index::Index; +// TODO +// use crate::index::Index; -use std::mem::size_of; pub type Result = std::result::Result; @@ -42,99 +38,12 @@ pub struct Store { // All pub header: StoreHeader, pub data_type: PhantomData, - - // meta - // location of rows file - // locations of index files - // - // rows file - // list -} - -// Read Cursors don't modify the rows nor Store Header. -// Write Cursors can modify both rows and Store Header. -// Probably should split these into two types. But they will have a lot of functionality in common. -pub struct Cursor { - header: StoreHeader, - file: File, - data_type: PhantomData, - - eof_file_position: FilePosition, -} - -pub enum AccessMode { - Read, - Write } pub type PositionOfValue = FilePosition; pub type PositionOfRow = FilePosition; -#[derive(Debug, Clone)] -pub struct StoreHeader { - pub number_of_columns: usize, - pub deleted_count: usize, - pub total_count: usize, - pub primary_column: Column, -} -impl StoreHeader { - pub const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); - pub const DELETED_COUNT_SIZE: usize = size_of::(); - pub const TOTAL_COUNT_SIZE: usize = size_of::(); - pub const PRIMARY_COLUMN_SIZE: usize = size_of::(); - pub const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; - - pub const NUMBER_OF_COLUMNS_OFFSET: usize = 0; - pub const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; - pub const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; - pub const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; -} - -#[derive(Debug)] -pub struct EntryHeader { - is_deleted: bool, -} - -#[derive(Debug)] -pub struct EntryHeaderWithDataSize { - pub is_deleted: bool, - pub data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 - // bytes etc -} -impl EntryHeaderWithDataSize { - pub const IS_DELETED_OFFSET: usize = 0; - pub const IS_DELETED_SIZE: usize = size_of::(); - pub const DATA_SIZES_OFFSET: usize = Self::IS_DELETED_OFFSET + Self::IS_DELETED_SIZE; - - pub fn size(number_of_columns: usize) -> usize { - let size_of_data_sizes: usize = number_of_columns*size_of::(); - Self::IS_DELETED_SIZE + size_of_data_sizes - } - - pub fn size_of_data(&self) -> usize{ - self.data_sizes.iter().sum() - } -} - -#[derive(Debug)] -pub struct Entry { - header: EntryHeader, - data: Vec, -} - -#[derive(Debug)] -pub struct EntryDetailed { - header: EntryHeaderWithDataSize, - data: Vec, -} - - -pub struct EntryIterator<'a> { - file: &'a mut File, - current_file_position: FilePosition -} - //===Store=== pub async fn store_exists(table_folder: &str) -> Result { Ok(fs::metadata(table_folder).await.is_ok()) @@ -144,10 +53,6 @@ pub async fn less_than_eq(store: &mut Store, file_position0: FilePosition, todo!() } -// pub trait SomethingSupportingLeq { -// async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> std::result::Result; -// } - #[async_trait] impl SomethingSupportingLeq for Store where T: Send @@ -250,107 +155,8 @@ impl Store { } // ===Store Header=== -impl StoreHeader { - fn encode(&self) -> Result> { - let mut result = encode(&self.number_of_columns)?; - result.append(&mut encode(&self.deleted_count)?); - result.append(&mut encode(&self.total_count)?); - result.append(&mut encode(&self.primary_column)?); - Ok(result) - } - - fn decode_buffer() -> [u8; StoreHeader::SIZE] { - [0; StoreHeader::SIZE] - } - - async fn decode(result: &mut [u8]) -> Result { - let (number_of_columns, _) = - decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; - let (deleted_count, _) = - decode::(&result[Self::DELETED_COUNT_OFFSET..Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; - let (total_count, _) = - decode::(&result[Self::TOTAL_COUNT_OFFSET..Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderTotalCount, e))?; - let (primary_column, _) = - decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; - let header = StoreHeader { - number_of_columns, - deleted_count, - total_count, - primary_column, - }; - - Ok(header) - } - - // returns new count - pub fn increment_total_count(&mut self) -> usize { - self.total_count += 1; - self.total_count - } - - // returns new count - pub fn increment_deleted_count(&mut self) -> usize { - self.deleted_count += 1; - self.deleted_count - } -} // ====Entry==== -impl EntryHeader { - fn encode(self: &EntryHeader) -> Result> { - let result: Vec = encode(&self.is_deleted)?; - Ok(result) - } -} - -impl EntryHeaderWithDataSize { - pub fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { - let (is_deleted, _) = - decode::(&bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; - - let data_sizes = decode_sequence::(number_of_columns, &bytes[Self::DATA_SIZES_OFFSET..]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryHeaderWithDataSizes, e))?; - - Ok(Self { is_deleted, data_sizes } ) - } -} - -impl Entry { - pub fn new(data: Vec) -> Self { - Self { header: EntryHeader { is_deleted: false }, data } - } - - pub fn new_deleted(data: Vec) -> Self { - Self { header: EntryHeader { is_deleted: true}, data } - } - - // FORMAT: [EntryHeaderWithDataSize, ..sequence of data] - pub fn encode(&self) -> Result> - where T: Encode - { - let mut result: Vec = self.header.encode()?; - - let (mut encoded_data, sizes) = encode_sequence_with_sizes(&self.data[..])?; - result.append(&mut encode_sequence(&sizes)?); // sizes of data (fixed by number of columns) - result.append(&mut encoded_data); // data variable size - Ok(result) - } -} - -impl EntryDetailed { - pub fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result - where T: Decode - { - let data = decode_sequence::(number_of_columns, bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; - Ok(EntryDetailed { header, data }) - } -} // impl StorageEngine for ColumnStore { diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/store_header.rs new file mode 100644 index 0000000..fa84478 --- /dev/null +++ b/storage_engine/src/store_header.rs @@ -0,0 +1,72 @@ +use crate::binary_coding::{encode, decode}; +use crate::storage_engine::{Result, Column}; +use crate::error::{Error, DecodeErrorKind}; +use std::mem::size_of; + +#[derive(Debug, Clone)] +pub struct StoreHeader { + pub number_of_columns: usize, + pub deleted_count: usize, + pub total_count: usize, + pub primary_column: Column, +} + +impl StoreHeader { + pub const NUMBER_OF_COLUMNS_SIZE: usize = size_of::(); + pub const DELETED_COUNT_SIZE: usize = size_of::(); + pub const TOTAL_COUNT_SIZE: usize = size_of::(); + pub const PRIMARY_COLUMN_SIZE: usize = size_of::(); + pub const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; + + pub const NUMBER_OF_COLUMNS_OFFSET: usize = 0; + pub const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; + pub const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; + pub const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; + + pub fn encode(&self) -> Result> { + let mut result = encode(&self.number_of_columns)?; + result.append(&mut encode(&self.deleted_count)?); + result.append(&mut encode(&self.total_count)?); + result.append(&mut encode(&self.primary_column)?); + Ok(result) + } + + pub fn decode_buffer() -> [u8; StoreHeader::SIZE] { + [0; StoreHeader::SIZE] + } + + pub async fn decode(result: &mut [u8]) -> Result { + let (number_of_columns, _) = + decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; + let (deleted_count, _) = + decode::(&result[Self::DELETED_COUNT_OFFSET..Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderDeletedCount, e))?; + let (total_count, _) = + decode::(&result[Self::TOTAL_COUNT_OFFSET..Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderTotalCount, e))?; + let (primary_column, _) = + decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; + let header = StoreHeader { + number_of_columns, + deleted_count, + total_count, + primary_column, + }; + + Ok(header) + } + + // returns new count + pub fn increment_total_count(&mut self) -> usize { + self.total_count += 1; + self.total_count + } + + // returns new count + pub fn increment_deleted_count(&mut self) -> usize { + self.deleted_count += 1; + self.deleted_count + } +} From a345bf99c6b72db71f67a039675e28cc582d6238 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 21:34:50 +0100 Subject: [PATCH 13/43] Fix delete bug --- storage_engine/src/cursor.rs | 68 ++++++++++++++++---------- storage_engine/src/entry.rs | 9 +--- storage_engine/src/entry_header.rs | 16 ++++++- storage_engine/src/main.rs | 76 +++++++++++++++++++----------- 4 files changed, 107 insertions(+), 62 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 716abf7..763db68 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -10,12 +10,13 @@ use bincode::{Decode, Encode}; use crate::binary_coding::encode; use crate::entry::{Entry, EntryDetailed}; -use crate::entry_header::EntryHeaderWithDataSize; +use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; use crate::storage_engine::{Store, FilePosition, Result, ROWS_FILE_NAME}; #[async_trait] -trait PrimitiveCursor { +// TODO: Make this private +pub trait PrimitiveCursor { fn file(&mut self) -> &mut File; fn eof_file_position(&self) -> FilePosition; @@ -30,22 +31,23 @@ trait PrimitiveCursor { Ok(result) } - async fn seek_to(&mut self, file_position: FilePosition) -> Result<()> { - self.file().seek(SeekFrom::Start(file_position)).await?; - Ok(()) + async fn seek_to(&mut self, file_position: FilePosition) -> Result { + let file_position = self.file().seek(SeekFrom::Start(file_position)).await?; + Ok(file_position) } - async fn seek_to_start(&mut self) -> Result<()> { - self.file().seek(SeekFrom::Start(0)).await?; - Ok(()) + // Start of the file i.e. the Header, not the entries. + async fn seek_to_start(&mut self) -> Result { + let file_position = self.file().seek(SeekFrom::Start(0)).await?; + Ok(file_position) } - async fn seek_to_end(&mut self) -> Result<()> { - self.file().seek(SeekFrom::End(0)).await?; - Ok(()) + async fn seek_to_end(&mut self) -> Result { + let file_position = self.file().seek(SeekFrom::End(0)).await?; + Ok(file_position) } - async fn seek_to_start_of_data(&mut self) -> Result<()> { + async fn seek_to_start_of_data(&mut self) -> Result { self.seek_to(StoreHeader::SIZE as u64).await } @@ -55,7 +57,10 @@ trait PrimitiveCursor { } async fn is_at_eof(&mut self) -> Result { - Ok(self.current_file_position().await? == self.eof_file_position()) + let current_file_position = self.current_file_position().await?; + let eof_file_position = self.eof_file_position(); + println!("IN EOF: current={}, eof_file_position={}", current_file_position, eof_file_position); + Ok(current_file_position == eof_file_position) } } @@ -90,9 +95,12 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { async fn next(&mut self) -> Result>> where T: Decode { + println!("are we at eof?"); if self.is_at_eof().await? { + println!("YES"); return Ok(None) } + println!("NO"); let entry_header = self.read_entry_header().await?; @@ -118,6 +126,7 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> { let mut bytes: Vec = vec![]; + self.seek_to_start().await.map_err(|e| e.to_io_or_panic())?; self.file().read_to_end(&mut bytes).await?; Ok(bytes) } @@ -191,8 +200,11 @@ impl ReadCursor { file, data_type: store.data_type, - eof_file_position: 0, // This will be overwriten by the seek_to_start_of_data + eof_file_position: 0, }; + let eof_file_position: FilePosition = cursor.seek_to_end().await?; + cursor.eof_file_position = eof_file_position; + cursor.seek_to_start_of_data().await?; Ok(cursor) @@ -221,8 +233,11 @@ impl <'cursor, T> WriteCursor<'cursor, T> { file, data_type: store.data_type, - eof_file_position: 0, // This will be overwriten by the seek_to_start_of_data + eof_file_position: 0, }; + let eof_file_position: FilePosition = cursor.seek_to_end().await?; + cursor.eof_file_position = eof_file_position; + cursor.seek_to_start_of_data().await?; Ok(cursor) @@ -255,12 +270,10 @@ impl <'cursor, T> WriteCursor<'cursor, T> { } // ===Entry Header Manipulation=== - // assumes we are at the start of the valid entry. - async fn set_entry_is_deleted_to(&mut self, is_deleted: bool) -> Result<()> - where T: Send - { - self.seek_to(EntryHeaderWithDataSize::IS_DELETED_OFFSET as u64).await?; - self.write_bytes(&encode::(&is_deleted)?).await?; + // assumes we are at the start of valid entry. + async fn set_new_entry_header(&mut self, entry_header: EntryHeader) -> Result<()> { + let bytes: Vec = entry_header.encode()?; + self.write_bytes(&bytes).await?; Ok(()) } @@ -274,8 +287,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> { self.increment_total_count().await?; let encoded_entry: Vec = entry.encode()?; - self.seek_to_end().await?; - let file_position: FilePosition = self.current_file_position().await?; + let file_position = self.seek_to_end().await?; self.write_bytes(&encoded_entry).await?; let eof_file_position: FilePosition = self.current_file_position().await?; @@ -289,13 +301,19 @@ impl <'cursor, T> WriteCursor<'cursor, T> { where T: Send { self.seek_to(file_position).await?; - let entry_header = self.read_entry_header().await?; + println!("Attempting to read the entry"); + let mut entry_header = self.read_entry_header().await?; + println!("Entry Header == {:?}", entry_header); if entry_header.is_deleted { + println!("Already deleted"); Ok(()) } else { + println!("Marking as deleted"); self.increment_deleted_count().await?; self.seek_to(file_position).await?; - self.set_entry_is_deleted_to(true).await?; + + entry_header.is_deleted = true; + self.set_new_entry_header(entry_header.into()).await?; self.attempt_garbage_collection_if_necessary().await?; Ok(()) diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs index 84d15f9..9d9de38 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/entry.rs @@ -1,6 +1,6 @@ use bincode::{Decode, Encode}; -use crate::binary_coding::{encode, encode_sequence, encode_sequence_with_sizes, decode_sequence}; +use crate::binary_coding::{encode_sequence, encode_sequence_with_sizes, decode_sequence}; use crate::storage_engine::Result; use crate::error::{Error, DecodeErrorKind}; use crate::entry_header::{EntryHeader, EntryHeaderWithDataSize}; @@ -17,13 +17,6 @@ pub struct EntryDetailed { data: Vec, } -impl EntryHeader { - fn encode(self: &EntryHeader) -> Result> { - let result: Vec = encode(&self.is_deleted)?; - Ok(result) - } -} - impl Entry { pub fn new(data: Vec) -> Self { Self { header: EntryHeader { is_deleted: false }, data } diff --git a/storage_engine/src/entry_header.rs b/storage_engine/src/entry_header.rs index b1a9f7d..4aa904c 100644 --- a/storage_engine/src/entry_header.rs +++ b/storage_engine/src/entry_header.rs @@ -1,4 +1,4 @@ -use crate::binary_coding::{decode, decode_sequence}; +use crate::binary_coding::{decode, encode, decode_sequence}; use crate::storage_engine::Result; use crate::error::{Error, DecodeErrorKind}; use std::mem::size_of; @@ -14,6 +14,20 @@ pub struct EntryHeaderWithDataSize { pub data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 // bytes etc } + +impl EntryHeader { + pub fn encode(self: &EntryHeader) -> Result> { + let result: Vec = encode(&self.is_deleted)?; + Ok(result) + } +} + +impl From for EntryHeader { + fn from(entry: EntryHeaderWithDataSize) -> Self { + Self { is_deleted: entry.is_deleted, } + } +} + impl EntryHeaderWithDataSize { pub const IS_DELETED_OFFSET: usize = 0; pub const IS_DELETED_SIZE: usize = size_of::(); diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index e09e481..d530bd1 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -9,7 +9,7 @@ mod store_header; use crate::entry::{Entry, EntryDetailed}; use crate::storage_engine::{Store, FilePosition}; -use crate::cursor::{ReadCursor, WriteCursor}; +use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, PrimitiveCursor}; type Data = u32; @@ -20,8 +20,8 @@ type Result = std::result::Result; async fn create_store() -> Result> { let mut store: Store = Store::new(TABLE_PATH, 5, 0).await.map_err(|e| e.to_io_or_panic())?; println!("CREATED"); - println!("THE STORE: {:?}", store); - println!("THE BYTES: {:?}", store.read_all_bytes().await?); + // println!("THE STORE: {:?}", store); + // println!("THE BYTES: {:?}", store.read_all_bytes().await?); Ok(store) } @@ -29,8 +29,8 @@ async fn create_store() -> Result> { async fn connect_store() -> Result> { let mut store: Store = Store::connect(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; println!("CONNECTED"); - println!("THE STORE: {:?}", store); - println!("THE BYTES: {:?}", store.read_all_bytes().await?); + // println!("THE STORE: {:?}", store); + // println!("THE BYTES: {:?}", store.read_all_bytes().await?); Ok(store) } @@ -60,6 +60,22 @@ async fn read_entry(cursor: &mut ReadCursor, file_position: FilePosition) todo!() } +async fn append_bunch_of_entries(store: &mut Store) -> Result<()> { + let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; + let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); + append_entry(&mut cursor, &entry0).await?; + + let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); + append_entry(&mut cursor, &entry1).await?; + + // println!("{:?}", store.read_all_bytes().await?); + let entry2: Entry = Entry::new(vec![99, 98, 97, 96, 95]); + append_entry(&mut cursor, &entry2).await?; + + let entry3: Entry = Entry::new(vec![50,50,50,50,50]); + append_entry(&mut cursor, &entry3).await?; + Ok(()) +} #[tokio::main] async fn main() -> Result<()> { @@ -67,42 +83,46 @@ async fn main() -> Result<()> { let mut store: Store = create_or_connect().await?; + if store.header.total_count == 0 { + println!("INSERTING!"); + append_bunch_of_entries(&mut store).await?; + } + { let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; - let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); - append_entry(&mut cursor, &entry0).await?; - let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); - append_entry(&mut cursor, &entry1).await?; + let entry: Entry = Entry::new(vec![60, 50, 40, 30, 20]); + // let file_position = append_entry(&mut cursor, &entry).await?; + // let file_position = 215; + // cursor.seek_to(file_position).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", store.read_all_bytes().await?); - let entry2: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - append_entry(&mut cursor, &entry2).await?; + // let entry_header = cursor.read_entry_header().await.map_err(|e| e.to_io_or_panic())?; + // println!("entry header = {:?}", entry_header); - let entry3: Entry = Entry::new(vec![50,50,50,50,50]); - append_entry(&mut cursor, &entry3).await?; + // println!("FILE POSITION == {}", file_position); + // cursor.mark_deleted_at(file_position).await.map_err(|e| e.to_io_or_panic())?; + // let entry_header = cursor.read_entry_header().await.map_err(|e| e.to_io_or_panic())?; + // println!("entry header after delete = {:?}", entry_header); } // println!("{:?}", store); // println!("{:?}", store.read_all_bytes().await?); - // let entry0: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - // append_entry(&mut cursor, &entry0).await?; + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let entry1: Entry = Entry::new(vec![50,50,50,50,50]); - // let file_position = append_entry(&mut cursor, &entry1).await?; - // println!("CURRENT FILE_POSITION = {}", file_position); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); - // Now file_position point to entry1. - // cursor.mark_deleted_at(file_position).await.map_err(|e| e.to_io_or_panic())?; - // cursor.seek_to(file_position).await.map_err(|e| e.to_io_or_panic())?; + cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; - // let entry2: StoreEntry = StoreEntry::new_deleted(vec![3, 2, 1]); - // let cursor2 = store.append_entry(&entry2).await.map_err(|e| e.to_io_or_panic())?; - // println!("cursor2 = {}", cursor2); - - println!("{:?}", store); - println!("{:?}", store.read_all_bytes().await?); println!("DONE"); From ff378b1dd83bd06a4168c97685e7d95c3f4b3033 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 22:47:29 +0100 Subject: [PATCH 14/43] Add brute-force search --- storage_engine/src/cursor.rs | 100 ++++++++++++++++++++++++----- storage_engine/src/entry.rs | 4 +- storage_engine/src/entry_header.rs | 14 +++- storage_engine/src/main.rs | 48 ++++++++++---- 4 files changed, 135 insertions(+), 31 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 763db68..42584b7 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -2,17 +2,18 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; use tokio::fs::{File, OpenOptions}; use std::path::Path; use std::marker::PhantomData; +use crate::error::{Error, DecodeErrorKind}; use async_trait::async_trait; use bincode; use bincode::{Decode, Encode}; -use crate::binary_coding::encode; +use crate::binary_coding::{encode, decode}; use crate::entry::{Entry, EntryDetailed}; use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; -use crate::storage_engine::{Store, FilePosition, Result, ROWS_FILE_NAME}; +use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME}; #[async_trait] // TODO: Make this private @@ -51,6 +52,12 @@ pub trait PrimitiveCursor { self.seek_to(StoreHeader::SIZE as u64).await } + // Seeks from current position by offset and returns new file position + async fn seek_by(&mut self, offset: i64) -> Result { + let file_position = self.file().seek(SeekFrom::Current(offset)).await?; + Ok(file_position) + } + async fn current_file_position(&mut self) -> Result { let next_file_position: FilePosition = self.file().stream_position().await?; Ok(next_file_position) @@ -59,7 +66,6 @@ pub trait PrimitiveCursor { async fn is_at_eof(&mut self) -> Result { let current_file_position = self.current_file_position().await?; let eof_file_position = self.eof_file_position(); - println!("IN EOF: current={}, eof_file_position={}", current_file_position, eof_file_position); Ok(current_file_position == eof_file_position) } } @@ -91,18 +97,31 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { } // ===Iteration=== - // Assumes that the current file position is at a valid entry or EOF. + // The following functions assume that the current file position is at a valid entry or EOF. + + + // WARNING: This moves the file_position to start of the data, so you can't just call + // next_entry_header() a bunch of times. You must move the cursor! + async fn next_entry_header(&mut self) -> Result> { + if self.is_at_eof().await? { + return Ok(None) + } + + let entry_header = self.read_entry_header().await?; + + Ok(Some(entry_header)) + } + + // This is meant to be used after next_entry_header() is called. + async fn jump_from_start_of_entry_data_to_next_entry(&mut self, entry_header: &EntryHeaderWithDataSize) -> Result{ + let file_position = self.seek_by(entry_header.size_of_data() as i64).await?; + Ok(file_position) + } + async fn next(&mut self) -> Result>> where T: Decode { - println!("are we at eof?"); - if self.is_at_eof().await? { - println!("YES"); - return Ok(None) - } - println!("NO"); - - let entry_header = self.read_entry_header().await?; + let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; let mut data_bytes: Vec = vec![0; entry_header.size_of_data()]; self.read_bytes(&mut data_bytes).await?; @@ -112,6 +131,59 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { Ok(Some(entry)) } + // Like next, but only reads the column, not the whole entry. + async fn next_at_column(&mut self, column: Column) -> Result> + where T: Decode + Send + { + let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; + let file_position_at_start_of_data = self.current_file_position().await?; + + // figuring out how much to decode + let column_offset = entry_header.offset_of_column(column); + self.seek_by(column_offset as i64).await?; + + // reading and decoding + let mut bytes: Vec = vec![0; entry_header.data_sizes[column as usize]]; + self.read_bytes(&mut bytes).await?; + let (value, _) = + decode::(&bytes[..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + + // jumping to next entry + self.seek_to(file_position_at_start_of_data).await?; + self.jump_from_start_of_entry_data_to_next_entry(&entry_header).await?; + + Ok(Some((entry_header, value))) + } + + async fn next_alive(&mut self) -> Result>> + where T: Decode + { + while let Some(entry) = self.next().await? { + if !entry.header.is_deleted { + return Ok(Some(entry)) + } + } + Ok(None) + } + + // ===Search=== + async fn find_first_eq_bruteforce(&mut self, column: Column, t0: &T) -> Result>> + where T: Decode + PartialEq + Send + Sync + { + let mut file_position = self.current_file_position().await?; + while let Some((_, t)) = self.next_at_column(column).await? { + if &t == t0 { + // go back and decode the whole entry + self.seek_to(file_position).await?; + return self.next().await + } else { + file_position = self.current_file_position().await?; + } + } + Ok(None) + } + // ===Debugging=== async fn read_entries(&mut self) -> Result<()> where T: Decode + std::fmt::Debug @@ -301,14 +373,10 @@ impl <'cursor, T> WriteCursor<'cursor, T> { where T: Send { self.seek_to(file_position).await?; - println!("Attempting to read the entry"); let mut entry_header = self.read_entry_header().await?; - println!("Entry Header == {:?}", entry_header); if entry_header.is_deleted { - println!("Already deleted"); Ok(()) } else { - println!("Marking as deleted"); self.increment_deleted_count().await?; self.seek_to(file_position).await?; diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs index 9d9de38..90d3dca 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/entry.rs @@ -13,8 +13,8 @@ pub struct Entry { #[derive(Debug)] pub struct EntryDetailed { - header: EntryHeaderWithDataSize, - data: Vec, + pub header: EntryHeaderWithDataSize, + pub data: Vec, } impl Entry { diff --git a/storage_engine/src/entry_header.rs b/storage_engine/src/entry_header.rs index 4aa904c..7c8d626 100644 --- a/storage_engine/src/entry_header.rs +++ b/storage_engine/src/entry_header.rs @@ -1,5 +1,5 @@ use crate::binary_coding::{decode, encode, decode_sequence}; -use crate::storage_engine::Result; +use crate::storage_engine::{Result, Column}; use crate::error::{Error, DecodeErrorKind}; use std::mem::size_of; @@ -42,6 +42,18 @@ impl EntryHeaderWithDataSize { self.data_sizes.iter().sum() } + pub fn offset_of_column(&self, column: Column) -> usize { + let mut sum = 0; + for (i, size) in self.data_sizes.iter().enumerate() { + if i < column as usize { + sum += size; + } else { + break + } + } + sum + } + pub fn decode(bytes: &mut [u8], number_of_columns: usize) -> Result { let (is_deleted, _) = decode::(&bytes) diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index d530bd1..cf32a88 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -108,20 +108,44 @@ async fn main() -> Result<()> { // println!("{:?}", store); // println!("{:?}", store.read_all_bytes().await?); - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + { + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + } - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); + { + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let column = 2; + let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + } + + { + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let column = 3; + let t0 = 6; + let x = cursor.find_first_eq_bruteforce(column, &t0).await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + } - cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; From 0f98903759bedd148c1d823e2beb4aef9e16f7cf Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 22:54:55 +0100 Subject: [PATCH 15/43] Add file_position to EntryDetailed --- storage_engine/src/cursor.rs | 3 ++- storage_engine/src/entry.rs | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 42584b7..b0ea017 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -121,12 +121,13 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { async fn next(&mut self) -> Result>> where T: Decode { + let file_position = self.current_file_position().await?; let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; let mut data_bytes: Vec = vec![0; entry_header.size_of_data()]; self.read_bytes(&mut data_bytes).await?; let entry: EntryDetailed = - EntryDetailed::decode(entry_header, self.header().number_of_columns, &mut data_bytes)?; + EntryDetailed::decode(entry_header, file_position, self.header().number_of_columns, &mut data_bytes)?; Ok(Some(entry)) } diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs index 90d3dca..c628979 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/entry.rs @@ -1,7 +1,7 @@ use bincode::{Decode, Encode}; use crate::binary_coding::{encode_sequence, encode_sequence_with_sizes, decode_sequence}; -use crate::storage_engine::Result; +use crate::storage_engine::{Result, FilePosition}; use crate::error::{Error, DecodeErrorKind}; use crate::entry_header::{EntryHeader, EntryHeaderWithDataSize}; @@ -14,6 +14,7 @@ pub struct Entry { #[derive(Debug)] pub struct EntryDetailed { pub header: EntryHeaderWithDataSize, + pub file_position: FilePosition, pub data: Vec, } @@ -40,11 +41,11 @@ impl Entry { } impl EntryDetailed { - pub fn decode(header: EntryHeaderWithDataSize, number_of_columns: usize, bytes: &[u8]) -> Result + pub fn decode(header: EntryHeaderWithDataSize, file_position: FilePosition, number_of_columns: usize, bytes: &[u8]) -> Result where T: Decode { let data = decode_sequence::(number_of_columns, bytes) .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; - Ok(EntryDetailed { header, data }) + Ok(EntryDetailed { header, file_position, data }) } } From daa39850f0c54b553c82bcff4439ff25ed7d807c Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sat, 3 Feb 2024 23:45:55 +0100 Subject: [PATCH 16/43] Prepare for garbage collection --- storage_engine/src/cursor.rs | 86 ++++++++++++++++++++++++++-- storage_engine/src/storage_engine.rs | 52 +++++++++-------- storage_engine/src/store_header.rs | 5 +- 3 files changed, 115 insertions(+), 28 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index b0ea017..8afe89f 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -13,7 +13,7 @@ use crate::binary_coding::{encode, decode}; use crate::entry::{Entry, EntryDetailed}; use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; -use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME}; +use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; #[async_trait] // TODO: Make this private @@ -261,7 +261,7 @@ impl ReadCursor { pub async fn new(store: &Store) -> Result where T: Send { - let path_to_rows = Path::new(&store.table_folder).join(ROWS_FILE_NAME); + let path_to_rows = Path::new(&store.header.table_folder).join(ROWS_FILE_NAME); let file: File = OpenOptions::new() .read(true) @@ -293,7 +293,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> { pub async fn new<'store: 'cursor>(store: &'store mut Store) -> Result where T: Send { - let path_to_rows = Path::new(&store.table_folder).join(ROWS_FILE_NAME); + let path_to_rows = Path::new(&store.header.table_folder).join(ROWS_FILE_NAME); let file: File = OpenOptions::new() .read(true) @@ -315,7 +315,33 @@ impl <'cursor, T> WriteCursor<'cursor, T> { Ok(cursor) } - + + pub async fn connect<'header: 'cursor>(path_to_rows: &str, header: &'header mut StoreHeader) -> Result + where T: Send + { + let file: File = + OpenOptions::new() + .read(true) + .write(true) + .open(path_to_rows) + .await?; + + let mut cursor = Self { + header, + file, + data_type: PhantomData::, + + eof_file_position: 0, + }; + let eof_file_position: FilePosition = cursor.seek_to_end().await?; + cursor.eof_file_position = eof_file_position; + + cursor.seek_to_start_of_data().await?; + + Ok(cursor) + } + + // ===Primitive Operations=== async fn write_bytes(&mut self, bytes: &[u8]) -> Result { Ok(self.file.write(bytes).await?) @@ -389,6 +415,18 @@ impl <'cursor, T> WriteCursor<'cursor, T> { } } + async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result>> + where T: Decode + PartialEq + Send + Sync + { + let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?; + if let Some(entry) = maybe_entry { + self.mark_deleted_at(entry.file_position).await?; + Ok(Some(entry)) + } else { + Ok(maybe_entry) + } + } + async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> { // TODO: What should be the policy? Counting size of garbage? Counting how many entries are // garbage? @@ -398,6 +436,46 @@ impl <'cursor, T> WriteCursor<'cursor, T> { Ok(()) } } + + async fn initiate_garbage_collection(&mut self) -> Result + where T: Send + { + let table_folder = self.header.table_folder.to_string(); + let path_to_table = Path::new(&table_folder); + let path_to_rows = path_to_table.join(GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME); + + let intermediate_file: File = Store::::create_empty_rows_file(path_to_rows, &self.header).await?; + + let mut intermediate_header: StoreHeader = StoreHeader { + table_folder, + number_of_columns: self.header.number_of_columns, + deleted_count: 0, + total_count: 0, + primary_column: self.header.primary_column + }; + + // Creates a new cursor to the intermediate file in which we'll dump the live entries. + // let mut cursor_to_intermediate = Self { + // header: &mut intermediate_header, + // file: intermediate_file, + // data_type: PhantomData::, + + // eof_file_position: 0, + // }; + let mut cursor_to_intermediate: Self = todo!(); + let eof_file_position: FilePosition = cursor_to_intermediate.seek_to_end().await?; + cursor_to_intermediate.eof_file_position = eof_file_position; + + + + // TODO: intermediate_header does not live long enough, so after garbage collection is + // done, we need to use it in the swap. + cursor_to_intermediate.header = todo!(); + + // In it there will be only the alive rows. + // Afterwards we swap the files, and delete the garbage. + todo!() + } } diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 4284f74..4c06c4d 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -1,7 +1,7 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::fs::{File, OpenOptions, DirBuilder}; use tokio::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::marker::PhantomData; use async_trait::async_trait; @@ -28,7 +28,6 @@ pub struct Store { // {write: 0, read: n + 1} ~> {write:0, read: n} // destroy read // {write: 0, read: 0} ~> {write: 1, read: 0} // create write // {write: 1, read: 0} ~> {write: 0, read: 0} // destroy write - pub table_folder: String, // primary_index: Vec>>, // indexes: Vec>>>, // primary_index: Index, @@ -63,6 +62,7 @@ impl SomethingSupportingLeq for Store } pub const ROWS_FILE_NAME: &'static str = "rows"; +pub const GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME: &'static str = "rows_intermediate"; impl Store { // ===Creation=== @@ -72,6 +72,31 @@ impl Store { DirBuilder::new() .create(path_to_table).await?; + let header = StoreHeader { + table_folder: table_folder.to_string(), + number_of_columns, + deleted_count: 0, + total_count: 0, + primary_column, + }; + + // We don't need the file right now. Only cursors will later open it. + Self::create_empty_rows_file(path_to_rows, &header).await?; + + // TODO: indexes + // let index: Index = Index::new( + // &format!("rows_{}", primary_column.to_string()), + // ).await?; + + let store = Self { + header, + data_type: PhantomData::, + }; + + Ok(store) + } + + pub async fn create_empty_rows_file(path_to_rows: PathBuf, header: &StoreHeader) -> Result { let mut file: File = OpenOptions::new() .write(true) @@ -80,28 +105,10 @@ impl Store { .open(path_to_rows) .await?; - let header = StoreHeader { - number_of_columns, - deleted_count: 0, - total_count: 0, - primary_column, - }; let encoded_header: Vec = header.encode()?; file.write(&encoded_header).await?; - - // TODO: indexes - // let index: Index = Index::new( - // &format!("rows_{}", primary_column.to_string()), - // ).await?; - - let store = Self { - table_folder: table_folder.to_string(), - header, - data_type: PhantomData::, - }; - - Ok(store) + Ok(file) } pub async fn connect(table_folder: &str) -> Result @@ -121,10 +128,9 @@ impl Store { // header. let mut header_bytes = StoreHeader::decode_buffer(); file.read_exact(&mut header_bytes).await?; - let header = StoreHeader::decode(&mut header_bytes).await?; + let header = StoreHeader::decode(table_folder, &mut header_bytes).await?; let store = Self { - table_folder: table_folder.to_string(), header, data_type: PhantomData::, }; diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/store_header.rs index fa84478..73cbb55 100644 --- a/storage_engine/src/store_header.rs +++ b/storage_engine/src/store_header.rs @@ -5,6 +5,8 @@ use std::mem::size_of; #[derive(Debug, Clone)] pub struct StoreHeader { + pub table_folder: String, // This one is not encoded into the file + pub number_of_columns: usize, pub deleted_count: usize, pub total_count: usize, @@ -35,7 +37,7 @@ impl StoreHeader { [0; StoreHeader::SIZE] } - pub async fn decode(result: &mut [u8]) -> Result { + pub async fn decode(table_folder: &str, result: &mut [u8]) -> Result { let (number_of_columns, _) = decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; @@ -49,6 +51,7 @@ impl StoreHeader { decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; let header = StoreHeader { + table_folder: table_folder.to_string(), number_of_columns, deleted_count, total_count, From 017f34bafa2049bb53cd5e0809a385ea9fecace2 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 14:07:29 +0100 Subject: [PATCH 17/43] Remove redundant comment --- storage_engine/src/cursor.rs | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 8afe89f..c9913b9 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -477,23 +477,3 @@ impl <'cursor, T> WriteCursor<'cursor, T> { todo!() } } - - - -// TODO - // pub async fn search_for(&mut self, index: T) -> Result<()> - // where T: Send - // { - // // let index = self.primary_index.borrow_mut(); - // // let x = index.lookup(self, 123).await?; - // todo!() - // } - // pub async fn search_for_entry_with_id(&mut self, id: T) -> Result>> { - // // TODO: make call to the primary index - // todo!() - // } - - // // TODO: This needs to be some sort of an iterator - // pub async fn get_all_eq(&self, column: Column, value: T) -> Result>> { - // todo!() - // } From a4a29e632b31abb5f821a961349ff7d40719aba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20Moravec?= Date: Sun, 4 Feb 2024 15:20:26 +0100 Subject: [PATCH 18/43] feat: in-memory index --- storage_engine/src/error.rs | 1 + storage_engine/src/index.rs | 153 +++++++++++++++++++-------- storage_engine/src/storage_engine.rs | 9 -- 3 files changed, 108 insertions(+), 55 deletions(-) diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index 10a74ae..2c7817c 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -15,6 +15,7 @@ pub enum DecodeErrorKind { EntryData, EntryIsDeleted, EntryHeaderWithDataSizes, + CorruptedData, } // ===Errors=== diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index c28c91e..628b136 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -1,18 +1,20 @@ use std::marker::PhantomData; -use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; -use tokio::fs::{File, OpenOptions, DirBuilder}; use std::path::Path; +use tokio::fs::{DirBuilder, File, OpenOptions}; +use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, SeekFrom}; -use std::collections::{BTreeMap}; use async_trait::async_trait; +use std::collections::{BTreeMap, HashSet}; +use std::hash::Hash; +use crate::binary_coding::{decode, decode_sequence, encode, encode_sequence}; use bincode; use bincode::{Decode, Encode}; -use crate::binary_coding::{encode, decode, encode_sequence, decode_sequence}; use tokio::fs; -use crate::error::Error; +use crate::error::{DecodeErrorKind, Error}; +use bincode::error::DecodeError; use std::mem::size_of; type Result = std::result::Result; @@ -23,67 +25,126 @@ type Result = std::result::Result; #[derive(Debug)] pub struct Index { file: File, - // None means index is asleep on disk. - in_memory: Option>, - header: IndexHeader, + data: BTreeMap>, key_type: PhantomData, value_type: PhantomData, } #[derive(Debug)] -pub struct IndexHeader { -} +pub struct IndexHeader {} -use crate::storage_engine::FilePosition; +impl Index +where + K: Encode + Decode + Ord, + V: Encode + Decode + Clone + Eq + Hash, +{ + pub async fn new(file_name: &str) -> Result> { + let file: File = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(file_name) + .await?; -#[async_trait] -pub trait SomethingSupportingLeq { - async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> std::result::Result; -} + let data = BTreeMap::new(); -impl Index { - // TODO: delete - // pub async fn new(file_name: &str, less_than_eq: &F) -> Result> - // where F: Fn(&mut Store, K, K) -> Fut, - // Store: SomethingSupportingLeq, - // Fut: Future>, - // { - // todo!() - // } - pub async fn new(file_name: &str) -> Result> - { - todo!() + Ok(Index { + file, + data, + key_type: PhantomData::, + value_type: PhantomData::, + }) } pub async fn connect(file_name: &str) -> Result> { - todo!() + let mut file: File = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(file_name) + .await?; + + let mut bytes = vec![]; + file.read_to_end(&mut bytes).await?; + + let data = Index::decode_tree(&bytes) + .map_err(|e| Error::DecodeError(DecodeErrorKind::CorruptedData, e))?; + + Ok(Index { + file, + data, + key_type: PhantomData::, + value_type: PhantomData::, + }) } - // Saves the in-memory index to disk and deallocates. - pub async fn sleep() -> Result> { - todo!() + pub async fn insert(&mut self, k: K, v: V) -> Result<()> { + self.data.entry(k).or_insert_with(HashSet::new).insert(v); + Ok(()) } - // Loads the index into memory - pub async fn wake() -> Result> { - todo!() + pub async fn lookup(&self, k: K) -> Result>> { + let hashset = self.data.get(&k).unwrap(); + Ok(Some(hashset.clone())) } - pub async fn insert() -> Result<()> - where K: Encode, V: Encode - { - todo!() + pub async fn delete(&mut self, k: K, v: V) -> Result> { + Ok(Some( + self.data.entry(k).or_insert_with(HashSet::new).remove(&v), + )) } - pub async fn lookup(&mut self, store: &mut Store, k: K) -> Result> - where K: Encode + Decode, - Store: SomethingSupportingLeq, - { - let x = store.less_than_eq(123, 123).await?; - todo!() + fn encode(&self) -> Result> { + let mut encoded = Vec::new(); + encoded.extend(encode(&self.data)?); + Ok(encoded) } - pub async fn delete(&mut self, k: K) -> Result> { - todo!() + fn decode_tree(data: &[u8]) -> std::result::Result>, DecodeError> { + let data: BTreeMap> = decode(data)?.0; + Ok(data) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn encode_decode() { + let mut index: Index = Index { + file: File::from_std(std::fs::File::create("test").unwrap()), + data: BTreeMap::new(), + key_type: PhantomData::, + value_type: PhantomData::, + }; + + index.insert("foo".to_string(), 123).await.unwrap(); + index.insert("foo".to_string(), 124).await.unwrap(); + index.insert("bar".to_string(), 125).await.unwrap(); + index.insert("bar".to_string(), 126).await.unwrap(); + + let lookup = index.lookup("foo".to_string()).await.unwrap().unwrap(); + assert_eq!(lookup.len(), 2); + assert!(lookup.contains(&123)); + assert!(lookup.contains(&124)); + println!("lookup {:?}", lookup); + + let encoded = index.encode().unwrap(); + let decoded = Index::::decode_tree(&encoded).unwrap(); + let decoded = Index { + file: File::from_std(std::fs::File::create("test").unwrap()), + data: decoded, + key_type: PhantomData::, + value_type: PhantomData::, + }; + + let lookup = decoded.lookup("foo".to_string()).await.unwrap().unwrap(); + assert_eq!(lookup.len(), 2); + assert!(lookup.contains(&123)); + assert!(lookup.contains(&124)); + println!("lookup {:?}", lookup); + + println!("{encoded:?}") } } diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 4284f74..e27c8a4 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -53,15 +53,6 @@ pub async fn less_than_eq(store: &mut Store, file_position0: FilePosition, todo!() } -#[async_trait] -impl SomethingSupportingLeq for Store - where T: Send -{ - async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> std::result::Result { - Ok(true) - } -} - pub const ROWS_FILE_NAME: &'static str = "rows"; impl Store { From 827c25cd698f2989964f416f7af30da66c5ea3e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20Moravec?= Date: Sun, 4 Feb 2024 15:21:37 +0100 Subject: [PATCH 19/43] tests: remove temp file --- storage_engine/src/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index 628b136..f0a1391 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -145,6 +145,6 @@ mod tests { assert!(lookup.contains(&124)); println!("lookup {:?}", lookup); - println!("{encoded:?}") + std::fs::remove_file("test").unwrap(); } } From dac888dc51254a6b1115ea63dfc2cd03389fb03b Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 15:46:43 +0100 Subject: [PATCH 20/43] Move concrete Cursor definitions on top --- storage_engine/src/cursor.rs | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index c9913b9..05f14db 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -15,6 +15,25 @@ use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; +// ===Concrete Cursors=== +pub struct ReadCursor { + header: StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + +pub struct WriteCursor<'a, T> { + header: &'a mut StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + + +// ===Traits=== #[async_trait] // TODO: Make this private pub trait PrimitiveCursor { @@ -206,23 +225,6 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { } -// ===Concrete Cursors=== -pub struct ReadCursor { - header: StoreHeader, - file: File, - data_type: PhantomData, - - eof_file_position: FilePosition, -} - -pub struct WriteCursor<'a, T> { - header: &'a mut StoreHeader, - file: File, - data_type: PhantomData, - - eof_file_position: FilePosition, -} - // ===PrimitiveCursor=== impl PrimitiveCursor for ReadCursor { fn file(&mut self) -> &mut File { From 89305b6126b3b61faaa960bcf0ed080855a748d7 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 16:57:19 +0100 Subject: [PATCH 21/43] Sketch out indexes in Store --- storage_engine/src/cursor.rs | 70 +++++++++++++++++++++++----- storage_engine/src/index.rs | 4 +- storage_engine/src/main.rs | 2 +- storage_engine/src/storage_engine.rs | 38 ++++++++------- 4 files changed, 82 insertions(+), 32 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 05f14db..a0c1671 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -14,10 +14,14 @@ use crate::entry::{Entry, EntryDetailed}; use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; +use crate::index::Index; + + // ===Concrete Cursors=== -pub struct ReadCursor { +pub struct ReadCursor<'a, T> { header: StoreHeader, + indexes: Vec>>, file: File, data_type: PhantomData, @@ -26,6 +30,7 @@ pub struct ReadCursor { pub struct WriteCursor<'a, T> { header: &'a mut StoreHeader, + indexes: Vec>>, file: File, data_type: PhantomData, @@ -224,9 +229,20 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { } } +#[async_trait] +pub trait CursorWithAccessToIndex: CursorWithStoreHeader { + fn indexes(&mut self) -> &[Option<&Index>]; + + async fn find_in_index(&mut self, k: &T) -> Result> + where T: Encode + Decode + Ord + Send + Sync + { + // let x = self.primary_index().lookup(k).await?; + todo!() + } +} // ===PrimitiveCursor=== -impl PrimitiveCursor for ReadCursor { +impl PrimitiveCursor for ReadCursor<'_, T> { fn file(&mut self) -> &mut File { &mut self.file } @@ -247,7 +263,7 @@ impl PrimitiveCursor for WriteCursor<'_, T> { } // ===CursorWithStoreHeader=== -impl CursorWithStoreHeader for ReadCursor { +impl CursorWithStoreHeader for ReadCursor<'_, T> { fn header(&self) -> &StoreHeader { &self.header } @@ -259,9 +275,23 @@ impl CursorWithStoreHeader for WriteCursor<'_, T> { } } -impl ReadCursor { - pub async fn new(store: &Store) -> Result - where T: Send +// ===CursorWithAccessToIndex=== +impl CursorWithAccessToIndex for ReadCursor<'_, T> { + fn indexes(&mut self) -> &[Option<&Index>] { + &self.indexes + } +} + +impl CursorWithAccessToIndex for WriteCursor<'_, T> { + fn indexes(&mut self) -> &[Option<&Index>] { + &self.indexes + } +} + + +impl <'cursor, T> ReadCursor<'cursor, T> { + pub async fn new<'store: 'cursor>(store: &'store Store) -> Result + where T: Send + Sync { let path_to_rows = Path::new(&store.header.table_folder).join(ROWS_FILE_NAME); let file: File = @@ -274,6 +304,7 @@ impl ReadCursor { header: store.header.clone(), file, data_type: store.data_type, + indexes: todo!(), eof_file_position: 0, }; @@ -284,13 +315,12 @@ impl ReadCursor { Ok(cursor) } - - pub async fn less_than_eq(&mut self, file_position0: FilePosition, file_position1: FilePosition) -> Result { - todo!() - } } -impl <'cursor, T> WriteCursor<'cursor, T> { +impl <'cursor, T> WriteCursor<'cursor, T> +// TODO: Consider adding this manually to wher eit is really needed + where T: Sync +{ // 'store lives atleast as long as 'cursor pub async fn new<'store: 'cursor>(store: &'store mut Store) -> Result where T: Send @@ -307,6 +337,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> { header: &mut store.header, file, data_type: store.data_type, + indexes: todo!(), eof_file_position: 0, }; @@ -332,6 +363,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> { header, file, data_type: PhantomData::, + indexes: todo!(), eof_file_position: 0, }; @@ -478,4 +510,20 @@ impl <'cursor, T> WriteCursor<'cursor, T> { // Afterwards we swap the files, and delete the garbage. todo!() } + + // ===Indexing=== + async fn insert_to_index(&mut self, t: T, file_position: FilePosition) -> Result> + where T: Encode + Decode + Ord + Send + Sync + { + // let x = self.primary_index.insert(t, file_position).await?; + todo!() + } + + async fn delete_from_index(&mut self, t: T, file_position: FilePosition) -> Result> + where T: Encode + Decode + Ord + Send + Sync + { + // let x = self.primary_index.delete(t, file_position).await?; + todo!() + } } + diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index f0a1391..97da926 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -83,8 +83,8 @@ where Ok(()) } - pub async fn lookup(&self, k: K) -> Result>> { - let hashset = self.data.get(&k).unwrap(); + pub async fn lookup(&self, k: &K) -> Result>> { + let hashset = self.data.get(k).unwrap(); Ok(Some(hashset.clone())) } diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index cf32a88..d9b2562 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -52,7 +52,7 @@ async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: &Entry) - Ok(file_position) } -async fn read_entry(cursor: &mut ReadCursor, file_position: FilePosition) -> Result>> { +async fn read_entry(cursor: &mut ReadCursor<'_, Data>, file_position: FilePosition) -> Result>> { println!("READING ENTRY at file_position={}", file_position); // let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; // println!("ENTRY: {:?}", entry); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 8fdebb0..1ff53ad 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -3,15 +3,12 @@ use tokio::fs::{File, OpenOptions, DirBuilder}; use tokio::fs; use std::path::{Path, PathBuf}; use std::marker::PhantomData; -use async_trait::async_trait; +use bincode::{Decode, Encode}; -use crate::index::SomethingSupportingLeq; use crate::error::Error; use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader}; use crate::store_header::StoreHeader; - -// TODO -// use crate::index::Index; +use crate::index::Index; pub type Result = std::result::Result; @@ -37,6 +34,7 @@ pub struct Store { // All pub header: StoreHeader, pub data_type: PhantomData, + pub primary_index: Index, } pub type PositionOfValue = FilePosition; @@ -48,16 +46,14 @@ pub async fn store_exists(table_folder: &str) -> Result { Ok(fs::metadata(table_folder).await.is_ok()) } -pub async fn less_than_eq(store: &mut Store, file_position0: FilePosition, file_position1: FilePosition) -> Result { - todo!() -} - pub const ROWS_FILE_NAME: &'static str = "rows"; pub const GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME: &'static str = "rows_intermediate"; impl Store { // ===Creation=== - pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result { + pub async fn new(table_folder: &str, number_of_columns: usize, primary_column: Column) -> Result + where T: Encode + Decode + Ord + { let path_to_table = Path::new(table_folder); let path_to_rows = path_to_table.join(ROWS_FILE_NAME); DirBuilder::new() @@ -74,14 +70,14 @@ impl Store { // We don't need the file right now. Only cursors will later open it. Self::create_empty_rows_file(path_to_rows, &header).await?; - // TODO: indexes - // let index: Index = Index::new( - // &format!("rows_{}", primary_column.to_string()), - // ).await?; + let primary_index: Index = Index::new( + &format!("rows_{}", primary_column.to_string()), + ).await?; let store = Self { header, data_type: PhantomData::, + primary_index, }; Ok(store) @@ -103,7 +99,7 @@ impl Store { } pub async fn connect(table_folder: &str) -> Result - where T: std::fmt::Debug + where T: std::fmt::Debug + Encode + Decode + Ord { let path_to_table = Path::new(table_folder); let path_to_rows = path_to_table.join(ROWS_FILE_NAME); @@ -121,29 +117,35 @@ impl Store { file.read_exact(&mut header_bytes).await?; let header = StoreHeader::decode(table_folder, &mut header_bytes).await?; + + let primary_index: Index = Index::connect( + &format!("rows_{}", header.primary_column.to_string()), + ).await?; + let store = Self { header, data_type: PhantomData::, + primary_index }; Ok(store) } // ===Cursors=== pub async fn read_cursor(&self) -> Result> - where T: Send + where T: Send + Sync { ReadCursor::new(self).await } pub async fn write_cursor(&mut self) -> Result> - where T: Send + where T: Send + Sync { WriteCursor::new(self).await } // For debugging. pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> - where T: Send + where T: Send + Sync { let mut cursor = self.read_cursor().await.map_err(|e| e.to_io_or_panic())?; let bytes = cursor.read_all_bytes().await?; From 4c0f91ad334f8676a7140aacf93966426e5b2848 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 18:13:05 +0100 Subject: [PATCH 22/43] Fix indexes types --- storage_engine/src/cursor.rs | 14 ++++++------ storage_engine/src/storage_engine.rs | 34 +++++++++++++++++++--------- storage_engine/src/store_header.rs | 6 +++++ 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index a0c1671..8fefaa6 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -21,7 +21,7 @@ use crate::index::Index; // ===Concrete Cursors=== pub struct ReadCursor<'a, T> { header: StoreHeader, - indexes: Vec>>, + indexes: &'a [Option>], file: File, data_type: PhantomData, @@ -30,7 +30,7 @@ pub struct ReadCursor<'a, T> { pub struct WriteCursor<'a, T> { header: &'a mut StoreHeader, - indexes: Vec>>, + indexes: &'a mut [Option>], file: File, data_type: PhantomData, @@ -231,7 +231,7 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { #[async_trait] pub trait CursorWithAccessToIndex: CursorWithStoreHeader { - fn indexes(&mut self) -> &[Option<&Index>]; + fn indexes(&mut self) -> &[Option>]; async fn find_in_index(&mut self, k: &T) -> Result> where T: Encode + Decode + Ord + Send + Sync @@ -277,13 +277,13 @@ impl CursorWithStoreHeader for WriteCursor<'_, T> { // ===CursorWithAccessToIndex=== impl CursorWithAccessToIndex for ReadCursor<'_, T> { - fn indexes(&mut self) -> &[Option<&Index>] { + fn indexes(&mut self) -> &[Option>] { &self.indexes } } impl CursorWithAccessToIndex for WriteCursor<'_, T> { - fn indexes(&mut self) -> &[Option<&Index>] { + fn indexes(&mut self) -> &[Option>] { &self.indexes } } @@ -304,7 +304,7 @@ impl <'cursor, T> ReadCursor<'cursor, T> { header: store.header.clone(), file, data_type: store.data_type, - indexes: todo!(), + indexes: &store.indexes, eof_file_position: 0, }; @@ -337,7 +337,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> header: &mut store.header, file, data_type: store.data_type, - indexes: todo!(), + indexes: &mut store.indexes, eof_file_position: 0, }; diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 1ff53ad..a753a3d 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -34,7 +34,7 @@ pub struct Store { // All pub header: StoreHeader, pub data_type: PhantomData, - pub primary_index: Index, + pub indexes: Vec>>, } pub type PositionOfValue = FilePosition; @@ -70,14 +70,18 @@ impl Store { // We don't need the file right now. Only cursors will later open it. Self::create_empty_rows_file(path_to_rows, &header).await?; - let primary_index: Index = Index::new( - &format!("rows_{}", primary_column.to_string()), - ).await?; + // TODO: I need to construct indexes + // let primary_index: Index = Index::new( + // &format!("rows_{}", primary_column.to_string()), + // ).await?; + + // TODO + let indexes = vec![]; let store = Self { header, data_type: PhantomData::, - primary_index, + indexes, }; Ok(store) @@ -118,31 +122,39 @@ impl Store { let header = StoreHeader::decode(table_folder, &mut header_bytes).await?; - let primary_index: Index = Index::connect( - &format!("rows_{}", header.primary_column.to_string()), - ).await?; + // let primary_index: Index = Index::connect( + // &format!("rows_{}", header.primary_column.to_string()), + // ).await?; + + // TODO + let indexes = vec![]; let store = Self { header, data_type: PhantomData::, - primary_index + indexes, }; Ok(store) } // ===Cursors=== - pub async fn read_cursor(&self) -> Result> + pub async fn read_cursor(&self) -> Result> where T: Send + Sync { ReadCursor::new(self).await } - pub async fn write_cursor(&mut self) -> Result> + pub async fn write_cursor(&mut self) -> Result> where T: Send + Sync { WriteCursor::new(self).await } + pub async fn make_indexable(&mut self, column: Column) -> Result<()> { + // Creates an index from scratch at above column + todo!() + } + // For debugging. pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> where T: Send + Sync diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/store_header.rs index 73cbb55..fc571a4 100644 --- a/storage_engine/src/store_header.rs +++ b/storage_engine/src/store_header.rs @@ -11,6 +11,8 @@ pub struct StoreHeader { pub deleted_count: usize, pub total_count: usize, pub primary_column: Column, + // TODO + // pub indexed_columns: Vec, } impl StoreHeader { @@ -25,6 +27,10 @@ impl StoreHeader { pub const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; pub const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; + fn indexed_columns_size(&self) -> usize { + size_of::() * self.number_of_columns + } + pub fn encode(&self) -> Result> { let mut result = encode(&self.number_of_columns)?; result.append(&mut encode(&self.deleted_count)?); From f2c17d2e661837b3556824c9f3e42c1b397bf1af Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 19:00:50 +0100 Subject: [PATCH 23/43] Add which columns are indexable to store header --- storage_engine/src/cursor.rs | 15 ++++---- storage_engine/src/error.rs | 1 + storage_engine/src/main.rs | 43 +++++++++++---------- storage_engine/src/storage_engine.rs | 30 ++++++++++----- storage_engine/src/store_header.rs | 57 +++++++++++++++++++++++----- 5 files changed, 100 insertions(+), 46 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 8fefaa6..0c3a5df 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -72,10 +72,6 @@ pub trait PrimitiveCursor { Ok(file_position) } - async fn seek_to_start_of_data(&mut self) -> Result { - self.seek_to(StoreHeader::SIZE as u64).await - } - // Seeks from current position by offset and returns new file position async fn seek_by(&mut self, offset: i64) -> Result { let file_position = self.file().seek(SeekFrom::Current(offset)).await?; @@ -98,6 +94,10 @@ pub trait PrimitiveCursor { pub trait CursorWithStoreHeader: PrimitiveCursor { fn header(&self) -> &StoreHeader; + async fn seek_to_start_of_data(&mut self) -> Result { + self.seek_to(StoreHeader::size(self.header().number_of_columns) as u64).await + } + async fn read_entry_header(&mut self) -> Result { let number_of_columns: usize = self.header().number_of_columns; let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; @@ -349,7 +349,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> Ok(cursor) } - pub async fn connect<'header: 'cursor>(path_to_rows: &str, header: &'header mut StoreHeader) -> Result + pub async fn connect<'header: 'cursor, 'indexes: 'cursor>(path_to_rows: &str, header: &'header mut StoreHeader, indexes: &'indexes mut Vec>>) -> Result where T: Send { let file: File = @@ -363,7 +363,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> header, file, data_type: PhantomData::, - indexes: todo!(), + indexes, eof_file_position: 0, }; @@ -485,7 +485,8 @@ impl <'cursor, T> WriteCursor<'cursor, T> number_of_columns: self.header.number_of_columns, deleted_count: 0, total_count: 0, - primary_column: self.header.primary_column + primary_column: self.header.primary_column, + indexed_columns: todo!() }; // Creates a new cursor to the intermediate file in which we'll dump the live entries. diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index 2c7817c..951f167 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -12,6 +12,7 @@ pub enum DecodeErrorKind { StoreHeaderDeletedCount, StoreHeaderTotalCount, StoreHeaderPrimaryColumn, + StoreHeaderIndexedColumns, EntryData, EntryIsDeleted, EntryHeaderWithDataSizes, diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index d9b2562..5f4bb69 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -89,9 +89,9 @@ async fn main() -> Result<()> { } { - let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; - let entry: Entry = Entry::new(vec![60, 50, 40, 30, 20]); + // let entry: Entry = Entry::new(vec![60, 50, 40, 30, 20]); // let file_position = append_entry(&mut cursor, &entry).await?; // let file_position = 215; // cursor.seek_to(file_position).await.map_err(|e| e.to_io_or_panic())?; @@ -107,20 +107,23 @@ async fn main() -> Result<()> { // println!("{:?}", store); // println!("{:?}", store.read_all_bytes().await?); - { let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); + } + + { + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); + let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + println!("{:?}", x); } { @@ -138,13 +141,13 @@ async fn main() -> Result<()> { println!("{:?}", x); } - { - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let column = 3; - let t0 = 6; - let x = cursor.find_first_eq_bruteforce(column, &t0).await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - } + // { + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let column = 3; + // let t0 = 6; + // let x = cursor.find_first_eq_bruteforce(column, &t0).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // } diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index a753a3d..d696e38 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -59,12 +59,17 @@ impl Store { DirBuilder::new() .create(path_to_table).await?; - let header = StoreHeader { - table_folder: table_folder.to_string(), - number_of_columns, - deleted_count: 0, - total_count: 0, - primary_column, + let header = { + let mut indexed_columns = vec![false; number_of_columns]; + indexed_columns[primary_column as usize] = true; + StoreHeader { + table_folder: table_folder.to_string(), + number_of_columns, + deleted_count: 0, + total_count: 0, + primary_column, + indexed_columns, + } }; // We don't need the file right now. Only cursors will later open it. @@ -117,9 +122,16 @@ impl Store { // Unfortunately we can't yet use store.read_bytes, since it can't be created without the // header. - let mut header_bytes = StoreHeader::decode_buffer(); - file.read_exact(&mut header_bytes).await?; - let header = StoreHeader::decode(table_folder, &mut header_bytes).await?; + let header = { + let mut fixed_header_bytes = StoreHeader::buffer_for_fixed_decoding(); + file.read_exact(&mut fixed_header_bytes).await?; + let fixed_header = StoreHeader::decode_fixed(table_folder, &fixed_header_bytes).await?; + + // decode the indexes + let mut rest_bytes: Vec = StoreHeader::buffer_for_rest_decoding(&fixed_header); + file.read_exact(&mut rest_bytes).await?; + StoreHeader::decode_rest(fixed_header, &rest_bytes).await? + }; // let primary_index: Index = Index::connect( diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/store_header.rs index fc571a4..31b23f0 100644 --- a/storage_engine/src/store_header.rs +++ b/storage_engine/src/store_header.rs @@ -1,4 +1,4 @@ -use crate::binary_coding::{encode, decode}; +use crate::binary_coding::{encode, encode_sequence, decode, decode_sequence}; use crate::storage_engine::{Result, Column}; use crate::error::{Error, DecodeErrorKind}; use std::mem::size_of; @@ -11,8 +11,17 @@ pub struct StoreHeader { pub deleted_count: usize, pub total_count: usize, pub primary_column: Column, - // TODO - // pub indexed_columns: Vec, + pub indexed_columns: Vec, +} + +#[derive(Debug, Clone)] +pub struct StoreHeaderFixedPart { + pub table_folder: String, // This one is not encoded into the file + + pub number_of_columns: usize, + pub deleted_count: usize, + pub total_count: usize, + pub primary_column: Column, } impl StoreHeader { @@ -20,15 +29,20 @@ impl StoreHeader { pub const DELETED_COUNT_SIZE: usize = size_of::(); pub const TOTAL_COUNT_SIZE: usize = size_of::(); pub const PRIMARY_COLUMN_SIZE: usize = size_of::(); - pub const SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; + pub const FIXED_SIZE: usize = Self::NUMBER_OF_COLUMNS_SIZE + Self::DELETED_COUNT_SIZE + Self::TOTAL_COUNT_SIZE + Self::PRIMARY_COLUMN_SIZE; pub const NUMBER_OF_COLUMNS_OFFSET: usize = 0; pub const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; pub const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; pub const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; + pub const INDEXED_COLUMNS_OFFSET: usize = Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE; - fn indexed_columns_size(&self) -> usize { - size_of::() * self.number_of_columns + fn indexed_columns_size(number_of_columns: usize) -> usize { + size_of::() * number_of_columns + } + + pub fn size(number_of_columns: usize) -> usize { + Self::FIXED_SIZE + Self::indexed_columns_size(number_of_columns) } pub fn encode(&self) -> Result> { @@ -36,14 +50,19 @@ impl StoreHeader { result.append(&mut encode(&self.deleted_count)?); result.append(&mut encode(&self.total_count)?); result.append(&mut encode(&self.primary_column)?); + result.append(&mut encode_sequence(&self.indexed_columns)?); Ok(result) } - pub fn decode_buffer() -> [u8; StoreHeader::SIZE] { - [0; StoreHeader::SIZE] + pub fn buffer_for_fixed_decoding() -> [u8; Self::FIXED_SIZE] { + [0; Self::FIXED_SIZE] } - pub async fn decode(table_folder: &str, result: &mut [u8]) -> Result { + pub fn buffer_for_rest_decoding(header: &StoreHeaderFixedPart) -> Vec { + vec![0; Self::indexed_columns_size(header.number_of_columns)] + } + + pub async fn decode_fixed(table_folder: &str, result: &[u8]) -> Result { let (number_of_columns, _) = decode::(&result[Self::NUMBER_OF_COLUMNS_OFFSET..Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderNumberOfColumns, e))?; @@ -56,7 +75,7 @@ impl StoreHeader { let (primary_column, _) = decode::(&result[Self::PRIMARY_COLUMN_OFFSET..Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE]) .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderPrimaryColumn, e))?; - let header = StoreHeader { + let header = StoreHeaderFixedPart { table_folder: table_folder.to_string(), number_of_columns, deleted_count, @@ -67,6 +86,24 @@ impl StoreHeader { Ok(header) } + pub async fn decode_rest(header: StoreHeaderFixedPart, result: &[u8]) -> Result { + let indexed_columns: Vec = + decode_sequence::(header.number_of_columns, result) + .map_err(|e| Error::DecodeError(DecodeErrorKind::StoreHeaderIndexedColumns, e))?; + + Ok(StoreHeader { + table_folder: header.table_folder, + number_of_columns: header.number_of_columns, + deleted_count: header.deleted_count, + total_count: header.total_count, + primary_column: header.primary_column, + + indexed_columns, + }) + } + + + // returns new count pub fn increment_total_count(&mut self) -> usize { self.total_count += 1; From 8fd2d4ebf392d72a7e85a3a858c8c4562b87d627 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 19:45:38 +0100 Subject: [PATCH 24/43] Connect store to indexes --- storage_engine/src/cursor.rs | 4 +- storage_engine/src/index.rs | 12 ++-- storage_engine/src/storage_engine.rs | 84 ++++++++++++++++++---------- 3 files changed, 64 insertions(+), 36 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 0c3a5df..05f9d05 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -13,7 +13,7 @@ use crate::binary_coding::{encode, decode}; use crate::entry::{Entry, EntryDetailed}; use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; -use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; +use crate::storage_engine::{Store, FilePosition, Column, Result, StoreIndexes, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; use crate::index::Index; @@ -349,7 +349,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> Ok(cursor) } - pub async fn connect<'header: 'cursor, 'indexes: 'cursor>(path_to_rows: &str, header: &'header mut StoreHeader, indexes: &'indexes mut Vec>>) -> Result + pub async fn connect<'header: 'cursor, 'indexes: 'cursor>(path_to_rows: &str, header: &'header mut StoreHeader, indexes: &'indexes mut StoreIndexes) -> Result where T: Send { let file: File = diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index 97da926..0cc01de 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -1,6 +1,6 @@ use std::marker::PhantomData; -use std::path::Path; -use tokio::fs::{DirBuilder, File, OpenOptions}; +use std::path::PathBuf; +use tokio::fs::{File, OpenOptions}; use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, SeekFrom}; use async_trait::async_trait; @@ -38,7 +38,7 @@ where K: Encode + Decode + Ord, V: Encode + Decode + Clone + Eq + Hash, { - pub async fn new(file_name: &str) -> Result> { + pub async fn new(file_name: PathBuf) -> Result> { let file: File = OpenOptions::new() .read(true) .write(true) @@ -56,7 +56,7 @@ where }) } - pub async fn connect(file_name: &str) -> Result> { + pub async fn connect(file_name: PathBuf) -> Result> { let mut file: File = OpenOptions::new() .read(true) .write(true) @@ -124,7 +124,7 @@ mod tests { index.insert("bar".to_string(), 125).await.unwrap(); index.insert("bar".to_string(), 126).await.unwrap(); - let lookup = index.lookup("foo".to_string()).await.unwrap().unwrap(); + let lookup = index.lookup(&"foo".to_string()).await.unwrap().unwrap(); assert_eq!(lookup.len(), 2); assert!(lookup.contains(&123)); assert!(lookup.contains(&124)); @@ -139,7 +139,7 @@ mod tests { value_type: PhantomData::, }; - let lookup = decoded.lookup("foo".to_string()).await.unwrap().unwrap(); + let lookup = decoded.lookup(&"foo".to_string()).await.unwrap().unwrap(); assert_eq!(lookup.len(), 2); assert!(lookup.contains(&123)); assert!(lookup.contains(&124)); diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index d696e38..e2df22e 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -16,27 +16,16 @@ pub type Result = std::result::Result; pub type Column = u64; pub type FilePosition = u64; -// TODO: Consider introducing a phantom type for the data that's used in the store. +// TODO: Consider adding another type parameter for indexable values #[derive(Debug)] pub struct Store { - // TODO: This needs to track how many read-write cursors there are...? - // RWMutex - // {write: 0, read: n} ~> {write:0, read: n + 1} // create read - // {write: 0, read: n + 1} ~> {write:0, read: n} // destroy read - // {write: 0, read: 0} ~> {write: 1, read: 0} // create write - // {write: 1, read: 0} ~> {write: 0, read: 0} // destroy write - // primary_index: Vec>>, - // indexes: Vec>>>, - // primary_index: Index, - - // TODO: It's not good to have StoreHeader copied to all the cursors, since they may modify it. - // How to sync? - // All pub header: StoreHeader, pub data_type: PhantomData, - pub indexes: Vec>>, + pub indexes: StoreIndexes, } +pub type StoreIndexes = Vec>>; + pub type PositionOfValue = FilePosition; pub type PositionOfRow = FilePosition; @@ -75,13 +64,7 @@ impl Store { // We don't need the file right now. Only cursors will later open it. Self::create_empty_rows_file(path_to_rows, &header).await?; - // TODO: I need to construct indexes - // let primary_index: Index = Index::new( - // &format!("rows_{}", primary_column.to_string()), - // ).await?; - - // TODO - let indexes = vec![]; + let indexes: StoreIndexes = Self::create_initial_indexes(&header).await?; let store = Self { header, @@ -92,6 +75,42 @@ impl Store { Ok(store) } + pub fn path_to_index_file(header: &StoreHeader, column: Column) -> PathBuf { + let path_to_table = Path::new(&header.table_folder); + let path_to_index = path_to_table.join(&format!("{}_{}", ROWS_FILE_NAME, column.to_string())); + path_to_index + } + + pub async fn create_empty_index_at(header: &StoreHeader, column: Column) -> Result> + where T: Encode + Decode + Ord + { + let path_to_index = Self::path_to_index_file(&header, column); + let index = Index::new(path_to_index).await?; + + Ok(index) + } + + pub async fn create_initial_indexes(header: &StoreHeader) -> Result> + where T: Encode + Decode + Ord + { + let mut result: StoreIndexes = Vec::with_capacity(header.number_of_columns); + for _ in 0..header.number_of_columns { + result.push(None) + } + + result[header.primary_column as usize] = Some(Self::create_empty_index_at(&header, header.primary_column).await?); + + Ok(result) + } + + pub async fn connect_index_at(header: &StoreHeader, column: Column) -> Result> + where T: Encode + Decode + Ord + { + let path_to_index = Self::path_to_index_file(&header, column); + let index: Index = Index::connect(path_to_index).await?; + Ok(index) + } + pub async fn create_empty_rows_file(path_to_rows: PathBuf, header: &StoreHeader) -> Result { let mut file: File = OpenOptions::new() @@ -134,17 +153,26 @@ impl Store { }; - // let primary_index: Index = Index::connect( - // &format!("rows_{}", header.primary_column.to_string()), - // ).await?; + let indexes: StoreIndexes = { + let mut result = Vec::with_capacity(header.number_of_columns); + for (column, &is_indexed) in header.indexed_columns.iter().enumerate() { + if is_indexed { + result.push(None) + // TODO: Once index connect is working, uncomment this line (and remove the + // above .push line + // result.push(Some(Self::connect_index_at(&header, column as Column).await?)) + } else { + result.push(None) + } + } - // TODO - let indexes = vec![]; + result + }; let store = Self { header, data_type: PhantomData::, - indexes, + indexes }; Ok(store) } From 6db62c42d76b497146f678f677492b8de864f86f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20Moravec?= Date: Sun, 4 Feb 2024 20:20:20 +0100 Subject: [PATCH 25/43] feat: index append encoding --- storage_engine/src/index.rs | 153 ++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 78 deletions(-) diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index 0cc01de..3cb84d2 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -1,33 +1,22 @@ -use std::marker::PhantomData; use std::path::PathBuf; use tokio::fs::{File, OpenOptions}; -use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, SeekFrom}; +use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, BufWriter}; -use async_trait::async_trait; use std::collections::{BTreeMap, HashSet}; use std::hash::Hash; -use crate::binary_coding::{decode, decode_sequence, encode, encode_sequence}; +use crate::binary_coding::{decode, encode}; use bincode; use bincode::{Decode, Encode}; -use tokio::fs; use crate::error::{DecodeErrorKind, Error}; -use bincode::error::DecodeError; -use std::mem::size_of; - type Result = std::result::Result; -// Implements a persistant self-balancing Binary Search Tree. Nope. -// We need fixed-size nodes. But we want to index Strings which are variable length. - #[derive(Debug)] pub struct Index { file: File, data: BTreeMap>, - key_type: PhantomData, - value_type: PhantomData, } #[derive(Debug)] @@ -48,61 +37,102 @@ where let data = BTreeMap::new(); - Ok(Index { - file, - data, - key_type: PhantomData::, - value_type: PhantomData::, - }) + Ok(Index { file, data }) } pub async fn connect(file_name: PathBuf) -> Result> { - let mut file: File = OpenOptions::new() + let file: File = OpenOptions::new() .read(true) .write(true) - .create(true) .open(file_name) .await?; - let mut bytes = vec![]; - file.read_to_end(&mut bytes).await?; - - let data = Index::decode_tree(&bytes) - .map_err(|e| Error::DecodeError(DecodeErrorKind::CorruptedData, e))?; - - Ok(Index { + let mut index = Index { file, - data, - key_type: PhantomData::, - value_type: PhantomData::, - }) + data: BTreeMap::new(), + }; + + index.load_from_file().await?; + Ok(index) } pub async fn insert(&mut self, k: K, v: V) -> Result<()> { + self.append_to_file(&k, &v).await?; self.data.entry(k).or_insert_with(HashSet::new).insert(v); Ok(()) } + pub fn insert_desynced(&mut self, k: K, v: V) -> () { + self.data.entry(k).or_insert_with(HashSet::new).insert(v); + } + pub async fn lookup(&self, k: &K) -> Result>> { - let hashset = self.data.get(k).unwrap(); - Ok(Some(hashset.clone())) + let hashset = self.data.get(k).cloned(); + Ok(hashset) } - pub async fn delete(&mut self, k: K, v: V) -> Result> { - Ok(Some( - self.data.entry(k).or_insert_with(HashSet::new).remove(&v), - )) + pub async fn delete(&mut self, k: K, v: V) -> Result<()> { + self.data.entry(k).and_modify(|values| { + values.remove(&v); + }); + self.dump_to_file().await } - fn encode(&self) -> Result> { + pub async fn sync_to_disk(&mut self) -> Result<()> { + self.dump_to_file().await + } + + async fn append_to_file(&mut self, key: &K, value: &V) -> Result<()> { let mut encoded = Vec::new(); - encoded.extend(encode(&self.data)?); - Ok(encoded) + encoded.extend(encode(key)?); + encoded.extend(encode(value)?); + + self.file.seek(std::io::SeekFrom::End(0)).await?; + self.file.write(&encoded).await?; + + Ok(()) } - fn decode_tree(data: &[u8]) -> std::result::Result>, DecodeError> { - let data: BTreeMap> = decode(data)?.0; - Ok(data) + async fn dump_to_file(&mut self) -> Result<()> { + let mut writer = BufWriter::new(&mut self.file); + writer.seek(std::io::SeekFrom::Start(0)).await?; + + let mut written: u64 = 0; + let mut encoded = Vec::new(); + for (key, value) in &self.data { + for v in value { + encoded.clear(); + encoded.extend(encode(key)?); + encoded.extend(encode(v)?); + writer.write(&encoded).await?; + written += encoded.len() as u64; + } + } + + writer.flush().await?; + self.file.set_len(written).await?; + Ok(()) + } + + async fn load_from_file(&mut self) -> Result<()> { + let mut bytes = vec![]; + + self.file.seek(std::io::SeekFrom::Start(0)).await?; + self.file.read_to_end(&mut bytes).await?; + + let mut cursor = 0; + while cursor < bytes.len() { + let (key, len) = decode(&bytes[cursor..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::CorruptedData, e))?; + cursor += len; + let (value, len) = decode(&bytes[cursor..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::CorruptedData, e))?; + cursor += len; + + self.insert_desynced(key, value); + } + + Ok(()) } } @@ -112,39 +142,6 @@ mod tests { #[tokio::test] async fn encode_decode() { - let mut index: Index = Index { - file: File::from_std(std::fs::File::create("test").unwrap()), - data: BTreeMap::new(), - key_type: PhantomData::, - value_type: PhantomData::, - }; - - index.insert("foo".to_string(), 123).await.unwrap(); - index.insert("foo".to_string(), 124).await.unwrap(); - index.insert("bar".to_string(), 125).await.unwrap(); - index.insert("bar".to_string(), 126).await.unwrap(); - - let lookup = index.lookup(&"foo".to_string()).await.unwrap().unwrap(); - assert_eq!(lookup.len(), 2); - assert!(lookup.contains(&123)); - assert!(lookup.contains(&124)); - println!("lookup {:?}", lookup); - - let encoded = index.encode().unwrap(); - let decoded = Index::::decode_tree(&encoded).unwrap(); - let decoded = Index { - file: File::from_std(std::fs::File::create("test").unwrap()), - data: decoded, - key_type: PhantomData::, - value_type: PhantomData::, - }; - - let lookup = decoded.lookup(&"foo".to_string()).await.unwrap().unwrap(); - assert_eq!(lookup.len(), 2); - assert!(lookup.contains(&123)); - assert!(lookup.contains(&124)); - println!("lookup {:?}", lookup); - - std::fs::remove_file("test").unwrap(); + todo!(); } } From 82300039fc27c59094c4740a5a1d251b0d507f76 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 20:45:57 +0100 Subject: [PATCH 26/43] Split cursor functionality further into traits. Prep for garbage collection. --- storage_engine/src/cursor.rs | 215 +++++++++++++++++++++-------------- storage_engine/src/main.rs | 2 +- 2 files changed, 133 insertions(+), 84 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 05f9d05..cf01190 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -37,6 +37,15 @@ pub struct WriteCursor<'a, T> { eof_file_position: FilePosition, } +// This is used as a cursor to temporary file during Garbage Collection +pub struct AppendOnlyCursor { + header: StoreHeader, + file: File, + data_type: PhantomData, + + eof_file_position: FilePosition, +} + // ===Traits=== #[async_trait] @@ -90,6 +99,14 @@ pub trait PrimitiveCursor { } } +#[async_trait] +pub trait PrimitiveWriteCursor: PrimitiveCursor { + async fn write_bytes(&mut self, bytes: &[u8]) -> Result { + Ok(self.file().write(bytes).await?) + } + +} + #[async_trait] pub trait CursorWithStoreHeader: PrimitiveCursor { fn header(&self) -> &StoreHeader; @@ -241,6 +258,53 @@ pub trait CursorWithAccessToIndex: CursorWithStoreHeader { } } +#[async_trait] +pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWriteCursor { + fn header_mut(&mut self) -> &mut StoreHeader; + fn set_eof_file_position(&mut self, new_file_position: FilePosition); + + // ===Store Header Manipulation=== + async fn increment_total_count(&mut self) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; + let new_count = self.header_mut().increment_total_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + async fn increment_deleted_count(&mut self) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; + let new_count = self.header_mut().increment_deleted_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + // ===Append Entry=== + + // Moves cursor to the end. + // Returns file position to the start of the new entry. + async fn append_entry(&mut self, entry: &Entry) -> Result + where T: Encode + Send + Sync + { + self.increment_total_count().await?; + + let encoded_entry: Vec = entry.encode()?; + let file_position = self.seek_to_end().await?; + self.write_bytes(&encoded_entry).await?; + + let eof_file_position: FilePosition = self.current_file_position().await?; + self.set_eof_file_position(eof_file_position); + + Ok(file_position) + } +} + +// ===========Implementations============= // ===PrimitiveCursor=== impl PrimitiveCursor for ReadCursor<'_, T> { fn file(&mut self) -> &mut File { @@ -262,33 +326,56 @@ impl PrimitiveCursor for WriteCursor<'_, T> { } } -// ===CursorWithStoreHeader=== -impl CursorWithStoreHeader for ReadCursor<'_, T> { - fn header(&self) -> &StoreHeader { - &self.header +impl PrimitiveCursor for AppendOnlyCursor { + fn file(&mut self) -> &mut File { + &mut self.file + } + + fn eof_file_position(&self) -> FilePosition { + self.eof_file_position } } +// ===PrimitiveCursor=== +impl PrimitiveWriteCursor for WriteCursor<'_, T> {} +impl PrimitiveWriteCursor for AppendOnlyCursor {} + + +// ===CursorWithStoreHeader=== +impl CursorWithStoreHeader for ReadCursor<'_, T> { + fn header(&self) -> &StoreHeader { &self.header } +} + impl CursorWithStoreHeader for WriteCursor<'_, T> { - fn header(&self) -> &StoreHeader { - &self.header - } + fn header(&self) -> &StoreHeader { &self.header } +} + +impl CursorWithStoreHeader for AppendOnlyCursor { + fn header(&self) -> &StoreHeader { &self.header } +} + +// ===CursorWithWriteStoreHeader=== +impl CursorWithWriteStoreHeader for WriteCursor<'_, T> { + fn header_mut(&mut self) -> &mut StoreHeader { self.header } + fn set_eof_file_position(&mut self, new_file_position: FilePosition) { self.eof_file_position = new_file_position } +} + +impl CursorWithWriteStoreHeader for AppendOnlyCursor { + fn header_mut(&mut self) -> &mut StoreHeader { &mut self.header } + fn set_eof_file_position(&mut self, new_file_position: FilePosition) { self.eof_file_position = new_file_position } } // ===CursorWithAccessToIndex=== impl CursorWithAccessToIndex for ReadCursor<'_, T> { - fn indexes(&mut self) -> &[Option>] { - &self.indexes - } + fn indexes(&mut self) -> &[Option>] { &self.indexes } } impl CursorWithAccessToIndex for WriteCursor<'_, T> { - fn indexes(&mut self) -> &[Option>] { - &self.indexes - } + fn indexes(&mut self) -> &[Option>] { &self.indexes } } + impl <'cursor, T> ReadCursor<'cursor, T> { pub async fn new<'store: 'cursor>(store: &'store Store) -> Result where T: Send + Sync @@ -317,6 +404,8 @@ impl <'cursor, T> ReadCursor<'cursor, T> { } } + + impl <'cursor, T> WriteCursor<'cursor, T> // TODO: Consider adding this manually to wher eit is really needed where T: Sync @@ -375,60 +464,16 @@ impl <'cursor, T> WriteCursor<'cursor, T> Ok(cursor) } - - // ===Primitive Operations=== - async fn write_bytes(&mut self, bytes: &[u8]) -> Result { - Ok(self.file.write(bytes).await?) - } - - // ===Store Header Manipulation=== - async fn increment_total_count(&mut self) -> Result<()> - where T: Send - { - self.seek_to_start().await?; - self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; - let new_count = self.header.increment_total_count(); - self.write_bytes(&encode::(&new_count)?).await?; - Ok(()) - } - - async fn increment_deleted_count(&mut self) -> Result<()> - where T: Send - { - self.seek_to_start().await?; - self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; - let new_count = self.header.increment_deleted_count(); - self.write_bytes(&encode::(&new_count)?).await?; - Ok(()) - } - // ===Entry Header Manipulation=== // assumes we are at the start of valid entry. - async fn set_new_entry_header(&mut self, entry_header: EntryHeader) -> Result<()> { + async fn set_new_entry_header(&mut self, entry_header: EntryHeader) -> Result<()> + where T: Send + { let bytes: Vec = entry_header.encode()?; self.write_bytes(&bytes).await?; Ok(()) } - // ===Append Entry=== - - // Moves cursor to the end. - // Returns file position to the start of the new entry. - pub async fn append_entry(&mut self, entry: &Entry) -> Result - where T: Encode + Send - { - self.increment_total_count().await?; - - let encoded_entry: Vec = entry.encode()?; - let file_position = self.seek_to_end().await?; - self.write_bytes(&encoded_entry).await?; - - let eof_file_position: FilePosition = self.current_file_position().await?; - self.eof_file_position = eof_file_position; - - Ok(file_position) - } - // ===Deletion=== pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> where T: Send @@ -461,18 +506,31 @@ impl <'cursor, T> WriteCursor<'cursor, T> } } - async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> { + // ===Garbage Collection=== + async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> + where T: Send + { // TODO: What should be the policy? Counting size of garbage? Counting how many entries are // garbage? if self.header.deleted_count > 100 { - todo!() - } else { - Ok(()) + self.initiate_garbage_collection().await?; } + Ok(()) } async fn initiate_garbage_collection(&mut self) -> Result where T: Send + { + // We'll dump all alive entries into a new file. + let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; + + // In it there will be only the alive rows. + // Afterwards we swap the files, and delete the garbage. + todo!() + } + + async fn spawn_cursor_to_intermediate_file(&self) -> Result> + where T: Send { let table_folder = self.header.table_folder.to_string(); let path_to_table = Path::new(&table_folder); @@ -480,36 +538,27 @@ impl <'cursor, T> WriteCursor<'cursor, T> let intermediate_file: File = Store::::create_empty_rows_file(path_to_rows, &self.header).await?; - let mut intermediate_header: StoreHeader = StoreHeader { + let intermediate_header: StoreHeader = StoreHeader { table_folder, number_of_columns: self.header.number_of_columns, deleted_count: 0, total_count: 0, primary_column: self.header.primary_column, - indexed_columns: todo!() + indexed_columns: self.header.indexed_columns.clone(), }; - // Creates a new cursor to the intermediate file in which we'll dump the live entries. - // let mut cursor_to_intermediate = Self { - // header: &mut intermediate_header, - // file: intermediate_file, - // data_type: PhantomData::, + // Creates a new (append) cursor to the intermediate file in which we'll dump the live entries. + let mut cursor_to_intermediate = AppendOnlyCursor { + header: intermediate_header, + file: intermediate_file, + data_type: PhantomData::, - // eof_file_position: 0, - // }; - let mut cursor_to_intermediate: Self = todo!(); + eof_file_position: 0, + }; let eof_file_position: FilePosition = cursor_to_intermediate.seek_to_end().await?; cursor_to_intermediate.eof_file_position = eof_file_position; - - - // TODO: intermediate_header does not live long enough, so after garbage collection is - // done, we need to use it in the swap. - cursor_to_intermediate.header = todo!(); - - // In it there will be only the alive rows. - // Afterwards we swap the files, and delete the garbage. - todo!() + Ok(cursor_to_intermediate) } // ===Indexing=== diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 5f4bb69..d8df150 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -9,7 +9,7 @@ mod store_header; use crate::entry::{Entry, EntryDetailed}; use crate::storage_engine::{Store, FilePosition}; -use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, PrimitiveCursor}; +use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteStoreHeader}; type Data = u32; From a5c7306b90f3055861c0728c385d3292a4268d22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20Moravec?= Date: Sun, 4 Feb 2024 20:48:10 +0100 Subject: [PATCH 27/43] tests: index testing --- storage_engine/src/index.rs | 172 ++++++++++++++++++++++++++++++++---- 1 file changed, 154 insertions(+), 18 deletions(-) diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index 3cb84d2..5d2187a 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -75,25 +75,10 @@ where self.data.entry(k).and_modify(|values| { values.remove(&v); }); - self.dump_to_file().await + self.sync_to_disk().await } pub async fn sync_to_disk(&mut self) -> Result<()> { - self.dump_to_file().await - } - - async fn append_to_file(&mut self, key: &K, value: &V) -> Result<()> { - let mut encoded = Vec::new(); - encoded.extend(encode(key)?); - encoded.extend(encode(value)?); - - self.file.seek(std::io::SeekFrom::End(0)).await?; - self.file.write(&encoded).await?; - - Ok(()) - } - - async fn dump_to_file(&mut self) -> Result<()> { let mut writer = BufWriter::new(&mut self.file); writer.seek(std::io::SeekFrom::Start(0)).await?; @@ -114,6 +99,17 @@ where Ok(()) } + async fn append_to_file(&mut self, key: &K, value: &V) -> Result<()> { + let mut encoded = Vec::new(); + encoded.extend(encode(key)?); + encoded.extend(encode(value)?); + + self.file.seek(std::io::SeekFrom::End(0)).await?; + self.file.write(&encoded).await?; + + Ok(()) + } + async fn load_from_file(&mut self) -> Result<()> { let mut bytes = vec![]; @@ -141,7 +137,147 @@ mod tests { use super::*; #[tokio::test] - async fn encode_decode() { - todo!(); + async fn connect_to_new() { + let file_name = PathBuf::from("connect_to_new"); + if file_name.exists() { + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + { + let index = Index::::new(file_name.clone()).await.unwrap(); + assert_eq!(index.data.len(), 0); + } + + { + let index = Index::::connect(file_name.clone()).await.unwrap(); + assert_eq!(index.data.len(), 0); + } + + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + #[tokio::test] + async fn inserting() { + let file_name = PathBuf::from("inserting"); + if file_name.exists() { + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + { + let mut index = Index::::new(file_name.clone()).await.unwrap(); + index.insert(1, 2).await.unwrap(); + index.insert(1, 3).await.unwrap(); + index.insert(1, 4).await.unwrap(); + index.insert(2, 3).await.unwrap(); + index.insert(2, 4).await.unwrap(); + index.insert(2, 5).await.unwrap(); + + assert_eq!(index.data.len(), 2); + assert_eq!(index.data.get(&1).unwrap().len(), 3); + assert_eq!(index.data.get(&2).unwrap().len(), 3); + } + + { + let index = Index::::connect(file_name.clone()).await.unwrap(); + assert_eq!(index.data.len(), 2); + assert_eq!(index.data.get(&1).unwrap().len(), 3); + assert_eq!(index.data.get(&2).unwrap().len(), 3); + } + + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + #[tokio::test] + async fn lookuping() { + let file_name = PathBuf::from("lookuping"); + if file_name.exists() { + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + { + let mut index = Index::::new(file_name.clone()).await.unwrap(); + index.insert(1, 2).await.unwrap(); + index.insert(1, 3).await.unwrap(); + index.insert(1, 4).await.unwrap(); + index.insert(2, 3).await.unwrap(); + index.insert(2, 4).await.unwrap(); + index.insert(2, 5).await.unwrap(); + + assert_eq!(index.lookup(&1).await.unwrap().unwrap().len(), 3); + assert_eq!(index.lookup(&2).await.unwrap().unwrap().len(), 3); + assert_eq!(index.lookup(&3).await.unwrap(), None); + + let first = index.lookup(&1).await.unwrap().unwrap(); + assert!(first.contains(&2)); + assert!(first.contains(&3)); + assert!(first.contains(&4)); + + let second = index.lookup(&2).await.unwrap().unwrap(); + assert!(second.contains(&3)); + assert!(second.contains(&4)); + assert!(second.contains(&5)); + } + + { + let index = Index::::connect(file_name.clone()).await.unwrap(); + assert_eq!(index.lookup(&1).await.unwrap().unwrap().len(), 3); + assert_eq!(index.lookup(&2).await.unwrap().unwrap().len(), 3); + assert_eq!(index.lookup(&3).await.unwrap(), None); + + let first = index.lookup(&1).await.unwrap().unwrap(); + assert!(first.contains(&2)); + assert!(first.contains(&3)); + assert!(first.contains(&4)); + + let second = index.lookup(&2).await.unwrap().unwrap(); + assert!(second.contains(&3)); + assert!(second.contains(&4)); + assert!(second.contains(&5)); + } + + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + #[tokio::test] + async fn deleting() { + let file_name = PathBuf::from("deleting"); + if file_name.exists() { + tokio::fs::remove_file(&file_name).await.unwrap(); + } + + { + let mut index = Index::::new(file_name.clone()).await.unwrap(); + index.insert(1, 2).await.unwrap(); + index.insert(1, 3).await.unwrap(); + index.insert(1, 4).await.unwrap(); + index.insert(2, 3).await.unwrap(); + index.insert(2, 4).await.unwrap(); + index.insert(2, 5).await.unwrap(); + + assert!(index.lookup(&1).await.unwrap().unwrap().contains(&2)); + index.delete(1, 2).await.unwrap(); + assert!(!index.lookup(&1).await.unwrap().unwrap().contains(&2)); + + assert!(index.lookup(&2).await.unwrap().unwrap().contains(&3)); + index.delete(2, 3).await.unwrap(); + assert!(!index.lookup(&2).await.unwrap().unwrap().contains(&3)); + } + + { + let mut index = Index::::connect(file_name.clone()).await.unwrap(); + + assert!(!index.lookup(&1).await.unwrap().unwrap().contains(&2)); + assert!(!index.lookup(&2).await.unwrap().unwrap().contains(&3)); + + assert!(index.lookup(&1).await.unwrap().unwrap().contains(&3)); + index.delete(1, 3).await.unwrap(); + assert!(!index.lookup(&1).await.unwrap().unwrap().contains(&3)); + + assert!(index.lookup(&1).await.unwrap().unwrap().contains(&4)); + assert!(index.lookup(&2).await.unwrap().unwrap().contains(&4)); + assert!(index.lookup(&2).await.unwrap().unwrap().contains(&5)); + } + + tokio::fs::remove_file(&file_name).await.unwrap(); } } From 28f182c4b197a9bfc6e24e17ca8de2eb9d9b645f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20Moravec?= Date: Sun, 4 Feb 2024 20:52:17 +0100 Subject: [PATCH 28/43] refactor: remove long uses --- storage_engine/src/index.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index 5d2187a..b5c42c1 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -4,6 +4,7 @@ use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, BufWriter}; use std::collections::{BTreeMap, HashSet}; use std::hash::Hash; +use std::io::SeekFrom; use crate::binary_coding::{decode, encode}; use bincode; @@ -80,7 +81,7 @@ where pub async fn sync_to_disk(&mut self) -> Result<()> { let mut writer = BufWriter::new(&mut self.file); - writer.seek(std::io::SeekFrom::Start(0)).await?; + writer.seek(SeekFrom::Start(0)).await?; let mut written: u64 = 0; let mut encoded = Vec::new(); @@ -104,7 +105,7 @@ where encoded.extend(encode(key)?); encoded.extend(encode(value)?); - self.file.seek(std::io::SeekFrom::End(0)).await?; + self.file.seek(SeekFrom::End(0)).await?; self.file.write(&encoded).await?; Ok(()) @@ -113,7 +114,7 @@ where async fn load_from_file(&mut self) -> Result<()> { let mut bytes = vec![]; - self.file.seek(std::io::SeekFrom::Start(0)).await?; + self.file.seek(SeekFrom::Start(0)).await?; self.file.read_to_end(&mut bytes).await?; let mut cursor = 0; @@ -135,12 +136,13 @@ where #[cfg(test)] mod tests { use super::*; + use tokio::fs::remove_file; #[tokio::test] async fn connect_to_new() { let file_name = PathBuf::from("connect_to_new"); if file_name.exists() { - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } { @@ -153,14 +155,14 @@ mod tests { assert_eq!(index.data.len(), 0); } - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } #[tokio::test] async fn inserting() { let file_name = PathBuf::from("inserting"); if file_name.exists() { - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } { @@ -184,14 +186,14 @@ mod tests { assert_eq!(index.data.get(&2).unwrap().len(), 3); } - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } #[tokio::test] async fn lookuping() { let file_name = PathBuf::from("lookuping"); if file_name.exists() { - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } { @@ -235,14 +237,14 @@ mod tests { assert!(second.contains(&5)); } - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } #[tokio::test] async fn deleting() { let file_name = PathBuf::from("deleting"); if file_name.exists() { - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } { @@ -278,6 +280,6 @@ mod tests { assert!(index.lookup(&2).await.unwrap().unwrap().contains(&5)); } - tokio::fs::remove_file(&file_name).await.unwrap(); + remove_file(&file_name).await.unwrap(); } } From b64819f28bc48cb8047744e9bd56d8b75b069fcd Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 21:09:46 +0100 Subject: [PATCH 29/43] Remove unnecessar PhantomData --- storage_engine/src/cursor.rs | 25 ++++++++++++++----------- storage_engine/src/entry.rs | 7 +++++++ storage_engine/src/storage_engine.rs | 3 --- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index cf01190..8e0efe0 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -2,7 +2,7 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; use tokio::fs::{File, OpenOptions}; use std::path::Path; use std::marker::PhantomData; -use crate::error::{Error, DecodeErrorKind}; +use std::collections::{BTreeMap, HashSet}; use async_trait::async_trait; @@ -10,6 +10,7 @@ use bincode; use bincode::{Decode, Encode}; use crate::binary_coding::{encode, decode}; +use crate::error::{Error, DecodeErrorKind}; use crate::entry::{Entry, EntryDetailed}; use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; use crate::store_header::StoreHeader; @@ -23,7 +24,6 @@ pub struct ReadCursor<'a, T> { header: StoreHeader, indexes: &'a [Option>], file: File, - data_type: PhantomData, eof_file_position: FilePosition, } @@ -32,7 +32,6 @@ pub struct WriteCursor<'a, T> { header: &'a mut StoreHeader, indexes: &'a mut [Option>], file: File, - data_type: PhantomData, eof_file_position: FilePosition, } @@ -390,7 +389,6 @@ impl <'cursor, T> ReadCursor<'cursor, T> { let mut cursor = Self { header: store.header.clone(), file, - data_type: store.data_type, indexes: &store.indexes, eof_file_position: 0, @@ -425,7 +423,6 @@ impl <'cursor, T> WriteCursor<'cursor, T> let mut cursor = Self { header: &mut store.header, file, - data_type: store.data_type, indexes: &mut store.indexes, eof_file_position: 0, @@ -451,7 +448,6 @@ impl <'cursor, T> WriteCursor<'cursor, T> let mut cursor = Self { header, file, - data_type: PhantomData::, indexes, eof_file_position: 0, @@ -476,7 +472,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> // ===Deletion=== pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> - where T: Send + where T: Send + Decode + Encode { self.seek_to(file_position).await?; let mut entry_header = self.read_entry_header().await?; @@ -495,7 +491,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> } async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result>> - where T: Decode + PartialEq + Send + Sync + where T: Decode + Encode + PartialEq + Send + Sync { let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?; if let Some(entry) = maybe_entry { @@ -508,7 +504,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> // ===Garbage Collection=== async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> - where T: Send + where T: Send + Decode + Encode { // TODO: What should be the policy? Counting size of garbage? Counting how many entries are // garbage? @@ -519,11 +515,18 @@ impl <'cursor, T> WriteCursor<'cursor, T> } async fn initiate_garbage_collection(&mut self) -> Result - where T: Send + where T: Send + Decode + Encode { - // We'll dump all alive entries into a new file. let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; + let in_memory_index: BTreeMap> = BTreeMap::new(); + + // We'll dump all alive entries into a new file. + while let Some(live_entry) = self.next_alive().await? { + let file_position = cursor_to_intermediate.append_entry(&live_entry.forget()).await?; + // TODO: Start indexing all of the indexable columns from scratch. + } + // In it there will be only the alive rows. // Afterwards we swap the files, and delete the garbage. todo!() diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs index c628979..b292051 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/entry.rs @@ -48,4 +48,11 @@ impl EntryDetailed { .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryData, e))?; Ok(EntryDetailed { header, file_position, data }) } + + pub fn forget(self) -> Entry { + Entry { + header: self.header.into(), + data: self.data, + } + } } diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index e2df22e..6113c88 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -20,7 +20,6 @@ pub type FilePosition = u64; #[derive(Debug)] pub struct Store { pub header: StoreHeader, - pub data_type: PhantomData, pub indexes: StoreIndexes, } @@ -68,7 +67,6 @@ impl Store { let store = Self { header, - data_type: PhantomData::, indexes, }; @@ -171,7 +169,6 @@ impl Store { let store = Self { header, - data_type: PhantomData::, indexes }; Ok(store) From 2357ea8230e4a9f1f69f3fdeeb333bc49c9aec5c Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 21:52:48 +0100 Subject: [PATCH 30/43] Implement garbage collection without indexes --- storage_engine/src/cursor.rs | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 8e0efe0..72a6a4c 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -518,18 +518,37 @@ impl <'cursor, T> WriteCursor<'cursor, T> where T: Send + Decode + Encode { let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; - + // This will be a vector of such BTree maps... let in_memory_index: BTreeMap> = BTreeMap::new(); // We'll dump all alive entries into a new file. - while let Some(live_entry) = self.next_alive().await? { - let file_position = cursor_to_intermediate.append_entry(&live_entry.forget()).await?; - // TODO: Start indexing all of the indexable columns from scratch. + let mut entries_deleted = 0; + { + while let Some(live_entry) = self.next_alive().await? { + entries_deleted += 1; + let file_position = cursor_to_intermediate.append_entry(&live_entry.forget()).await?; + // TODO: Start indexing all of the indexable columns from scratch. + } } - // In it there will be only the alive rows. + // TODO: Create a new indexes from in_memory_index. + // Afterwards we swap the files, and delete the garbage. - todo!() + // TODO: + // What needs to be done? + // 1. We take self cursor and mutate it + + // swapping headers + self.header.deleted_count = 0; + self.header.total_count = cursor_to_intermediate.header.total_count; + + // TODO: We'll actually have to iterate through all the indexes and swap each of them. + self.indexes = todo!(); + self.file = cursor_to_intermediate.file; + + self.eof_file_position = cursor_to_intermediate.eof_file_position; + + Ok(entries_deleted) } async fn spawn_cursor_to_intermediate_file(&self) -> Result> From 813911293431a0078ee42726aa22a6639837706e Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Sun, 4 Feb 2024 23:54:22 +0100 Subject: [PATCH 31/43] Make use of indexes --- storage_engine/src/cursor.rs | 108 +++++++++++++++++++++++---- storage_engine/src/entry.rs | 4 +- storage_engine/src/error.rs | 4 + storage_engine/src/main.rs | 26 +++++-- storage_engine/src/storage_engine.rs | 5 +- 5 files changed, 120 insertions(+), 27 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 72a6a4c..5587d6b 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -245,18 +245,6 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { } } -#[async_trait] -pub trait CursorWithAccessToIndex: CursorWithStoreHeader { - fn indexes(&mut self) -> &[Option>]; - - async fn find_in_index(&mut self, k: &T) -> Result> - where T: Encode + Decode + Ord + Send + Sync - { - // let x = self.primary_index().lookup(k).await?; - todo!() - } -} - #[async_trait] pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWriteCursor { fn header_mut(&mut self) -> &mut StoreHeader; @@ -287,7 +275,7 @@ pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWri // Moves cursor to the end. // Returns file position to the start of the new entry. - async fn append_entry(&mut self, entry: &Entry) -> Result + async fn append_entry_no_indexing(&mut self, entry: &Entry) -> Result where T: Encode + Send + Sync { self.increment_total_count().await?; @@ -303,6 +291,94 @@ pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWri } } +#[async_trait] +pub trait CursorWithAccessToIndex: CursorWithStoreHeader { + fn indexes(&mut self) -> &[Option>]; + + async fn index_lookup(&mut self, column: Column, k: &T) -> Result>> + where T: Encode + Decode + Ord + Send + Sync + { + match &self.indexes()[column as usize] { + Some(index) => { + let file_positions = index.lookup(k).await?.unwrap_or_else(|| HashSet::new()); + let mut entries: Vec> = vec![]; + for &file_position in file_positions.iter() { + match self.read_entry_at(file_position).await? { + Some(entry) => { + entries.push(entry) + }, + None => { + return Err(Error::IndexIsStoringEofFilePosition(column)) + } + } + } + + Ok(entries) + }, + None => + Err(Error::AttemptToIndexNonIndexableColumn(column)) + } + } + + // TODO: I also need the global find + async fn lookup(&mut self, column: Column, k: &T) -> Result>> { + todo!() + } +} + +#[async_trait] +pub trait CursorWithWriteAccessToIndex: CursorWithAccessToIndex + CursorWithWriteStoreHeader { + fn indexes_mut(&mut self) -> &mut [Option>]; + + // Assumes that the column is indexable. + fn mut_index_at(&mut self, column: Column) -> &mut Index { + match &mut self.indexes_mut()[column as usize] { + Some(index) => { + index + }, + None => { + unreachable!() + } + } + } + + // Assumes that the column is indexable. + async fn insert_into_index(&mut self, column: Column, value: T, file_position: FilePosition) -> Result<()> + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + let index = self.mut_index_at(column as Column); + index.insert(value, file_position).await?; + Ok(()) + } + + // Assumes that the column is indexable. + async fn delete_from_index(&mut self, column: Column, value: T, file_position: FilePosition) -> Result<()> + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + let index = self.mut_index_at(column as Column); + index.delete(value, file_position).await?; + Ok(()) + } + + async fn insert_entry(&mut self, entry: Entry) -> Result + // TODO: Why is 'async_trait necessary? + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + let file_position = self.append_entry_no_indexing(&entry).await?; + + // insert the indexable values of the entry into corresponding indexes. + for (column, (value, should_index)) in entry.data.into_iter().zip(self.header().indexed_columns.clone()).enumerate() { + if should_index { + // SAFETY: If should_index is true, then the column is indexable. + self.insert_into_index(column as Column, value, file_position).await? + } + } + + Ok(file_position) + } +} + + // ===========Implementations============= // ===PrimitiveCursor=== impl PrimitiveCursor for ReadCursor<'_, T> { @@ -373,6 +449,10 @@ impl CursorWithAccessToIndex for WriteCursor<'_, T> { fn indexes(&mut self) -> &[Option>] { &self.indexes } } +// ===CursorWithWriteAccessToIndex=== +impl CursorWithWriteAccessToIndex for WriteCursor<'_, T> { + fn indexes_mut(&mut self) -> &mut [Option>] { self.indexes } +} impl <'cursor, T> ReadCursor<'cursor, T> { @@ -526,7 +606,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> { while let Some(live_entry) = self.next_alive().await? { entries_deleted += 1; - let file_position = cursor_to_intermediate.append_entry(&live_entry.forget()).await?; + let file_position = cursor_to_intermediate.append_entry_no_indexing(&live_entry.forget()).await?; // TODO: Start indexing all of the indexable columns from scratch. } } diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs index b292051..ed0c1fa 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/entry.rs @@ -7,8 +7,8 @@ use crate::entry_header::{EntryHeader, EntryHeaderWithDataSize}; #[derive(Debug)] pub struct Entry { - header: EntryHeader, - data: Vec, + pub header: EntryHeader, + pub data: Vec, } #[derive(Debug)] diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index 951f167..a80937b 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -1,7 +1,11 @@ +use crate::storage_engine::Column; + #[derive(Debug)] pub enum Error { DecodeError(DecodeErrorKind, bincode::error::DecodeError), EncodeError(bincode::error::EncodeError), + AttemptToIndexNonIndexableColumn(Column), + IndexIsStoringEofFilePosition(Column), IoError(std::io::Error), InvalidStoreHeader, } diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index d8df150..21d560d 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -9,7 +9,7 @@ mod store_header; use crate::entry::{Entry, EntryDetailed}; use crate::storage_engine::{Store, FilePosition}; -use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteStoreHeader}; +use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteStoreHeader, CursorWithWriteAccessToIndex, CursorWithAccessToIndex}; type Data = u32; @@ -44,10 +44,12 @@ async fn create_or_connect() -> Result> { } -async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: &Entry) -> Result { +async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: Entry) -> Result { println!("APPENDING"); println!("entry == {:?}", entry); - let file_position: FilePosition = cursor.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; + + // let file_position: FilePosition = cursor.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; + let file_position: FilePosition = cursor.insert_entry(entry).await.map_err(|e| e.to_io_or_panic())?; println!("file_position == {:?}", file_position); Ok(file_position) } @@ -63,17 +65,20 @@ async fn read_entry(cursor: &mut ReadCursor<'_, Data>, file_position: FilePositi async fn append_bunch_of_entries(store: &mut Store) -> Result<()> { let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); - append_entry(&mut cursor, &entry0).await?; + append_entry(&mut cursor, entry0).await?; let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); - append_entry(&mut cursor, &entry1).await?; + append_entry(&mut cursor, entry1).await?; // println!("{:?}", store.read_all_bytes().await?); let entry2: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - append_entry(&mut cursor, &entry2).await?; + append_entry(&mut cursor, entry2).await?; let entry3: Entry = Entry::new(vec![50,50,50,50,50]); - append_entry(&mut cursor, &entry3).await?; + append_entry(&mut cursor, entry3).await?; + + let entry4: Entry = Entry::new(vec![1,50,50,50,50]); // same 0-th column as entry0 + append_entry(&mut cursor, entry4).await?; Ok(()) } @@ -141,6 +146,13 @@ async fn main() -> Result<()> { println!("{:?}", x); } + { + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let entries = cursor.index_lookup(0, &1).await.map_err(|e| e.to_io_or_panic())?; + println!("ARE INDEXES WORKING???"); + println!("{:?}", entries); + } + // { // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; // let column = 3; diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 6113c88..c631228 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -155,10 +155,7 @@ impl Store { let mut result = Vec::with_capacity(header.number_of_columns); for (column, &is_indexed) in header.indexed_columns.iter().enumerate() { if is_indexed { - result.push(None) - // TODO: Once index connect is working, uncomment this line (and remove the - // above .push line - // result.push(Some(Self::connect_index_at(&header, column as Column).await?)) + result.push(Some(Self::connect_index_at(&header, column as Column).await?)) } else { result.push(None) } From b0f05f36f2f7051530722026b9f10ca5187f20b6 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 00:44:14 +0100 Subject: [PATCH 32/43] Can build indexes on new columns --- storage_engine/src/cursor.rs | 73 ++++++++++++++++++++++++---- storage_engine/src/error.rs | 1 + storage_engine/src/main.rs | 29 ++++++++++- storage_engine/src/storage_engine.rs | 13 +++-- storage_engine/src/store_header.rs | 8 +++ 5 files changed, 110 insertions(+), 14 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 5587d6b..f15a80a 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -173,9 +173,10 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { } // Like next, but only reads the column, not the whole entry. - async fn next_at_column(&mut self, column: Column) -> Result> + async fn next_at_column(&mut self, column: Column) -> Result> where T: Decode + Send { + let file_position = self.current_file_position().await?; let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; let file_position_at_start_of_data = self.current_file_position().await?; @@ -194,9 +195,20 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { self.seek_to(file_position_at_start_of_data).await?; self.jump_from_start_of_entry_data_to_next_entry(&entry_header).await?; - Ok(Some((entry_header, value))) + Ok(Some((entry_header, file_position, value))) } + async fn next_alive_at_column(&mut self, column: Column) -> Result> + where T: Decode + Send + { + while let Some((header, file_position, t)) = self.next_at_column(column).await? { + if !header.is_deleted { + return Ok(Some((header, file_position, t))) + } + } + Ok(None) + } + async fn next_alive(&mut self) -> Result>> where T: Decode { @@ -213,7 +225,7 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { where T: Decode + PartialEq + Send + Sync { let mut file_position = self.current_file_position().await?; - while let Some((_, t)) = self.next_at_column(column).await? { + while let Some((_, _, t)) = self.next_alive_at_column(column).await? { if &t == t0 { // go back and decode the whole entry self.seek_to(file_position).await?; @@ -225,6 +237,16 @@ pub trait CursorWithStoreHeader: PrimitiveCursor { Ok(None) } + async fn find_all_eq_bruteforce(&mut self, column: Column, t0: &T) -> Result>> + where T: Decode + PartialEq + Send + Sync + { + let mut entries = vec![]; + while let Some(entry) = self.find_first_eq_bruteforce(column, t0).await? { + entries.push(entry) + } + Ok(entries) + } + // ===Debugging=== async fn read_entries(&mut self) -> Result<()> where T: Decode + std::fmt::Debug @@ -271,6 +293,16 @@ pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWri Ok(()) } + async fn set_header(&mut self, header: &StoreHeader) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + let encoded_header: Vec = header.encode()?; + self.write_bytes(&encoded_header).await?; + + Ok(()) + } + // ===Append Entry=== // Moves cursor to the end. @@ -320,9 +352,16 @@ pub trait CursorWithAccessToIndex: CursorWithStoreHeader { } } - // TODO: I also need the global find - async fn lookup(&mut self, column: Column, k: &T) -> Result>> { - todo!() + async fn select_entries_where_eq(&mut self, column: Column, value: &T) -> Result>> + where T: Encode + Decode + Ord + Send + Sync + { + if self.header().is_column_indexed(column) { + println!("INDEXED LOOKUP"); + self.index_lookup(column, value).await + } else { + println!("BRUTE-FORCE LOOKUP"); + self.find_all_eq_bruteforce(column, value).await + } } } @@ -664,11 +703,25 @@ impl <'cursor, T> WriteCursor<'cursor, T> } // ===Indexing=== - async fn insert_to_index(&mut self, t: T, file_position: FilePosition) -> Result> - where T: Encode + Decode + Ord + Send + Sync + // WARNING: Assumes the column is NOT indexable. + pub async fn attach_index(&mut self, column: Column) -> Result<()> + where T: Ord + Decode + Encode + Send + Sync { - // let x = self.primary_index.insert(t, file_position).await?; - todo!() + // New Index + let index = Store::create_empty_index_at(&self.header, column).await?; + self.indexes[column as usize] = Some(index); + + // Mark column as indexed + self.header.make_column_indexed(column); + self.set_header(&self.header.clone()).await?; + + // Build index + self.seek_to_start_of_data().await?; + while let Some((_, file_position, value)) = self.next_alive_at_column(column).await? { + self.insert_into_index(column, value, file_position).await? + } + + Ok(()) } async fn delete_from_index(&mut self, t: T, file_position: FilePosition) -> Result> diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index a80937b..4adf004 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -6,6 +6,7 @@ pub enum Error { EncodeError(bincode::error::EncodeError), AttemptToIndexNonIndexableColumn(Column), IndexIsStoringEofFilePosition(Column), + ColumnAlreadyIndexed(Column), IoError(std::io::Error), InvalidStoreHeader, } diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 21d560d..55bf693 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -148,11 +148,38 @@ async fn main() -> Result<()> { { let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let entries = cursor.index_lookup(0, &1).await.map_err(|e| e.to_io_or_panic())?; + let column = 0; + let value = 1; + let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; println!("ARE INDEXES WORKING???"); println!("{:?}", entries); } + { + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let column = 1; + let value = 2; + let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; + println!("ARE INDEXES WORKING???"); + println!("{:?}", entries); + } + + { + let column = 1; + println!("BUILDING AN INDEX"); + store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; + println!("INDEX BUILT!"); + + let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + let value = 2; + let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; + println!("ARE INDEXES WORKING???"); + println!("{:?}", entries); + } + + + + // { // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; // let column = 3; diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index c631228..94ac5bd 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -184,9 +184,16 @@ impl Store { WriteCursor::new(self).await } - pub async fn make_indexable(&mut self, column: Column) -> Result<()> { - // Creates an index from scratch at above column - todo!() + // ===Indexes=== + pub async fn attach_index(&mut self, column: Column) -> Result<()> + where T: Ord + Decode + Encode + Send + Sync + { + if self.header.is_column_indexed(column) { + Err(Error::ColumnAlreadyIndexed(column)) + } else { + let mut cursor = self.write_cursor().await?; + cursor.attach_index(column).await + } } // For debugging. diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/store_header.rs index 31b23f0..004e26a 100644 --- a/storage_engine/src/store_header.rs +++ b/storage_engine/src/store_header.rs @@ -115,4 +115,12 @@ impl StoreHeader { self.deleted_count += 1; self.deleted_count } + + pub fn is_column_indexed(&self, column: Column) -> bool { + self.indexed_columns[column as usize] + } + + pub fn make_column_indexed(&mut self, column: Column) { + self.indexed_columns[column as usize] = true + } } From 1086b2fc5e21a121019dded812f1c56909f1fc14 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 01:04:10 +0100 Subject: [PATCH 33/43] Add indexing to deletion --- storage_engine/src/cursor.rs | 81 ++++++++++++++++++++++-------------- storage_engine/src/main.rs | 8 ++-- 2 files changed, 52 insertions(+), 37 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index f15a80a..13054da 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -400,7 +400,6 @@ pub trait CursorWithWriteAccessToIndex: CursorWithAccessToIndex + CursorWi } async fn insert_entry(&mut self, entry: Entry) -> Result - // TODO: Why is 'async_trait necessary? where T: Encode + Decode + Ord + Send + Sync + 'async_trait { let file_position = self.append_entry_no_indexing(&entry).await?; @@ -415,6 +414,18 @@ pub trait CursorWithWriteAccessToIndex: CursorWithAccessToIndex + CursorWi Ok(file_position) } + + async fn delete_entry_values_from_indexes(&mut self, entry: EntryDetailed) -> Result<()> + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + for (column, (value, should_index)) in entry.data.into_iter().zip(self.header().indexed_columns.clone()).enumerate() { + if should_index { + // SAFETY: If should_index is true, then the column is indexable. + self.delete_from_index(column as Column, value, entry.file_position).await? + } + } + Ok(()) + } } @@ -591,26 +602,39 @@ impl <'cursor, T> WriteCursor<'cursor, T> // ===Deletion=== pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> - where T: Send + Decode + Encode + where T: Encode + Decode + Ord + Send + Sync { self.seek_to(file_position).await?; let mut entry_header = self.read_entry_header().await?; if entry_header.is_deleted { Ok(()) } else { + // Update store and entry headers self.increment_deleted_count().await?; self.seek_to(file_position).await?; entry_header.is_deleted = true; self.set_new_entry_header(entry_header.into()).await?; + // Update index + self.seek_to(file_position).await?; + match self.next().await? { + Some(entry) => { + self.delete_entry_values_from_indexes(entry).await? + }, + None => { + // SAFETY: We just modified its header, so it must exist. + unreachable!() + } + } + self.attempt_garbage_collection_if_necessary().await?; Ok(()) } } async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result>> - where T: Decode + Encode + PartialEq + Send + Sync + where T: Encode + Decode + Ord + Send + Sync { let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?; if let Some(entry) = maybe_entry { @@ -621,6 +645,28 @@ impl <'cursor, T> WriteCursor<'cursor, T> } } + // ===Indexing=== + // WARNING: Assumes the column is NOT indexable. + pub async fn attach_index(&mut self, column: Column) -> Result<()> + where T: Ord + Decode + Encode + Send + Sync + { + // New Index + let index = Store::create_empty_index_at(&self.header, column).await?; + self.indexes[column as usize] = Some(index); + + // Mark column as indexed + self.header.make_column_indexed(column); + self.set_header(&self.header.clone()).await?; + + // Build index + self.seek_to_start_of_data().await?; + while let Some((_, file_position, value)) = self.next_alive_at_column(column).await? { + self.insert_into_index(column, value, file_position).await? + } + + Ok(()) + } + // ===Garbage Collection=== async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> where T: Send + Decode + Encode @@ -701,34 +747,5 @@ impl <'cursor, T> WriteCursor<'cursor, T> Ok(cursor_to_intermediate) } - - // ===Indexing=== - // WARNING: Assumes the column is NOT indexable. - pub async fn attach_index(&mut self, column: Column) -> Result<()> - where T: Ord + Decode + Encode + Send + Sync - { - // New Index - let index = Store::create_empty_index_at(&self.header, column).await?; - self.indexes[column as usize] = Some(index); - - // Mark column as indexed - self.header.make_column_indexed(column); - self.set_header(&self.header.clone()).await?; - - // Build index - self.seek_to_start_of_data().await?; - while let Some((_, file_position, value)) = self.next_alive_at_column(column).await? { - self.insert_into_index(column, value, file_position).await? - } - - Ok(()) - } - - async fn delete_from_index(&mut self, t: T, file_position: FilePosition) -> Result> - where T: Encode + Decode + Ord + Send + Sync - { - // let x = self.primary_index.delete(t, file_position).await?; - todo!() - } } diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 55bf693..46e6552 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -166,9 +166,9 @@ async fn main() -> Result<()> { { let column = 1; - println!("BUILDING AN INDEX"); - store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; - println!("INDEX BUILT!"); + // println!("BUILDING AN INDEX"); + // store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; + // println!("INDEX BUILT!"); let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; let value = 2; @@ -178,8 +178,6 @@ async fn main() -> Result<()> { } - - // { // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; // let column = 3; From f3fc67cbbc6a2d3d1fc71317a9d0189a08287dea Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 01:34:16 +0100 Subject: [PATCH 34/43] Implement Garbage Collection --- storage_engine/src/cursor.rs | 48 ++++++++++++++++++++---------- storage_engine/src/entry.rs | 8 +++-- storage_engine/src/entry_header.rs | 2 +- storage_engine/src/index.rs | 5 ++++ 4 files changed, 43 insertions(+), 20 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 13054da..45b37fc 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -602,7 +602,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> // ===Deletion=== pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> - where T: Encode + Decode + Ord + Send + Sync + where T: Encode + Decode + Ord + Send + Sync + Clone + Ord { self.seek_to(file_position).await?; let mut entry_header = self.read_entry_header().await?; @@ -634,7 +634,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> } async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result>> - where T: Encode + Decode + Ord + Send + Sync + where T: Encode + Decode + Ord + Send + Sync + Clone + Ord { let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?; if let Some(entry) = maybe_entry { @@ -669,7 +669,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> // ===Garbage Collection=== async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> - where T: Send + Decode + Encode + where T: Send + Decode + Encode + Clone + Ord { // TODO: What should be the policy? Counting size of garbage? Counting how many entries are // garbage? @@ -680,11 +680,21 @@ impl <'cursor, T> WriteCursor<'cursor, T> } async fn initiate_garbage_collection(&mut self) -> Result - where T: Send + Decode + Encode + where T: Send + Decode + Encode + Clone + Ord { let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; - // This will be a vector of such BTree maps... - let in_memory_index: BTreeMap> = BTreeMap::new(); + // Since garbage collection changes FilePositions of live entries, we need to update the + // indexes too. + + let mut in_memory_indexes: Vec>>> = Vec::with_capacity(self.header.number_of_columns); + for column in 0..self.header.number_of_columns { + if self.header.is_column_indexed(column as Column) { + let in_memory_index = BTreeMap::new(); + in_memory_indexes.push(Some(in_memory_index)) + } else { + in_memory_indexes.push(None) + } + } // We'll dump all alive entries into a new file. let mut entries_deleted = 0; @@ -692,25 +702,31 @@ impl <'cursor, T> WriteCursor<'cursor, T> while let Some(live_entry) = self.next_alive().await? { entries_deleted += 1; let file_position = cursor_to_intermediate.append_entry_no_indexing(&live_entry.forget()).await?; - // TODO: Start indexing all of the indexable columns from scratch. + + // Update index. (Wouldn't it be nice if we had `for let ...`?) + for (maybe_in_memory_index, value) in in_memory_indexes.iter_mut().zip(&live_entry.data) { + if let Some(in_memory_index) = maybe_in_memory_index { + in_memory_index.entry(value.clone()).or_insert_with(HashSet::new).insert(file_position); + } + } } } - // TODO: Create a new indexes from in_memory_index. - - // Afterwards we swap the files, and delete the garbage. - // TODO: - // What needs to be done? - // 1. We take self cursor and mutate it + // ===swap=== + // swapping indexes + // Update indexes on disk. + for (column, maybe_in_memory_index) in in_memory_indexes.into_iter().enumerate() { + if let Some(in_memory_index) = maybe_in_memory_index { + let index = self.mut_index_at(column as Column); + index.reset(in_memory_index).await?; + } + } // swapping headers self.header.deleted_count = 0; self.header.total_count = cursor_to_intermediate.header.total_count; - // TODO: We'll actually have to iterate through all the indexes and swap each of them. - self.indexes = todo!(); self.file = cursor_to_intermediate.file; - self.eof_file_position = cursor_to_intermediate.eof_file_position; Ok(entries_deleted) diff --git a/storage_engine/src/entry.rs b/storage_engine/src/entry.rs index ed0c1fa..d22a46f 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/entry.rs @@ -49,10 +49,12 @@ impl EntryDetailed { Ok(EntryDetailed { header, file_position, data }) } - pub fn forget(self) -> Entry { + pub fn forget(&self) -> Entry + where T: Clone + { Entry { - header: self.header.into(), - data: self.data, + header: self.header.clone().into(), + data: self.data.clone(), } } } diff --git a/storage_engine/src/entry_header.rs b/storage_engine/src/entry_header.rs index 7c8d626..cee5496 100644 --- a/storage_engine/src/entry_header.rs +++ b/storage_engine/src/entry_header.rs @@ -8,7 +8,7 @@ pub struct EntryHeader { pub is_deleted: bool, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct EntryHeaderWithDataSize { pub is_deleted: bool, pub data_sizes: Vec, // vec![5, 6, 20] means that column 0 stores 5 bytes, column 1 stores 6 diff --git a/storage_engine/src/index.rs b/storage_engine/src/index.rs index b5c42c1..6f69b76 100644 --- a/storage_engine/src/index.rs +++ b/storage_engine/src/index.rs @@ -100,6 +100,11 @@ where Ok(()) } + pub async fn reset(&mut self, data: BTreeMap>) -> Result<()> { + self.data = data; + self.sync_to_disk().await + } + async fn append_to_file(&mut self, key: &K, value: &V) -> Result<()> { let mut encoded = Vec::new(); encoded.extend(encode(key)?); From c0a3ee08b8d53da4fd7c1a74563b038d359214dc Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 02:26:54 +0100 Subject: [PATCH 35/43] Forgot to seek before garbage collection --- storage_engine/src/cursor.rs | 70 ++++++++++++++++---- storage_engine/src/main.rs | 124 +++++++++++++++++++++-------------- 2 files changed, 131 insertions(+), 63 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 45b37fc..8b6bfbb 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -1,5 +1,6 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; use tokio::fs::{File, OpenOptions}; +use tokio::fs; use std::path::Path; use std::marker::PhantomData; use std::collections::{BTreeMap, HashSet}; @@ -17,7 +18,7 @@ use crate::store_header::StoreHeader; use crate::storage_engine::{Store, FilePosition, Column, Result, StoreIndexes, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; use crate::index::Index; - +const GARBAGE_COLLECTION_TRIGGER: usize = 100; // ===Concrete Cursors=== pub struct ReadCursor<'a, T> { @@ -327,12 +328,12 @@ pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWri pub trait CursorWithAccessToIndex: CursorWithStoreHeader { fn indexes(&mut self) -> &[Option>]; - async fn index_lookup(&mut self, column: Column, k: &T) -> Result>> + async fn index_lookup(&mut self, column: Column, value: &T) -> Result>> where T: Encode + Decode + Ord + Send + Sync { match &self.indexes()[column as usize] { Some(index) => { - let file_positions = index.lookup(k).await?.unwrap_or_else(|| HashSet::new()); + let file_positions = index.lookup(value).await?.unwrap_or_else(|| HashSet::new()); let mut entries: Vec> = vec![]; for &file_position in file_positions.iter() { match self.read_entry_at(file_position).await? { @@ -601,9 +602,10 @@ impl <'cursor, T> WriteCursor<'cursor, T> } // ===Deletion=== - pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> + pub async fn mark_deleted_at(&mut self, file_position: FilePosition, enable_garbage_collector: bool) -> Result<()> where T: Encode + Decode + Ord + Send + Sync + Clone + Ord { + println!("MARKING {} as DELETED", file_position); self.seek_to(file_position).await?; let mut entry_header = self.read_entry_header().await?; if entry_header.is_deleted { @@ -628,23 +630,59 @@ impl <'cursor, T> WriteCursor<'cursor, T> } } - self.attempt_garbage_collection_if_necessary().await?; + if enable_garbage_collector { + self.attempt_garbage_collection_if_necessary().await?; + } Ok(()) } } - async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result>> - where T: Encode + Decode + Ord + Send + Sync + Clone + Ord + async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T, enable_garbage_collector: bool) -> Result>> + where T: Encode + Decode + Ord + Send + Sync + Clone { let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?; if let Some(entry) = maybe_entry { - self.mark_deleted_at(entry.file_position).await?; + self.mark_deleted_at(entry.file_position, enable_garbage_collector).await?; Ok(Some(entry)) } else { Ok(maybe_entry) } } + // Doesn't update indexes. + async fn find_all_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result + where T: Encode + Decode + Ord + Send + Sync + Clone + { + let mut count = 0; + while let Some(_) = self.find_first_eq_bruteforce_and_delete(column, t0, false).await? { + count += 1; + } + Ok(count) + } + + pub async fn delete_entries_where_eq(&mut self, column: Column, value: &T, enable_garbage_collector: bool) -> Result + where T: Encode + Decode + Ord + Send + Sync + Clone + { + let count = + if self.header().is_column_indexed(column) { + println!("DELETION: INDEXED LOOKUP"); + let entries = self.index_lookup(column, value).await?; + let count = entries.len(); + for entry in entries { + self.mark_deleted_at(entry.file_position, false).await? + } + count + } else { + println!("DELETION: BRUTE-FORCE LOOKUP"); + let count = self.find_all_eq_bruteforce_and_delete(column, value).await?; + count + }; + if enable_garbage_collector { + self.attempt_garbage_collection_if_necessary().await?; + } + Ok(count) + } + // ===Indexing=== // WARNING: Assumes the column is NOT indexable. pub async fn attach_index(&mut self, column: Column) -> Result<()> @@ -671,15 +709,14 @@ impl <'cursor, T> WriteCursor<'cursor, T> async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> where T: Send + Decode + Encode + Clone + Ord { - // TODO: What should be the policy? Counting size of garbage? Counting how many entries are - // garbage? - if self.header.deleted_count > 100 { + if self.header.deleted_count > GARBAGE_COLLECTION_TRIGGER { + println!("=======START GARBAGE COLLETOR===="); self.initiate_garbage_collection().await?; } Ok(()) } - async fn initiate_garbage_collection(&mut self) -> Result + pub async fn initiate_garbage_collection(&mut self) -> Result where T: Send + Decode + Encode + Clone + Ord { let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; @@ -698,6 +735,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> // We'll dump all alive entries into a new file. let mut entries_deleted = 0; + self.seek_to_start_of_data().await?; { while let Some(live_entry) = self.next_alive().await? { entries_deleted += 1; @@ -729,6 +767,14 @@ impl <'cursor, T> WriteCursor<'cursor, T> self.file = cursor_to_intermediate.file; self.eof_file_position = cursor_to_intermediate.eof_file_position; + // swap files on disk + // current file + let path_to_table = Path::new(&self.header.table_folder); + let path_to_rows = path_to_table.join(ROWS_FILE_NAME); + let path_to_intermediate_rows = path_to_table.join(GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME); + fs::remove_file(path_to_rows.clone()).await?; + fs::rename(path_to_intermediate_rows, path_to_rows).await?; + Ok(entries_deleted) } diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 46e6552..f3a32e0 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -82,6 +82,20 @@ async fn append_bunch_of_entries(store: &mut Store) -> Result<()> { Ok(()) } +async fn test_garbage_collection(store: &mut Store) -> Result<()> { + let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; + // cursor.delete_entries_where_eq() + // 1. mark a bunch of entries as deleted + let column = 0; + let value = 1; + // cursor.delete_entries_where_eq(column, &value, true).await.map_err(|e| e.to_io_or_panic())?; + // let value = 50; + // cursor.delete_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; + + // cursor.initiate_garbage_collection().await.map_err(|e| e.to_io_or_panic())?; + Ok(()) +} + #[tokio::main] async fn main() -> Result<()> { println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); @@ -117,65 +131,73 @@ async fn main() -> Result<()> { cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; } - { - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - } + test_garbage_collection(&mut store).await?; { let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let column = 2; - let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); - let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - println!("{:?}", x); + cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; } - { - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let column = 0; - let value = 1; - let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - println!("ARE INDEXES WORKING???"); - println!("{:?}", entries); - } - { - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let column = 1; - let value = 2; - let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - println!("ARE INDEXES WORKING???"); - println!("{:?}", entries); - } + // { + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // } - { - let column = 1; - // println!("BUILDING AN INDEX"); - // store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; - // println!("INDEX BUILT!"); + // { + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let column = 2; + // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; + // println!("{:?}", x); + // } - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - let value = 2; - let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - println!("ARE INDEXES WORKING???"); - println!("{:?}", entries); - } + // { + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let column = 0; + // let value = 1; + // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; + // println!("ARE INDEXES WORKING???"); + // println!("{:?}", entries); + // } + + // { + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let column = 1; + // let value = 2; + // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; + // println!("ARE INDEXES WORKING???"); + // println!("{:?}", entries); + // } + + // { + // let column = 1; + // // println!("BUILDING AN INDEX"); + // // store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; + // // println!("INDEX BUILT!"); + + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // let value = 2; + // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; + // println!("ARE INDEXES WORKING???"); + // println!("{:?}", entries); + // } // { From 1618bffb85d1a802a1ac762d1643911a4a03ea1b Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 03:29:42 +0100 Subject: [PATCH 36/43] Cleanup --- storage_engine/src/binary_coding.rs | 10 +-- storage_engine/src/cursor.rs | 36 ++-------- storage_engine/src/error.rs | 3 - storage_engine/src/main.rs | 23 +++--- storage_engine/src/storage_engine.rs | 101 +-------------------------- storage_engine/src/store_header.rs | 1 + 6 files changed, 22 insertions(+), 152 deletions(-) diff --git a/storage_engine/src/binary_coding.rs b/storage_engine/src/binary_coding.rs index 5e6378d..4e66623 100644 --- a/storage_engine/src/binary_coding.rs +++ b/storage_engine/src/binary_coding.rs @@ -69,11 +69,12 @@ pub fn decode_sequence(len: usize, bytes: &[u8]) -> Result, bi } +#[allow(dead_code)] fn example_encoding_decoding() { - let xs: Vec = vec![123, 250, 256, 123, 123, 123]; - let xs: Vec = vec![]; - let xs: Vec = vec![123]; - let xs: Vec = vec![123, 250]; + let _xs: Vec = vec![123, 250, 256, 123, 123, 123]; + let _xs: Vec = vec![]; + let _xs: Vec = vec![123]; + let _xs: Vec = vec![123, 250]; let xs: Vec = vec!["foo".to_string(), "bar".to_string()]; @@ -87,4 +88,3 @@ fn example_encoding_decoding() { let dxs = decode_vector::(&exs[..]).unwrap(); println!("decoded {:?}", dxs); } - diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 8b6bfbb..5acbfbe 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -49,8 +49,7 @@ pub struct AppendOnlyCursor { // ===Traits=== #[async_trait] -// TODO: Make this private -pub trait PrimitiveCursor { +pub(crate) trait PrimitiveCursor { fn file(&mut self) -> &mut File; fn eof_file_position(&self) -> FilePosition; @@ -100,7 +99,7 @@ pub trait PrimitiveCursor { } #[async_trait] -pub trait PrimitiveWriteCursor: PrimitiveCursor { +pub(crate) trait PrimitiveWriteCursor: PrimitiveCursor { async fn write_bytes(&mut self, bytes: &[u8]) -> Result { Ok(self.file().write(bytes).await?) } @@ -536,8 +535,6 @@ impl <'cursor, T> ReadCursor<'cursor, T> { impl <'cursor, T> WriteCursor<'cursor, T> -// TODO: Consider adding this manually to wher eit is really needed - where T: Sync { // 'store lives atleast as long as 'cursor pub async fn new<'store: 'cursor>(store: &'store mut Store) -> Result @@ -566,31 +563,6 @@ impl <'cursor, T> WriteCursor<'cursor, T> Ok(cursor) } - pub async fn connect<'header: 'cursor, 'indexes: 'cursor>(path_to_rows: &str, header: &'header mut StoreHeader, indexes: &'indexes mut StoreIndexes) -> Result - where T: Send - { - let file: File = - OpenOptions::new() - .read(true) - .write(true) - .open(path_to_rows) - .await?; - - let mut cursor = Self { - header, - file, - indexes, - - eof_file_position: 0, - }; - let eof_file_position: FilePosition = cursor.seek_to_end().await?; - cursor.eof_file_position = eof_file_position; - - cursor.seek_to_start_of_data().await?; - - Ok(cursor) - } - // ===Entry Header Manipulation=== // assumes we are at the start of valid entry. async fn set_new_entry_header(&mut self, entry_header: EntryHeader) -> Result<()> @@ -707,7 +679,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> // ===Garbage Collection=== async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> - where T: Send + Decode + Encode + Clone + Ord + where T: Send + Sync + Decode + Encode + Clone + Ord { if self.header.deleted_count > GARBAGE_COLLECTION_TRIGGER { println!("=======START GARBAGE COLLETOR===="); @@ -717,7 +689,7 @@ impl <'cursor, T> WriteCursor<'cursor, T> } pub async fn initiate_garbage_collection(&mut self) -> Result - where T: Send + Decode + Encode + Clone + Ord + where T: Send + Sync + Decode + Encode + Clone + Ord { let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; // Since garbage collection changes FilePositions of live entries, we need to update the diff --git a/storage_engine/src/error.rs b/storage_engine/src/error.rs index 4adf004..47bf5cc 100644 --- a/storage_engine/src/error.rs +++ b/storage_engine/src/error.rs @@ -8,7 +8,6 @@ pub enum Error { IndexIsStoringEofFilePosition(Column), ColumnAlreadyIndexed(Column), IoError(std::io::Error), - InvalidStoreHeader, } #[derive(Debug)] @@ -49,5 +48,3 @@ impl From for Error { Self::IoError(err) } } - - diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index f3a32e0..4681cb2 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -9,7 +9,7 @@ mod store_header; use crate::entry::{Entry, EntryDetailed}; use crate::storage_engine::{Store, FilePosition}; -use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteStoreHeader, CursorWithWriteAccessToIndex, CursorWithAccessToIndex}; +use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteAccessToIndex}; type Data = u32; @@ -18,7 +18,7 @@ const TABLE_PATH: &'static str = "test_table"; type Result = std::result::Result; async fn create_store() -> Result> { - let mut store: Store = Store::new(TABLE_PATH, 5, 0).await.map_err(|e| e.to_io_or_panic())?; + let store: Store = Store::new(TABLE_PATH, 5, 0).await.map_err(|e| e.to_io_or_panic())?; println!("CREATED"); // println!("THE STORE: {:?}", store); // println!("THE BYTES: {:?}", store.read_all_bytes().await?); @@ -27,7 +27,7 @@ async fn create_store() -> Result> { } async fn connect_store() -> Result> { - let mut store: Store = Store::connect(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; + let store: Store = Store::connect(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; println!("CONNECTED"); // println!("THE STORE: {:?}", store); // println!("THE BYTES: {:?}", store.read_all_bytes().await?); @@ -56,10 +56,9 @@ async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: Entry) -> async fn read_entry(cursor: &mut ReadCursor<'_, Data>, file_position: FilePosition) -> Result>> { println!("READING ENTRY at file_position={}", file_position); - // let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; - // println!("ENTRY: {:?}", entry); - // Ok(entry) - todo!() + let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; + println!("ENTRY: {:?}", entry); + Ok(entry) } async fn append_bunch_of_entries(store: &mut Store) -> Result<()> { @@ -88,7 +87,7 @@ async fn test_garbage_collection(store: &mut Store) -> Result<()> { // 1. mark a bunch of entries as deleted let column = 0; let value = 1; - // cursor.delete_entries_where_eq(column, &value, true).await.map_err(|e| e.to_io_or_panic())?; + cursor.delete_entries_where_eq(column, &value, true).await.map_err(|e| e.to_io_or_panic())?; // let value = 50; // cursor.delete_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; @@ -133,10 +132,10 @@ async fn main() -> Result<()> { test_garbage_collection(&mut store).await?; - { - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; - } + // { + // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; + // cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; + // } // { diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 94ac5bd..74ba9ad 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -2,7 +2,6 @@ use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::fs::{File, OpenOptions, DirBuilder}; use tokio::fs; use std::path::{Path, PathBuf}; -use std::marker::PhantomData; use bincode::{Decode, Encode}; use crate::error::Error; @@ -25,9 +24,6 @@ pub struct Store { pub type StoreIndexes = Vec>>; -pub type PositionOfValue = FilePosition; -pub type PositionOfRow = FilePosition; - //===Store=== pub async fn store_exists(table_folder: &str) -> Result { @@ -197,6 +193,7 @@ impl Store { } // For debugging. + #[allow(dead_code)] pub async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> where T: Send + Sync { @@ -205,99 +202,3 @@ impl Store { Ok(bytes) } } - -// ===Store Header=== - -// ====Entry==== - - -// impl StorageEngine for ColumnStore { -// async fn append(&mut self, id: Index, entry: Row) -> Result - -// async fn get_all(&self) -> ??? -// async fn get_eq(&self, column: Column, value: T) -> ??? - -// async fn delete_all(&mut self) -// async fn delete_eq(&mut self, column: Column, value: T) -> ??? -// } - -// struct Error { -// } - - -// Selected( -// &'a TableSchema, -// ColumnSelection, -// TODO: Don't do the Box(dyn Iterator<...>) -// you'll have a concrete implementation of Iterator, and that's what you'll use -// Box + 'a + Send>, -// ), - - - -// #[async_trait] -// trait StorageEngine -// where T: Encode + Decode -// { -// async fn append(&mut self, id: Index, entry: Row) -> Result - -// async fn get_all(&self) -> ??? -// async fn get_eq(&self, column: Column, value: T) -> ??? - -// async fn delete_all(&mut self) -// async fn delete_eq(&mut self, column: Column, value: T) -> ??? -// } - -// #[cfg(test)] -// mod tests { -// #[test] -// fn hello_test() { -// assert!(true); -// } -// } - -// let sroage_engine = STorageEngine::new("users") -// let mut next_position = 0 - - -// type FilePosition = usize; - - -// type StoreFile = Vec; -// type IndexFile = ??? - -// struct IndexEntry { - -// } - - -// #00000 [false, u26, "Arnold", "schwarzenegger", "gettothechoppa@yahoo.com"] #5120000 [true, u27, "Arnold", "Vosloo", "avosloo@aol.com"] -// #00000 [true, u27, "Arnold", "Vosloo", "avosloo@aol.com"] - - -// at #00000 512 kb deleted, -// ... - - - - -// [(u26, [#00000]), (u27, [#5120000])] -// [("Arnold", [#000000, #5120000]), ("Arnfsdaf", []), ("Adasdsd", []), ("Bdsad", [])] -// // basically always keep indexes in memory and on write always sync on disk - - - - -// CREATE INDEX usersname ON "users" (name); - -// INSERT INTO users (id, name, surname, email) VALUES (u26, "Arnold", "schwarzenegger", "gettothechoppa@yahoo.com"); -// INSERT INTO users (id, name, surname, email) VALUES (u27, "Arnold", "Vosloo", "avosloo@aol.com"); - - -// SELECT * FROM users WHERE id=u26; - -// SELECT * FROM users WHERE name="Arnold"; - - -// SELECT * FROM cars; -// DELETE FROM users WHERE name="Arnold"; diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/store_header.rs index 004e26a..890b198 100644 --- a/storage_engine/src/store_header.rs +++ b/storage_engine/src/store_header.rs @@ -35,6 +35,7 @@ impl StoreHeader { pub const DELETED_COUNT_OFFSET: usize = Self::NUMBER_OF_COLUMNS_OFFSET + Self::NUMBER_OF_COLUMNS_SIZE; pub const TOTAL_COUNT_OFFSET: usize = Self::DELETED_COUNT_OFFSET + Self::DELETED_COUNT_SIZE; pub const PRIMARY_COLUMN_OFFSET: usize = Self::TOTAL_COUNT_OFFSET + Self::TOTAL_COUNT_SIZE; + #[allow(dead_code)] pub const INDEXED_COLUMNS_OFFSET: usize = Self::PRIMARY_COLUMN_OFFSET + Self::PRIMARY_COLUMN_SIZE; fn indexed_columns_size(number_of_columns: usize) -> usize { From b13d2f04cd24b8343aee7d9089e261c9b1656486 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 03:35:43 +0100 Subject: [PATCH 37/43] Introduce segments module --- storage_engine/src/cursor.rs | 8 ++++---- storage_engine/src/lib.rs | 4 +--- storage_engine/src/main.rs | 6 ++---- storage_engine/src/{ => segments}/entry.rs | 2 +- storage_engine/src/{ => segments}/entry_header.rs | 0 storage_engine/src/segments/mod.rs | 3 +++ storage_engine/src/{ => segments}/store_header.rs | 0 storage_engine/src/storage_engine.rs | 2 +- 8 files changed, 12 insertions(+), 13 deletions(-) rename storage_engine/src/{ => segments}/entry.rs (95%) rename storage_engine/src/{ => segments}/entry_header.rs (100%) create mode 100644 storage_engine/src/segments/mod.rs rename storage_engine/src/{ => segments}/store_header.rs (100%) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 5acbfbe..84077fa 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -12,10 +12,10 @@ use bincode::{Decode, Encode}; use crate::binary_coding::{encode, decode}; use crate::error::{Error, DecodeErrorKind}; -use crate::entry::{Entry, EntryDetailed}; -use crate::entry_header::{EntryHeaderWithDataSize, EntryHeader}; -use crate::store_header::StoreHeader; -use crate::storage_engine::{Store, FilePosition, Column, Result, StoreIndexes, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; +use crate::segments::entry::{Entry, EntryDetailed}; +use crate::segments::entry_header::{EntryHeaderWithDataSize, EntryHeader}; +use crate::segments::store_header::StoreHeader; +use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; use crate::index::Index; const GARBAGE_COLLECTION_TRIGGER: usize = 100; diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs index 65f1a06..cae01f5 100644 --- a/storage_engine/src/lib.rs +++ b/storage_engine/src/lib.rs @@ -3,6 +3,4 @@ mod binary_coding; mod error; mod index; mod cursor; -mod entry; -mod entry_header; -mod store_header; +mod segments; diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs index 4681cb2..b9f8b9b 100644 --- a/storage_engine/src/main.rs +++ b/storage_engine/src/main.rs @@ -3,11 +3,9 @@ mod binary_coding; mod error; mod index; mod cursor; -mod entry; -mod entry_header; -mod store_header; +mod segments; -use crate::entry::{Entry, EntryDetailed}; +use crate::segments::entry::{Entry, EntryDetailed}; use crate::storage_engine::{Store, FilePosition}; use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteAccessToIndex}; diff --git a/storage_engine/src/entry.rs b/storage_engine/src/segments/entry.rs similarity index 95% rename from storage_engine/src/entry.rs rename to storage_engine/src/segments/entry.rs index d22a46f..b42c490 100644 --- a/storage_engine/src/entry.rs +++ b/storage_engine/src/segments/entry.rs @@ -3,7 +3,7 @@ use bincode::{Decode, Encode}; use crate::binary_coding::{encode_sequence, encode_sequence_with_sizes, decode_sequence}; use crate::storage_engine::{Result, FilePosition}; use crate::error::{Error, DecodeErrorKind}; -use crate::entry_header::{EntryHeader, EntryHeaderWithDataSize}; +use crate::segments::entry_header::{EntryHeader, EntryHeaderWithDataSize}; #[derive(Debug)] pub struct Entry { diff --git a/storage_engine/src/entry_header.rs b/storage_engine/src/segments/entry_header.rs similarity index 100% rename from storage_engine/src/entry_header.rs rename to storage_engine/src/segments/entry_header.rs diff --git a/storage_engine/src/segments/mod.rs b/storage_engine/src/segments/mod.rs new file mode 100644 index 0000000..b78a646 --- /dev/null +++ b/storage_engine/src/segments/mod.rs @@ -0,0 +1,3 @@ +pub mod entry; +pub mod entry_header; +pub mod store_header; diff --git a/storage_engine/src/store_header.rs b/storage_engine/src/segments/store_header.rs similarity index 100% rename from storage_engine/src/store_header.rs rename to storage_engine/src/segments/store_header.rs diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 74ba9ad..26398f3 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -6,7 +6,7 @@ use bincode::{Decode, Encode}; use crate::error::Error; use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader}; -use crate::store_header::StoreHeader; +use crate::segments::store_header::StoreHeader; use crate::index::Index; From 61de195658443324cdd7bd343bdb013583a5118c Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 15:27:42 +0100 Subject: [PATCH 38/43] tests --- storage_engine/src/storage_engine.rs | 338 +++++++++++++++++++++++++++ 1 file changed, 338 insertions(+) diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 26398f3..01f87d0 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -202,3 +202,341 @@ impl Store { Ok(bytes) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::segments::entry::{Entry, EntryDetailed}; + use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteAccessToIndex, CursorWithAccessToIndex}; + + impl Drop for Store { + fn drop(&mut self) { + println!("DROPPING TEST FOLDER"); + let table_folder = self.header.table_folder.clone(); + // Seems no one has figured out how to do AsyncDrop yet. + std::fs::remove_dir_all(table_folder).unwrap(); + } + } + + + #[tokio::test] + async fn test_create() { + type Data = u32; + + let table_path = "test_table_0"; + let number_of_columns = 5; + let primary_column = 0; + let store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + assert!(store.header.number_of_columns == number_of_columns); + assert!(store.header.total_count == 0); + assert!(store.header.deleted_count == 0); + assert!(store.header.primary_column == primary_column); + } + + #[tokio::test] + async fn test_insert() { + type Data = u32; + + let table_path = "test_table_1"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); + cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + cursor.insert_entry(entry1).await.unwrap(); + + assert!(store.header.total_count == 2); + } + } + + #[tokio::test] + async fn test_select_next() { + type Data = u32; + + let table_path = "test_table_2"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); + cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + cursor.insert_entry(entry1).await.unwrap(); + + assert!(store.header.total_count == 2); + } + + { + let mut cursor = store.read_cursor().await.unwrap(); + + let entry0 = cursor.next().await.unwrap().unwrap(); + let entry1 = cursor.next().await.unwrap().unwrap(); + + assert!(entry0.data == vec![1,2,3,4,5]); + assert!(entry1.data == vec![6,7,8,9,10]); + } + } + + #[tokio::test] + async fn test_select_all() { + type Data = u32; + + let table_path = "test_table_3"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); + cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + cursor.insert_entry(entry1).await.unwrap(); + + assert!(store.header.total_count == 2); + } + + { + let mut cursor = store.read_cursor().await.unwrap(); + + let mut entries = vec![]; + while let Some(entry) = cursor.next().await.unwrap() { + entries.push(entry) + } + + assert!(entries.len() == 2); + assert!(entries[0].data == vec![1,2,3,4,5]); + assert!(entries[1].data == vec![6,7,8,9,10]); + } + } + + #[tokio::test] + async fn test_select_eq() { + type Data = u32; + + let table_path = "test_table_4"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + let value = 200; + { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, value, 3, 4, 5]); + cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + cursor.insert_entry(entry1).await.unwrap(); + + let entry2: Entry = Entry::new(vec![11, 2, 10, 10, 10]); + cursor.insert_entry(entry2).await.unwrap(); + + let entry3: Entry = Entry::new(vec![1, value, 100, 50, 40]); + cursor.insert_entry(entry3).await.unwrap(); + + assert!(store.header.total_count == 4); + } + + { + let mut cursor = store.read_cursor().await.unwrap(); + let column = 1; + + let entries = cursor.select_entries_where_eq(column, &value).await.unwrap(); + + assert!(entries.len() == 2); + assert!(entries[0].data == vec![1, value, 3, 4, 5]); + assert!(entries[1].data == vec![1, value, 100, 50, 40]); + } + } + + #[tokio::test] + async fn test_select_eq_indexed() { + type Data = u32; + + let table_path = "test_table_5"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + let column: Column = 1; + + assert!(store.indexes[column as usize].is_none()); + store.attach_index(column).await.unwrap(); + assert!(store.indexes[column as usize].is_some()); + + let value = 200; + { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, value, 3, 4, 5]); + cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + cursor.insert_entry(entry1).await.unwrap(); + + let entry2: Entry = Entry::new(vec![11, 2, 10, 10, 10]); + cursor.insert_entry(entry2).await.unwrap(); + + let entry3: Entry = Entry::new(vec![1, value, 100, 50, 40]); + cursor.insert_entry(entry3).await.unwrap(); + + assert!(store.header.total_count == 4); + } + + { + let mut cursor = store.read_cursor().await.unwrap(); + let column = 1; + + let entries = cursor.select_entries_where_eq(column, &value).await.unwrap(); + assert!(entries.len() == 2); + // Order may be non-deterministic. + assert!(entries[0].data[column as usize] == value); + assert!(entries[1].data[column as usize] == value); + } + } + + #[tokio::test] + async fn test_delete_entry() { + type Data = u32; + + let table_path = "test_table_6"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + let value = 200; + let (_file_position0, file_position1, _file_position2, _file_position3) = { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, value, 3, 4, 5]); + let file_position0 = cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + let file_position1 = cursor.insert_entry(entry1).await.unwrap(); + + let entry2: Entry = Entry::new(vec![11, 2, 10, 10, 10]); + let file_position2 = cursor.insert_entry(entry2).await.unwrap(); + + let entry3: Entry = Entry::new(vec![1, value, 100, 50, 40]); + let file_position3 = cursor.insert_entry(entry3).await.unwrap(); + + assert!(store.header.total_count == 4); + (file_position0, file_position1, file_position2, file_position3) + }; + + { + assert!(store.header.deleted_count == 0); + let mut cursor = store.write_cursor().await.unwrap(); + cursor.mark_deleted_at(file_position1, false).await.unwrap(); + assert!(store.header.deleted_count == 1); + } + } + + #[tokio::test] + async fn test_delete_where_eq() { + type Data = u32; + + let table_path = "test_table_7"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + let column: Column = 1; + + assert!(store.indexes[column as usize].is_none()); + store.attach_index(column).await.unwrap(); + assert!(store.indexes[column as usize].is_some()); + + let value = 200; + + let (_file_position0, _file_position1, _file_position2, _file_position3) = { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, value, 3, 4, 5]); + let file_position0 = cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + let file_position1 = cursor.insert_entry(entry1).await.unwrap(); + + let entry2: Entry = Entry::new(vec![11, 2, 10, 10, 10]); + let file_position2 = cursor.insert_entry(entry2).await.unwrap(); + + let entry3: Entry = Entry::new(vec![1, value, 100, 50, 40]); + let file_position3 = cursor.insert_entry(entry3).await.unwrap(); + + assert!(store.header.total_count == 4); + (file_position0, file_position1, file_position2, file_position3) + }; + + { + assert!(store.header.deleted_count == 0); + let mut cursor = store.write_cursor().await.unwrap(); + cursor.delete_entries_where_eq(column, &value, false).await.unwrap(); + assert!(store.header.deleted_count == 2); + } + } + + #[tokio::test] + async fn test_garbage_collection() { + type Data = u32; + + let table_path = "test_table_8"; + let number_of_columns = 5; + let primary_column = 0; + let mut store: Store = Store::new(table_path, number_of_columns, primary_column).await.unwrap(); + + let column: Column = 1; + + assert!(store.indexes[column as usize].is_none()); + store.attach_index(column).await.unwrap(); + assert!(store.indexes[column as usize].is_some()); + + let value = 200; + + let (_file_position0, _file_position1, _file_position2, _file_position3) = { + let mut cursor = store.write_cursor().await.unwrap(); + + let entry0: Entry = Entry::new(vec![1, value, 3, 4, 5]); + let file_position0 = cursor.insert_entry(entry0).await.unwrap(); + + let entry1: Entry = Entry::new(vec![6, 7, 8, 9, 10]); + let file_position1 = cursor.insert_entry(entry1).await.unwrap(); + + let entry2: Entry = Entry::new(vec![11, 2, 10, 10, 10]); + let file_position2 = cursor.insert_entry(entry2).await.unwrap(); + + let entry3: Entry = Entry::new(vec![1, value, 100, 50, 40]); + let file_position3 = cursor.insert_entry(entry3).await.unwrap(); + + assert!(store.header.total_count == 4); + (file_position0, file_position1, file_position2, file_position3) + }; + + { + assert!(store.header.deleted_count == 0); + let mut cursor = store.write_cursor().await.unwrap(); + cursor.delete_entries_where_eq(column, &value, false).await.unwrap(); + assert!(cursor.header().deleted_count == 2); + assert!(cursor.header().total_count == 4); + + cursor.initiate_garbage_collection().await.unwrap(); + assert!(cursor.header().deleted_count == 0); + assert!(cursor.header().total_count == 2); + } + } + +} From 62d4720e5414b93020207c79e856a9b4c6551f6f Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 15:30:39 +0100 Subject: [PATCH 39/43] Remove unnecessary println! --- storage_engine/src/cursor.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 84077fa..0811bfe 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -356,10 +356,8 @@ pub trait CursorWithAccessToIndex: CursorWithStoreHeader { where T: Encode + Decode + Ord + Send + Sync { if self.header().is_column_indexed(column) { - println!("INDEXED LOOKUP"); self.index_lookup(column, value).await } else { - println!("BRUTE-FORCE LOOKUP"); self.find_all_eq_bruteforce(column, value).await } } @@ -577,7 +575,6 @@ impl <'cursor, T> WriteCursor<'cursor, T> pub async fn mark_deleted_at(&mut self, file_position: FilePosition, enable_garbage_collector: bool) -> Result<()> where T: Encode + Decode + Ord + Send + Sync + Clone + Ord { - println!("MARKING {} as DELETED", file_position); self.seek_to(file_position).await?; let mut entry_header = self.read_entry_header().await?; if entry_header.is_deleted { @@ -637,7 +634,6 @@ impl <'cursor, T> WriteCursor<'cursor, T> { let count = if self.header().is_column_indexed(column) { - println!("DELETION: INDEXED LOOKUP"); let entries = self.index_lookup(column, value).await?; let count = entries.len(); for entry in entries { @@ -645,7 +641,6 @@ impl <'cursor, T> WriteCursor<'cursor, T> } count } else { - println!("DELETION: BRUTE-FORCE LOOKUP"); let count = self.find_all_eq_bruteforce_and_delete(column, value).await?; count }; @@ -682,8 +677,9 @@ impl <'cursor, T> WriteCursor<'cursor, T> where T: Send + Sync + Decode + Encode + Clone + Ord { if self.header.deleted_count > GARBAGE_COLLECTION_TRIGGER { - println!("=======START GARBAGE COLLETOR===="); + println!("=======START GARBAGE COLLECTOR===="); self.initiate_garbage_collection().await?; + println!("=======GARBAGE COLLECTOR FINISHED===="); } Ok(()) } From f18fd3a7966c049cbec4f74f7db8d4df3acdce35 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 15:31:37 +0100 Subject: [PATCH 40/43] Remove main.rs --- storage_engine/src/main.rs | 213 ------------------------------------- 1 file changed, 213 deletions(-) delete mode 100644 storage_engine/src/main.rs diff --git a/storage_engine/src/main.rs b/storage_engine/src/main.rs deleted file mode 100644 index b9f8b9b..0000000 --- a/storage_engine/src/main.rs +++ /dev/null @@ -1,213 +0,0 @@ -mod storage_engine; -mod binary_coding; -mod error; -mod index; -mod cursor; -mod segments; - -use crate::segments::entry::{Entry, EntryDetailed}; -use crate::storage_engine::{Store, FilePosition}; -use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteAccessToIndex}; - -type Data = u32; - -const TABLE_PATH: &'static str = "test_table"; - -type Result = std::result::Result; - -async fn create_store() -> Result> { - let store: Store = Store::new(TABLE_PATH, 5, 0).await.map_err(|e| e.to_io_or_panic())?; - println!("CREATED"); - // println!("THE STORE: {:?}", store); - // println!("THE BYTES: {:?}", store.read_all_bytes().await?); - - Ok(store) -} - -async fn connect_store() -> Result> { - let store: Store = Store::connect(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; - println!("CONNECTED"); - // println!("THE STORE: {:?}", store); - // println!("THE BYTES: {:?}", store.read_all_bytes().await?); - Ok(store) -} - -async fn create_or_connect() -> Result> { - let exists = storage_engine::store_exists(TABLE_PATH).await.map_err(|e| e.to_io_or_panic())?; - if exists { - connect_store().await - } else { - create_store().await - } -} - - -async fn append_entry(cursor: &mut WriteCursor<'_, Data>, entry: Entry) -> Result { - println!("APPENDING"); - println!("entry == {:?}", entry); - - // let file_position: FilePosition = cursor.append_entry(&entry).await.map_err(|e| e.to_io_or_panic())?; - let file_position: FilePosition = cursor.insert_entry(entry).await.map_err(|e| e.to_io_or_panic())?; - println!("file_position == {:?}", file_position); - Ok(file_position) -} - -async fn read_entry(cursor: &mut ReadCursor<'_, Data>, file_position: FilePosition) -> Result>> { - println!("READING ENTRY at file_position={}", file_position); - let entry = cursor.read_entry_at(file_position).await.map_err(|e| e.to_io_or_panic())?; - println!("ENTRY: {:?}", entry); - Ok(entry) -} - -async fn append_bunch_of_entries(store: &mut Store) -> Result<()> { - let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; - let entry0: Entry = Entry::new(vec![1, 2, 3, 4, 5]); - append_entry(&mut cursor, entry0).await?; - - let entry1: Entry = Entry::new(vec![200, 200, 5, 6, 7]); - append_entry(&mut cursor, entry1).await?; - - // println!("{:?}", store.read_all_bytes().await?); - let entry2: Entry = Entry::new(vec![99, 98, 97, 96, 95]); - append_entry(&mut cursor, entry2).await?; - - let entry3: Entry = Entry::new(vec![50,50,50,50,50]); - append_entry(&mut cursor, entry3).await?; - - let entry4: Entry = Entry::new(vec![1,50,50,50,50]); // same 0-th column as entry0 - append_entry(&mut cursor, entry4).await?; - Ok(()) -} - -async fn test_garbage_collection(store: &mut Store) -> Result<()> { - let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; - // cursor.delete_entries_where_eq() - // 1. mark a bunch of entries as deleted - let column = 0; - let value = 1; - cursor.delete_entries_where_eq(column, &value, true).await.map_err(|e| e.to_io_or_panic())?; - // let value = 50; - // cursor.delete_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - - // cursor.initiate_garbage_collection().await.map_err(|e| e.to_io_or_panic())?; - Ok(()) -} - -#[tokio::main] -async fn main() -> Result<()> { - println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); - - let mut store: Store = create_or_connect().await?; - - if store.header.total_count == 0 { - println!("INSERTING!"); - append_bunch_of_entries(&mut store).await?; - } - - { - // let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?; - - // let entry: Entry = Entry::new(vec![60, 50, 40, 30, 20]); - // let file_position = append_entry(&mut cursor, &entry).await?; - // let file_position = 215; - // cursor.seek_to(file_position).await.map_err(|e| e.to_io_or_panic())?; - - // let entry_header = cursor.read_entry_header().await.map_err(|e| e.to_io_or_panic())?; - // println!("entry header = {:?}", entry_header); - - // println!("FILE POSITION == {}", file_position); - // cursor.mark_deleted_at(file_position).await.map_err(|e| e.to_io_or_panic())?; - // let entry_header = cursor.read_entry_header().await.map_err(|e| e.to_io_or_panic())?; - // println!("entry header after delete = {:?}", entry_header); - } - - // println!("{:?}", store); - // println!("{:?}", store.read_all_bytes().await?); - { - let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; - } - - test_garbage_collection(&mut store).await?; - - // { - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; - // } - - - // { - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // } - - // { - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let column = 2; - // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // } - - // { - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let column = 0; - // let value = 1; - // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - // println!("ARE INDEXES WORKING???"); - // println!("{:?}", entries); - // } - - // { - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let column = 1; - // let value = 2; - // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - // println!("ARE INDEXES WORKING???"); - // println!("{:?}", entries); - // } - - // { - // let column = 1; - // // println!("BUILDING AN INDEX"); - // // store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; - // // println!("INDEX BUILT!"); - - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let value = 2; - // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; - // println!("ARE INDEXES WORKING???"); - // println!("{:?}", entries); - // } - - - // { - // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; - // let column = 3; - // let t0 = 6; - // let x = cursor.find_first_eq_bruteforce(column, &t0).await.map_err(|e| e.to_io_or_panic())?; - // println!("{:?}", x); - // } - - - - - println!("DONE"); - Ok(()) -} From edfecfa8d676d40d56b29406c6489039077de203 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:03:19 +0100 Subject: [PATCH 41/43] Split cursor capabilities --- storage_engine/src/cursor.rs | 431 +----------------- .../src/cursor_capabilities/header_access.rs | 230 ++++++++++ .../src/cursor_capabilities/index_access.rs | 115 +++++ storage_engine/src/cursor_capabilities/mod.rs | 3 + .../src/cursor_capabilities/primitive.rs | 62 +++ storage_engine/src/lib.rs | 1 + storage_engine/src/storage_engine.rs | 7 +- 7 files changed, 438 insertions(+), 411 deletions(-) create mode 100644 storage_engine/src/cursor_capabilities/header_access.rs create mode 100644 storage_engine/src/cursor_capabilities/index_access.rs create mode 100644 storage_engine/src/cursor_capabilities/mod.rs create mode 100644 storage_engine/src/cursor_capabilities/primitive.rs diff --git a/storage_engine/src/cursor.rs b/storage_engine/src/cursor.rs index 0811bfe..3f68560 100644 --- a/storage_engine/src/cursor.rs +++ b/storage_engine/src/cursor.rs @@ -1,22 +1,20 @@ -use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; use tokio::fs::{File, OpenOptions}; use tokio::fs; use std::path::Path; use std::marker::PhantomData; use std::collections::{BTreeMap, HashSet}; -use async_trait::async_trait; - use bincode; use bincode::{Decode, Encode}; -use crate::binary_coding::{encode, decode}; -use crate::error::{Error, DecodeErrorKind}; -use crate::segments::entry::{Entry, EntryDetailed}; -use crate::segments::entry_header::{EntryHeaderWithDataSize, EntryHeader}; +use crate::segments::entry::EntryDetailed; +use crate::segments::entry_header::EntryHeader; use crate::segments::store_header::StoreHeader; use crate::storage_engine::{Store, FilePosition, Column, Result, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; use crate::index::Index; +use crate::cursor_capabilities::primitive::{CursorCanRead, CursorCanWrite}; +use crate::cursor_capabilities::header_access::{CursorCanReadHeader, CursorCanWriteHeader}; +use crate::cursor_capabilities::index_access::{CursorCanWriteToIndex, CursorCanReadIndex}; const GARBAGE_COLLECTION_TRIGGER: usize = 100; @@ -47,389 +45,9 @@ pub struct AppendOnlyCursor { } -// ===Traits=== -#[async_trait] -pub(crate) trait PrimitiveCursor { - fn file(&mut self) -> &mut File; - fn eof_file_position(&self) -> FilePosition; - - async fn read_bytes(&mut self, bytes: &mut [u8]) -> Result<()> { - self.file().read_exact(bytes).await?; - Ok(()) - } - - async fn get_bytes(&mut self, count: usize) -> Result> { - let mut result: Vec = Vec::with_capacity(count); - self.read_bytes(&mut result).await?; - Ok(result) - } - - async fn seek_to(&mut self, file_position: FilePosition) -> Result { - let file_position = self.file().seek(SeekFrom::Start(file_position)).await?; - Ok(file_position) - } - - // Start of the file i.e. the Header, not the entries. - async fn seek_to_start(&mut self) -> Result { - let file_position = self.file().seek(SeekFrom::Start(0)).await?; - Ok(file_position) - } - - async fn seek_to_end(&mut self) -> Result { - let file_position = self.file().seek(SeekFrom::End(0)).await?; - Ok(file_position) - } - - // Seeks from current position by offset and returns new file position - async fn seek_by(&mut self, offset: i64) -> Result { - let file_position = self.file().seek(SeekFrom::Current(offset)).await?; - Ok(file_position) - } - - async fn current_file_position(&mut self) -> Result { - let next_file_position: FilePosition = self.file().stream_position().await?; - Ok(next_file_position) - } - - async fn is_at_eof(&mut self) -> Result { - let current_file_position = self.current_file_position().await?; - let eof_file_position = self.eof_file_position(); - Ok(current_file_position == eof_file_position) - } -} - -#[async_trait] -pub(crate) trait PrimitiveWriteCursor: PrimitiveCursor { - async fn write_bytes(&mut self, bytes: &[u8]) -> Result { - Ok(self.file().write(bytes).await?) - } - -} - -#[async_trait] -pub trait CursorWithStoreHeader: PrimitiveCursor { - fn header(&self) -> &StoreHeader; - - async fn seek_to_start_of_data(&mut self) -> Result { - self.seek_to(StoreHeader::size(self.header().number_of_columns) as u64).await - } - - async fn read_entry_header(&mut self) -> Result { - let number_of_columns: usize = self.header().number_of_columns; - let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; - self.read_bytes(&mut header_bytes).await?; - let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..], number_of_columns)?; - - Ok(header) - } - - async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { - self.seek_to(file_position).await?; - self.read_entry_header().await - } - - // Returns None when file_position == eof_file_position - async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> - where T: Decode - { - self.seek_to(file_position).await?; - self.next().await - } - - // ===Iteration=== - // The following functions assume that the current file position is at a valid entry or EOF. - - - // WARNING: This moves the file_position to start of the data, so you can't just call - // next_entry_header() a bunch of times. You must move the cursor! - async fn next_entry_header(&mut self) -> Result> { - if self.is_at_eof().await? { - return Ok(None) - } - - let entry_header = self.read_entry_header().await?; - - Ok(Some(entry_header)) - } - - // This is meant to be used after next_entry_header() is called. - async fn jump_from_start_of_entry_data_to_next_entry(&mut self, entry_header: &EntryHeaderWithDataSize) -> Result{ - let file_position = self.seek_by(entry_header.size_of_data() as i64).await?; - Ok(file_position) - } - - async fn next(&mut self) -> Result>> - where T: Decode - { - let file_position = self.current_file_position().await?; - let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; - - let mut data_bytes: Vec = vec![0; entry_header.size_of_data()]; - self.read_bytes(&mut data_bytes).await?; - let entry: EntryDetailed = - EntryDetailed::decode(entry_header, file_position, self.header().number_of_columns, &mut data_bytes)?; - - Ok(Some(entry)) - } - - // Like next, but only reads the column, not the whole entry. - async fn next_at_column(&mut self, column: Column) -> Result> - where T: Decode + Send - { - let file_position = self.current_file_position().await?; - let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; - let file_position_at_start_of_data = self.current_file_position().await?; - - // figuring out how much to decode - let column_offset = entry_header.offset_of_column(column); - self.seek_by(column_offset as i64).await?; - - // reading and decoding - let mut bytes: Vec = vec![0; entry_header.data_sizes[column as usize]]; - self.read_bytes(&mut bytes).await?; - let (value, _) = - decode::(&bytes[..]) - .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; - - // jumping to next entry - self.seek_to(file_position_at_start_of_data).await?; - self.jump_from_start_of_entry_data_to_next_entry(&entry_header).await?; - - Ok(Some((entry_header, file_position, value))) - } - - async fn next_alive_at_column(&mut self, column: Column) -> Result> - where T: Decode + Send - { - while let Some((header, file_position, t)) = self.next_at_column(column).await? { - if !header.is_deleted { - return Ok(Some((header, file_position, t))) - } - } - Ok(None) - } - - async fn next_alive(&mut self) -> Result>> - where T: Decode - { - while let Some(entry) = self.next().await? { - if !entry.header.is_deleted { - return Ok(Some(entry)) - } - } - Ok(None) - } - - // ===Search=== - async fn find_first_eq_bruteforce(&mut self, column: Column, t0: &T) -> Result>> - where T: Decode + PartialEq + Send + Sync - { - let mut file_position = self.current_file_position().await?; - while let Some((_, _, t)) = self.next_alive_at_column(column).await? { - if &t == t0 { - // go back and decode the whole entry - self.seek_to(file_position).await?; - return self.next().await - } else { - file_position = self.current_file_position().await?; - } - } - Ok(None) - } - - async fn find_all_eq_bruteforce(&mut self, column: Column, t0: &T) -> Result>> - where T: Decode + PartialEq + Send + Sync - { - let mut entries = vec![]; - while let Some(entry) = self.find_first_eq_bruteforce(column, t0).await? { - entries.push(entry) - } - Ok(entries) - } - - // ===Debugging=== - async fn read_entries(&mut self) -> Result<()> - where T: Decode + std::fmt::Debug - { - self.seek_to_start_of_data().await?; - while let Some(entry) = self.next().await? { - println!("{:?}", entry); - } - println!("END of entries."); - Ok(()) - } - - async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> { - let mut bytes: Vec = vec![]; - self.seek_to_start().await.map_err(|e| e.to_io_or_panic())?; - self.file().read_to_end(&mut bytes).await?; - Ok(bytes) - } -} - -#[async_trait] -pub trait CursorWithWriteStoreHeader: CursorWithStoreHeader + PrimitiveWriteCursor { - fn header_mut(&mut self) -> &mut StoreHeader; - fn set_eof_file_position(&mut self, new_file_position: FilePosition); - - // ===Store Header Manipulation=== - async fn increment_total_count(&mut self) -> Result<()> - where T: Send - { - self.seek_to_start().await?; - self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; - let new_count = self.header_mut().increment_total_count(); - self.write_bytes(&encode::(&new_count)?).await?; - Ok(()) - } - - async fn increment_deleted_count(&mut self) -> Result<()> - where T: Send - { - self.seek_to_start().await?; - self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; - let new_count = self.header_mut().increment_deleted_count(); - self.write_bytes(&encode::(&new_count)?).await?; - Ok(()) - } - - async fn set_header(&mut self, header: &StoreHeader) -> Result<()> - where T: Send - { - self.seek_to_start().await?; - let encoded_header: Vec = header.encode()?; - self.write_bytes(&encoded_header).await?; - - Ok(()) - } - - // ===Append Entry=== - - // Moves cursor to the end. - // Returns file position to the start of the new entry. - async fn append_entry_no_indexing(&mut self, entry: &Entry) -> Result - where T: Encode + Send + Sync - { - self.increment_total_count().await?; - - let encoded_entry: Vec = entry.encode()?; - let file_position = self.seek_to_end().await?; - self.write_bytes(&encoded_entry).await?; - - let eof_file_position: FilePosition = self.current_file_position().await?; - self.set_eof_file_position(eof_file_position); - - Ok(file_position) - } -} - -#[async_trait] -pub trait CursorWithAccessToIndex: CursorWithStoreHeader { - fn indexes(&mut self) -> &[Option>]; - - async fn index_lookup(&mut self, column: Column, value: &T) -> Result>> - where T: Encode + Decode + Ord + Send + Sync - { - match &self.indexes()[column as usize] { - Some(index) => { - let file_positions = index.lookup(value).await?.unwrap_or_else(|| HashSet::new()); - let mut entries: Vec> = vec![]; - for &file_position in file_positions.iter() { - match self.read_entry_at(file_position).await? { - Some(entry) => { - entries.push(entry) - }, - None => { - return Err(Error::IndexIsStoringEofFilePosition(column)) - } - } - } - - Ok(entries) - }, - None => - Err(Error::AttemptToIndexNonIndexableColumn(column)) - } - } - - async fn select_entries_where_eq(&mut self, column: Column, value: &T) -> Result>> - where T: Encode + Decode + Ord + Send + Sync - { - if self.header().is_column_indexed(column) { - self.index_lookup(column, value).await - } else { - self.find_all_eq_bruteforce(column, value).await - } - } -} - -#[async_trait] -pub trait CursorWithWriteAccessToIndex: CursorWithAccessToIndex + CursorWithWriteStoreHeader { - fn indexes_mut(&mut self) -> &mut [Option>]; - - // Assumes that the column is indexable. - fn mut_index_at(&mut self, column: Column) -> &mut Index { - match &mut self.indexes_mut()[column as usize] { - Some(index) => { - index - }, - None => { - unreachable!() - } - } - } - - // Assumes that the column is indexable. - async fn insert_into_index(&mut self, column: Column, value: T, file_position: FilePosition) -> Result<()> - where T: Encode + Decode + Ord + Send + Sync + 'async_trait - { - let index = self.mut_index_at(column as Column); - index.insert(value, file_position).await?; - Ok(()) - } - - // Assumes that the column is indexable. - async fn delete_from_index(&mut self, column: Column, value: T, file_position: FilePosition) -> Result<()> - where T: Encode + Decode + Ord + Send + Sync + 'async_trait - { - let index = self.mut_index_at(column as Column); - index.delete(value, file_position).await?; - Ok(()) - } - - async fn insert_entry(&mut self, entry: Entry) -> Result - where T: Encode + Decode + Ord + Send + Sync + 'async_trait - { - let file_position = self.append_entry_no_indexing(&entry).await?; - - // insert the indexable values of the entry into corresponding indexes. - for (column, (value, should_index)) in entry.data.into_iter().zip(self.header().indexed_columns.clone()).enumerate() { - if should_index { - // SAFETY: If should_index is true, then the column is indexable. - self.insert_into_index(column as Column, value, file_position).await? - } - } - - Ok(file_position) - } - - async fn delete_entry_values_from_indexes(&mut self, entry: EntryDetailed) -> Result<()> - where T: Encode + Decode + Ord + Send + Sync + 'async_trait - { - for (column, (value, should_index)) in entry.data.into_iter().zip(self.header().indexed_columns.clone()).enumerate() { - if should_index { - // SAFETY: If should_index is true, then the column is indexable. - self.delete_from_index(column as Column, value, entry.file_position).await? - } - } - Ok(()) - } -} - - // ===========Implementations============= -// ===PrimitiveCursor=== -impl PrimitiveCursor for ReadCursor<'_, T> { +// ===primitive capabilities=== +impl CursorCanRead for ReadCursor<'_, T> { fn file(&mut self) -> &mut File { &mut self.file } @@ -439,7 +57,7 @@ impl PrimitiveCursor for ReadCursor<'_, T> { } } -impl PrimitiveCursor for WriteCursor<'_, T> { +impl CursorCanRead for WriteCursor<'_, T> { fn file(&mut self) -> &mut File { &mut self.file } @@ -449,7 +67,7 @@ impl PrimitiveCursor for WriteCursor<'_, T> { } } -impl PrimitiveCursor for AppendOnlyCursor { +impl CursorCanRead for AppendOnlyCursor { fn file(&mut self) -> &mut File { &mut self.file } @@ -459,50 +77,48 @@ impl PrimitiveCursor for AppendOnlyCursor { } } -// ===PrimitiveCursor=== -impl PrimitiveWriteCursor for WriteCursor<'_, T> {} -impl PrimitiveWriteCursor for AppendOnlyCursor {} +impl CursorCanWrite for WriteCursor<'_, T> {} +impl CursorCanWrite for AppendOnlyCursor {} -// ===CursorWithStoreHeader=== -impl CursorWithStoreHeader for ReadCursor<'_, T> { +// ===capability to access header=== +impl CursorCanReadHeader for ReadCursor<'_, T> { fn header(&self) -> &StoreHeader { &self.header } } -impl CursorWithStoreHeader for WriteCursor<'_, T> { +impl CursorCanReadHeader for WriteCursor<'_, T> { fn header(&self) -> &StoreHeader { &self.header } } -impl CursorWithStoreHeader for AppendOnlyCursor { +impl CursorCanReadHeader for AppendOnlyCursor { fn header(&self) -> &StoreHeader { &self.header } } -// ===CursorWithWriteStoreHeader=== -impl CursorWithWriteStoreHeader for WriteCursor<'_, T> { +impl CursorCanWriteHeader for WriteCursor<'_, T> { fn header_mut(&mut self) -> &mut StoreHeader { self.header } fn set_eof_file_position(&mut self, new_file_position: FilePosition) { self.eof_file_position = new_file_position } } -impl CursorWithWriteStoreHeader for AppendOnlyCursor { +impl CursorCanWriteHeader for AppendOnlyCursor { fn header_mut(&mut self) -> &mut StoreHeader { &mut self.header } fn set_eof_file_position(&mut self, new_file_position: FilePosition) { self.eof_file_position = new_file_position } } -// ===CursorWithAccessToIndex=== -impl CursorWithAccessToIndex for ReadCursor<'_, T> { +// ===capability to access index=== +impl CursorCanReadIndex for ReadCursor<'_, T> { fn indexes(&mut self) -> &[Option>] { &self.indexes } } -impl CursorWithAccessToIndex for WriteCursor<'_, T> { +impl CursorCanReadIndex for WriteCursor<'_, T> { fn indexes(&mut self) -> &[Option>] { &self.indexes } } -// ===CursorWithWriteAccessToIndex=== -impl CursorWithWriteAccessToIndex for WriteCursor<'_, T> { +impl CursorCanWriteToIndex for WriteCursor<'_, T> { fn indexes_mut(&mut self) -> &mut [Option>] { self.indexes } } +// ===Specifics=== impl <'cursor, T> ReadCursor<'cursor, T> { pub async fn new<'store: 'cursor>(store: &'store Store) -> Result where T: Send + Sync @@ -530,8 +146,6 @@ impl <'cursor, T> ReadCursor<'cursor, T> { } } - - impl <'cursor, T> WriteCursor<'cursor, T> { // 'store lives atleast as long as 'cursor @@ -778,4 +392,3 @@ impl <'cursor, T> WriteCursor<'cursor, T> Ok(cursor_to_intermediate) } } - diff --git a/storage_engine/src/cursor_capabilities/header_access.rs b/storage_engine/src/cursor_capabilities/header_access.rs new file mode 100644 index 0000000..57ccabb --- /dev/null +++ b/storage_engine/src/cursor_capabilities/header_access.rs @@ -0,0 +1,230 @@ +use tokio::io::AsyncReadExt; +use async_trait::async_trait; + +use bincode; +use bincode::{Decode, Encode}; +use crate::binary_coding::{encode, decode}; + +use crate::error::{Error, DecodeErrorKind}; +use crate::segments::entry::{Entry, EntryDetailed}; +use crate::segments::entry_header::EntryHeaderWithDataSize; +use crate::segments::store_header::StoreHeader; +use crate::storage_engine::{FilePosition, Column, Result}; +use crate::cursor_capabilities::primitive::{CursorCanRead, CursorCanWrite}; + +#[async_trait] +pub trait CursorCanReadHeader: CursorCanRead { + fn header(&self) -> &StoreHeader; + + async fn seek_to_start_of_data(&mut self) -> Result { + self.seek_to(StoreHeader::size(self.header().number_of_columns) as u64).await + } + + async fn read_entry_header(&mut self) -> Result { + let number_of_columns: usize = self.header().number_of_columns; + let mut header_bytes: Vec = vec![0; EntryHeaderWithDataSize::size(number_of_columns)]; + self.read_bytes(&mut header_bytes).await?; + let header = EntryHeaderWithDataSize::decode(&mut header_bytes[..], number_of_columns)?; + + Ok(header) + } + + async fn read_entry_header_at(&mut self, file_position: FilePosition) -> Result { + self.seek_to(file_position).await?; + self.read_entry_header().await + } + + // Returns None when file_position == eof_file_position + async fn read_entry_at(&mut self, file_position: FilePosition) -> Result>> + where T: Decode + { + self.seek_to(file_position).await?; + self.next().await + } + + // ===Iteration=== + // The following functions assume that the current file position is at a valid entry or EOF. + + + // WARNING: This moves the file_position to start of the data, so you can't just call + // next_entry_header() a bunch of times. You must move the cursor! + async fn next_entry_header(&mut self) -> Result> { + if self.is_at_eof().await? { + return Ok(None) + } + + let entry_header = self.read_entry_header().await?; + + Ok(Some(entry_header)) + } + + // This is meant to be used after next_entry_header() is called. + async fn jump_from_start_of_entry_data_to_next_entry(&mut self, entry_header: &EntryHeaderWithDataSize) -> Result{ + let file_position = self.seek_by(entry_header.size_of_data() as i64).await?; + Ok(file_position) + } + + async fn next(&mut self) -> Result>> + where T: Decode + { + let file_position = self.current_file_position().await?; + let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; + + let mut data_bytes: Vec = vec![0; entry_header.size_of_data()]; + self.read_bytes(&mut data_bytes).await?; + let entry: EntryDetailed = + EntryDetailed::decode(entry_header, file_position, self.header().number_of_columns, &mut data_bytes)?; + + Ok(Some(entry)) + } + + // Like next, but only reads the column, not the whole entry. + async fn next_at_column(&mut self, column: Column) -> Result> + where T: Decode + Send + { + let file_position = self.current_file_position().await?; + let Some(entry_header) = self.next_entry_header().await? else { return Ok(None) }; + let file_position_at_start_of_data = self.current_file_position().await?; + + // figuring out how much to decode + let column_offset = entry_header.offset_of_column(column); + self.seek_by(column_offset as i64).await?; + + // reading and decoding + let mut bytes: Vec = vec![0; entry_header.data_sizes[column as usize]]; + self.read_bytes(&mut bytes).await?; + let (value, _) = + decode::(&bytes[..]) + .map_err(|e| Error::DecodeError(DecodeErrorKind::EntryIsDeleted, e))?; + + // jumping to next entry + self.seek_to(file_position_at_start_of_data).await?; + self.jump_from_start_of_entry_data_to_next_entry(&entry_header).await?; + + Ok(Some((entry_header, file_position, value))) + } + + async fn next_alive_at_column(&mut self, column: Column) -> Result> + where T: Decode + Send + { + while let Some((header, file_position, t)) = self.next_at_column(column).await? { + if !header.is_deleted { + return Ok(Some((header, file_position, t))) + } + } + Ok(None) + } + + async fn next_alive(&mut self) -> Result>> + where T: Decode + { + while let Some(entry) = self.next().await? { + if !entry.header.is_deleted { + return Ok(Some(entry)) + } + } + Ok(None) + } + + // ===Search=== + async fn find_first_eq_bruteforce(&mut self, column: Column, t0: &T) -> Result>> + where T: Decode + PartialEq + Send + Sync + { + let mut file_position = self.current_file_position().await?; + while let Some((_, _, t)) = self.next_alive_at_column(column).await? { + if &t == t0 { + // go back and decode the whole entry + self.seek_to(file_position).await?; + return self.next().await + } else { + file_position = self.current_file_position().await?; + } + } + Ok(None) + } + + async fn find_all_eq_bruteforce(&mut self, column: Column, t0: &T) -> Result>> + where T: Decode + PartialEq + Send + Sync + { + let mut entries = vec![]; + while let Some(entry) = self.find_first_eq_bruteforce(column, t0).await? { + entries.push(entry) + } + Ok(entries) + } + + // ===Debugging=== + async fn read_entries(&mut self) -> Result<()> + where T: Decode + std::fmt::Debug + { + self.seek_to_start_of_data().await?; + while let Some(entry) = self.next().await? { + println!("{:?}", entry); + } + println!("END of entries."); + Ok(()) + } + + async fn read_all_bytes(&mut self) -> std::result::Result, std::io::Error> { + let mut bytes: Vec = vec![]; + self.seek_to_start().await.map_err(|e| e.to_io_or_panic())?; + self.file().read_to_end(&mut bytes).await?; + Ok(bytes) + } +} + +#[async_trait] +pub trait CursorCanWriteHeader: CursorCanReadHeader + CursorCanWrite { + fn header_mut(&mut self) -> &mut StoreHeader; + fn set_eof_file_position(&mut self, new_file_position: FilePosition); + + // ===Store Header Manipulation=== + async fn increment_total_count(&mut self) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + self.seek_to(StoreHeader::TOTAL_COUNT_OFFSET as u64).await?; + let new_count = self.header_mut().increment_total_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + async fn increment_deleted_count(&mut self) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + self.seek_to(StoreHeader::DELETED_COUNT_OFFSET as u64).await?; + let new_count = self.header_mut().increment_deleted_count(); + self.write_bytes(&encode::(&new_count)?).await?; + Ok(()) + } + + async fn set_header(&mut self, header: &StoreHeader) -> Result<()> + where T: Send + { + self.seek_to_start().await?; + let encoded_header: Vec = header.encode()?; + self.write_bytes(&encoded_header).await?; + + Ok(()) + } + + // ===Append Entry=== + + // Moves cursor to the end. + // Returns file position to the start of the new entry. + async fn append_entry_no_indexing(&mut self, entry: &Entry) -> Result + where T: Encode + Send + Sync + { + self.increment_total_count().await?; + + let encoded_entry: Vec = entry.encode()?; + let file_position = self.seek_to_end().await?; + self.write_bytes(&encoded_entry).await?; + + let eof_file_position: FilePosition = self.current_file_position().await?; + self.set_eof_file_position(eof_file_position); + + Ok(file_position) + } +} diff --git a/storage_engine/src/cursor_capabilities/index_access.rs b/storage_engine/src/cursor_capabilities/index_access.rs new file mode 100644 index 0000000..d9d7fc3 --- /dev/null +++ b/storage_engine/src/cursor_capabilities/index_access.rs @@ -0,0 +1,115 @@ +use std::collections::HashSet; + +use async_trait::async_trait; + +use bincode; +use bincode::{Decode, Encode}; + +use crate::error::Error; +use crate::segments::entry::{Entry, EntryDetailed}; +use crate::storage_engine::{FilePosition, Column, Result}; +use crate::index::Index; +use crate::cursor_capabilities::header_access::{CursorCanReadHeader, CursorCanWriteHeader}; + +#[async_trait] +pub trait CursorCanReadIndex: CursorCanReadHeader { + fn indexes(&mut self) -> &[Option>]; + + async fn index_lookup(&mut self, column: Column, value: &T) -> Result>> + where T: Encode + Decode + Ord + Send + Sync + { + match &self.indexes()[column as usize] { + Some(index) => { + let file_positions = index.lookup(value).await?.unwrap_or_else(|| HashSet::new()); + let mut entries: Vec> = vec![]; + for &file_position in file_positions.iter() { + match self.read_entry_at(file_position).await? { + Some(entry) => { + entries.push(entry) + }, + None => { + return Err(Error::IndexIsStoringEofFilePosition(column)) + } + } + } + + Ok(entries) + }, + None => + Err(Error::AttemptToIndexNonIndexableColumn(column)) + } + } + + async fn select_entries_where_eq(&mut self, column: Column, value: &T) -> Result>> + where T: Encode + Decode + Ord + Send + Sync + { + if self.header().is_column_indexed(column) { + self.index_lookup(column, value).await + } else { + self.find_all_eq_bruteforce(column, value).await + } + } +} + +#[async_trait] +pub trait CursorCanWriteToIndex: CursorCanReadIndex + CursorCanWriteHeader { + fn indexes_mut(&mut self) -> &mut [Option>]; + + // Assumes that the column is indexable. + fn mut_index_at(&mut self, column: Column) -> &mut Index { + match &mut self.indexes_mut()[column as usize] { + Some(index) => { + index + }, + None => { + unreachable!() + } + } + } + + // Assumes that the column is indexable. + async fn insert_into_index(&mut self, column: Column, value: T, file_position: FilePosition) -> Result<()> + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + let index = self.mut_index_at(column as Column); + index.insert(value, file_position).await?; + Ok(()) + } + + // Assumes that the column is indexable. + async fn delete_from_index(&mut self, column: Column, value: T, file_position: FilePosition) -> Result<()> + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + let index = self.mut_index_at(column as Column); + index.delete(value, file_position).await?; + Ok(()) + } + + async fn insert_entry(&mut self, entry: Entry) -> Result + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + let file_position = self.append_entry_no_indexing(&entry).await?; + + // insert the indexable values of the entry into corresponding indexes. + for (column, (value, should_index)) in entry.data.into_iter().zip(self.header().indexed_columns.clone()).enumerate() { + if should_index { + // SAFETY: If should_index is true, then the column is indexable. + self.insert_into_index(column as Column, value, file_position).await? + } + } + + Ok(file_position) + } + + async fn delete_entry_values_from_indexes(&mut self, entry: EntryDetailed) -> Result<()> + where T: Encode + Decode + Ord + Send + Sync + 'async_trait + { + for (column, (value, should_index)) in entry.data.into_iter().zip(self.header().indexed_columns.clone()).enumerate() { + if should_index { + // SAFETY: If should_index is true, then the column is indexable. + self.delete_from_index(column as Column, value, entry.file_position).await? + } + } + Ok(()) + } +} diff --git a/storage_engine/src/cursor_capabilities/mod.rs b/storage_engine/src/cursor_capabilities/mod.rs new file mode 100644 index 0000000..6d301cb --- /dev/null +++ b/storage_engine/src/cursor_capabilities/mod.rs @@ -0,0 +1,3 @@ +pub(crate) mod primitive; +pub(crate) mod header_access; +pub(crate) mod index_access; diff --git a/storage_engine/src/cursor_capabilities/primitive.rs b/storage_engine/src/cursor_capabilities/primitive.rs new file mode 100644 index 0000000..d11b79b --- /dev/null +++ b/storage_engine/src/cursor_capabilities/primitive.rs @@ -0,0 +1,62 @@ +use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; +use tokio::fs::File; +use async_trait::async_trait; + +use crate::storage_engine::{FilePosition, Result}; + +#[async_trait] +pub(crate) trait CursorCanRead { + fn file(&mut self) -> &mut File; + fn eof_file_position(&self) -> FilePosition; + + async fn read_bytes(&mut self, bytes: &mut [u8]) -> Result<()> { + self.file().read_exact(bytes).await?; + Ok(()) + } + + async fn get_bytes(&mut self, count: usize) -> Result> { + let mut result: Vec = Vec::with_capacity(count); + self.read_bytes(&mut result).await?; + Ok(result) + } + + async fn seek_to(&mut self, file_position: FilePosition) -> Result { + let file_position = self.file().seek(SeekFrom::Start(file_position)).await?; + Ok(file_position) + } + + // Start of the file i.e. the Header, not the entries. + async fn seek_to_start(&mut self) -> Result { + let file_position = self.file().seek(SeekFrom::Start(0)).await?; + Ok(file_position) + } + + async fn seek_to_end(&mut self) -> Result { + let file_position = self.file().seek(SeekFrom::End(0)).await?; + Ok(file_position) + } + + // Seeks from current position by offset and returns new file position + async fn seek_by(&mut self, offset: i64) -> Result { + let file_position = self.file().seek(SeekFrom::Current(offset)).await?; + Ok(file_position) + } + + async fn current_file_position(&mut self) -> Result { + let next_file_position: FilePosition = self.file().stream_position().await?; + Ok(next_file_position) + } + + async fn is_at_eof(&mut self) -> Result { + let current_file_position = self.current_file_position().await?; + let eof_file_position = self.eof_file_position(); + Ok(current_file_position == eof_file_position) + } +} + +#[async_trait] +pub(crate) trait CursorCanWrite: CursorCanRead { + async fn write_bytes(&mut self, bytes: &[u8]) -> Result { + Ok(self.file().write(bytes).await?) + } +} diff --git a/storage_engine/src/lib.rs b/storage_engine/src/lib.rs index cae01f5..e7920dd 100644 --- a/storage_engine/src/lib.rs +++ b/storage_engine/src/lib.rs @@ -4,3 +4,4 @@ mod error; mod index; mod cursor; mod segments; +mod cursor_capabilities; diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index 01f87d0..d31327c 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -5,7 +5,8 @@ use std::path::{Path, PathBuf}; use bincode::{Decode, Encode}; use crate::error::Error; -use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader}; +use crate::cursor::{ReadCursor, WriteCursor}; +use crate::cursor_capabilities::header_access::CursorCanReadHeader; use crate::segments::store_header::StoreHeader; use crate::index::Index; @@ -207,7 +208,9 @@ impl Store { mod tests { use super::*; use crate::segments::entry::{Entry, EntryDetailed}; - use crate::cursor::{ReadCursor, WriteCursor, CursorWithStoreHeader, CursorWithWriteAccessToIndex, CursorWithAccessToIndex}; + use crate::cursor::{ReadCursor, WriteCursor}; + use crate::cursor_capabilities::header_access::CursorCanReadHeader; + use crate::cursor_capabilities::index_access::{CursorCanWriteToIndex, CursorCanReadIndex}; impl Drop for Store { fn drop(&mut self) { From fbfb069bbf859907d68ec3c309c35fa4c7d206a4 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:04:14 +0100 Subject: [PATCH 42/43] Remove unused imports --- storage_engine/src/storage_engine.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/storage_engine/src/storage_engine.rs b/storage_engine/src/storage_engine.rs index d31327c..7798e10 100644 --- a/storage_engine/src/storage_engine.rs +++ b/storage_engine/src/storage_engine.rs @@ -207,8 +207,7 @@ impl Store { #[cfg(test)] mod tests { use super::*; - use crate::segments::entry::{Entry, EntryDetailed}; - use crate::cursor::{ReadCursor, WriteCursor}; + use crate::segments::entry::Entry; use crate::cursor_capabilities::header_access::CursorCanReadHeader; use crate::cursor_capabilities::index_access::{CursorCanWriteToIndex, CursorCanReadIndex}; From 0666c0d24c24a09791e702f4755fba3ba715a2a2 Mon Sep 17 00:00:00 2001 From: Yuriy Dupyn <2153100+omedusyo@users.noreply.github.com> Date: Mon, 5 Feb 2024 16:06:35 +0100 Subject: [PATCH 43/43] Test for encoding/decoding --- storage_engine/src/binary_coding.rs | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/storage_engine/src/binary_coding.rs b/storage_engine/src/binary_coding.rs index 4e66623..1b2a475 100644 --- a/storage_engine/src/binary_coding.rs +++ b/storage_engine/src/binary_coding.rs @@ -69,22 +69,19 @@ pub fn decode_sequence(len: usize, bytes: &[u8]) -> Result, bi } -#[allow(dead_code)] -fn example_encoding_decoding() { - let _xs: Vec = vec![123, 250, 256, 123, 123, 123]; - let _xs: Vec = vec![]; - let _xs: Vec = vec![123]; - let _xs: Vec = vec![123, 250]; +#[cfg(test)] +mod tests { + use super::*; - let xs: Vec = vec!["foo".to_string(), "bar".to_string()]; + #[test] + fn test_encoding_decoding() { + let xs: Vec = vec!["foo".to_string(), "bar".to_string()]; + let exs = encode_vector(&xs[..]).unwrap(); - println!("original {:?}", xs); - let exs = encode_vector(&xs[..]).unwrap(); - println!("encoded {:?}", exs); + // WARNING: Don't forget to specify the type here + let dxs = decode_vector::(&exs[..]).unwrap(); - // WARNING: Don't forget to specify the type here - // let dxs = decode_vector::(&exs[..]).unwrap(); - let dxs = decode_vector::(&exs[..]).unwrap(); - println!("decoded {:?}", dxs); + assert!(dxs == xs); + } }