Forgot to seek before garbage collection

This commit is contained in:
Yuriy Dupyn 2024-02-05 02:26:54 +01:00
parent f3fc67cbbc
commit c0a3ee08b8
2 changed files with 131 additions and 63 deletions

View file

@ -1,5 +1,6 @@
use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom}; use tokio::io::{AsyncReadExt, AsyncWriteExt, AsyncSeekExt, SeekFrom};
use tokio::fs::{File, OpenOptions}; use tokio::fs::{File, OpenOptions};
use tokio::fs;
use std::path::Path; use std::path::Path;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::collections::{BTreeMap, HashSet}; use std::collections::{BTreeMap, HashSet};
@ -17,7 +18,7 @@ use crate::store_header::StoreHeader;
use crate::storage_engine::{Store, FilePosition, Column, Result, StoreIndexes, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME}; use crate::storage_engine::{Store, FilePosition, Column, Result, StoreIndexes, ROWS_FILE_NAME, GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME};
use crate::index::Index; use crate::index::Index;
const GARBAGE_COLLECTION_TRIGGER: usize = 100;
// ===Concrete Cursors=== // ===Concrete Cursors===
pub struct ReadCursor<'a, T> { pub struct ReadCursor<'a, T> {
@ -327,12 +328,12 @@ pub trait CursorWithWriteStoreHeader<T>: CursorWithStoreHeader<T> + PrimitiveWri
pub trait CursorWithAccessToIndex<T>: CursorWithStoreHeader<T> { pub trait CursorWithAccessToIndex<T>: CursorWithStoreHeader<T> {
fn indexes(&mut self) -> &[Option<Index<T, FilePosition>>]; fn indexes(&mut self) -> &[Option<Index<T, FilePosition>>];
async fn index_lookup(&mut self, column: Column, k: &T) -> Result<Vec<EntryDetailed<T>>> async fn index_lookup(&mut self, column: Column, value: &T) -> Result<Vec<EntryDetailed<T>>>
where T: Encode + Decode + Ord + Send + Sync where T: Encode + Decode + Ord + Send + Sync
{ {
match &self.indexes()[column as usize] { match &self.indexes()[column as usize] {
Some(index) => { Some(index) => {
let file_positions = index.lookup(k).await?.unwrap_or_else(|| HashSet::new()); let file_positions = index.lookup(value).await?.unwrap_or_else(|| HashSet::new());
let mut entries: Vec<EntryDetailed<T>> = vec![]; let mut entries: Vec<EntryDetailed<T>> = vec![];
for &file_position in file_positions.iter() { for &file_position in file_positions.iter() {
match self.read_entry_at(file_position).await? { match self.read_entry_at(file_position).await? {
@ -601,9 +602,10 @@ impl <'cursor, T> WriteCursor<'cursor, T>
} }
// ===Deletion=== // ===Deletion===
pub async fn mark_deleted_at(&mut self, file_position: FilePosition) -> Result<()> pub async fn mark_deleted_at(&mut self, file_position: FilePosition, enable_garbage_collector: bool) -> Result<()>
where T: Encode + Decode + Ord + Send + Sync + Clone + Ord where T: Encode + Decode + Ord + Send + Sync + Clone + Ord
{ {
println!("MARKING {} as DELETED", file_position);
self.seek_to(file_position).await?; self.seek_to(file_position).await?;
let mut entry_header = self.read_entry_header().await?; let mut entry_header = self.read_entry_header().await?;
if entry_header.is_deleted { if entry_header.is_deleted {
@ -628,23 +630,59 @@ impl <'cursor, T> WriteCursor<'cursor, T>
} }
} }
if enable_garbage_collector {
self.attempt_garbage_collection_if_necessary().await?; self.attempt_garbage_collection_if_necessary().await?;
}
Ok(()) Ok(())
} }
} }
async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result<Option<EntryDetailed<T>>> async fn find_first_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T, enable_garbage_collector: bool) -> Result<Option<EntryDetailed<T>>>
where T: Encode + Decode + Ord + Send + Sync + Clone + Ord where T: Encode + Decode + Ord + Send + Sync + Clone
{ {
let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?; let maybe_entry = self.find_first_eq_bruteforce(column, t0).await?;
if let Some(entry) = maybe_entry { if let Some(entry) = maybe_entry {
self.mark_deleted_at(entry.file_position).await?; self.mark_deleted_at(entry.file_position, enable_garbage_collector).await?;
Ok(Some(entry)) Ok(Some(entry))
} else { } else {
Ok(maybe_entry) Ok(maybe_entry)
} }
} }
// Doesn't update indexes.
async fn find_all_eq_bruteforce_and_delete(&mut self, column: Column, t0: &T) -> Result<usize>
where T: Encode + Decode + Ord + Send + Sync + Clone
{
let mut count = 0;
while let Some(_) = self.find_first_eq_bruteforce_and_delete(column, t0, false).await? {
count += 1;
}
Ok(count)
}
pub async fn delete_entries_where_eq(&mut self, column: Column, value: &T, enable_garbage_collector: bool) -> Result<usize>
where T: Encode + Decode + Ord + Send + Sync + Clone
{
let count =
if self.header().is_column_indexed(column) {
println!("DELETION: INDEXED LOOKUP");
let entries = self.index_lookup(column, value).await?;
let count = entries.len();
for entry in entries {
self.mark_deleted_at(entry.file_position, false).await?
}
count
} else {
println!("DELETION: BRUTE-FORCE LOOKUP");
let count = self.find_all_eq_bruteforce_and_delete(column, value).await?;
count
};
if enable_garbage_collector {
self.attempt_garbage_collection_if_necessary().await?;
}
Ok(count)
}
// ===Indexing=== // ===Indexing===
// WARNING: Assumes the column is NOT indexable. // WARNING: Assumes the column is NOT indexable.
pub async fn attach_index(&mut self, column: Column) -> Result<()> pub async fn attach_index(&mut self, column: Column) -> Result<()>
@ -671,15 +709,14 @@ impl <'cursor, T> WriteCursor<'cursor, T>
async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()> async fn attempt_garbage_collection_if_necessary(&mut self) -> Result<()>
where T: Send + Decode + Encode + Clone + Ord where T: Send + Decode + Encode + Clone + Ord
{ {
// TODO: What should be the policy? Counting size of garbage? Counting how many entries are if self.header.deleted_count > GARBAGE_COLLECTION_TRIGGER {
// garbage? println!("=======START GARBAGE COLLETOR====");
if self.header.deleted_count > 100 {
self.initiate_garbage_collection().await?; self.initiate_garbage_collection().await?;
} }
Ok(()) Ok(())
} }
async fn initiate_garbage_collection(&mut self) -> Result<usize> pub async fn initiate_garbage_collection(&mut self) -> Result<usize>
where T: Send + Decode + Encode + Clone + Ord where T: Send + Decode + Encode + Clone + Ord
{ {
let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?; let mut cursor_to_intermediate = self.spawn_cursor_to_intermediate_file().await?;
@ -698,6 +735,7 @@ impl <'cursor, T> WriteCursor<'cursor, T>
// We'll dump all alive entries into a new file. // We'll dump all alive entries into a new file.
let mut entries_deleted = 0; let mut entries_deleted = 0;
self.seek_to_start_of_data().await?;
{ {
while let Some(live_entry) = self.next_alive().await? { while let Some(live_entry) = self.next_alive().await? {
entries_deleted += 1; entries_deleted += 1;
@ -729,6 +767,14 @@ impl <'cursor, T> WriteCursor<'cursor, T>
self.file = cursor_to_intermediate.file; self.file = cursor_to_intermediate.file;
self.eof_file_position = cursor_to_intermediate.eof_file_position; self.eof_file_position = cursor_to_intermediate.eof_file_position;
// swap files on disk
// current file
let path_to_table = Path::new(&self.header.table_folder);
let path_to_rows = path_to_table.join(ROWS_FILE_NAME);
let path_to_intermediate_rows = path_to_table.join(GARBAGE_COLLECTION_INTERMEDIATE_ROWS_FILE_NAME);
fs::remove_file(path_to_rows.clone()).await?;
fs::rename(path_to_intermediate_rows, path_to_rows).await?;
Ok(entries_deleted) Ok(entries_deleted)
} }

View file

@ -82,6 +82,20 @@ async fn append_bunch_of_entries(store: &mut Store<Data>) -> Result<()> {
Ok(()) Ok(())
} }
async fn test_garbage_collection(store: &mut Store<Data>) -> Result<()> {
let mut cursor = store.write_cursor().await.map_err(|e| e.to_io_or_panic())?;
// cursor.delete_entries_where_eq()
// 1. mark a bunch of entries as deleted
let column = 0;
let value = 1;
// cursor.delete_entries_where_eq(column, &value, true).await.map_err(|e| e.to_io_or_panic())?;
// let value = 50;
// cursor.delete_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?;
// cursor.initiate_garbage_collection().await.map_err(|e| e.to_io_or_panic())?;
Ok(())
}
#[tokio::main] #[tokio::main]
async fn main() -> Result<()> { async fn main() -> Result<()> {
println!("STOOOOOOOOOOOORAAAAAAAAAAAGE"); println!("STOOOOOOOOOOOORAAAAAAAAAAAGE");
@ -117,65 +131,73 @@ async fn main() -> Result<()> {
cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?; cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?;
} }
{ test_garbage_collection(&mut store).await?;
let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
}
{ {
let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
let column = 2; cursor.read_entries().await.map_err(|e| e.to_io_or_panic())?;
let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
println!("{:?}", x);
} }
{
let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
let column = 0;
let value = 1;
let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?;
println!("ARE INDEXES WORKING???");
println!("{:?}", entries);
}
{ // {
let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
let column = 1; // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
let value = 2; // println!("{:?}", x);
let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
println!("ARE INDEXES WORKING???"); // println!("{:?}", x);
println!("{:?}", entries); // let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
} // println!("{:?}", x);
// let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
// println!("{:?}", x);
// let x = cursor.next().await.map_err(|e| e.to_io_or_panic())?;
// println!("{:?}", x);
// }
{ // {
let column = 1; // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
// println!("BUILDING AN INDEX"); // let column = 2;
// store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?; // let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
// println!("INDEX BUILT!"); // println!("{:?}", x);
// let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
// println!("{:?}", x);
// let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
// println!("{:?}", x);
// let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
// println!("{:?}", x);
// let x = cursor.next_at_column(column).await.map_err(|e| e.to_io_or_panic())?;
// println!("{:?}", x);
// }
let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?; // {
let value = 2; // let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?; // let column = 0;
println!("ARE INDEXES WORKING???"); // let value = 1;
println!("{:?}", entries); // let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?;
} // println!("ARE INDEXES WORKING???");
// println!("{:?}", entries);
// }
// {
// let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
// let column = 1;
// let value = 2;
// let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?;
// println!("ARE INDEXES WORKING???");
// println!("{:?}", entries);
// }
// {
// let column = 1;
// // println!("BUILDING AN INDEX");
// // store.attach_index(column).await.map_err(|e| e.to_io_or_panic())?;
// // println!("INDEX BUILT!");
// let mut cursor = store.read_cursor().await.map_err(|e| e.to_io_or_panic())?;
// let value = 2;
// let entries = cursor.select_entries_where_eq(column, &value).await.map_err(|e| e.to_io_or_panic())?;
// println!("ARE INDEXES WORKING???");
// println!("{:?}", entries);
// }
// { // {