Skip to content

Commit

Permalink
Start refactoring file data reader (#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
koolkdev authored Nov 23, 2024
1 parent 6a9e85a commit 1ceaac2
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 106 deletions.
233 changes: 127 additions & 106 deletions src/file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
#include "file.h"

#include <cassert>
#include <ranges>
#include "block.h"
#include "quota_area.h"
#include "structs.h"

struct FileDataChunkInfo {
struct FileDataRef {
std::shared_ptr<Block> data_block;
size_t offset_in_block;
size_t size;
Expand All @@ -30,38 +31,48 @@ class File::DataCategoryReader {
public:
DataCategoryReader(const std::shared_ptr<File>& file) : file_(file) {}
virtual ~DataCategoryReader() {}
virtual size_t GetAttributesMetadataSize() const = 0;
virtual FileDataChunkInfo GetFileDataChunkInfo(size_t offset, size_t size) = 0;
size_t Read(std::byte* data, size_t offset, size_t size) {
auto chunk_info = GetFileDataChunkInfo(offset, size);
auto data_begin = chunk_info.data_block->data().begin();
std::copy(data_begin + chunk_info.offset_in_block, data_begin + chunk_info.offset_in_block + chunk_info.size, data);
return chunk_info.size;

virtual size_t GetMetadataSize() const = 0;
virtual size_t GetMetadataItemsCount() const = 0;

virtual std::span<const std::byte> GetData(size_t offset, size_t size) = 0;
virtual std::span<std::byte> GetMutableData(size_t offset, size_t size) = 0;

size_t Read(std::byte* output, size_t offset, size_t size) {
auto data = GetData(offset, size);
std::copy(data.begin(), data.end(), output);
return data.size();
}
size_t Write(const std::byte* data, size_t offset, size_t size) {
auto chunk_info = GetFileDataChunkInfo(offset, size);
auto data_begin = chunk_info.data_block->mutable_data().begin();
std::copy(data, data + chunk_info.size, data_begin + chunk_info.offset_in_block);
chunk_info.data_block->Flush();
return chunk_info.size;

size_t Write(const std::byte* input, size_t offset, size_t size) {
auto data = GetMutableData(offset, size);
std::copy(input, input + data.size(), data.begin());
return data.size();
}

virtual void Resize(size_t new_size) = 0;

protected:
std::shared_ptr<File> file_;

size_t GetAttributesMetadataOffset() const {
return file_->metadata_block()->to_offset(file_->metadata()) + file_->metadata()->size();
}
size_t GetAttributesMetadataEndOffset() const {
return file_->metadata_block()->to_offset(file_->metadata()) +
align_to_power_of_2(file_->metadata()->size() + GetAttributesMetadataSize());
}
const std::byte* GetAttributesMetadataEnd() const {
// We can't do get_object<std::byte> because it might point to data.end(), so in debug it will cause an
// error
return file_->metadata_block()->data().data() + GetAttributesMetadataEndOffset();
template <typename T, bool AlignToEnd = false>
auto GetMetadata() const {
const auto count = GetMetadataItemsCount();
const auto file_metadata_size = sizeof(T) * count;
const auto base_metadata_size = file_->metadata()->size();
auto* metadata = [&]() {
if constexpr (std::is_const_v<T>) {
return reinterpret_cast<const std::byte*>(file_->metadata());
} else {
return reinterpret_cast<std::byte*>(file_->mutable_metadata());
}
}();
if constexpr (AlignToEnd) {
auto* end = metadata + align_to_power_of_2(base_metadata_size + file_metadata_size);
return std::span<T>{reinterpret_cast<T*>(end - file_metadata_size), count} | std::views::reverse;
} else {
return std::span<T>{reinterpret_cast<T*>(metadata + base_metadata_size), count};
}
}
};

Expand All @@ -70,13 +81,17 @@ class File::DataCategory0Reader : public File::DataCategoryReader {
public:
DataCategory0Reader(const std::shared_ptr<File>& file) : DataCategoryReader(file) {}

virtual size_t GetAttributesMetadataSize() const { return file_->metadata()->size_on_disk.value(); }
size_t GetMetadataSize() const override { return GetMetadataItemsCount() * sizeof(std::byte); }
size_t GetMetadataItemsCount() const override { return file_->metadata()->size_on_disk.value(); }

virtual FileDataChunkInfo GetFileDataChunkInfo(size_t offset, size_t size) {
return FileDataChunkInfo{file_->metadata_block(), GetAttributesMetadataOffset() + offset, size};
std::span<const std::byte> GetData(size_t offset, size_t size) override {
return GetMetadata<const std::byte>().subspan(offset, size);
}
std::span<std::byte> GetMutableData(size_t offset, size_t size) override {
return GetMetadata<std::byte>().subspan(offset, size);
}

virtual void Resize(size_t new_size) {
void Resize(size_t new_size) override {
// Just update the attribute, the data in the metadata block
file_->mutable_metadata()->file_size = static_cast<uint32_t>(new_size);
}
Expand All @@ -86,28 +101,44 @@ class File::RegularDataCategoryReader : public File::DataCategoryReader {
public:
RegularDataCategoryReader(const std::shared_ptr<File>& file) : DataCategoryReader(file) {}

virtual size_t GetAttributesMetadataSize() const {
// round up dividation
size_t data_blocks_count = ((file_->metadata()->size_on_disk.value() - 1) >> GetDataBlockSize()) + 1;
return sizeof(DataBlockMetadata) * data_blocks_count;
size_t GetMetadataSize() const override { return GetMetadataItemsCount() * sizeof(DataBlockMetadata); }
size_t GetMetadataItemsCount() const override {
return div_ceil_pow2(file_->metadata()->size_on_disk.value(), GetDataBlockSize());
}

std::span<const std::byte> GetData(size_t offset, size_t size) override {
auto data_ref = GetFileDataRef(offset, size);
return data_ref.data_block->data().subspan(data_ref.offset_in_block, data_ref.size);
}

std::span<std::byte> GetMutableData(size_t offset, size_t size) override {
auto data_ref = GetFileDataRef(offset, size);
return data_ref.data_block->mutable_data().subspan(data_ref.offset_in_block, data_ref.size);
}

virtual FileDataChunkInfo GetFileDataChunkInfo(size_t offset, size_t size) {
auto blocks_list = reinterpret_cast<const DataBlockMetadata*>(GetAttributesMetadataEnd());
int64_t block_index = offset >> GetDataBlockSize();
size_t offset_in_block = offset & ((1 << GetDataBlockSize()) - 1);
auto hash_block = file_->metadata_block();
uint32_t block_offset = static_cast<uint32_t>((offset >> GetDataBlockSize()) << GetDataBlockSize());

LoadDataBlock(blocks_list[-block_index - 1].block_number.value(),
static_cast<uint32_t>(
std::min(1U << GetDataBlockSize(), file_->metadata()->file_size.value() - block_offset)),
{hash_block, hash_block->to_offset(&blocks_list[-block_index - 1].hash)});
FileDataRef GetDataFromBlock(uint32_t block_number,
size_t block_offset,
size_t offset_in_block,
size_t size,
const std::shared_ptr<Block>& hash_block,
const uint8_be_t* hash) {
LoadDataBlock(block_number,
std::min(uint32_t{1} << GetDataBlockSize(),
static_cast<uint32_t>(file_->metadata()->file_size.value() - block_offset)),
{hash_block, hash_block->to_offset(hash)});
size = std::min(size, current_data_block->size() - offset_in_block);
return FileDataChunkInfo{current_data_block, offset_in_block, size};
return {current_data_block, offset_in_block, size};
}

virtual void Resize(size_t new_size) {
virtual FileDataRef GetFileDataRef(size_t offset, size_t size) {
auto blocks_list = GetMetadata<const DataBlockMetadata, true>();
auto [block_index, offset_in_block] = div_pow2(offset, GetDataBlockSize());
auto block_offset = floor_pow2(offset, GetDataBlockSize());
return GetDataFromBlock(blocks_list[block_index].block_number.value(), block_offset, offset_in_block, size,
file_->metadata_block(), blocks_list[block_index].hash);
}

void Resize(size_t new_size) override {
size_t old_size = file_->metadata()->file_size.value();
while (old_size != new_size) {
std::shared_ptr<Block> current_block;
Expand All @@ -116,26 +147,26 @@ class File::RegularDataCategoryReader : public File::DataCategoryReader {
// Just update last block
if (new_size > 0) {
// Minus 1 because if it is right at the end of the block, we will get the next block
auto chunk_info = GetFileDataChunkInfo(new_size - 1, 1);
auto chunk_info = GetFileDataRef(new_size - 1, 1);
current_block = chunk_info.data_block;
new_block_size = std::min(chunk_info.offset_in_block + 1, static_cast<size_t>(1U) << GetDataBlockSize());
new_block_size = std::min(chunk_info.offset_in_block + 1, size_t{1} << GetDataBlockSize());
}
old_size = new_size;
} else {
if (old_size & ((1 << GetDataBlockSize()) - 1)) {
// We need to incrase the size of the last block
// Minus 1 because if it is right at the end of the block, we will get the next block
auto chunk_info = GetFileDataChunkInfo(old_size - 1, 1);
auto chunk_info = GetFileDataRef(old_size - 1, 1);
current_block = chunk_info.data_block;
new_block_size = std::min(chunk_info.offset_in_block + 1 + (new_size - old_size), static_cast<size_t>(1U)
<< GetDataBlockSize());
new_block_size =
std::min(chunk_info.offset_in_block + 1 + (new_size - old_size), size_t{1} << GetDataBlockSize());
old_size += new_block_size - (chunk_info.offset_in_block + 1);
} else {
// Open new block, the size of the loaded block will be 0
auto chunk_info = GetFileDataChunkInfo(old_size, 0);
auto chunk_info = GetFileDataRef(old_size, 0);
current_block = chunk_info.data_block;
assert(chunk_info.offset_in_block == 0);
new_block_size = std::min(new_size - old_size, static_cast<size_t>(1U) << GetDataBlockSize());
new_block_size = std::min(new_size - old_size, size_t{1} << GetDataBlockSize());
old_size += new_block_size;
}
}
Expand All @@ -147,18 +178,16 @@ class File::RegularDataCategoryReader : public File::DataCategoryReader {
}

protected:
virtual BlockType GetBlocksLog2CountInDataBlock() const = 0;
virtual size_t GetDataBlockSize() const {
return file_->quota()->block_size_log2() + log2_size(GetBlocksLog2CountInDataBlock());
}
virtual BlockType GetDataBlockType() const = 0;
virtual size_t GetDataBlockSize() const { return file_->quota()->block_size_log2() + log2_size(GetDataBlockType()); }
std::shared_ptr<Block> current_data_block;

void LoadDataBlock(uint32_t block_number, uint32_t data_size, Block::HashRef data_hash) {
if (current_data_block &&
file_->quota()->to_area_block_number(current_data_block->physical_block_number()) == block_number)
return;
auto block = file_->quota()->LoadDataBlock(block_number, static_cast<BlockSize>(file_->quota()->block_size_log2()),
GetBlocksLog2CountInDataBlock(), data_size, std::move(data_hash),
GetDataBlockType(), data_size, std::move(data_hash),
!(file_->metadata()->flags.value() & EntryMetadata::UNENCRYPTED_FILE));
if (!block.has_value())
throw WfsException(WfsError::kFileDataCorrupted);
Expand All @@ -173,7 +202,7 @@ class File::DataCategory1Reader : public File::RegularDataCategoryReader {
DataCategory1Reader(const std::shared_ptr<File>& file) : RegularDataCategoryReader(file) {}

protected:
virtual BlockType GetBlocksLog2CountInDataBlock() const { return BlockType::Single; }
BlockType GetDataBlockType() const override { return BlockType::Single; }
};

// Category 2 - File data in large block (8 regular blocks), in the attribute metadata there is a reversed list of block
Expand All @@ -183,7 +212,7 @@ class File::DataCategory2Reader : public File::RegularDataCategoryReader {
DataCategory2Reader(const std::shared_ptr<File>& file) : RegularDataCategoryReader(file) {}

protected:
virtual BlockType GetBlocksLog2CountInDataBlock() const { return BlockType::Large; }
BlockType GetDataBlockType() const override { return BlockType::Large; }
};

// Category 3 - File data in clusters of large block (8 large blocksblocks), in the attribute metadata there is a
Expand All @@ -193,45 +222,35 @@ class File::DataCategory3Reader : public File::DataCategory2Reader {
public:
DataCategory3Reader(const std::shared_ptr<File>& file) : DataCategory2Reader(file) {}

virtual size_t GetAttributesMetadataSize() const {
size_t data_blocks_clusters_count = ((file_->metadata()->size_on_disk.value() - 1) >> ClusterDataLog2Size()) + 1;
return sizeof(DataBlocksClusterMetadata) * data_blocks_clusters_count;
size_t GetMetadataSize() const override { return GetMetadataItemsCount() * sizeof(DataBlocksClusterMetadata); }
size_t GetMetadataItemsCount() const override {
return div_ceil_pow2(file_->metadata()->size_on_disk.value(), ClusterDataLog2Size());
}

virtual FileDataChunkInfo GetFileDataChunkInfo(size_t offset, size_t size) {
return GetFileDataChunkInfoFromClustersList(
offset, offset, size, file_->metadata_block(),
reinterpret_cast<const DataBlocksClusterMetadata*>(GetAttributesMetadataEnd()), true);
FileDataRef GetFileDataRef(size_t offset, size_t size) override {
return GetFileDataRefFromClustersList(
/*cluster_list_start=*/0, offset, size, file_->metadata_block(),
GetMetadata<const DataBlocksClusterMetadata, true>());
}

protected:
FileDataChunkInfo GetFileDataChunkInfoFromClustersList(size_t offset,
size_t original_offset,
size_t size,
const std::shared_ptr<Block>& metadata_block,
const DataBlocksClusterMetadata* clusters_list,
bool reverse) {
int64_t cluster_index = offset >> ClusterDataLog2Size();
size_t offset_in_cluster = offset & ((1 << ClusterDataLog2Size()) - 1);
size_t block_index = offset_in_cluster >> GetDataBlockSize();
size_t offset_in_block = offset_in_cluster & ((1 << GetDataBlockSize()) - 1);
uint32_t block_offset = static_cast<uint32_t>((original_offset >> GetDataBlockSize()) << GetDataBlockSize());

const DataBlocksClusterMetadata* cluster = NULL;
if (reverse)
cluster = &clusters_list[-cluster_index - 1];
else
cluster = &clusters_list[cluster_index];
LoadDataBlock(cluster->block_number.value() +
static_cast<uint32_t>(block_index << log2_size(GetBlocksLog2CountInDataBlock())),
static_cast<uint32_t>(
std::min(1U << GetDataBlockSize(), file_->metadata()->file_size.value() - block_offset)),
{metadata_block, metadata_block->to_offset(&cluster->hash[block_index])});
size = std::min(size, current_data_block->size() - offset_in_block);
return FileDataChunkInfo{current_data_block, offset_in_block, size};
template <typename ClusterArray>
FileDataRef GetFileDataRefFromClustersList(size_t cluster_list_start,
size_t offset,
size_t size,
const std::shared_ptr<Block>& metadata_block,
ClusterArray&& clusters_list) {
auto offset_in_cluster_list = offset - (cluster_list_start << ClusterDataLog2Size());
auto [cluster_index, offset_in_cluster] = div_pow2(offset_in_cluster_list, ClusterDataLog2Size());
auto [block_index, offset_in_block] = div_pow2(offset_in_cluster, GetDataBlockSize());
auto block_offset = floor_pow2(offset, GetDataBlockSize());
return GetDataFromBlock(clusters_list[cluster_index].block_number.value() +
static_cast<uint32_t>(block_index << log2_size(GetDataBlockType())),
block_offset, offset_in_block, size, metadata_block,
clusters_list[cluster_index].hash[block_index]);
}

size_t ClusterDataLog2Size() const { return GetDataBlockSize() + 3; }
size_t ClusterDataLog2Size() const { return file_->quota()->block_size_log2() + log2_size(BlockType::Cluster); }
};

// Category 4 - File data in clusters of large block (8 large blocksblocks), in the attribute metadata there is list of
Expand All @@ -241,20 +260,22 @@ class File::DataCategory4Reader : public File::DataCategory3Reader {
public:
DataCategory4Reader(const std::shared_ptr<File>& file) : DataCategory3Reader(file) {}

virtual size_t GetAttributesMetadataSize() const {
size_t data_blocks_clusters_count = ((file_->metadata()->size_on_disk.value() - 1) >> ClusterDataLog2Size()) + 1;
size_t blocks_count = ((data_blocks_clusters_count - 1) / ClustersInBlock()) + 1;
return sizeof(uint32_be_t) * blocks_count;
size_t GetMetadataSize() const override { return GetMetadataItemsCount() * sizeof(uint32_be_t); }
size_t GetMetadataItemsCount() const override {
size_t data_blocks_clusters_count = div_ceil_pow2(file_->metadata()->size_on_disk.value(), ClusterDataLog2Size());
return div_ceil(data_blocks_clusters_count, ClustersInBlock());
}

virtual FileDataChunkInfo GetFileDataChunkInfo(size_t offset, size_t size) {
auto blocks_list = reinterpret_cast<const uint32_be_t*>(GetAttributesMetadataEnd());
int64_t block_index = offset / (ClustersInBlock() << ClusterDataLog2Size());
size_t offset_in_block = offset % (ClustersInBlock() << ClusterDataLog2Size());
LoadMetadataBlock(blocks_list[-block_index - 1].value());
return GetFileDataChunkInfoFromClustersList(
offset_in_block, offset, size, current_metadata_block,
current_metadata_block->get_object<DataBlocksClusterMetadata>(sizeof(MetadataBlockHeader)), false);
FileDataRef GetFileDataRef(size_t offset, size_t size) override {
auto blocks_list = GetMetadata<const uint32_be_t, true>();
auto cluster_index = offset >> ClusterDataLog2Size();
int64_t block_index = cluster_index / ClustersInBlock();
LoadMetadataBlock(blocks_list[block_index].value());
return GetFileDataRefFromClustersList(
block_index * ClustersInBlock(), offset, size, current_metadata_block,
std::span<const DataBlocksClusterMetadata>{
current_metadata_block->get_object<DataBlocksClusterMetadata>(sizeof(MetadataBlockHeader)),
ClustersInBlock()});
}

protected:
Expand All @@ -273,7 +294,7 @@ class File::DataCategory4Reader : public File::DataCategory3Reader {
size_t ClustersInBlock() const {
size_t clusters_in_block =
(file_->quota()->block_size() - sizeof(MetadataBlockHeader)) / sizeof(DataBlocksClusterMetadata);
clusters_in_block = std::min(clusters_in_block, static_cast<size_t>(48));
clusters_in_block = std::min(clusters_in_block, size_t{48});
return clusters_in_block;
}
};
Expand Down Expand Up @@ -375,5 +396,5 @@ boost::iostreams::stream_offset File::file_device::seek(boost::iostreams::stream
std::streamsize File::file_device::optimal_buffer_size() const {
// Max block size. TODO: By category
// TODO: The pback_buffer_size, which is actually used, is 0x10004, fix it
return size_t{1} << (log2_size(BlockSize::Logical) + log2_size(BlockType::Cluster));
return std::streamsize{1} << (log2_size(BlockSize::Logical) + log2_size(BlockType::Cluster));
}
12 changes: 12 additions & 0 deletions src/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,15 @@ inline size_t align_to_power_of_2(size_t size) {
inline size_t div_ceil(size_t n, size_t div) {
return (n + div - 1) / div;
}

inline size_t div_ceil_pow2(size_t n, size_t pow) {
return (n + ((1ULL << pow) - 1)) >> pow;
}

inline std::pair<size_t, size_t> div_pow2(size_t n, size_t pow) {
return {n >> pow, n & ((1 << pow) - 1)};
}

inline size_t floor_pow2(size_t n, size_t pow) {
return (n >> pow) << pow;
}

0 comments on commit 1ceaac2

Please sign in to comment.