Skip to content

Commit

Permalink
Store the name of a mounted bucket in block's metadata (#1208)
Browse files Browse the repository at this point in the history
The field `x-amz-meta-source-bucket-name` of the cache block was
intended to store the name of the mounted bucket (source bucket).
Currently it stores the name of the cache bucket.

### Does this change impact existing behavior?

Yes, we update the version of the block schema. All blocks written with
previous versions of Mountpoint won't be accessible (attempts will be
cache misses).

### Does this change need a changelog entry?

Yes.

---

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license and I agree to the terms of
the [Developer Certificate of Origin
(DCO)](https://developercertificate.org/).

---------

Signed-off-by: Vlad Volodkin <vlaad@amazon.com>
Signed-off-by: Volodkin Vladislav <vladvolodkin@gmail.com>
Co-authored-by: Vlad Volodkin <vlaad@amazon.com>
Co-authored-by: Alessandro Passaro <alessandro.passaro@gmail.com>
  • Loading branch information
3 people authored Jan 3, 2025
1 parent 641f613 commit 4284e64
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 7 deletions.
1 change: 1 addition & 0 deletions mountpoint-s3/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
### Other changes

* Fix an issue where an interrupt during `readdir` syscall leads to an error. ([#965](https://github.com/awslabs/mountpoint-s3/pull/965))
* Fix an issue where the source bucket of a shared cache block was not correctly validated ([#1208](https://github.com/awslabs/mountpoint-s3/pull/1208))

## v1.13.0 (December 2, 2024)

Expand Down
52 changes: 45 additions & 7 deletions mountpoint-s3/src/data_cache/express_data_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use tracing::Instrument;

use mountpoint_s3_client::checksums::crc32c_from_base64;

const CACHE_VERSION: &str = "V1";
const CACHE_VERSION: &str = "V2";

/// Configuration for a [ExpressDataCache].
#[derive(Debug)]
Expand All @@ -46,6 +46,8 @@ pub struct ExpressDataCache<Client: ObjectClient> {
config: ExpressDataCacheConfig,
/// Name of the S3 Express bucket to store the blocks.
bucket_name: String,
/// Name of the mounted bucket.
source_bucket_name: String,
}

impl<S, C> From<ObjectClientError<S, C>> for DataCacheError
Expand All @@ -69,6 +71,7 @@ where
prefix: build_prefix(source_bucket_name, config.block_size),
config,
bucket_name: bucket_name.to_owned(),
source_bucket_name: source_bucket_name.to_owned(),
}
}

Expand All @@ -80,7 +83,7 @@ where
// calculates the prefix.
let data = format!(
"source_bucket={}\nblock_size={}",
self.bucket_name, self.config.block_size
self.source_bucket_name, self.config.block_size
);

// put_object is sufficient for validating cache, as S3 Directory buckets only support
Expand Down Expand Up @@ -185,7 +188,7 @@ where
.ok_or_else(|| DataCacheError::InvalidBlockChecksum)?;
let crc32c = crc32c_from_base64(&crc32c_b64).map_err(|_| DataCacheError::InvalidBlockChecksum)?;

let block_metadata = BlockMetadata::new(block_idx, block_offset, cache_key, &self.bucket_name, crc32c);
let block_metadata = BlockMetadata::new(block_idx, block_offset, cache_key, &self.source_bucket_name, crc32c);
block_metadata.validate_object_metadata(&object_metadata)?;

Ok(Some(ChecksummedBytes::new_from_inner_data(buffer, crc32c)))
Expand All @@ -211,7 +214,8 @@ where
let object_key = get_s3_key(&self.prefix, &cache_key, block_idx);

let (data, checksum) = bytes.into_inner().map_err(|_| DataCacheError::InvalidBlockContent)?;
let block_metadata = BlockMetadata::new(block_idx, block_offset, &cache_key, &self.bucket_name, checksum);
let block_metadata =
BlockMetadata::new(block_idx, block_offset, &cache_key, &self.source_bucket_name, checksum);

self.client
.put_object_single(
Expand Down Expand Up @@ -300,7 +304,7 @@ where
/// wanting to get (and avoid collisions with the key).
/// On miss, bypass the cache and go to the main data source.
#[cfg_attr(test, derive(proptest_derive::Arbitrary))]
#[derive(Debug, PartialEq, Eq)]
#[derive(Clone, Debug, PartialEq, Eq)]
struct BlockMetadata {
block_idx: BlockIndex,
block_offset: u64,
Expand Down Expand Up @@ -600,13 +604,28 @@ mod tests {
);

let (data, checksum) = data.into_inner().unwrap();
let block_metadata = BlockMetadata::new(0, 0, &cache_key, bucket, checksum);
let block_metadata = BlockMetadata::new(0, 0, &cache_key, source_bucket, checksum);
let put_params = block_metadata.to_put_object_params();

let (data_2, checksum_2) = data_2.into_inner().unwrap();
let block_metadata_2 = BlockMetadata::new(0, 0, &cache_key, bucket, checksum_2);
let block_metadata_2 = BlockMetadata::new(0, 0, &cache_key, source_bucket, checksum_2);
let put_params_2 = block_metadata_2.to_put_object_params();

// Store with correct metadata and expect a successful get_block
client
.put_object_single(bucket, &object_key, &put_params, data.clone())
.in_current_span()
.await
.unwrap();
let (received_data, _) = cache
.get_block(&cache_key, 0, 0, data.len())
.await
.expect("get should succeed with intact metadata")
.expect("block should be non-empty")
.into_inner()
.expect("block should be valid");
assert_eq!(received_data, data);

// Remove the checksum when writing.
client
.put_object_single(bucket, &object_key, &put_params.clone().checksum(None), data.clone())
Expand Down Expand Up @@ -660,6 +679,25 @@ mod tests {
.expect_err("cache should return error if object metadata doesn't match data");
assert!(matches!(err, DataCacheError::InvalidBlockHeader(_)));

// Write data with object metadata header for object from a different bucket
let mut corrupted_metadata = block_metadata.clone();
corrupted_metadata.source_bucket_name = bucket.to_owned();
client
.put_object_single(
bucket,
&object_key,
&corrupted_metadata.to_put_object_params(),
data.clone(),
)
.in_current_span()
.await
.unwrap();
let err = cache
.get_block(&cache_key, 0, 0, data.len())
.await
.expect_err("cache should return error if source bucket does not match");
assert!(matches!(err, DataCacheError::InvalidBlockHeader(_)));

// Get data that's not been written yet
let result = cache
.get_block(&cache_key_non_existent, 0, 0, data.len())
Expand Down

0 comments on commit 4284e64

Please sign in to comment.