Skip to content

Commit

Permalink
Improve handling out-of-RAM errors during Qdrant startup (qdrant#1777)
Browse files Browse the repository at this point in the history
* WIP: Start working on out-of-RAM errors handling [skip ci]

* Implement basic handling of out-of-RAM errors during Qdrant startup

* Try to fix CI fail by allowing both V1 and V2 cgroups

* Try to fix CI fail by improving cgroups handling

* Fix cgroups path detection/handling (+ some minor stylistic changes)

* fixup! Fix cgroups path detection/handling (+ some minor stylistic changes)

* Add test

* Enable low RAM test

* fixup! Add test

* free memory checks

* rm unused function

* Oom fallback script (qdrant#1809)

* add recover mode in qdrant + script for handelling OOM

* fix clippy

* reformat entrypoint.sh

* fix test

* add logging to test

* fix test

* fix test

---------

Co-authored-by: Andrey Vasnetsov <andrey@vasnetsov.com>
  • Loading branch information
ffuugoo and generall committed May 17, 2023
1 parent 8687d28 commit 93d784f
Show file tree
Hide file tree
Showing 27 changed files with 649 additions and 63 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
tests/storage-compat/storage.tar.bz2 filter=lfs diff=lfs merge=lfs -text
tests/storage-compat/collection.snapshot.gz filter=lfs diff=lfs merge=lfs -text
tests/low-ram/storage.tar.xz filter=lfs diff=lfs merge=lfs -text
21 changes: 21 additions & 0 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,24 @@ jobs:
run: ./tests/grpc_consistency_check.sh
- name: OpenAPI file consistency check
run: ./tests/openapi_consistency_check.sh


test-low-ram:
runs-on: ubuntu-latest

steps:
- name: Install dependencies
run: sudo apt-get install clang protobuf-compiler git-lfs jq
- name: Setup git-lfs
run: git lfs install
- name: Install minimal stable
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
- uses: Swatinem/rust-cache@v2
- uses: actions/checkout@v3
- name: Run low RAM test
working-directory: ./tests/low-ram
shell: bash
run: ./low-ram.sh
93 changes: 92 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ RUN mkdir -p ${APP}

COPY --from=builder /qdrant/qdrant ${APP}/qdrant
COPY --from=builder /qdrant/config ${APP}/config
COPY --from=builder /qdrant/tools/entrypoint.sh ${APP}/entrypoint.sh

WORKDIR ${APP}

CMD ["./qdrant"]
CMD ["./entrypoint.sh"]
6 changes: 5 additions & 1 deletion lib/collection/src/operations/shared_storage_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ const DEFAULT_UPDATE_QUEUE_SIZE_LISTENER: usize = 10_000;
/// Storage configuration shared between all collections.
/// Represents a per-node configuration, which might be changes with restart.
/// Vales of this struct are not persisted.
#[derive(Copy, Clone, Debug)]
#[derive(Clone, Debug)]
pub struct SharedStorageConfig {
pub update_queue_size: usize,
pub node_type: NodeType,
pub handle_collection_load_errors: bool,
pub recovery_mode: Option<String>,
}

impl Default for SharedStorageConfig {
Expand All @@ -19,6 +20,7 @@ impl Default for SharedStorageConfig {
update_queue_size: DEFAULT_UPDATE_QUEUE_SIZE,
node_type: Default::default(),
handle_collection_load_errors: false,
recovery_mode: None,
}
}
}
Expand All @@ -28,6 +30,7 @@ impl SharedStorageConfig {
update_queue_size: Option<usize>,
node_type: NodeType,
handle_collection_load_errors: bool,
recovery_mode: Option<String>,
) -> Self {
let update_queue_size = update_queue_size.unwrap_or(match node_type {
NodeType::Normal => DEFAULT_UPDATE_QUEUE_SIZE,
Expand All @@ -38,6 +41,7 @@ impl SharedStorageConfig {
update_queue_size,
node_type,
handle_collection_load_errors,
recovery_mode,
}
}
}
5 changes: 5 additions & 0 deletions lib/collection/src/operations/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ pub enum CollectionError {
},
#[error("Remote shard on {peer_id} failed during forward proxy operation: {error}")]
ForwardProxyError { peer_id: PeerId, error: Box<Self> },
#[error("Out of memory, free: {free}, {description}")]
OutOfMemory { description: String, free: u64 },
}

impl CollectionError {
Expand Down Expand Up @@ -475,6 +477,9 @@ impl From<OperationError> for CollectionError {
OperationError::TypeInferenceError { .. } => Self::BadInput {
description: format!("{err}"),
},
OperationError::OutOfMemory { description, free } => {
Self::OutOfMemory { description, free }
}
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion lib/collection/src/shards/local_shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,10 @@ impl LocalShard {

return Err(err.clone());
}

Err(err @ CollectionError::OutOfMemory { .. }) => {
log::error!("{err}");
return Err(err.clone());
}
Err(err) => log::error!("{err}"),
Ok(_) => (),
}
Expand Down
49 changes: 27 additions & 22 deletions lib/collection/src/shards/replica_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,8 @@ impl ShardReplicaSet {
/// Recovers shard from disk.
///
/// WARN: This method intended to be used only on the initial start of the node.
/// It does not implement any logic to recover from a failure. Will panic if there is a failure.
/// It does not implement any logic to recover from a failure.
/// Will panic or load partial state if there is a failure.
#[allow(clippy::too_many_arguments)]
pub async fn load(
shard_id: ShardId,
Expand Down Expand Up @@ -480,32 +481,36 @@ impl ShardReplicaSet {
);

let local = if replica_state.read().is_local {
let res = LocalShard::load(
shard_id,
collection_id.clone(),
shard_path,
collection_config.clone(),
shared_storage_config.clone(),
update_runtime.clone(),
)
.await;
let shard = if let Some(recovery_reason) = &shared_storage_config.recovery_mode {
Dummy(DummyShard::new(recovery_reason))
} else {
let res = LocalShard::load(
shard_id,
collection_id.clone(),
shard_path,
collection_config.clone(),
shared_storage_config.clone(),
update_runtime.clone(),
)
.await;

let shard = match res {
Ok(shard) => Local(shard),
Err(err) => {
if !shared_storage_config.handle_collection_load_errors {
panic!("Failed to load local shard {shard_path:?}: {err}")
}
match res {
Ok(shard) => Local(shard),
Err(err) => {
if !shared_storage_config.handle_collection_load_errors {
panic!("Failed to load local shard {shard_path:?}: {err}")
}

log::error!(
"Failed to load local shard {shard_path:?}, \
log::error!(
"Failed to load local shard {shard_path:?}, \
initializing \"dummy\" shard instead: \
{err}"
);
);

Dummy(DummyShard::new(format!(
"Failed to load local shard {shard_path:?}: {err}"
)))
Dummy(DummyShard::new(format!(
"Failed to load local shard {shard_path:?}: {err}"
)))
}
}
};

Expand Down
3 changes: 3 additions & 0 deletions lib/collection/src/shards/shard_holder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ impl ShardHolder {
let mut require_migration = true;
match shard_type {
ShardType::Local => {
// deprecated
let local_shard = LocalShard::load(
shard_id,
collection_id.clone(),
Expand All @@ -239,12 +240,14 @@ impl ShardHolder {
.unwrap();
}
ShardType::Remote { peer_id } => {
// deprecated
replica_set
.add_remote(peer_id, ReplicaState::Active)
.await
.unwrap();
}
ShardType::Temporary => {
// deprecated
let temp_shard = LocalShard::load(
shard_id,
collection_id.clone(),
Expand Down
6 changes: 6 additions & 0 deletions lib/segment/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ quantization = { git = "https://github.com/qdrant/quantization.git" }
validator = { version = "0.16", features = ["derive"] }
chrono = { version = "0.4.24", features = ["serde"] }

sysinfo = "0.28"

[target.'cfg(target_os = "linux")'.dependencies]
cgroups-rs = "0.3"
procfs = { version = "0.15", default-features = false }

[[bench]]
name = "vector_search"
harness = false
Expand Down
Loading

0 comments on commit 93d784f

Please sign in to comment.