Skip to content

Commit

Permalink
Merge pull request #31 from theseus-rs/optimize-variable-sequence-mat…
Browse files Browse the repository at this point in the history
…ching

refactor: optimize variable byte sequence matching by performing BOF/EOF matches first
  • Loading branch information
brianheineman authored Jan 7, 2025
2 parents fddfe12 + 3d37f85 commit 0c11243
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 28 deletions.
24 changes: 12 additions & 12 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ quick-xml = "0.37.2"
rayon = "1.5.1"
reqwest = "0.12.12"
serde = "1.0.217"
serde_json = "1.0.134"
serde_json = "1.0.135"
serde_yaml = "0.9.34"
thiserror = "2.0.9"
tokio = { version = "1.42.0", default-features = false, features = ["macros", "rt", "sync"] }
Expand Down
13 changes: 1 addition & 12 deletions file_type/src/file_types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ static MEDIA_TYPE_MAP: LazyLock<HashMap<&'static str, Vec<&'static FileType>>> =
static DATA_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/data/");
const EMPTY_EXTENSIONS: &Vec<&'static FileType> = &Vec::new();
const EMPTY_MEDIA_TYPES: &Vec<&'static FileType> = &Vec::new();
// The following file types are slow to process and should be skipped when determining the file type
const SLOW_FILE_TYPES: [&str; 20] = [
"fmt/63", "fmt/64", "fmt/65", "fmt/66", "fmt/67", "fmt/68", "fmt/69", "fmt/70", "fmt/71",
"fmt/72", "fmt/73", "fmt/74", "fmt/75", "fmt/76", "fmt/77", "fmt/78", "fmt/79", "fmt/433",
"fmt/435", "fmt/1389",
];

/// Deserialize the PRONOM XML file format data into a map of puid to `FileType`.
fn initialize_file_formats() -> HashMap<String, FileType> {
Expand Down Expand Up @@ -60,15 +54,10 @@ fn initialize_signature_map() -> HashMap<&'static str, &'static FileType> {
let file_types = &*FILE_TYPES;

for file_type in file_types.values() {
let id = file_type.id();
if SLOW_FILE_TYPES.contains(&id) {
continue;
}

let file_format = file_type.file_format();
let internal_signatures = file_format.internal_signatures();
if !internal_signatures.is_empty() {
signatures.insert(id, file_type);
signatures.insert(file_type.id(), file_type);
}
}

Expand Down
27 changes: 24 additions & 3 deletions file_type/src/format/internal_signature.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::format::external_signature::ExternalSignature;
use crate::format::ByteSequence;
use serde::{Deserialize, Serialize};
use crate::format::{ByteSequence, PositionType};
use serde::{Deserialize, Deserializer, Serialize};

/// An internal signature.
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
Expand All @@ -12,7 +12,10 @@ pub struct InternalSignature {
name: String,
#[serde(skip_serializing_if = "String::is_empty", rename = "SignatureNote")]
note: String,
#[serde(rename = "ByteSequence")]
#[serde(
rename = "ByteSequence",
deserialize_with = "deserialize_and_sort_byte_sequences"
)]
byte_sequences: Vec<ByteSequence>,
}

Expand Down Expand Up @@ -66,6 +69,24 @@ impl InternalSignature {
}
}

fn deserialize_and_sort_byte_sequences<'de, D>(
deserializer: D,
) -> Result<Vec<ByteSequence>, D::Error>
where
D: Deserializer<'de>,
{
let mut byte_sequences: Vec<ByteSequence> = Vec::deserialize(deserializer)?;
// Sort byte sequences by position type such that Variable comes last. All current uses of
// Variable byte sequences also include a BOF/EOF sequence. This is an optimization
// to check BOF/EOF sequences first to avoid checking Variable sequences when possible.
byte_sequences.sort_by_key(|byte_sequence| match byte_sequence.position_type() {
PositionType::AbsoluteFromBOF => 0,
PositionType::AbsoluteFromEOF => 1,
PositionType::Variable => 2,
});
Ok(byte_sequences)
}

#[cfg(test)]
mod test {
use super::*;
Expand Down
28 changes: 28 additions & 0 deletions file_type/tests/file_format.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use anyhow::Result;
use file_type::format::FileFormat;
use std::path::PathBuf;
use tokio::time::Instant;

const CRATE_DIR: &str = env!("CARGO_MANIFEST_DIR");

async fn file_format(format_type: &str, file_name: &str) -> Result<FileFormat> {
let path = PathBuf::from(CRATE_DIR)
.join("data")
.join(format_type)
.join(file_name);
let xml = tokio::fs::read_to_string(path).await?;
let file_format = quick_xml::de::from_str(xml.as_str())?;
Ok(file_format)
}

#[tokio::test]
async fn test_variable_format() -> Result<()> {
let file_format = file_format("pronom", "fmt-63.xml").await?;
let length = 1 << 31;
let bytes = vec![0; length];
let start = Instant::now();
let _ = file_format.is_match(&bytes);
let elapsed = start.elapsed();
assert!(elapsed.as_millis() < 10);
Ok(())
}

0 comments on commit 0c11243

Please sign in to comment.