Skip to content

Commit

Permalink
ref(sourcebundle): Check UTF-8 validity memory efficiently (#890)
Browse files Browse the repository at this point in the history
The current check to ensure a sourcebundle is valid UTF-8 reads the entire sourcebundle file into memory. This is inefficient for large files.

This PR introduces a UTF8Reader which wraps any reader. The UTF8Reader ensures that the stream is valid UTF8 as it is being read, while only requiring a small amount of memory (currently 8 KiB) to be allocated as a buffer.

Co-authored-by: Sebastian Zivota <loewenheim@users.noreply.github.com>
  • Loading branch information
szokeasaurusrex and loewenheim authored Jan 20, 2025
1 parent 2865146 commit 38c5a16
Show file tree
Hide file tree
Showing 6 changed files with 455 additions and 13 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## Unreleased

**Improvements**

- Check UTF-8 validity memory efficiently ([#890](https://github.com/getsentry/symbolic/pull/890))

## 12.13.2

**Fixes**
Expand Down
115 changes: 115 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ once_cell = "1.17.1"
parking_lot = "0.12.1"
pdb-addr2line = "0.10.4"
proguard = { version = "5.4.0", features = ["uuid"] }
proptest = "1.6.0"
regex = "1.7.1"
rustc-demangle = "0.1.21"
# keep this in sync with whatever version `goblin` uses
Expand Down
1 change: 1 addition & 0 deletions symbolic-debuginfo/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ zstd = { workspace = true, optional = true }
[dev-dependencies]
criterion = { workspace = true }
insta = { workspace = true }
proptest = {workspace = true }
tempfile = { workspace = true }
similar-asserts = { workspace = true }
symbolic-testutils = { path = "../symbolic-testutils" }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,17 @@
//! bundle a file entry has a `url` and might carry `headers` or individual debug IDs
//! per source file.
mod utf8_reader;

use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet, HashMap};
use std::error::Error;
use std::fmt;
use std::fmt::{Display, Formatter};
use std::fs::{File, OpenOptions};
use std::io::{BufReader, BufWriter, Read, Seek, Write};
use std::io::{BufReader, BufWriter, ErrorKind, Read, Seek, Write};
use std::path::Path;
use std::sync::Arc;
use std::{fmt, io};

use parking_lot::Mutex;
use regex::Regex;
Expand All @@ -60,6 +62,7 @@ use zip::{write::SimpleFileOptions, ZipWriter};

use symbolic_common::{Arch, AsSelf, CodeId, DebugId, SourceLinkMappings};

use self::utf8_reader::Utf8Reader;
use crate::base::*;
use crate::js::{
discover_debug_id, discover_sourcemap_embedded_debug_id, discover_sourcemaps_location,
Expand Down Expand Up @@ -1217,31 +1220,41 @@ where
pub fn add_file<S, R>(
&mut self,
path: S,
mut file: R,
file: R,
info: SourceFileInfo,
) -> Result<(), SourceBundleError>
where
S: AsRef<str>,
R: Read,
{
let mut buf = String::new();

if let Err(e) = file.read_to_string(&mut buf) {
return Err(SourceBundleError::new(SourceBundleErrorKind::ReadFailed, e));
}
let mut file_reader = Utf8Reader::new(file);

let full_path = self.file_path(path.as_ref());
let unique_path = self.unique_path(full_path);

self.writer
.start_file(unique_path.clone(), default_file_options())
.map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;
self.writer
.write_all(buf.as_bytes())
.map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;

self.manifest.files.insert(unique_path, info);
Ok(())
match io::copy(&mut file_reader, &mut self.writer) {
Err(e) => {
self.writer
.abort_file()
.map_err(|e| SourceBundleError::new(SourceBundleErrorKind::WriteFailed, e))?;

// ErrorKind::InvalidData is returned by Utf8Reader when the file is not valid UTF-8.
let error_kind = match e.kind() {
ErrorKind::InvalidData => SourceBundleErrorKind::ReadFailed,
_ => SourceBundleErrorKind::WriteFailed,
};

Err(SourceBundleError::new(error_kind, e))
}
Ok(_) => {
self.manifest.files.insert(unique_path, info);
Ok(())
}
}
}

/// Calls add_file, and handles any ReadFailed errors by calling the skipped_file_callback.
Expand Down
Loading

0 comments on commit 38c5a16

Please sign in to comment.