diff --git a/README.md b/README.md index 67d0ce0..05586c6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Synopsis -The `LineReader` struct is a byte-delimiter-focused buffered reader meant as a +`LineReader` is a byte-delimiter-focused buffered reader for Rust, meant as a faster, less error-prone alternative to `BufRead::read_until`. It provides two main functions: @@ -13,7 +13,8 @@ It provides two main functions: Returns `Option>` - `None` on end-of-file, an IO error from the wrapped reader, or an immutable byte slice ending on and including any delimiter. -Line length is limited to the size of the internal buffer. +Line length is limited to the size of the internal buffer - longer lines will be +spread across multiple reads. In contrast with `read_until`, detecting end-of-file is more natural with the use of `Option`; line length is naturally limited to some sensible value without @@ -23,75 +24,42 @@ slices; you'll never forget to call `buf.clear()`. ### `next_batch()` -Behaves identically to `next_line()`, except it returns a slice of *all* the complete -lines in the buffer. +Behaves identically to `next_line()`, except it returns a slice of *all* the +complete lines in the buffer. ## Example - extern crate linereader; - use linereader::LineReader; +```rust +extern crate linereader; +use linereader::LineReader; - let mut file = File::open(myfile).expect("open"); +let mut file = File::open(myfile).expect("open"); - // Defaults to a 1 MiB buffer and b'\n' delimiter; change with one of: - // * LineReader::with_capacity(usize); - // * LineReader::with_delimiter(u8); - // * LineReader::with_delimiter_and_capacity(u8, usize) - let mut reader = LineReader::new(file); - - while let Some(line) = reader.next_line() { - let line = line.expect("read error"); - // line is a &[u8] owned by reader. - } - -Lines can also be read in batches for group processing - e.g. in threads: - - while let Some(lines) = reader.next_batch() { - send(&chan, lines.expect("read error").to_vec()); - } - -This should be more efficient than finding each intermediate delimiter in the main -thread, and allocating and sending each individual line. Any line fragments at -the end of the internal buffer will be copied to the start in the next call. +// Defaults to a 64 KiB buffer and b'\n' delimiter; change with one of: +// * LineReader::with_capacity(usize); +// * LineReader::with_delimiter(u8); +// * LineReader::with_delimiter_and_capacity(u8, usize) +let mut reader = LineReader::new(file); +while let Some(line) = reader.next_line() { + let line = line.expect("read error"); + // line is a &[u8] owned by reader. +} +``` ## Performance Tests performed using ['Dickens_Charles_Pickwick_Papers.xml'](http://hur.st/Dickens_Charles_Pickwick_Papers.xml.xz), concatinated to itself 480 times. The resulting file is 976 MB and 10.3 million lines long. -Buffers in each test are set to 1 MiB. - ### Westmere Xeon 2.1GHz, FreeBSD/ZFS. | Method | Time | Lines/sec | Bandwidth | |------------------|--------:|------------:|--------------:| -| read() | 1.82s | 5,674,452/s | 535.21 MB/s | -| LR::next_batch() | 1.83s | 5,650,387/s | 532.94 MB/s | -| LR::next_line() | 3.10s | 3,341,796/s | 315.20 MB/s | -| read_until() | 3.62s | 2,861,864/s | 269.93 MB/s | -| read_line() | 4.25s | 2,432,505/s | 229.43 MB/s | -| lines() | 4.88s | 2,119,837/s | 199.94 MB/s | - -### Haswell Xeon 3.4GHz, Windows 10 Subystem for Linux. - -| Method | Time | Lines/sec | Bandwidth | -|------------------|--------:|------------:|--------------:| -| read() | 0.26s | 39,253,494/s | 3702.36 MB/s | -| LR::next_batch() | 0.26s | 39,477,365/s | 3723.47 MB/s | -| LR::next_line() | 0.50s | 20,672,784/s | 1949.84 MB/s | -| read_until() | 0.60s | 17,303,147/s | 1632.02 MB/s | -| read_line() | 0.84s | 12,293,247/s | 1159.49 MB/s | -| lines() | 1.53s | 6,783,849/s | 639.85 MB/s | - -It's also surprisingly fast on debug builds (or stdlib is surprisingly slow): - -| Method | Time | Lines/sec | Bandwidth | -|------------------|--------:|------------:|--------------:| -| read() | 0.27s | 38,258,105/s | 3608.47 MB/s | -| LR::next_batch() | 0.28s | 36,896,353/s | 3480.04 MB/s | -| LR::next_line() | 2.99s | 3,463,911/s | 326.71 MB/s | -| read_until() | 57.01s | 181,505/s | 17.12 MB/s | -| read_line() | 58.36s | 177,322/s | 16.72 MB/s | -| lines() | 21.06s | 491,320/s | 46.34 MB/s | +| read() | 0.25s | 41429738/s | 3907.62 MB/s | +| LR::next_batch() | 0.27s | 38258946/s | 3608.55 MB/s | +| LR::next_line() | 1.51s | 6874006/s | 648.35 MB/s | +| read_until() | 1.94s | 5327387/s | 502.47 MB/s | +| read_line() | 2.54s | 4081562/s | 384.97 MB/s | +| lines() | 3.23s | 3199491/s | 301.77 MB/s | diff --git a/src/lib.rs b/src/lib.rs index fe92bd0..de25c9c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,7 @@ extern crate memchr; use memchr::{memchr, memrchr}; const NEWLINE: u8 = b'\n'; -const DEFAULT_CAPACITY: usize = 1024 * 1024; +const DEFAULT_CAPACITY: usize = 1024 * 64; /// The `LineReader` struct adds buffered, byte-delimited (default: `\n`) /// reading to any io::Reader. @@ -50,7 +50,7 @@ impl fmt::Debug for LineReader { impl LineReader { /// Create a new `LineReader` around the reader with a default capacity of - /// 1 MiB and delimiter of `\n`. + /// 64 KiB and delimiter of `\n`. /// /// ```no_run /// # use linereader::LineReader; @@ -73,7 +73,7 @@ impl LineReader { /// # use std::fs::File; /// # use std::io; /// # fn x() -> io::Result<()> { - /// let mut reader = LineReader::with_capacity(1024*64, File::open("myfile.txt")?); + /// let mut reader = LineReader::with_capacity(1024*512, File::open("myfile.txt")?); /// # Ok(()) /// # } /// ``` @@ -105,7 +105,7 @@ impl LineReader { /// # use std::fs::File; /// # use std::io; /// # fn x() -> io::Result<()> { - /// let mut reader = LineReader::with_delimiter_and_capacity(b'\t', 1024*64, File::open("myfile.txt")?); + /// let mut reader = LineReader::with_delimiter_and_capacity(b'\t', 1024*512, File::open("myfile.txt")?); /// # Ok(()) /// # } /// ``` diff --git a/src/main.rs b/src/main.rs index e0e8624..98b64fd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,7 +10,7 @@ extern crate memchr; use memchr::Memchr; -const BUFFER_SIZE: usize = 1024 * 1024; +const BUFFER_SIZE: usize = 1024 * 64; struct Report { lines: u64, @@ -81,7 +81,7 @@ fn try_baseline(report: &Report, filename: &str) { bytes += r as u64; } - report.report("read() 1 MiB", Some(bytes), None, start.elapsed()); + report.report("read()", Some(bytes), None, start.elapsed()); } fn try_linereader_batch(report: &Report, filename: &str) {