From a3f01c19fda400196897e07a1b7f6747e17562c7 Mon Sep 17 00:00:00 2001 From: Xiaoya Wei Date: Thu, 28 Sep 2023 22:45:42 +0800 Subject: [PATCH] perf: Improve throughput when vectored IO is not enabled (#712) As discussed in https://github.com/hyperium/h2/issues/711, the current implementation of sending data is suboptimal when vectored I/O is not enabled: data frame's head is likely to be sent in a separate TCP segment, whose payload is of only 9 bytes. This PR adds some specialized implementaton for non-vectored I/O case. In short, it sets a larget chain threhold, and also makes sure a data frame's head is sent along with the beginning part of the real data payload. All existing unit tests passed. Also I take a look at the e2e https://github.com/hyperium/hyper/blob/0.14.x/benches/end_to_end.rs but realize that all the benchmarks there are for the case of vectored I/O if the OS supports vectored I/O. There isn't a specific case for non-vectored I/O so I am not sure how to proceed with benchmark for performance evaluations. --- Cargo.toml | 2 +- src/codec/framed_write.rs | 84 +++++++++++++++++---------------------- 2 files changed, 37 insertions(+), 49 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index dd351bfa..b11981d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ members = [ futures-core = { version = "0.3", default-features = false } futures-sink = { version = "0.3", default-features = false } futures-util = { version = "0.3", default-features = false } -tokio-util = { version = "0.7.1", features = ["codec"] } +tokio-util = { version = "0.7.1", features = ["codec", "io"] } tokio = { version = "1", features = ["io-util"] } bytes = "1" http = "0.2" diff --git a/src/codec/framed_write.rs b/src/codec/framed_write.rs index 4b1b4acc..c88af02d 100644 --- a/src/codec/framed_write.rs +++ b/src/codec/framed_write.rs @@ -7,8 +7,9 @@ use bytes::{Buf, BufMut, BytesMut}; use std::pin::Pin; use std::task::{Context, Poll}; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tokio_util::io::poll_write_buf; -use std::io::{self, Cursor, IoSlice}; +use std::io::{self, Cursor}; // A macro to get around a method needing to borrow &mut self macro_rules! limited_write_buf { @@ -45,8 +46,11 @@ struct Encoder { /// Max frame size, this is specified by the peer max_frame_size: FrameSize, - /// Whether or not the wrapped `AsyncWrite` supports vectored IO. - is_write_vectored: bool, + /// Chain payloads bigger than this. + chain_threshold: usize, + + /// Min buffer required to attempt to write a frame + min_buffer_capacity: usize, } #[derive(Debug)] @@ -61,14 +65,16 @@ enum Next { /// frame that big. const DEFAULT_BUFFER_CAPACITY: usize = 16 * 1_024; -/// Min buffer required to attempt to write a frame -const MIN_BUFFER_CAPACITY: usize = frame::HEADER_LEN + CHAIN_THRESHOLD; - -/// Chain payloads bigger than this. The remote will never advertise a max frame -/// size less than this (well, the spec says the max frame size can't be less -/// than 16kb, so not even close). +/// Chain payloads bigger than this when vectored I/O is enabled. The remote +/// will never advertise a max frame size less than this (well, the spec says +/// the max frame size can't be less than 16kb, so not even close). const CHAIN_THRESHOLD: usize = 256; +/// Chain payloads bigger than this when vectored I/O is **not** enabled. +/// A larger value in this scenario will reduce the number of small and +/// fragmented data being sent, and hereby improve the throughput. +const CHAIN_THRESHOLD_WITHOUT_VECTORED_IO: usize = 1024; + // TODO: Make generic impl FramedWrite where @@ -76,7 +82,11 @@ where B: Buf, { pub fn new(inner: T) -> FramedWrite { - let is_write_vectored = inner.is_write_vectored(); + let chain_threshold = if inner.is_write_vectored() { + CHAIN_THRESHOLD + } else { + CHAIN_THRESHOLD_WITHOUT_VECTORED_IO + }; FramedWrite { inner, encoder: Encoder { @@ -85,7 +95,8 @@ where next: None, last_data_frame: None, max_frame_size: frame::DEFAULT_MAX_FRAME_SIZE, - is_write_vectored, + chain_threshold, + min_buffer_capacity: chain_threshold + frame::HEADER_LEN, }, } } @@ -126,23 +137,17 @@ where Some(Next::Data(ref mut frame)) => { tracing::trace!(queued_data_frame = true); let mut buf = (&mut self.encoder.buf).chain(frame.payload_mut()); - ready!(write( - &mut self.inner, - self.encoder.is_write_vectored, - &mut buf, - cx, - ))? + ready!(poll_write_buf(Pin::new(&mut self.inner), cx, &mut buf))? } _ => { tracing::trace!(queued_data_frame = false); - ready!(write( - &mut self.inner, - self.encoder.is_write_vectored, - &mut self.encoder.buf, + ready!(poll_write_buf( + Pin::new(&mut self.inner), cx, + &mut self.encoder.buf ))? } - } + }; } match self.encoder.unset_frame() { @@ -165,30 +170,6 @@ where } } -fn write( - writer: &mut T, - is_write_vectored: bool, - buf: &mut B, - cx: &mut Context<'_>, -) -> Poll> -where - T: AsyncWrite + Unpin, - B: Buf, -{ - // TODO(eliza): when tokio-util 0.5.1 is released, this - // could just use `poll_write_buf`... - const MAX_IOVS: usize = 64; - let n = if is_write_vectored { - let mut bufs = [IoSlice::new(&[]); MAX_IOVS]; - let cnt = buf.chunks_vectored(&mut bufs); - ready!(Pin::new(writer).poll_write_vectored(cx, &bufs[..cnt]))? - } else { - ready!(Pin::new(writer).poll_write(cx, buf.chunk()))? - }; - buf.advance(n); - Ok(()).into() -} - #[must_use] enum ControlFlow { Continue, @@ -240,12 +221,17 @@ where return Err(PayloadTooBig); } - if len >= CHAIN_THRESHOLD { + if len >= self.chain_threshold { let head = v.head(); // Encode the frame head to the buffer head.encode(len, self.buf.get_mut()); + if self.buf.get_ref().remaining() < self.chain_threshold { + let extra_bytes = self.chain_threshold - self.buf.remaining(); + self.buf.get_mut().put(v.payload_mut().take(extra_bytes)); + } + // Save the data frame self.next = Some(Next::Data(v)); } else { @@ -305,7 +291,9 @@ where } fn has_capacity(&self) -> bool { - self.next.is_none() && self.buf.get_ref().remaining_mut() >= MIN_BUFFER_CAPACITY + self.next.is_none() + && (self.buf.get_ref().capacity() - self.buf.get_ref().len() + >= self.min_buffer_capacity) } fn is_empty(&self) -> bool {