Skip to content

Commit

Permalink
Subsequence versions of ByteBufUtil#writeUtf8(...) methods (netty#9224)
Browse files Browse the repository at this point in the history
Motivation

It would be useful to be able to write UTF-8 encoded subsequence of
CharSequence characters to a ByteBuf without needing to create a
temporary object via CharSequence#subSequence().

Modification

Add overloads of ByteBufUtil writeUtf8, reserveAndWriteUtf8 and
utf8Bytes methods which take explicit subsequence bounds.

Result

More efficient writing of substrings to byte buffers possible
  • Loading branch information
njhill authored and normanmaurer committed Jun 21, 2019
1 parent 9dd1aab commit 2af769f
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 12 deletions.
75 changes: 63 additions & 12 deletions buffer/src/main/java/io/netty/buffer/ByteBufUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import io.netty.util.Recycler;
import io.netty.util.Recycler.Handle;
import io.netty.util.concurrent.FastThreadLocal;
import io.netty.util.internal.MathUtil;
import io.netty.util.internal.PlatformDependent;
import io.netty.util.internal.StringUtil;
import io.netty.util.internal.SystemPropertyUtil;
Expand Down Expand Up @@ -472,6 +473,14 @@ private static int lastIndexOf(ByteBuf buffer, int fromIndex, int toIndex, byte
return buffer.forEachByteDesc(toIndex, fromIndex - toIndex, new ByteProcessor.IndexOfProcessor(value));
}

private static CharSequence checkCharSequenceBounds(CharSequence seq, int start, int end) {
if (MathUtil.isOutOfBounds(start, end - start, seq.length())) {
throw new IndexOutOfBoundsException("expected: 0 <= start(" + start + ") <= end (" + end
+ ") <= seq.length(" + seq.length() + ')');
}
return seq;
}

/**
* Encode a {@link CharSequence} in <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a> and write
* it to a {@link ByteBuf} allocated with {@code alloc}.
Expand All @@ -496,7 +505,17 @@ public static ByteBuf writeUtf8(ByteBufAllocator alloc, CharSequence seq) {
* This method returns the actual number of bytes written.
*/
public static int writeUtf8(ByteBuf buf, CharSequence seq) {
return reserveAndWriteUtf8(buf, seq, utf8MaxBytes(seq));
int seqLength = seq.length();
return reserveAndWriteUtf8Seq(buf, seq, 0, seqLength, utf8MaxBytes(seqLength));
}

/**
* Equivalent to <code>{@link #writeUtf8(ByteBuf, CharSequence) writeUtf8(buf, seq.subSequence(start, end))}</code>
* but avoids subsequence object allocation.
*/
public static int writeUtf8(ByteBuf buf, CharSequence seq, int start, int end) {
checkCharSequenceBounds(seq, start, end);
return reserveAndWriteUtf8Seq(buf, seq, start, end, utf8MaxBytes(end - start));
}

/**
Expand All @@ -509,34 +528,53 @@ public static int writeUtf8(ByteBuf buf, CharSequence seq) {
* This method returns the actual number of bytes written.
*/
public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int reserveBytes) {
return reserveAndWriteUtf8Seq(buf, seq, 0, seq.length(), reserveBytes);
}

/**
* Equivalent to <code>{@link #reserveAndWriteUtf8(ByteBuf, CharSequence, int)
* reserveAndWriteUtf8(buf, seq.subSequence(start, end), reserveBytes)}</code> but avoids
* subsequence object allocation if possible.
*
* @return actual number of bytes written
*/
public static int reserveAndWriteUtf8(ByteBuf buf, CharSequence seq, int start, int end, int reserveBytes) {
return reserveAndWriteUtf8Seq(buf, checkCharSequenceBounds(seq, start, end), start, end, reserveBytes);
}

private static int reserveAndWriteUtf8Seq(ByteBuf buf, CharSequence seq, int start, int end, int reserveBytes) {
for (;;) {
if (buf instanceof WrappedCompositeByteBuf) {
// WrappedCompositeByteBuf is a sub-class of AbstractByteBuf so it needs special handling.
buf = buf.unwrap();
} else if (buf instanceof AbstractByteBuf) {
AbstractByteBuf byteBuf = (AbstractByteBuf) buf;
byteBuf.ensureWritable0(reserveBytes);
int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, seq.length());
int written = writeUtf8(byteBuf, byteBuf.writerIndex, seq, start, end);
byteBuf.writerIndex += written;
return written;
} else if (buf instanceof WrappedByteBuf) {
// Unwrap as the wrapped buffer may be an AbstractByteBuf and so we can use fast-path.
buf = buf.unwrap();
} else {
byte[] bytes = seq.toString().getBytes(CharsetUtil.UTF_8);
byte[] bytes = seq.subSequence(start, end).toString().getBytes(CharsetUtil.UTF_8);
buf.writeBytes(bytes);
return bytes.length;
}
}
}

// Fast-Path implementation
static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq, int len) {
return writeUtf8(buffer, writerIndex, seq, 0, len);
}

// Fast-Path implementation
static int writeUtf8(AbstractByteBuf buffer, int writerIndex, CharSequence seq, int start, int end) {
int oldWriterIndex = writerIndex;

// We can use the _set methods as these not need to do any index checks and reference checks.
// This is possible as we called ensureWritable(...) before.
for (int i = 0; i < len; i++) {
for (int i = start; i < end; i++) {
char c = seq.charAt(i);
if (c < 0x80) {
buffer._setByte(writerIndex++, (byte) c);
Expand Down Expand Up @@ -606,22 +644,35 @@ public static int utf8MaxBytes(CharSequence seq) {
* This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence)}.
*/
public static int utf8Bytes(final CharSequence seq) {
return utf8ByteCount(seq, 0, seq.length());
}

/**
* Equivalent to <code>{@link #utf8Bytes(CharSequence) utf8Bytes(seq.subSequence(start, end))}</code>
* but avoids subsequence object allocation.
* <p>
* This method is producing the exact length according to {@link #writeUtf8(ByteBuf, CharSequence, int, int)}.
*/
public static int utf8Bytes(final CharSequence seq, int start, int end) {
return utf8ByteCount(checkCharSequenceBounds(seq, start, end), start, end);
}

private static int utf8ByteCount(final CharSequence seq, int start, int end) {
if (seq instanceof AsciiString) {
return seq.length();
return end - start;
}
int seqLength = seq.length();
int i = 0;
int i = start;
// ASCII fast path
while (i < seqLength && seq.charAt(i) < 0x80) {
while (i < end && seq.charAt(i) < 0x80) {
++i;
}
// !ASCII is packed in a separate method to let the ASCII case be smaller
return i < seqLength ? i + utf8Bytes(seq, i, seqLength) : i;
return i < end ? (i - start) + utf8BytesNonAscii(seq, i, end) : i - start;
}

private static int utf8Bytes(final CharSequence seq, final int start, final int length) {
private static int utf8BytesNonAscii(final CharSequence seq, final int start, final int end) {
int encodedLength = 0;
for (int i = start; i < length; i++) {
for (int i = start; i < end; i++) {
final char c = seq.charAt(i);
// making it 100% branchless isn't rewarding due to the many bit operations necessary!
if (c < 0x800) {
Expand Down
91 changes: 91 additions & 0 deletions buffer/src/test/java/io/netty/buffer/ByteBufUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,97 @@ private static void assertWrapped(ByteBuf buf) {
assertTrue(buf instanceof WrappedByteBuf);
}

@Test
public void testWriteUtf8Subsequence() {
String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
ByteBuf buf = Unpooled.buffer(16);
buf.writeBytes(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = Unpooled.buffer(16);
ByteBufUtil.writeUtf8(buf2, usAscii, 5, 18);

assertEquals(buf, buf2);

buf.release();
buf2.release();
}

@Test
public void testReserveAndWriteUtf8Subsequence() {
String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
ByteBuf buf = Unpooled.buffer(16);
buf.writeBytes(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8));
ByteBuf buf2 = Unpooled.buffer(16);
int count = ByteBufUtil.reserveAndWriteUtf8(buf2, usAscii, 5, 18, 16);

assertEquals(buf, buf2);
assertEquals(buf.readableBytes(), count);

buf.release();
buf2.release();
}

@Test
public void testUtf8BytesSubsequence() {
String usAscii = "Some UTF-8 like äÄ∏ŒŒ";
assertEquals(usAscii.substring(5, 18).getBytes(CharsetUtil.UTF_8).length,
ByteBufUtil.utf8Bytes(usAscii, 5, 18));
}

private static int[][] INVALID_RANGES = new int[][] {
{ -1, 5 }, { 5, 30 }, { 10, 5 }
};

interface TestMethod {
int invoke(Object... args);
}

private void testInvalidSubsequences(TestMethod method) {
for (int [] range : INVALID_RANGES) {
ByteBuf buf = Unpooled.buffer(16);
try {
method.invoke(buf, "Some UTF-8 like äÄ∏ŒŒ", range[0], range[1]);
fail("Did not throw IndexOutOfBoundsException for range (" + range[0] + ", " + range[1] + ")");
} catch (IndexOutOfBoundsException iiobe) {
// expected
} finally {
assertFalse(buf.isReadable());
buf.release();
}
}
}

@Test
public void testWriteUtf8InvalidSubsequences() {
testInvalidSubsequences(new TestMethod() {
@Override
public int invoke(Object... args) {
return ByteBufUtil.writeUtf8((ByteBuf) args[0], (String) args[1],
(Integer) args[2], (Integer) args[3]);
}
});
}

@Test
public void testReserveAndWriteUtf8InvalidSubsequences() {
testInvalidSubsequences(new TestMethod() {
@Override
public int invoke(Object... args) {
return ByteBufUtil.reserveAndWriteUtf8((ByteBuf) args[0], (String) args[1],
(Integer) args[2], (Integer) args[3], 32);
}
});
}

@Test
public void testUtf8BytesInvalidSubsequences() {
testInvalidSubsequences(new TestMethod() {
@Override
public int invoke(Object... args) {
return ByteBufUtil.utf8Bytes((String) args[1], (Integer) args[2], (Integer) args[3]);
}
});
}

@Test
public void testDecodeUsAscii() {
testDecodeString("This is a test", CharsetUtil.US_ASCII);
Expand Down

0 comments on commit 2af769f

Please sign in to comment.