Skip to content

Commit

Permalink
Optimize performance of character_length function (apache#13696)
Browse files Browse the repository at this point in the history
* Optimize performance of  function

Signed-off-by: Tai Le Manh <manhtai.lmt@gmail.com>

* Add pre-check array is null

* Fix clippy warnings

---------

Signed-off-by: Tai Le Manh <manhtai.lmt@gmail.com>
  • Loading branch information
tlm365 authored Dec 10, 2024
1 parent 6fae5a0 commit 437cbf8
Showing 1 changed file with 39 additions and 18 deletions.
57 changes: 39 additions & 18 deletions datafusion/functions/src/unicode/character_length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
use crate::strings::StringArrayType;
use crate::utils::{make_scalar_function, utf8_to_int_type};
use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray,
Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveBuilder,
};
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
use datafusion_common::Result;
Expand Down Expand Up @@ -136,31 +136,52 @@ fn character_length(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}

fn character_length_general<'a, T: ArrowPrimitiveType, V: StringArrayType<'a>>(
array: V,
) -> Result<ArrayRef>
fn character_length_general<'a, T, V>(array: V) -> Result<ArrayRef>
where
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
V: StringArrayType<'a>,
{
let mut builder = PrimitiveBuilder::<T>::with_capacity(array.len());

// String characters are variable length encoded in UTF-8, counting the
// number of chars requires expensive decoding, however checking if the
// string is ASCII only is relatively cheap.
// If strings are ASCII only, count bytes instead.
let is_array_ascii_only = array.is_ascii();
let iter = array.iter();
let result = iter
.map(|string| {
string.map(|string: &str| {
if is_array_ascii_only {
T::Native::usize_as(string.len())
} else {
T::Native::usize_as(string.chars().count())
}
})
})
.collect::<PrimitiveArray<T>>();

Ok(Arc::new(result) as ArrayRef)
if array.null_count() == 0 {
if is_array_ascii_only {
for i in 0..array.len() {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.len()));
}
} else {
for i in 0..array.len() {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.chars().count()));
}
}
} else if is_array_ascii_only {
for i in 0..array.len() {
if array.is_null(i) {
builder.append_null();
} else {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.len()));
}
}
} else {
for i in 0..array.len() {
if array.is_null(i) {
builder.append_null();
} else {
let value = array.value(i);
builder.append_value(T::Native::usize_as(value.chars().count()));
}
}
}

Ok(Arc::new(builder.finish()) as ArrayRef)
}

#[cfg(test)]
Expand Down

0 comments on commit 437cbf8

Please sign in to comment.