Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better Timestamp Parsing #2713

Merged
merged 2 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 197 additions & 14 deletions cpp/perspective/src/cpp/arrow_csv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// ┃ of the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). ┃
// ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛

#include <chrono>
#include <perspective/base.h>
#include <perspective/arrow_csv.h>
#include <arrow/util/value_parsing.h>
Expand Down Expand Up @@ -223,24 +224,67 @@ ParseSSS(const char* s, std::chrono::milliseconds* out) {
}

static inline bool
ParseTZ(const char* s, std::chrono::hours* out) {
uint8_t hours = 0;
ParseSSSSSS(const char* s, std::chrono::microseconds* out) {
uint32_t nanos = 0;
if (ARROW_PREDICT_FALSE(s[0] != '.')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 1, 6, &nanos)
)) {
return false;
}

if (ARROW_PREDICT_FALSE(s[0] != '+') && ARROW_PREDICT_FALSE(s[0] != '-')) {
if (ARROW_PREDICT_FALSE(nanos >= 999999)) {
return false;
}
*out = std::chrono::microseconds(nanos);
return true;
}

static inline bool
ParseSSSSSSSSS(const char* s, std::chrono::nanoseconds* out) {
uint32_t nanos = 0;
if (ARROW_PREDICT_FALSE(s[0] != '.')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 1, 9, &nanos)
)) {
return false;
}

if (ARROW_PREDICT_FALSE(nanos >= 999999999)) {
return false;
}
*out = std::chrono::nanoseconds(nanos);
return true;
}

static inline bool
ParseTZ(const char* s, std::chrono::minutes* out) {
uint8_t hours = 0;
uint8_t minutes = 0;
if ((ARROW_PREDICT_FALSE(s[0] != '+') && ARROW_PREDICT_FALSE(s[0] != '-'))
|| ARROW_PREDICT_FALSE(s[3] != ':')) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 1, 2, &hours)
)) {
return false;
}
if (ARROW_PREDICT_FALSE(!arrow::internal::ParseUnsigned(s + 4, 2, &minutes)
)) {
return false;
}

if (ARROW_PREDICT_FALSE(hours >= 12)) {
if (ARROW_PREDICT_FALSE(hours >= 12)
|| ARROW_PREDICT_FALSE(minutes >= 59)) {
return false;
}
if (s[0] == '-') {
hours = -hours;
int32_t total = hours * 60 + minutes;
if (s[0] == '+') {
total = -total;
}
*out = std::chrono::hours(hours);
*out = std::chrono::minutes(total);
return true;
}

Expand All @@ -254,13 +298,20 @@ class CustomISO8601Parser : public arrow::TimestampParser {
int64_t* out,
bool* out_zone_offset_present = NULLPTR
) const override {
// if we are trying to parse this with seconds, fail
// and it will try to parse this again but as
// nanoseconds :) then it wont truncate the fractional bits.
if (unit == arrow::TimeUnit::SECOND) {
return false;
}

if (!arrow::internal::ParseTimestampISO8601(s, length, unit, out)) {
if (s[length - 1] == 'Z') {
--length;
}
if (length == 23) {
// "YYYY-MM-DD[ T]hh:mm:ss.sss"
// "YYYY-MM-DD[ T]hh:mm:ss.sss" -- millis

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
Expand All @@ -281,8 +332,9 @@ class CustomISO8601Parser : public arrow::TimestampParser {
);
return true;
}

if (length == 25) {
// "2008-09-15[ T]15:53:00+05:00"
// "2008-09-15[ T]15:53:00+05:00" -- seconds with TZ
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
Expand All @@ -293,7 +345,7 @@ class CustomISO8601Parser : public arrow::TimestampParser {
))) {
return false;
}
std::chrono::hours tz;
std::chrono::minutes tz;
if (ARROW_PREDICT_FALSE(!ParseTZ(s + 19, &tz))) {
return false;
}
Expand All @@ -303,6 +355,137 @@ class CustomISO8601Parser : public arrow::TimestampParser {
);
return true;
}
if (length == 26) {
// YYYY-MM-DD[ T]hh:mm:ss.ssssss -- micros

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
std::chrono::microseconds micros;
if (ARROW_PREDICT_FALSE(!ParseSSSSSS(s + 19, &micros))) {
return false;
}
// round the micros into millis as Perspective does not support
// nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(micros
);
*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis, unit
);

return true;
}

if (length == 29) {
// YYYY-MM-DD[ T]hh:mm:ss.sssssssss -- nanos
// arrow handles YYYY-MM-DD[ T]hh:mm:ss.sss[+-]HH:MM
std::cout << "DDD WOOHOOOOO!\n";
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
// we can now be at sss[+-]HH:MM -- millis and TZ
// or sssssssss -- nanos
std::chrono::nanoseconds nanos;
if (ARROW_PREDICT_FALSE(!ParseSSSSSSSSS(s + 19, &nanos))) {
return false;
}
// Truncate the nanos into millis as Perspective does not
// support nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(nanos
);

*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis, unit
);

return true;
}
if (length == 32) {
// YYYY-MM-DD[ T]hh:mm:ss.ssssss[+-]HH:MM -- micros with TZ

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
std::chrono::microseconds micros;
if (ARROW_PREDICT_FALSE(!ParseSSSSSS(s + 19, &micros))) {
return false;
}
// round the micros into millis as Perspective does not support
// nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(micros
);

std::chrono::minutes tz;
if (ARROW_PREDICT_FALSE(!ParseTZ(s + 26, &tz))) {
return false;
}
*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis + tz,
unit
);

return true;
}

if (length == 35) {
// YYYY-MM-DD[ T]hh:mm:ss.sssssssss[+-]HH:MM -- nanos with TZ

arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::seconds seconds;
if (ARROW_PREDICT_FALSE(!arrow::internal::detail::ParseHH_MM_SS(
s + 11, &seconds
))) {
return false;
}
std::chrono::nanoseconds nanos;
if (ARROW_PREDICT_FALSE(!ParseSSSSSSSSS(s + 19, &nanos))) {
return false;
}
// round the nanos into millis as Perspective does not support
// nano precision.
auto millis =
std::chrono::duration_cast<std::chrono::milliseconds>(nanos
);

std::chrono::minutes tz;
if (ARROW_PREDICT_FALSE(!ParseTZ(s + 29, &tz))) {
return false;
}

*out = ConvertTimePoint(
arrow_vendored::date::sys_days(ymd) + seconds + millis + tz,
unit
);

return true;
}
return false;
}
return true;
Expand Down Expand Up @@ -387,8 +570,8 @@ std::vector<std::shared_ptr<arrow::TimestampParser>> DATE_PARSERS{
std::make_shared<CustomISO8601Parser>(),
std::make_shared<USTimestampParser>(),
arrow::TimestampParser::MakeStrptime("%Y-%m-%d\\D%H:%M:%S.%f"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"
), // US locale string
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"),
// US locale string
arrow::TimestampParser::MakeStrptime("%m-%d-%Y"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y"),
arrow::TimestampParser::MakeStrptime("%d %m %Y"),
Expand All @@ -401,8 +584,8 @@ std::vector<std::shared_ptr<arrow::TimestampParser>> DATE_READERS{
std::make_shared<CustomISO8601Parser>(),
std::make_shared<USTimestampParser>(),
arrow::TimestampParser::MakeStrptime("%Y-%m-%d\\D%H:%M:%S.%f"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"
), // US locale string
arrow::TimestampParser::MakeStrptime("%m/%d/%Y, %I:%M:%S %p"),
// US locale
arrow::TimestampParser::MakeStrptime("%m-%d-%Y"),
arrow::TimestampParser::MakeStrptime("%m/%d/%Y"),
arrow::TimestampParser::MakeStrptime("%d %m %Y"),
Expand Down
2 changes: 1 addition & 1 deletion rust/perspective-js/src/ts/emscripten_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ export async function compile_perspective(
const str = Error().stack || "";
const textEncoder = new TextEncoder();
const bytes = textEncoder.encode(str);
const ptr = module._psp_js_alloc(bytes.byteLength + 1);
const ptr = module._psp_alloc(bytes.byteLength + 1);
module.HEAPU8.set(bytes, ptr);
module.HEAPU8[ptr + bytes.byteLength] = 0;
return ptr;
Expand Down
21 changes: 18 additions & 3 deletions rust/perspective-js/test/js/clear.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,29 @@ import type * as psp_types from "@finos/perspective";
{ x: 1, y: 2 },
{ x: 3, y: 4 },
]);
table.replace([{ x: 5, y: 6 }]);
await table.replace([{ x: 5, y: 6 }]);
json = await view.to_json();
expect(json).toHaveLength(1);
expect(json).toEqual([{ x: 5, y: 6 }]);
view.delete();
table.delete();
});

test("Replaces CSV Table with high precision datetimes", async function () {
const a = '"start"\n2024-08-14T14:06:07.826Z';
const b = '"start"\n2024-08-14T14:06:09.876667543Z';
const table = await perspective.table(a);
const view = await table.view();
const csv1 = await view.to_csv();
expect(csv1).toEqual('"start"\n2024-08-14 14:06:07.826\n');

await table.replace(b);
const csv2 = await view.to_csv();
expect(csv2).toEqual('"start"\n2024-08-14 14:06:09.876\n');
view.delete();
table.delete();
});

test("replaces the rows in the table with the input data and fires an on_update", async function () {
const table = await perspective.table([
{ x: 1, y: 2 },
Expand Down Expand Up @@ -95,7 +110,7 @@ import type * as psp_types from "@finos/perspective";
{ x: 3, y: 4 },
]);

table.replace([{ x: 5, y: 6 }]);
await table.replace([{ x: 5, y: 6 }]);
await result;
});

Expand Down Expand Up @@ -139,7 +154,7 @@ import type * as psp_types from "@finos/perspective";
{ x: 3, y: 4 },
]);

table.replace([{ x: 5, y: 6 }]);
await table.replace([{ x: 5, y: 6 }]);
await result;
});

Expand Down
Loading
Loading