From a73d229bdbfa1aa7e4c782d329c301035e88d0b5 Mon Sep 17 00:00:00 2001 From: Kari Lavikka Date: Mon, 5 Aug 2024 10:30:07 +0300 Subject: [PATCH] feat(core): add aggregate functions to `"aggregate"`transform In addition to "count", there are now "valid", "sum", "mean", "average", "median", "min", "max", and "variance". --- docs/grammar/transform/aggregate.md | 77 ++++++++-- .../core/src/data/transforms/aggregate.js | 84 ++++++++--- .../src/data/transforms/aggregate.test.js | 134 ++++++++++++++++++ .../core/src/data/transforms/aggregateOps.js | 17 +++ packages/core/src/spec/transform.d.ts | 34 ++++- 5 files changed, 315 insertions(+), 31 deletions(-) create mode 100644 packages/core/src/data/transforms/aggregate.test.js create mode 100644 packages/core/src/data/transforms/aggregateOps.js diff --git a/docs/grammar/transform/aggregate.md b/docs/grammar/transform/aggregate.md index 834e805a..40c74be8 100644 --- a/docs/grammar/transform/aggregate.md +++ b/docs/grammar/transform/aggregate.md @@ -1,18 +1,28 @@ # Aggregate -The `"aggregate"` transform is currently minimal – it adds a new `count` field -that contains the number of data items in a group. More aggregate operations -will be added later. - -!!! warning - - The parameterization will change in the future to support other aggregate - operations. +The `"aggregate"` transform summarizes data fields using aggregate functions, +such as `"sum"` or `"max"`. The data can be grouped by one or more fields, +which results in a list of objects with the grouped fields and the aggregate +values. ## Parameters SCHEMA AggregateParams +### Available aggregate functions + +Aggregate functions are applied to the data fields in each group. + +- `"count"`: Count the number of records in each group. +- `"valid"`: Count the number of non-null and non-NaN values. +- `"sum"`: Sum the values. +- `"mean"`: Calculate the mean value. +- `"average"`: A synonym for `"mean"`. +- `"median"`: Calculate the median value. +- `"min"`: Find the minimum value. +- `"max"`: Find the maximum value. +- `"variance"`: Calculate the variance. + ## Example Given the following data: @@ -38,3 +48,54 @@ A new list of data objects is created: | ------ | ----- | | first | 2 | | second | 1 | + +### Calculating min and max + +
+ +```json +{ + "data": { + "values": [ + { "Category": "A", "Value": 5 }, + { "Category": "A", "Value": 9 }, + { "Category": "A", "Value": 9.5 }, + { "Category": "B", "Value": 3 }, + { "Category": "B", "Value": 5 }, + { "Category": "B", "Value": 7.5 }, + { "Category": "B", "Value": 8 } + ] + }, + + "encoding": { + "y": { "field": "Category", "type": "nominal" } + }, + + "layer": [ + { + "encoding": { + "x": { "field": "Value", "type": "quantitative" } + }, + "mark": "point" + }, + { + "transform": [ + { + "type": "aggregate", + "groupby": ["Category"], + "fields": ["Value", "Value"], + "ops": ["min", "max"], + "as": ["minValue", "maxValue"] + } + ], + "encoding": { + "x": { "field": "minValue", "type": "quantitative" }, + "x2": { "field": "maxValue" } + }, + "mark": "rule" + } + ] +} +``` + +
diff --git a/packages/core/src/data/transforms/aggregate.js b/packages/core/src/data/transforms/aggregate.js index e3cd8f96..9b15aebf 100644 --- a/packages/core/src/data/transforms/aggregate.js +++ b/packages/core/src/data/transforms/aggregate.js @@ -2,14 +2,8 @@ import { group as d3group } from "d3-array"; import FlowNode, { BEHAVIOR_CLONES } from "../flowNode.js"; import { field } from "../../utils/field.js"; import iterateNestedMaps from "../../utils/iterateNestedMaps.js"; +import AGGREGATE_OPS from "./aggregateOps.js"; -/** - * A minimal aggregate transform that just counts grouped (by a single field) data items. - * Work in progress. - * - * Eventually this will implement the most of Vega's aggregate transform: - * https://vega.github.io/vega/docs/transforms/aggregate/ - */ export default class AggregateTransform extends FlowNode { get behavior() { return BEHAVIOR_CLONES; @@ -17,6 +11,8 @@ export default class AggregateTransform extends FlowNode { /** * @param {import("../../spec/transform.js").AggregateParams} params + * + * @typedef {import("../flowNode.js").Datum} Datum */ constructor(params) { super(); @@ -24,9 +20,45 @@ export default class AggregateTransform extends FlowNode { /** @type {any[]} */ this.buffer = []; + + /** + * @type {((arr: Datum[]) => number)[]} + */ + this.ops = []; + /** + * @type {string[]} + */ + this.as = []; + + if (params.fields) { + if (params.fields.length != params.ops.length) { + throw new Error("Fields and ops must have the same length!"); + } + + if (params.as && params.as.length != params.ops.length) { + throw new Error( + 'If "as" is defined, "fields" and "as" must have the same length!' + ); + } + + params.fields.forEach((fieldName, i) => { + const accessor = field(fieldName); + const op = AGGREGATE_OPS[params.ops[i]]; + this.ops.push((arr) => op(arr, accessor)); + this.as.push( + params.as + ? params.as[i] + : `${params.ops[i]}_${params.fields[i]}` + ); + }); + } else { + this.ops.push((arr) => AGGREGATE_OPS.count(arr)); + this.as.push("count"); + } } reset() { + super.reset(); this.buffer = []; } @@ -41,26 +73,36 @@ export default class AggregateTransform extends FlowNode { complete() { const params = this.params; - const groupby = params.groupby; + const groupby = params?.groupby; + + if (groupby?.length > 0) { + const groupFieldAccessors = groupby.map((f) => field(f)); - const groupFieldAccessors = groupby.map((f) => field(f)); + // There's something strange in d3-array's typings + const groups = /** @type {Map} */ /** @type {any} */ ( + d3group(this.buffer, ...groupFieldAccessors) + ); - // TODO: Fix case where no group fields are specified + for (const [group, data] of iterateNestedMaps(groups)) { + /** @type {any} */ + const datum = {}; - // There's something strange in d3-array's typings - const groups = /** @type {Map} */ /** @type {any} */ ( - d3group(this.buffer, ...groupFieldAccessors) - ); + for (let i = 0; i < groupby.length; i++) { + datum[groupby[i]] = group[i]; + } - for (const [group, data] of iterateNestedMaps(groups)) { - /** @type {any} */ - const datum = { - count: data.length, - }; + this.ops.forEach((op, i) => { + datum[this.as[i]] = op(data); + }); - for (let i = 0; i < groupby.length; i++) { - datum[groupby[i]] = group[i]; + this._propagate(datum); } + } else { + /** @type {Datum} */ + const datum = {}; + this.ops.forEach((op, i) => { + datum[this.as[i]] = op(this.buffer); + }); this._propagate(datum); } diff --git a/packages/core/src/data/transforms/aggregate.test.js b/packages/core/src/data/transforms/aggregate.test.js new file mode 100644 index 00000000..c288482c --- /dev/null +++ b/packages/core/src/data/transforms/aggregate.test.js @@ -0,0 +1,134 @@ +import { describe, expect, test } from "vitest"; +import { processData } from "../flowTestUtils.js"; +import AggregateTransform from "./aggregate.js"; + +/** + * @param {import("../../spec/transform.js").AggregateParams} params + * @param {any[]} data + */ +function transform(params, data) { + return processData(new AggregateTransform(params), data); +} + +describe("Aggregate transform", () => { + test("Default to count when no data fields or group-by fields are specified", () => { + const input = [ + { name: "alpha", data: 123 }, + { name: "beta", data: 456 }, + { name: "beta", data: 789 }, + ]; + + expect(transform({ type: "aggregate" }, input)).toEqual([{ count: 3 }]); + }); + + test("Default to count when no data fields fields are specified", () => { + const input = [ + { name: "alpha", data: 123 }, + { name: "beta", data: 456 }, + { name: "beta", data: 789 }, + ]; + + expect( + transform({ type: "aggregate", groupby: ["name"] }, input) + ).toEqual([ + { name: "alpha", count: 1 }, + { name: "beta", count: 2 }, + ]); + }); + + test("Compute count, sum, min, max, and mean for groups. Use default output field names.", () => { + const input = [ + { name: "alpha", data: 123 }, + { name: "beta", data: 456 }, + { name: "beta", data: 789 }, + ]; + + expect( + transform( + { + type: "aggregate", + groupby: ["name"], + fields: ["data", "data", "data", "data", "data"], + ops: ["count", "sum", "min", "max", "mean"], + }, + input + ) + ).toEqual([ + { + name: "alpha", + count_data: 1, + sum_data: 123, + min_data: 123, + max_data: 123, + mean_data: 123, + }, + { + name: "beta", + count_data: 2, + sum_data: 1245, + min_data: 456, + max_data: 789, + mean_data: 622.5, + }, + ]); + }); + + test("Allow custom output field names", () => { + const input = [ + { name: "alpha", data: 123 }, + { name: "beta", data: 456 }, + { name: "beta", data: 789 }, + ]; + + expect( + transform( + { + type: "aggregate", + fields: ["data", "data", "data", "data", "data"], + ops: ["count", "sum", "min", "max", "mean"], + as: ["count", "total", "min", "max", "average"], + }, + input + ) + ).toEqual([ + { + count: 3, + total: 1368, + min: 123, + max: 789, + average: 456, + }, + ]); + }); + + test("Throw if the length of fields and ops does not match", () => { + const input = [{ name: "beta", data: 789 }]; + + expect(() => + transform( + { + type: "aggregate", + fields: ["data", "data", "data", "data"], + ops: ["count", "sum", "min", "max", "mean"], + }, + input + ) + ).toThrow(); + }); + + test("Throw if the length of fields and as does not match", () => { + const input = [{ name: "beta", data: 789 }]; + + expect(() => + transform( + { + type: "aggregate", + fields: ["data"], + ops: ["count"], + as: ["count", "total"], + }, + input + ) + ).toThrow(); + }); +}); diff --git a/packages/core/src/data/transforms/aggregateOps.js b/packages/core/src/data/transforms/aggregateOps.js new file mode 100644 index 00000000..6b4039b3 --- /dev/null +++ b/packages/core/src/data/transforms/aggregateOps.js @@ -0,0 +1,17 @@ +import { count, max, mean, median, min, sum, variance } from "d3-array"; + +/** + * @type {Record number) => number>} + */ +const AGGREGATE_OPS = { + count: (arr) => arr.length, + valid: count, + sum, + min, + max, + mean, + median, + variance, +}; + +export default AGGREGATE_OPS; diff --git a/packages/core/src/spec/transform.d.ts b/packages/core/src/spec/transform.d.ts index 526b4abb..6fc291cd 100644 --- a/packages/core/src/spec/transform.d.ts +++ b/packages/core/src/spec/transform.d.ts @@ -188,14 +188,44 @@ export interface StackParams extends TransformParamsBase { baseField?: Field; } +export type AggregateOp = + | "count" + | "valid" + | "sum" + | "min" + | "max" + | "mean" + | "median" + | "variance"; + export interface AggregateParams extends TransformParamsBase { type: "aggregate"; /** - * Which fields to use for grouping. Missing `groupby` results in a single - * group that includes all the data items. + * The fields by which to group the data. If these are not defined, all data + * objects will be grouped into a single category. */ groupby?: Field[]; + + /** + * The data fields to apply aggregate functions to. This array should + * correspond with the `ops` and `as` arrays. If no fields or operations + * are specified, a count aggregation will be applied by default. + */ + fields?: Field[]; + + /** + * The aggregation operations to be performed on the fields, such as `"sum"`, + * `"average"`, or `"count"`. + */ + ops?: AggregateOp[]; + + /** + * The names for the output fields corresponding to each aggregated field. + * If not provided, names will be automatically created using the operation + * and field names (e.g., `sum_field`, `average_field`). + */ + as?: string[]; } export interface FlattenParams extends TransformParamsBase {