Skip to content

Commit

Permalink
feat(core): add aggregate functions to "aggregate"transform
Browse files Browse the repository at this point in the history
In addition to "count", there are now "valid", "sum", "mean", "average",
"median", "min", "max", and "variance".
  • Loading branch information
tuner committed Aug 5, 2024
1 parent 1394985 commit a73d229
Show file tree
Hide file tree
Showing 5 changed files with 315 additions and 31 deletions.
77 changes: 69 additions & 8 deletions docs/grammar/transform/aggregate.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
# Aggregate

The `"aggregate"` transform is currently minimal – it adds a new `count` field
that contains the number of data items in a group. More aggregate operations
will be added later.

!!! warning

The parameterization will change in the future to support other aggregate
operations.
The `"aggregate"` transform summarizes data fields using aggregate functions,
such as `"sum"` or `"max"`. The data can be grouped by one or more fields,
which results in a list of objects with the grouped fields and the aggregate
values.

## Parameters

SCHEMA AggregateParams

### Available aggregate functions

Aggregate functions are applied to the data fields in each group.

- `"count"`: Count the number of records in each group.
- `"valid"`: Count the number of non-null and non-NaN values.
- `"sum"`: Sum the values.
- `"mean"`: Calculate the mean value.
- `"average"`: A synonym for `"mean"`.
- `"median"`: Calculate the median value.
- `"min"`: Find the minimum value.
- `"max"`: Find the maximum value.
- `"variance"`: Calculate the variance.

## Example

Given the following data:
Expand All @@ -38,3 +48,54 @@ A new list of data objects is created:
| ------ | ----- |
| first | 2 |
| second | 1 |

### Calculating min and max

<div><genome-spy-doc-embed height="152">

```json
{
"data": {
"values": [
{ "Category": "A", "Value": 5 },
{ "Category": "A", "Value": 9 },
{ "Category": "A", "Value": 9.5 },
{ "Category": "B", "Value": 3 },
{ "Category": "B", "Value": 5 },
{ "Category": "B", "Value": 7.5 },
{ "Category": "B", "Value": 8 }
]
},

"encoding": {
"y": { "field": "Category", "type": "nominal" }
},

"layer": [
{
"encoding": {
"x": { "field": "Value", "type": "quantitative" }
},
"mark": "point"
},
{
"transform": [
{
"type": "aggregate",
"groupby": ["Category"],
"fields": ["Value", "Value"],
"ops": ["min", "max"],
"as": ["minValue", "maxValue"]
}
],
"encoding": {
"x": { "field": "minValue", "type": "quantitative" },
"x2": { "field": "maxValue" }
},
"mark": "rule"
}
]
}
```

</genome-spy-doc-embed></div>
84 changes: 63 additions & 21 deletions packages/core/src/data/transforms/aggregate.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,63 @@ import { group as d3group } from "d3-array";
import FlowNode, { BEHAVIOR_CLONES } from "../flowNode.js";
import { field } from "../../utils/field.js";
import iterateNestedMaps from "../../utils/iterateNestedMaps.js";
import AGGREGATE_OPS from "./aggregateOps.js";

/**
* A minimal aggregate transform that just counts grouped (by a single field) data items.
* Work in progress.
*
* Eventually this will implement the most of Vega's aggregate transform:
* https://vega.github.io/vega/docs/transforms/aggregate/
*/
export default class AggregateTransform extends FlowNode {
get behavior() {
return BEHAVIOR_CLONES;
}

/**
* @param {import("../../spec/transform.js").AggregateParams} params
*
* @typedef {import("../flowNode.js").Datum} Datum
*/
constructor(params) {
super();
this.params = params;

/** @type {any[]} */
this.buffer = [];

/**
* @type {((arr: Datum[]) => number)[]}
*/
this.ops = [];
/**
* @type {string[]}
*/
this.as = [];

if (params.fields) {
if (params.fields.length != params.ops.length) {
throw new Error("Fields and ops must have the same length!");
}

if (params.as && params.as.length != params.ops.length) {
throw new Error(
'If "as" is defined, "fields" and "as" must have the same length!'
);
}

params.fields.forEach((fieldName, i) => {
const accessor = field(fieldName);
const op = AGGREGATE_OPS[params.ops[i]];
this.ops.push((arr) => op(arr, accessor));
this.as.push(
params.as
? params.as[i]
: `${params.ops[i]}_${params.fields[i]}`
);
});
} else {
this.ops.push((arr) => AGGREGATE_OPS.count(arr));
this.as.push("count");
}
}

reset() {
super.reset();
this.buffer = [];
}

Expand All @@ -41,26 +73,36 @@ export default class AggregateTransform extends FlowNode {
complete() {
const params = this.params;

const groupby = params.groupby;
const groupby = params?.groupby;

if (groupby?.length > 0) {
const groupFieldAccessors = groupby.map((f) => field(f));

const groupFieldAccessors = groupby.map((f) => field(f));
// There's something strange in d3-array's typings
const groups = /** @type {Map<any, any>} */ /** @type {any} */ (
d3group(this.buffer, ...groupFieldAccessors)
);

// TODO: Fix case where no group fields are specified
for (const [group, data] of iterateNestedMaps(groups)) {
/** @type {any} */
const datum = {};

// There's something strange in d3-array's typings
const groups = /** @type {Map<any, any>} */ /** @type {any} */ (
d3group(this.buffer, ...groupFieldAccessors)
);
for (let i = 0; i < groupby.length; i++) {
datum[groupby[i]] = group[i];
}

for (const [group, data] of iterateNestedMaps(groups)) {
/** @type {any} */
const datum = {
count: data.length,
};
this.ops.forEach((op, i) => {
datum[this.as[i]] = op(data);
});

for (let i = 0; i < groupby.length; i++) {
datum[groupby[i]] = group[i];
this._propagate(datum);
}
} else {
/** @type {Datum} */
const datum = {};
this.ops.forEach((op, i) => {
datum[this.as[i]] = op(this.buffer);
});

this._propagate(datum);
}
Expand Down
134 changes: 134 additions & 0 deletions packages/core/src/data/transforms/aggregate.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import { describe, expect, test } from "vitest";
import { processData } from "../flowTestUtils.js";
import AggregateTransform from "./aggregate.js";

/**
* @param {import("../../spec/transform.js").AggregateParams} params
* @param {any[]} data
*/
function transform(params, data) {
return processData(new AggregateTransform(params), data);
}

describe("Aggregate transform", () => {
test("Default to count when no data fields or group-by fields are specified", () => {
const input = [
{ name: "alpha", data: 123 },
{ name: "beta", data: 456 },
{ name: "beta", data: 789 },
];

expect(transform({ type: "aggregate" }, input)).toEqual([{ count: 3 }]);
});

test("Default to count when no data fields fields are specified", () => {
const input = [
{ name: "alpha", data: 123 },
{ name: "beta", data: 456 },
{ name: "beta", data: 789 },
];

expect(
transform({ type: "aggregate", groupby: ["name"] }, input)
).toEqual([
{ name: "alpha", count: 1 },
{ name: "beta", count: 2 },
]);
});

test("Compute count, sum, min, max, and mean for groups. Use default output field names.", () => {
const input = [
{ name: "alpha", data: 123 },
{ name: "beta", data: 456 },
{ name: "beta", data: 789 },
];

expect(
transform(
{
type: "aggregate",
groupby: ["name"],
fields: ["data", "data", "data", "data", "data"],
ops: ["count", "sum", "min", "max", "mean"],
},
input
)
).toEqual([
{
name: "alpha",
count_data: 1,
sum_data: 123,
min_data: 123,
max_data: 123,
mean_data: 123,
},
{
name: "beta",
count_data: 2,
sum_data: 1245,
min_data: 456,
max_data: 789,
mean_data: 622.5,
},
]);
});

test("Allow custom output field names", () => {
const input = [
{ name: "alpha", data: 123 },
{ name: "beta", data: 456 },
{ name: "beta", data: 789 },
];

expect(
transform(
{
type: "aggregate",
fields: ["data", "data", "data", "data", "data"],
ops: ["count", "sum", "min", "max", "mean"],
as: ["count", "total", "min", "max", "average"],
},
input
)
).toEqual([
{
count: 3,
total: 1368,
min: 123,
max: 789,
average: 456,
},
]);
});

test("Throw if the length of fields and ops does not match", () => {
const input = [{ name: "beta", data: 789 }];

expect(() =>
transform(
{
type: "aggregate",
fields: ["data", "data", "data", "data"],
ops: ["count", "sum", "min", "max", "mean"],
},
input
)
).toThrow();
});

test("Throw if the length of fields and as does not match", () => {
const input = [{ name: "beta", data: 789 }];

expect(() =>
transform(
{
type: "aggregate",
fields: ["data"],
ops: ["count"],
as: ["count", "total"],
},
input
)
).toThrow();
});
});
17 changes: 17 additions & 0 deletions packages/core/src/data/transforms/aggregateOps.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import { count, max, mean, median, min, sum, variance } from "d3-array";

/**
* @type {Record<import("../../spec/transform.js").AggregateOp, (arr: any[], accessor?: (datum: any) => number) => number>}
*/
const AGGREGATE_OPS = {
count: (arr) => arr.length,
valid: count,
sum,
min,
max,
mean,
median,
variance,
};

export default AGGREGATE_OPS;
Loading

0 comments on commit a73d229

Please sign in to comment.