Skip to content

Derived columns #367

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 76 additions & 12 deletions src/table.js
Original file line number Diff line number Diff line change
Expand Up @@ -627,14 +627,13 @@ export function getSchema(source) {
return {schema, inferred: false};
}

// This function applies table cell operations to an in-memory table (array of
// objects); it should be equivalent to the corresponding SQL query. TODO Use
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
// function to do table operations on in-memory data?
export function __table(source, operations) {
// This function infers a schema from the source data, if one doesn't already
// exist, and merges type assertions into that schema. If the schema was
// inferred or if there are type assertions, it then coerces the rows in the
// source data to the types specified in the schema.
function applyTypes(source, operations) {
const input = source;
let {schema, inferred} = getSchema(source);
// Combine column types from schema with user-selected types in operations
const types = new Map(schema.map(({name, type}) => [name, type]));
if (operations.types) {
for (const {name, type} of operations.types) {
Expand All @@ -650,6 +649,66 @@ export function __table(source, operations) {
// operations.types, above.
source = source.map(d => coerceRow(d, types, schema));
}
return {source, schema};
}

function applyNames(source, operations) {
if (!operations.names) return source;
const overridesByName = new Map(operations.names.map((n) => [n.column, n]));
return source.map((d) =>
Object.fromEntries(Object.keys(d).map((k) => {
const override = overridesByName.get(k);
return [override?.name ?? k, d[k]];
}))
);
}

// This function applies table cell operations to an in-memory table (array of
// objects); it should be equivalent to the corresponding SQL query. TODO Use
// DuckDBClient for data arrays, too, and then we wouldn’t need our own __table
// function to do table operations on in-memory data?
export function __table(source, operations) {
const errors = new Map();
const input = source;
const typed = applyTypes(source, operations);
source = typed.source;
let schema = typed.schema;
if (operations.derive) {
// Derived columns may depend on coerced values from the original data source,
// so we must evaluate derivations after the initial inference and coercion
// step.
const derivedSource = [];
operations.derive.map(({name, value}) => {
let columnErrors = [];
// Derived column formulas may reference renamed columns, so we must
// compute derivations on the renamed source. However, we don't modify the
// source itself with renamed names until after the other operations are
// applied, because operations like filter and sort reference original
// column names.
// TODO Allow derived columns to reference other derived columns.
applyNames(source, operations).map((row, index, rows) => {
let resolved;
try {
resolved = value(row, index, rows);
} catch (error) {
columnErrors.push({index, error});
resolved = undefined;
}
if (derivedSource[index]) {
derivedSource[index] = {...derivedSource[index], [name]: resolved};
} else {
derivedSource.push({[name]: resolved});
}
});
if (columnErrors.length) errors.set(name, columnErrors);
});
// Since derived columns are untyped by default, we do a pass of type
// inference and coercion after computing the derived values.
const typedDerived = applyTypes(derivedSource, operations);
// Merge derived source and schema with the source dataset.
source = source.map((row, i) => ({...row, ...typedDerived.source[i]}));
schema = [...schema, ...typedDerived.schema];
}
for (const {type, operands} of operations.filter) {
const [{value: column}] = operands;
const values = operands.slice(1).map(({value}) => value);
Expand Down Expand Up @@ -750,6 +809,8 @@ export function __table(source, operations) {
if (from > 0 || to < Infinity) {
source = source.slice(Math.max(0, from), Math.max(0, to));
}
// Preserve the schema for all columns.
let fullSchema = schema.slice();
if (operations.select.columns) {
if (schema) {
const schemaByName = new Map(schema.map((s) => [s.name, s]));
Expand All @@ -767,16 +828,19 @@ export function __table(source, operations) {
return ({...s, ...(override ? {name: override.name} : null)});
});
}
source = source.map((d) =>
Object.fromEntries(Object.keys(d).map((k) => {
const override = overridesByName.get(k);
return [override?.name ?? k, d[k]];
}))
);
if (fullSchema) {
fullSchema = fullSchema.map((s) => {
const override = overridesByName.get(s.name);
return ({...s, ...(override ? {name: override.name} : null)});
});
}
source = applyNames(source, operations);
}
if (source !== input) {
if (schema) source.schema = schema;
}
source.fullSchema = fullSchema;
source.errors = errors;
return source;
}

Expand Down
96 changes: 86 additions & 10 deletions test/table-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,8 @@ describe("__table", () => {
};
const expectedEmpty = [{}, {}, {}];
expectedEmpty.schema = [];
expectedEmpty.fullSchema = source.schema;
expectedEmpty.errors = new Map();
assert.deepStrictEqual(
__table(source, operationsEmptyColumns),
expectedEmpty
Expand All @@ -514,6 +516,8 @@ describe("__table", () => {
};
const expectedSelected = [{a: 1}, {a: 2}, {a: 3}];
expectedSelected.schema = [{name: "a", type: "integer", inferred: "integer"}];
expectedSelected.fullSchema = source.schema;
expectedSelected.errors = new Map();
assert.deepStrictEqual(
__table(source, operationsSelectedColumns),
expectedSelected
Expand Down Expand Up @@ -546,6 +550,8 @@ describe("__table", () => {
};
const expectedEq = [{a: 1, b: 2, c: 3}];
expectedEq.schema = source.schema;
expectedEq.fullSchema = source.schema;
expectedEq.errors = new Map();
assert.deepStrictEqual(__table(source, operationsEquals), expectedEq);
const operationsComparison = {
...EMPTY_TABLE_DATA.operations,
Expand All @@ -568,6 +574,8 @@ describe("__table", () => {
};
const expectedLtGt = [{a: 2, b: 4, c: 6}];
expectedLtGt.schema = source.schema;
expectedLtGt.fullSchema = source.schema;
expectedLtGt.errors = new Map();
assert.deepStrictEqual(__table(source, operationsComparison), expectedLtGt);
});

Expand All @@ -586,6 +594,8 @@ describe("__table", () => {
};
const expectedEq = [{a: 1, b: 2, c: 3}];
expectedEq.schema = source.schema;
expectedEq.fullSchema = source.schema;
expectedEq.errors = new Map();
assert.deepStrictEqual(__table(source, operationsEquals), expectedEq);
const operationsComparison = {
...EMPTY_TABLE_DATA.operations,
Expand All @@ -608,6 +618,8 @@ describe("__table", () => {
};
const expectedLteGte = [{a: 2, b: 4, c: 6}];
expectedLteGte.schema = source.schema;
expectedLteGte.fullSchema = source.schema;
expectedLteGte.errors = new Map();
assert.deepStrictEqual(
__table(source, operationsComparison),
expectedLteGte
Expand All @@ -634,6 +646,8 @@ describe("__table", () => {
];
const expected = [{a: new Date("2021-01-02")}];
expected.schema = [{name: "a", type: "date", inferred: "date"}];
expected.fullSchema = expected.schema;
expected.errors = new Map();
assert.deepStrictEqual(__table(source, operationsEquals), expected);
});

Expand All @@ -648,6 +662,8 @@ describe("__table", () => {
{a: 1, b: 2, c: 3}
];
expectedDesc.schema = source.schema;
expectedDesc.fullSchema = source.schema;
expectedDesc.errors = new Map();
assert.deepStrictEqual(__table(source, operationsDesc), expectedDesc);
const operationsAsc = {
...EMPTY_TABLE_DATA.operations,
Expand All @@ -659,6 +675,8 @@ describe("__table", () => {
{a: 3, b: 6, c: 9}
];
expectedAsc.schema = source.schema;
expectedAsc.fullSchema = source.schema;
expectedAsc.errors = new Map();
assert.deepStrictEqual(__table(source, operationsAsc), expectedAsc);
const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}];
const operationsMulti = {
Expand All @@ -676,6 +694,8 @@ describe("__table", () => {
{a: 1, b: 2, c: 3}
];
expectedExtended.schema = source.schema;
expectedExtended.fullSchema = source.schema;
expectedExtended.errors = new Map();
assert.deepStrictEqual(
__table(sourceExtended, operationsMulti),
expectedExtended
Expand All @@ -694,6 +714,8 @@ describe("__table", () => {
{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: NaN}, {a: NaN}, {a: NaN}, {a: NaN}
];
expectedDesc.schema = [{name: "a", type: "number", inferred: "number"}];
expectedDesc.fullSchema = expectedDesc.schema;
expectedDesc.errors = new Map();
assert.deepStrictEqual(
__table(sourceWithMissing, operationsDesc),
expectedDesc
Expand All @@ -706,6 +728,8 @@ describe("__table", () => {
{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: NaN}, {a: NaN}, {a: NaN}, {a: NaN}
];
expectedAsc.schema = [{name: "a", type: "number", inferred: "number"}];
expectedAsc.fullSchema = expectedAsc.schema;
expectedAsc.errors = new Map();
assert.deepStrictEqual(
__table(sourceWithMissing, operationsAsc),
expectedAsc
Expand All @@ -723,6 +747,8 @@ describe("__table", () => {
{a: 1, b: 2, c: 3}
];
sorted.schema = source.schema;
sorted.fullSchema = source.schema;
sorted.errors = new Map();
assert.deepStrictEqual(__table(source, operations), sorted);
const originalOrder = [
{a: 1, b: 2, c: 3},
Expand All @@ -743,13 +769,17 @@ describe("__table", () => {
{a: 3, b: 6, c: 9}
];
expectedToNull.schema = source.schema;
expectedToNull.fullSchema = source.schema;
expectedToNull.errors = new Map();
assert.deepStrictEqual(__table(source, operationsToNull), expectedToNull);
const operationsFromNull = {
...EMPTY_TABLE_DATA.operations,
slice: {from: null, to: 1}
};
const expectedFromNull = [{a: 1, b: 2, c: 3}];
expectedFromNull.schema = source.schema;
expectedFromNull.fullSchema = source.schema;
expectedFromNull.errors = new Map();
assert.deepStrictEqual(
__table(source, operationsFromNull),
expectedFromNull
Expand All @@ -760,6 +790,8 @@ describe("__table", () => {
};
const expectedSlice = [{a: 2, b: 4, c: 6}];
expectedSlice.schema = source.schema;
expectedSlice.fullSchema = source.schema;
expectedSlice.errors = new Map();
assert.deepStrictEqual(__table(source, operations), expectedSlice);
});

Expand Down Expand Up @@ -794,18 +826,16 @@ describe("__table", () => {
{nameA: 2, b: 4, c: 6},
{nameA: 3, b: 6, c: 9}
];
expected.schema = [
const schema = [
{name: "nameA", type: "integer", inferred: "integer"},
{name: "b", type: "integer", inferred: "integer"},
{name: "c", type: "integer", inferred: "integer"}
];
expected.schema = schema;
expected.fullSchema = schema;
expected.errors = new Map();
assert.deepStrictEqual(__table(source, operations), expected);
source.columns = ["a", "b", "c"];
assert.deepStrictEqual(__table(source, operations).schema, [
{name: "nameA", type: "integer", inferred: "integer"},
{name: "b", type: "integer", inferred: "integer"},
{name: "c", type: "integer", inferred: "integer"}
]);
});

it("__table type assertions", () => {
Expand All @@ -823,13 +853,59 @@ describe("__table", () => {
{name: "b", type: "integer", inferred: "integer"},
{name: "c", type: "integer", inferred: "integer"}
];
expected.fullSchema = expected.schema;
expected.errors = new Map();
assert.deepStrictEqual(__table(source, operations), expected);
source.columns = ["a", "b", "c"];
assert.deepStrictEqual(__table(source, operations).schema, [
{name: "a", type: "string", inferred: "integer"},
});

it("__table derived columns", () => {
const operations = {
...EMPTY_TABLE_DATA.operations,
derive: [{name: "d", value: (row) => row.a ** 2}]
};
const expected = [
{a: 1, b: 2, c: 3, d: 1},
{a: 2, b: 4, c: 6, d: 4},
{a: 3, b: 6, c: 9, d: 9}
];
expected.schema = [
{name: "a", type: "integer", inferred: "integer"},
{name: "b", type: "integer", inferred: "integer"},
{name: "c", type: "integer", inferred: "integer"}
]);
{name: "c", type: "integer", inferred: "integer"},
{name: "d", type: "integer", inferred: "integer"}
];
expected.fullSchema = expected.schema;
expected.errors = new Map();
assert.deepStrictEqual(__table(source, operations), expected);
});

it("__table derived columns with errors", () => {
const functionWithError = (row) => row.a.b.c;
const operations = {
...EMPTY_TABLE_DATA.operations,
derive: [{name: "d", value: functionWithError}]
};
let error;
try {
functionWithError(source[0]);
} catch (e) {
error = e;
}
const expected = [
{a: 1, b: 2, c: 3, d: undefined},
{a: 2, b: 4, c: 6, d: undefined},
{a: 3, b: 6, c: 9, d: undefined}
];
expected.schema = [
{name: "a", type: "integer", inferred: "integer"},
{name: "b", type: "integer", inferred: "integer"},
{name: "c", type: "integer", inferred: "integer"},
{name: "d", type: "other", inferred: "other"}
];
expected.fullSchema = expected.schema;
expected.errors = new Map([["d", [{index: 0, error}, {index: 1, error}, {index: 2, error}]]]);
assert.deepStrictEqual(__table(source, operations), expected);
});
});

Expand Down