From ea741feb6028b80e9dbd7ad1aa930c603b620c9f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 19 Jan 2023 15:12:53 -0800 Subject: [PATCH 01/90] Stop using {typed: true} for csv and tsv --- src/table.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 05f30469..33f87dc2 100644 --- a/src/table.js +++ b/src/table.js @@ -191,8 +191,8 @@ function sourceCache(loadSource) { const loadTableDataSource = sourceCache(async (source, name) => { if (source instanceof FileAttachment) { switch (source.mimeType) { - case "text/csv": return source.csv({typed: true}); - case "text/tab-separated-values": return source.tsv({typed: true}); + case "text/csv": return source.csv(); + case "text/tab-separated-values": return source.tsv(); case "application/json": return source.json(); case "application/x-sqlite3": return source.sqlite(); } From 558a6a6bdfa8ea5f33ad636abfde9a7001c4dbac Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 19 Jan 2023 15:14:14 -0800 Subject: [PATCH 02/90] Infer schema if none exists --- src/table.js | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 33f87dc2..7f180da3 100644 --- a/src/table.js +++ b/src/table.js @@ -543,8 +543,9 @@ export function getTypeValidator(colType) { // DuckDBClient for data arrays, too, and then we wouldn’t need our own __table // function to do table operations on in-memory data? export function __table(source, operations) { - const input = source; let {schema, columns} = source; + if (!schema) source.schema = inferSchema(source); + const input = source; let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); for (const {type, operands} of operations.filter) { @@ -666,3 +667,79 @@ export function __table(source, operations) { } return source; } + +function initKey() { + return { + other: 0, + boolean: 0, + integer: 0, + number: 0, + date: 0, + string: 0, + array: 0, + object: 0, + bigint: 0, // TODO for csv, tsv? + buffer: 0 + }; +} + +function inferSchema(source) { + const schema = []; + const sampleSize = 100; + const sample = source.slice(0, sampleSize); + const typeCounts = {}; + sample.map((d) => { + for (const key in d) { + if (!typeCounts[key]) typeCounts[key] = initKey(); + // for json and sqlite, we already have some types, but for csv and tsv, all + // columns are strings here. + const type = typeof d[key]; + const value = type === "string" ? d[key]?.trim() : d[key]; + if (value === null || value === undefined || value.length === 0) + typeCounts[key]["other"]++; + else if (type !== "string") { + if (Array.isArray(value)) typeCounts[key]["array"]++; + else if (value instanceof Date) typeCounts[key]["date"]++; + else if (value instanceof ArrayBuffer) typeCounts[key]["buffer"]++; + else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object + } else { + if (value === "true" || value === "false") + typeCounts[key]["boolean"]++; + else if (!isNaN(+value) && /^-?[0-9]+$/.test(value)) + typeCounts[key]["integer"]++; + else if (!isNaN(+value)) typeCounts[key]["number"]++; + else if ( + value.match( + /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ + ) + ) + typeCounts[key]["date"]++; + else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/)) + typeCounts[key]["date"]++; + else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/)) + typeCounts[key]["date"]++; + else typeCounts[key]["string"]++; + } + } + }); + const columns = Object.keys(typeCounts); + for (const col of columns) { + // sort descending so most commonly encoutered type is first + const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) { + return typeCounts[col][b] - typeCounts[col][a]; + }); + let type = typesSorted[0]; + if (type === "other") { + // take the next-most-encountered type if most are "other", but only if + // its tally is greater than the next one in the list + if (typeCounts[typesSorted[1]] > typeCounts[typesSorted[2]]) + type = typesSorted[1]; + // else we could iterate over the sample and use the first encountered type + } + schema.push({ + name: col, + type: type + }); + } + return schema; +} \ No newline at end of file From ba09d45f71a847d19ed4d00e9cdc438dc3b47253 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Fri, 20 Jan 2023 09:55:13 -0800 Subject: [PATCH 03/90] Add schema validity check to address #9673 --- src/table.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 7f180da3..1dedda1f 100644 --- a/src/table.js +++ b/src/table.js @@ -544,7 +544,7 @@ export function getTypeValidator(colType) { // function to do table operations on in-memory data? export function __table(source, operations) { let {schema, columns} = source; - if (!schema) source.schema = inferSchema(source); + if (!schema || !isValidSchema(schema)) source.schema = inferSchema(source); const input = source; let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); @@ -668,6 +668,13 @@ export function __table(source, operations) { return source; } +function isValidSchema(schema) { + if (!schema || !Array.isArray(schema)) return; + return schema.every((s) => { + s && typeof s.name === "string" && typeof s.type === "string"; + }); +} + function initKey() { return { other: 0, From 3a3f5a15cad0ea022a939f794c5cf2c0ff3e37ee Mon Sep 17 00:00:00 2001 From: Libbey White Date: Fri, 20 Jan 2023 11:19:56 -0800 Subject: [PATCH 04/90] Update tests --- test/table-test.js | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index 0ef3e53d..e04d7a42 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -454,9 +454,10 @@ describe("__table", () => { const operationsNullColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: null}}; assert.deepStrictEqual(__table(source, operationsNullColumns), source); const operationsEmptyColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: []}}; - assert.deepStrictEqual(__table(source, operationsEmptyColumns), [{}, {}, {}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsEmptyColumns).slice(), [{}, {}, {}]); const operationsSelectedColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: ["a"]}}; - assert.deepStrictEqual(__table(source, operationsSelectedColumns), [{a: 1}, {a: 2}, {a: 3}]); + assert.deepStrictEqual(__table(source, operationsSelectedColumns).slice(), [{a: 1}, {a: 2}, {a: 3}]); }); it("__table unknown filter", () => { @@ -480,7 +481,8 @@ describe("__table", () => { {type: "gt", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2}]} ] }; - assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]); }); it("__table filter lte + gte", () => { @@ -496,7 +498,8 @@ describe("__table", () => { {type: "gte", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2.5}]} ] }; - assert.deepStrictEqual(__table(source, operationsComparison), [{a: 2, b: 4, c: 6}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]); }); it("__table filter primitive lte + gte", () => { @@ -526,8 +529,9 @@ describe("__table", () => { [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}] ); const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]}; + // comparing the result of .slice() removes schema from the comparison assert.deepStrictEqual( - __table(source, operationsAsc), + __table(source, operationsAsc).slice(), [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}] ); const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}]; @@ -549,8 +553,9 @@ describe("__table", () => { [{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null}] ); const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]}; + // comparing the result of .slice() removes schema from the comparison assert.deepStrictEqual( - __table(sourceWithMissing, operationsAsc), + __table(sourceWithMissing, operationsAsc).slice(), [{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null}] ); }); @@ -561,8 +566,9 @@ describe("__table", () => { __table(source, operations), [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}] ); + // comparing the result of .slice() removes schema from the comparison assert.deepStrictEqual( - source, + source.slice(), [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}] ); }); @@ -571,9 +577,10 @@ describe("__table", () => { const operationsToNull = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: null}}; assert.deepStrictEqual(__table(source, operationsToNull), [{a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]); const operationsFromNull = {...EMPTY_TABLE_DATA.operations, slice: {from: null, to: 1}}; - assert.deepStrictEqual(__table(source, operationsFromNull), [{a: 1, b: 2, c: 3}]); + // comparing the result of .slice() removes schema from the comparison + assert.deepStrictEqual(__table(source, operationsFromNull).slice(), [{a: 1, b: 2, c: 3}]); const operations = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: 2}}; - assert.deepStrictEqual(__table(source, operations), [{a: 2, b: 4, c: 6}]); + assert.deepStrictEqual(__table(source, operations).slice(), [{a: 2, b: 4, c: 6}]); }); it("__table retains schema and columns info", () => { @@ -585,6 +592,13 @@ describe("__table", () => { [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] ); }); + + it("__table infers schema", () => { + assert.deepStrictEqual( + __table(source, EMPTY_TABLE_DATA.operations).schema, + [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] + ); + }); }); describe("getTypeValidator filters accurately", () => { From e151efd096df9585a1380c6d0d6333e322743342 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Fri, 20 Jan 2023 11:49:42 -0800 Subject: [PATCH 05/90] Handle sources that are arrays of primitives --- src/table.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 1dedda1f..cc5731f9 100644 --- a/src/table.js +++ b/src/table.js @@ -693,7 +693,10 @@ function initKey() { function inferSchema(source) { const schema = []; const sampleSize = 100; - const sample = source.slice(0, sampleSize); + let sample = source.slice(0, sampleSize); + if (arrayIsPrimitive(sample)) { + sample = sample.map(d => {return {value: d};}); + } const typeCounts = {}; sample.map((d) => { for (const key in d) { From ec38311aa6cb8e82e364ced7dc8967372c883c4f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Sat, 21 Jan 2023 08:33:35 -0800 Subject: [PATCH 06/90] Formatting --- src/table.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index cc5731f9..141ecc4e 100644 --- a/src/table.js +++ b/src/table.js @@ -695,7 +695,9 @@ function inferSchema(source) { const sampleSize = 100; let sample = source.slice(0, sampleSize); if (arrayIsPrimitive(sample)) { - sample = sample.map(d => {return {value: d};}); + sample = sample.map((d) => { + return {value: d}; + }); } const typeCounts = {}; sample.map((d) => { @@ -713,8 +715,7 @@ function inferSchema(source) { else if (value instanceof ArrayBuffer) typeCounts[key]["buffer"]++; else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object } else { - if (value === "true" || value === "false") - typeCounts[key]["boolean"]++; + if (value === "true" || value === "false") typeCounts[key]["boolean"]++; else if (!isNaN(+value) && /^-?[0-9]+$/.test(value)) typeCounts[key]["integer"]++; else if (!isNaN(+value)) typeCounts[key]["number"]++; From 6e9d64e12cd0aa3988c165b36f4075c03439ef1f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 23 Jan 2023 13:13:16 -0800 Subject: [PATCH 07/90] Quick updates based on feedback --- src/table.js | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/table.js b/src/table.js index 141ecc4e..63a3d01a 100644 --- a/src/table.js +++ b/src/table.js @@ -700,41 +700,40 @@ function inferSchema(source) { }); } const typeCounts = {}; - sample.map((d) => { + for (const d of sample) { for (const key in d) { if (!typeCounts[key]) typeCounts[key] = initKey(); // for json and sqlite, we already have some types, but for csv and tsv, all // columns are strings here. const type = typeof d[key]; - const value = type === "string" ? d[key]?.trim() : d[key]; + const value = type === "string" ? d[key].trim() : d[key]; if (value === null || value === undefined || value.length === 0) - typeCounts[key]["other"]++; + typeCounts[key].other++; else if (type !== "string") { - if (Array.isArray(value)) typeCounts[key]["array"]++; - else if (value instanceof Date) typeCounts[key]["date"]++; - else if (value instanceof ArrayBuffer) typeCounts[key]["buffer"]++; + if (Array.isArray(value)) typeCounts[key].array++; + else if (value instanceof Date) typeCounts[key].date++; + else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object } else { - if (value === "true" || value === "false") typeCounts[key]["boolean"]++; - else if (!isNaN(+value) && /^-?[0-9]+$/.test(value)) - typeCounts[key]["integer"]++; - else if (!isNaN(+value)) typeCounts[key]["number"]++; - else if ( + if (value === "true" || value === "false") typeCounts[key].boolean++; + else if (!isNaN(value)) { + if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; + else typeCounts[key].number++; + } else if ( value.match( /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ ) ) - typeCounts[key]["date"]++; + typeCounts[key].date++; else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/)) - typeCounts[key]["date"]++; + typeCounts[key].date++; else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/)) - typeCounts[key]["date"]++; - else typeCounts[key]["string"]++; + typeCounts[key].date++; + else typeCounts[key].string++; } } - }); - const columns = Object.keys(typeCounts); - for (const col of columns) { + } + for (const col in typeCounts) { // sort descending so most commonly encoutered type is first const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) { return typeCounts[col][b] - typeCounts[col][a]; From 799398f3567b8636acd185d3d0232ffac2631cb0 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 23 Jan 2023 15:03:30 -0800 Subject: [PATCH 08/90] With Mike F's coercion --- src/table.js | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 63a3d01a..cf7bfe50 100644 --- a/src/table.js +++ b/src/table.js @@ -538,16 +538,60 @@ export function getTypeValidator(colType) { } } +// Function to get the correct validity checking function based on type +export function coerceToType(value, colType) { + let m; + switch (colType) { + case "string": + return `${value}`; + case "bigint": + return isNaN(Number(value)) ? null : BigInt(value); + case "boolean": + return (value === true || value === "true") ? true : + value === false || value === "false" ? false : null; + case "number": + return isNaN(Number(value)) ? value : Number(value); + case "date": + if (m = value.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/)) { + if (fixtz && !!m[4] && !m[7]) value = value.replace(/-/g, "/").replace(/T/, " "); + return new Date(value); + } + return null; + default: + return value; + // case "buffer": + // return isValidBuffer; + // case "array": + // return isValidArray; + // case "object": + // return isValidObject; + // case "other": + // default: + // return isValidOther; + } + } + + // This function applies table cell operations to an in-memory table (array of // objects); it should be equivalent to the corresponding SQL query. TODO Use // DuckDBClient for data arrays, too, and then we wouldn’t need our own __table // function to do table operations on in-memory data? export function __table(source, operations) { let {schema, columns} = source; - if (!schema || !isValidSchema(schema)) source.schema = inferSchema(source); + let newlyInferred = false; + if (!schema || !isValidSchema(schema)) { + source.schema = inferSchema(source); + newlyInferred = true; + } const input = source; let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); + // Combine column types from schema with user selected types in operations + const types = new Map(source.schema.map(({name, type}) => [name, type])); + if (operations.type || newlyInferred) { + operations.type?.forEach(({column, type}) => types.set(column, type)); + source = source.map(d => coerceRow(d, types)); + } for (const {type, operands} of operations.filter) { const [{value: column}] = operands; const values = operands.slice(1).map(({value}) => value); @@ -675,6 +719,18 @@ function isValidSchema(schema) { }); } +export default function coerceRow(object, types) { + for (var key in object) { + const type = types.get(key); + const value = object[key]; + object[key] = coerceToType(value, type); + } + return object; +} + +// https://github.com/d3/d3-dsv/issues/45 +const fixtz = new Date("2019-01-01T00:00").getHours() || new Date("2019-07-01T00:00").getHours(); + function initKey() { return { other: 0, @@ -690,7 +746,7 @@ function initKey() { }; } -function inferSchema(source) { +export function inferSchema(source) { const schema = []; const sampleSize = 100; let sample = source.slice(0, sampleSize); From 5f30887fd1c6863abd87db2fe69227395e51db17 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 11:21:34 -0800 Subject: [PATCH 09/90] Remove new validity check fn and use existing --- src/table.js | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/table.js b/src/table.js index cf7bfe50..5a2b281b 100644 --- a/src/table.js +++ b/src/table.js @@ -66,7 +66,7 @@ function objectHasEnumerableKeys(value) { } function isQueryResultSetSchema(schemas) { - return (Array.isArray(schemas) && schemas.every((s) => s && typeof s.name === "string")); + return (Array.isArray(schemas) && schemas.every((s) => s && typeof s.name === "string" && typeof s.type === "string")); } function isQueryResultSetColumns(columns) { @@ -579,8 +579,8 @@ export function coerceToType(value, colType) { export function __table(source, operations) { let {schema, columns} = source; let newlyInferred = false; - if (!schema || !isValidSchema(schema)) { source.schema = inferSchema(source); + if (!schema || !isQueryResultSetSchema(schema)) { newlyInferred = true; } const input = source; @@ -712,13 +712,6 @@ export function __table(source, operations) { return source; } -function isValidSchema(schema) { - if (!schema || !Array.isArray(schema)) return; - return schema.every((s) => { - s && typeof s.name === "string" && typeof s.type === "string"; - }); -} - export default function coerceRow(object, types) { for (var key in object) { const type = types.get(key); From eb7008ab73481a2b8063a0be3d29169cc78a68f7 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 11:22:03 -0800 Subject: [PATCH 10/90] Don't mutate source --- src/table.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 5a2b281b..d1028155 100644 --- a/src/table.js +++ b/src/table.js @@ -577,17 +577,17 @@ export function coerceToType(value, colType) { // DuckDBClient for data arrays, too, and then we wouldn’t need our own __table // function to do table operations on in-memory data? export function __table(source, operations) { + const input = source; let {schema, columns} = source; let newlyInferred = false; - source.schema = inferSchema(source); if (!schema || !isQueryResultSetSchema(schema)) { + schema = inferSchema(source); newlyInferred = true; } - const input = source; let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); // Combine column types from schema with user selected types in operations - const types = new Map(source.schema.map(({name, type}) => [name, type])); + const types = new Map(schema.map(({name, type}) => [name, type])); if (operations.type || newlyInferred) { operations.type?.forEach(({column, type}) => types.set(column, type)); source = source.map(d => coerceRow(d, types)); From 37868373bf1daf7d9d03012b34e3ac8e7b7bfaf1 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 11:22:33 -0800 Subject: [PATCH 11/90] Don't mutate row --- src/table.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index d1028155..2c1f1f1a 100644 --- a/src/table.js +++ b/src/table.js @@ -713,12 +713,13 @@ export function __table(source, operations) { } export default function coerceRow(object, types) { + let coerced = {}; for (var key in object) { const type = types.get(key); const value = object[key]; - object[key] = coerceToType(value, type); + coerced[key] = coerceToType(value, type); } - return object; + return coerced; } // https://github.com/d3/d3-dsv/issues/45 From 81018d47452186a612b14665d4dcf76a50241626 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 12:01:46 -0800 Subject: [PATCH 12/90] Add exported fn to index.js --- src/index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/index.js b/src/index.js index 3afaf70c..9ae35311 100644 --- a/src/index.js +++ b/src/index.js @@ -9,5 +9,6 @@ export { isDataArray, isDatabaseClient, __table as applyDataTableOperations, - getTypeValidator + getTypeValidator, + inferSchema } from "./table.js"; From 3ec692b404c6531085e2b351391f58442c9fe12d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 13:59:58 -0800 Subject: [PATCH 13/90] Apply user-selected types and update schema --- src/table.js | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 2c1f1f1a..ab717146 100644 --- a/src/table.js +++ b/src/table.js @@ -586,10 +586,19 @@ export function __table(source, operations) { } let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); - // Combine column types from schema with user selected types in operations + // Combine column types from schema with user-selected types in operations const types = new Map(schema.map(({name, type}) => [name, type])); - if (operations.type || newlyInferred) { - operations.type?.forEach(({column, type}) => types.set(column, type)); + if (operations.type) { + for (const {name, type} of operations.type) { + types.set(name, type); + source = source.map(d => coerceRow(d, types)); + // update schema with user-selected type + const colIndex = schema.findIndex((col) => col.name === name); + if (colIndex > -1) schema[colIndex] = {name, type}; + } + } + // Coerce data according to new schema, unless we already did + if (newlyInferred && !operations.type) { source = source.map(d => coerceRow(d, types)); } for (const {type, operands} of operations.filter) { From 2aaaad094885a3a5e5b464e6574c39467df39ec1 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 14:28:21 -0800 Subject: [PATCH 14/90] Combine into one regex --- src/table.js | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/table.js b/src/table.js index ab717146..a7e6a931 100644 --- a/src/table.js +++ b/src/table.js @@ -780,14 +780,10 @@ export function inferSchema(source) { else typeCounts[key].number++; } else if ( value.match( - /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ + /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ ) ) typeCounts[key].date++; - else if (value.match(/(\d{1,2})\/(\d{1,2})\/(\d{2,4}) (\d{2}):(\d{2})/)) - typeCounts[key].date++; - else if (value.match(/(\d{4})-(\d{1,2})-(\d{1,2})/)) - typeCounts[key].date++; else typeCounts[key].string++; } } From 4bd58bf30f09d59aa786b6e730674cca37007d95 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 14:52:05 -0800 Subject: [PATCH 15/90] Update handling of "other" and use d3.greatest --- src/table.js | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/table.js b/src/table.js index a7e6a931..2a9db1af 100644 --- a/src/table.js +++ b/src/table.js @@ -1,4 +1,4 @@ -import {reverse} from "d3-array"; +import {greatest, reverse} from "d3-array"; import {FileAttachment} from "./fileAttachment.js"; import {isArqueroTable} from "./arquero.js"; import {isArrowTable, loadArrow} from "./arrow.js"; @@ -766,13 +766,13 @@ export function inferSchema(source) { // columns are strings here. const type = typeof d[key]; const value = type === "string" ? d[key].trim() : d[key]; - if (value === null || value === undefined || value.length === 0) - typeCounts[key].other++; - else if (type !== "string") { + if (type !== "string") { if (Array.isArray(value)) typeCounts[key].array++; else if (value instanceof Date) typeCounts[key].date++; else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; - else if (type in typeCounts[key]) typeCounts[key][type]++; // number, bigint, boolean, or object + // number, bigint, boolean, or object + else if (type in typeCounts[key]) typeCounts[key][type]++; + else if (value !== null && value !== undefined) typeCounts[key].other++; } else { if (value === "true" || value === "false") typeCounts[key].boolean++; else if (!isNaN(value)) { @@ -789,22 +789,10 @@ export function inferSchema(source) { } } for (const col in typeCounts) { - // sort descending so most commonly encoutered type is first - const typesSorted = Object.keys(typeCounts[col]).sort(function (a, b) { - return typeCounts[col][b] - typeCounts[col][a]; - }); - let type = typesSorted[0]; - if (type === "other") { - // take the next-most-encountered type if most are "other", but only if - // its tally is greater than the next one in the list - if (typeCounts[typesSorted[1]] > typeCounts[typesSorted[2]]) - type = typesSorted[1]; - // else we could iterate over the sample and use the first encountered type - } schema.push({ name: col, - type: type + type: greatest(Object.keys(typeCounts[col]), (d) => typeCounts[col][d]) }); } return schema; -} \ No newline at end of file +} From bf9771390923dbbde741518c1fff44f22fb26c68 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 15:48:51 -0800 Subject: [PATCH 16/90] Fix tests --- test/table-test.js | 378 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 304 insertions(+), 74 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index e04d7a42..bc26f474 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -443,21 +443,51 @@ describe("__table", () => { let source; beforeEach(() => { - source = [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]; + source = [ + {a: 1, b: 2, c: 3}, + {a: 2, b: 4, c: 6}, + {a: 3, b: 6, c: 9} + ]; + source.schema = [ + {name: "a", type: "number"}, + {name: "b", type: "number"}, + {name: "c", type: "number"} + ]; }); it("__table no operations", () => { - assert.deepStrictEqual(__table(source, EMPTY_TABLE_DATA.operations), source); + assert.deepStrictEqual( + __table(source, EMPTY_TABLE_DATA.operations), + source + ); }); it("__table columns", () => { - const operationsNullColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: null}}; + const operationsNullColumns = { + ...EMPTY_TABLE_DATA.operations, + select: {columns: null} + }; assert.deepStrictEqual(__table(source, operationsNullColumns), source); - const operationsEmptyColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: []}}; - // comparing the result of .slice() removes schema from the comparison - assert.deepStrictEqual(__table(source, operationsEmptyColumns).slice(), [{}, {}, {}]); - const operationsSelectedColumns = {...EMPTY_TABLE_DATA.operations, select: {columns: ["a"]}}; - assert.deepStrictEqual(__table(source, operationsSelectedColumns).slice(), [{a: 1}, {a: 2}, {a: 3}]); + const operationsEmptyColumns = { + ...EMPTY_TABLE_DATA.operations, + select: {columns: []} + }; + const expectedEmpty = [{}, {}, {}]; + expectedEmpty.schema = []; + assert.deepStrictEqual( + __table(source, operationsEmptyColumns), + expectedEmpty + ); + const operationsSelectedColumns = { + ...EMPTY_TABLE_DATA.operations, + select: {columns: ["a"]} + }; + const expectedSelected = [{a: 1}, {a: 2}, {a: 3}]; + expectedSelected.schema = [{name: "a", type: "number"}]; + assert.deepStrictEqual( + __table(source, operationsSelectedColumns), + expectedSelected + ); }); it("__table unknown filter", () => { @@ -465,138 +495,338 @@ describe("__table", () => { ...EMPTY_TABLE_DATA.operations, filter: [{type: "xyz", operands: [{type: "column", value: "a"}]}] }; - assert.throws(() => __table(source, operations), /unknown filter type: xyz/); + assert.throws( + () => __table(source, operations), + /unknown filter type: xyz/ + ); }); it("__table filter lt + gt", () => { const operationsEquals = { ...EMPTY_TABLE_DATA.operations, - filter: [{type: "eq", operands: [{type: "column", value: "a"}, {type: "resolved", value: 1}]}] + filter: [ + { + type: "eq", + operands: [ + {type: "column", value: "a"}, + {type: "resolved", value: 1} + ] + } + ] }; - assert.deepStrictEqual(__table(source, operationsEquals), [{a: 1, b: 2, c: 3}]); + const expectedEq = [{a: 1, b: 2, c: 3}]; + expectedEq.schema = source.schema; + assert.deepStrictEqual(__table(source, operationsEquals), expectedEq); const operationsComparison = { ...EMPTY_TABLE_DATA.operations, filter: [ - {type: "lt", operands: [{type: "column", value: "a"}, {type: "resolved", value: 3}]}, - {type: "gt", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2}]} + { + type: "lt", + operands: [ + {type: "column", value: "a"}, + {type: "resolved", value: 3} + ] + }, + { + type: "gt", + operands: [ + {type: "column", value: "b"}, + {type: "resolved", value: 2} + ] + } ] }; - // comparing the result of .slice() removes schema from the comparison - assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]); + const expectedLtGt = [{a: 2, b: 4, c: 6}]; + expectedLtGt.schema = source.schema; + assert.deepStrictEqual(__table(source, operationsComparison), expectedLtGt); }); it("__table filter lte + gte", () => { const operationsEquals = { ...EMPTY_TABLE_DATA.operations, - filter: [{type: "eq", operands: [{type: "column", value: "a"}, {type: "resolved", value: 1}]}] + filter: [ + { + type: "eq", + operands: [ + {type: "column", value: "a"}, + {type: "resolved", value: 1} + ] + } + ] }; - assert.deepStrictEqual(__table(source, operationsEquals), [{a: 1, b: 2, c: 3}]); + const expectedEq = [{a: 1, b: 2, c: 3}]; + expectedEq.schema = source.schema; + assert.deepStrictEqual(__table(source, operationsEquals), expectedEq); const operationsComparison = { ...EMPTY_TABLE_DATA.operations, filter: [ - {type: "lte", operands: [{type: "column", value: "a"}, {type: "resolved", value: 2.5}]}, - {type: "gte", operands: [{type: "column", value: "b"}, {type: "resolved", value: 2.5}]} + { + type: "lte", + operands: [ + {type: "column", value: "a"}, + {type: "resolved", value: 2.5} + ] + }, + { + type: "gte", + operands: [ + {type: "column", value: "b"}, + {type: "resolved", value: 2.5} + ] + } ] }; - // comparing the result of .slice() removes schema from the comparison - assert.deepStrictEqual(__table(source, operationsComparison).slice(), [{a: 2, b: 4, c: 6}]); + const expectedLteGte = [{a: 2, b: 4, c: 6}]; + expectedLteGte.schema = source.schema; + assert.deepStrictEqual( + __table(source, operationsComparison), + expectedLteGte + ); }); it("__table filter primitive lte + gte", () => { - assert.deepStrictEqual(__table([1, 2, 3], { - ...EMPTY_TABLE_DATA.operations, - filter: [{type: "eq", operands: [{type: "column", value: "value"}, {type: "resolved", value: 1}]}] - }), [1]); - assert.deepStrictEqual(__table(Uint32Array.of(1, 2, 3), { - ...EMPTY_TABLE_DATA.operations, - filter: [{type: "eq", operands: [{type: "column", value: "value"}, {type: "resolved", value: 1}]}] - }), [1]); + const expectedPrimitive = [1]; + expectedPrimitive.schema = [{name: "value", type: "number"}]; + assert.deepStrictEqual( + __table([1, 2, 3], { + ...EMPTY_TABLE_DATA.operations, + filter: [ + { + type: "eq", + operands: [ + {type: "column", value: "value"}, + {type: "resolved", value: 1} + ] + } + ] + }), + expectedPrimitive + ); + const expectedUint32Array = [1]; + expectedUint32Array.schema = []; + assert.deepStrictEqual( + __table(Uint32Array.of(1, 2, 3), { + ...EMPTY_TABLE_DATA.operations, + filter: [ + { + type: "eq", + operands: [ + {type: "column", value: "value"}, + {type: "resolved", value: 1} + ] + } + ] + }), + expectedUint32Array + ); }); it("__table filter eq date", () => { const operationsEquals = { ...EMPTY_TABLE_DATA.operations, - filter: [{type: "eq", operands: [{type: "column", value: "a"}, {type: "resolved", value: new Date("2021-01-02")}]}] + filter: [ + { + type: "eq", + operands: [ + {type: "column", value: "a"}, + {type: "resolved", value: new Date("2021-01-02")} + ] + } + ] }; - const source = [{a: new Date("2021-01-01")}, {a: new Date("2021-01-02")}, {a: new Date("2021-01-03")}]; - assert.deepStrictEqual(__table(source, operationsEquals), [{a: new Date("2021-01-02")}]); + const source = [ + {a: new Date("2021-01-01")}, + {a: new Date("2021-01-02")}, + {a: new Date("2021-01-03")} + ]; + const expected = [{a: new Date("2021-01-02")}]; + expected.schema = [{name: "a", type: "date"}]; + assert.deepStrictEqual(__table(source, operationsEquals), expected); }); it("__table sort", () => { - const operationsDesc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "desc"}]}; - assert.deepStrictEqual( - __table(source, operationsDesc), - [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}] - ); - const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]}; - // comparing the result of .slice() removes schema from the comparison - assert.deepStrictEqual( - __table(source, operationsAsc).slice(), - [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}] - ); + const operationsDesc = { + ...EMPTY_TABLE_DATA.operations, + sort: [{column: "a", direction: "desc"}] + }; + const expectedDesc = [ + {a: 3, b: 6, c: 9}, + {a: 2, b: 4, c: 6}, + {a: 1, b: 2, c: 3} + ]; + expectedDesc.schema = source.schema; + assert.deepStrictEqual(__table(source, operationsDesc), expectedDesc); + const operationsAsc = { + ...EMPTY_TABLE_DATA.operations, + sort: [{column: "a", direction: "asc"}] + }; + const expectedAsc = [ + {a: 1, b: 2, c: 3}, + {a: 2, b: 4, c: 6}, + {a: 3, b: 6, c: 9} + ]; + expectedAsc.schema = source.schema; + assert.deepStrictEqual(__table(source, operationsAsc), expectedAsc); const sourceExtended = [...source, {a: 1, b: 3, c: 3}, {a: 1, b: 5, c: 3}]; const operationsMulti = { ...EMPTY_TABLE_DATA.operations, - sort: [{column: "a", direction: "desc"}, {column: "b", direction: "desc"}] + sort: [ + {column: "a", direction: "desc"}, + {column: "b", direction: "desc"} + ] }; + const expectedExtended = [ + {a: 3, b: 6, c: 9}, + {a: 2, b: 4, c: 6}, + {a: 1, b: 5, c: 3}, + {a: 1, b: 3, c: 3}, + {a: 1, b: 2, c: 3} + ]; + expectedExtended.schema = source.schema; assert.deepStrictEqual( __table(sourceExtended, operationsMulti), - [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 5, c: 3}, {a: 1, b: 3, c: 3}, {a: 1, b: 2, c: 3}] + expectedExtended ); }); it("__table sort missing values", () => { - const sourceWithMissing = [{a: 1}, {a: null}, {a: undefined}, {a: 10}, {a: 5}, {a: NaN}, {a: null}, {a: 20}]; - const operationsDesc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "desc"}]}; + const sourceWithMissing = [ + {a: 1}, + {a: null}, + {a: undefined}, + {a: 10}, + {a: 5}, + {a: NaN}, + {a: null}, + {a: 20} + ]; + const operationsDesc = { + ...EMPTY_TABLE_DATA.operations, + sort: [{column: "a", direction: "desc"}] + }; + const expectedDesc = [ + {a: 20}, + {a: 10}, + {a: 5}, + {a: 1}, + {a: 0}, + {a: 0}, + {a: undefined}, + {a: NaN} + ]; + expectedDesc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsDesc), - [{a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null}] + expectedDesc ); - const operationsAsc = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "asc"}]}; - // comparing the result of .slice() removes schema from the comparison + const operationsAsc = { + ...EMPTY_TABLE_DATA.operations, + sort: [{column: "a", direction: "asc"}] + }; + const expectedAsc = [ + {a: 0}, + {a: 0}, + {a: 1}, + {a: 5}, + {a: 10}, + {a: 20}, + {a: undefined}, + {a: NaN} + ]; + expectedAsc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( - __table(sourceWithMissing, operationsAsc).slice(), - [{a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null}] + __table(sourceWithMissing, operationsAsc), + expectedAsc ); }); it("__table sort does not mutate input", () => { - const operations = {...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "desc"}]}; - assert.deepStrictEqual( - __table(source, operations), - [{a: 3, b: 6, c: 9}, {a: 2, b: 4, c: 6}, {a: 1, b: 2, c: 3}] - ); - // comparing the result of .slice() removes schema from the comparison - assert.deepStrictEqual( - source.slice(), - [{a: 1, b: 2, c: 3}, {a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}] - ); + const operations = { + ...EMPTY_TABLE_DATA.operations, + sort: [{column: "a", direction: "desc"}] + }; + const sorted = [ + {a: 3, b: 6, c: 9}, + {a: 2, b: 4, c: 6}, + {a: 1, b: 2, c: 3} + ]; + sorted.schema = source.schema; + assert.deepStrictEqual(__table(source, operations), sorted); + const originalOrder = [ + {a: 1, b: 2, c: 3}, + {a: 2, b: 4, c: 6}, + {a: 3, b: 6, c: 9} + ]; + originalOrder.schema = source.schema; + assert.deepStrictEqual(source, originalOrder); }); it("__table slice", () => { - const operationsToNull = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: null}}; - assert.deepStrictEqual(__table(source, operationsToNull), [{a: 2, b: 4, c: 6}, {a: 3, b: 6, c: 9}]); - const operationsFromNull = {...EMPTY_TABLE_DATA.operations, slice: {from: null, to: 1}}; - // comparing the result of .slice() removes schema from the comparison - assert.deepStrictEqual(__table(source, operationsFromNull).slice(), [{a: 1, b: 2, c: 3}]); - const operations = {...EMPTY_TABLE_DATA.operations, slice: {from: 1, to: 2}}; - assert.deepStrictEqual(__table(source, operations).slice(), [{a: 2, b: 4, c: 6}]); + const operationsToNull = { + ...EMPTY_TABLE_DATA.operations, + slice: {from: 1, to: null} + }; + const expectedToNull = [ + {a: 2, b: 4, c: 6}, + {a: 3, b: 6, c: 9} + ]; + expectedToNull.schema = source.schema; + assert.deepStrictEqual(__table(source, operationsToNull), expectedToNull); + const operationsFromNull = { + ...EMPTY_TABLE_DATA.operations, + slice: {from: null, to: 1} + }; + const expectedFromNull = [{a: 1, b: 2, c: 3}]; + expectedFromNull.schema = source.schema; + assert.deepStrictEqual( + __table(source, operationsFromNull), + expectedFromNull + ); + const operations = { + ...EMPTY_TABLE_DATA.operations, + slice: {from: 1, to: 2} + }; + const expectedSlice = [{a: 2, b: 4, c: 6}]; + expectedSlice.schema = source.schema; + assert.deepStrictEqual(__table(source, operations), expectedSlice); }); it("__table retains schema and columns info", () => { source.columns = ["a", "b", "c"]; - assert.deepStrictEqual(__table(source, EMPTY_TABLE_DATA.operations).columns, ["a", "b", "c"]); - source.schema = [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}]; + assert.deepStrictEqual( + __table(source, EMPTY_TABLE_DATA.operations).columns, + ["a", "b", "c"] + ); + source.schema = [ + {name: "a", type: "number"}, + {name: "b", type: "number"}, + {name: "c", type: "number"} + ]; assert.deepStrictEqual( __table(source, EMPTY_TABLE_DATA.operations).schema, - [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] + [ + {name: "a", type: "number"}, + {name: "b", type: "number"}, + {name: "c", type: "number"} + ] ); }); it("__table infers schema", () => { assert.deepStrictEqual( - __table(source, EMPTY_TABLE_DATA.operations).schema, - [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] + __table( + [ + {a: 1, b: 2, c: 3}, + {a: 2, b: 4, c: 6}, + {a: 3, b: 6, c: 9} + ], + EMPTY_TABLE_DATA.operations + ).schema, + [ + {name: "a", type: "number"}, + {name: "b", type: "number"}, + {name: "c", type: "number"} + ] ); }); }); From afe52416e9b505e91619b6d1a593aea29ad2e68e Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 24 Jan 2023 15:57:15 -0800 Subject: [PATCH 17/90] Small fixes --- src/table.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 2a9db1af..2e30a814 100644 --- a/src/table.js +++ b/src/table.js @@ -545,6 +545,7 @@ export function coerceToType(value, colType) { case "string": return `${value}`; case "bigint": + // eslint-disable-next-line no-undef return isNaN(Number(value)) ? null : BigInt(value); case "boolean": return (value === true || value === "true") ? true : @@ -552,6 +553,7 @@ export function coerceToType(value, colType) { case "number": return isNaN(Number(value)) ? value : Number(value); case "date": + if (value instanceof Date) return value; if (m = value.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/)) { if (fixtz && !!m[4] && !m[7]) value = value.replace(/-/g, "/").replace(/T/, " "); return new Date(value); @@ -589,7 +591,7 @@ export function __table(source, operations) { // Combine column types from schema with user-selected types in operations const types = new Map(schema.map(({name, type}) => [name, type])); if (operations.type) { - for (const {name, type} of operations.type) { + for (const {name, type} of operations.type) { types.set(name, type); source = source.map(d => coerceRow(d, types)); // update schema with user-selected type From f5c648bbdcae65ba150f9e5a0267c0bf5382cd4c Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 09:32:04 -0800 Subject: [PATCH 18/90] More coercion --- src/table.js | 67 +++++++++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/table.js b/src/table.js index 2e30a814..47dd3c29 100644 --- a/src/table.js +++ b/src/table.js @@ -538,41 +538,46 @@ export function getTypeValidator(colType) { } } -// Function to get the correct validity checking function based on type -export function coerceToType(value, colType) { - let m; - switch (colType) { +export function coerceToType(value, type) { + switch (type) { case "string": - return `${value}`; + return value === "string" ? value.trim() : `${value}`; + case "boolean": + return value === true || value === "true" + ? true + : value === false || value === "false" + ? false + : null; + case "integer": + return isNaN(parseInt(value)) ? null : parseInt(value); case "bigint": // eslint-disable-next-line no-undef return isNaN(Number(value)) ? null : BigInt(value); - case "boolean": - return (value === true || value === "true") ? true : - value === false || value === "false" ? false : null; - case "number": - return isNaN(Number(value)) ? value : Number(value); - case "date": - if (value instanceof Date) return value; - if (m = value.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/)) { - if (fixtz && !!m[4] && !m[7]) value = value.replace(/-/g, "/").replace(/T/, " "); - return new Date(value); - } - return null; - default: - return value; - // case "buffer": - // return isValidBuffer; - // case "array": - // return isValidArray; - // case "object": - // return isValidObject; - // case "other": - // default: - // return isValidOther; + case "number": + return isNaN(Number(value)) ? null : Number(value); + case "date": { + if (value instanceof Date) return value; + let match; + if ( + (match = value.match( + /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ + )) + ) { + if (fixTz && !!match[4] && !match[7]) + value = value.replace(/-/g, "/").replace(/T/, " "); + const date = new Date(value); + return date instanceof Date ? date : null; + } + return null; } + case "array": + case "buffer": + case "object": + case "other": + default: + return value ? value : null; } - +} // This function applies table cell operations to an in-memory table (array of // objects); it should be equivalent to the corresponding SQL query. TODO Use @@ -734,7 +739,7 @@ export default function coerceRow(object, types) { } // https://github.com/d3/d3-dsv/issues/45 -const fixtz = new Date("2019-01-01T00:00").getHours() || new Date("2019-07-01T00:00").getHours(); +const fixTz = new Date("2019-01-01T00:00").getHours() || new Date("2019-07-01T00:00").getHours(); function initKey() { return { @@ -786,6 +791,8 @@ export function inferSchema(source) { ) ) typeCounts[key].date++; + // the long regex accepts dates in the form of ISOString and + // LocaleDateString, with or without times else typeCounts[key].string++; } } From 740d8606f0d9fdee2eb5b3a72221eb264c7132a6 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 09:55:37 -0800 Subject: [PATCH 19/90] Fix test --- test/table-test.js | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index bc26f474..4b3f7c44 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -691,28 +691,14 @@ describe("__table", () => { it("__table sort missing values", () => { const sourceWithMissing = [ - {a: 1}, - {a: null}, - {a: undefined}, - {a: 10}, - {a: 5}, - {a: NaN}, - {a: null}, - {a: 20} + {a: 1}, {a: null}, {a: undefined}, {a: 10}, {a: 5}, {a: NaN}, {a: null}, {a: 20} ]; const operationsDesc = { ...EMPTY_TABLE_DATA.operations, sort: [{column: "a", direction: "desc"}] }; const expectedDesc = [ - {a: 20}, - {a: 10}, - {a: 5}, - {a: 1}, - {a: 0}, - {a: 0}, - {a: undefined}, - {a: NaN} + {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: 0}, {a: 0}, {a: null}, {a: null} ]; expectedDesc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( @@ -724,14 +710,7 @@ describe("__table", () => { sort: [{column: "a", direction: "asc"}] }; const expectedAsc = [ - {a: 0}, - {a: 0}, - {a: 1}, - {a: 5}, - {a: 10}, - {a: 20}, - {a: undefined}, - {a: NaN} + {a: 0}, {a: 0}, {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: null} ]; expectedAsc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( From 9ddd3520c4cdb6ab95cf0b261e2a68ab0565f74c Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 10:43:37 -0800 Subject: [PATCH 20/90] Try supporting number coercion into dates --- src/table.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/table.js b/src/table.js index 47dd3c29..0fe25e20 100644 --- a/src/table.js +++ b/src/table.js @@ -557,6 +557,7 @@ export function coerceToType(value, type) { return isNaN(Number(value)) ? null : Number(value); case "date": { if (value instanceof Date) return value; + if (typeof value === "number") return new Date(value); let match; if ( (match = value.match( From bfd138c359a963afa594daddbb42b5e59cb49e73 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 12:01:37 -0800 Subject: [PATCH 21/90] Try with value.toString --- src/table.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 0fe25e20..9367221a 100644 --- a/src/table.js +++ b/src/table.js @@ -541,7 +541,7 @@ export function getTypeValidator(colType) { export function coerceToType(value, type) { switch (type) { case "string": - return value === "string" ? value.trim() : `${value}`; + return value === "string" ? value.trim() : value.toString(); case "boolean": return value === true || value === "true" ? true From 1e820ec869060fdec199e1de140797bf79e465f8 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 15:13:34 -0800 Subject: [PATCH 22/90] Fixes and allowing for soft coercion --- src/table.js | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/table.js b/src/table.js index 9367221a..d131b661 100644 --- a/src/table.js +++ b/src/table.js @@ -538,45 +538,43 @@ export function getTypeValidator(colType) { } } -export function coerceToType(value, type) { +export function coerceToType(value, type, options={}) { + const defaultValue = options.soft ? value : null; switch (type) { case "string": - return value === "string" ? value.trim() : value.toString(); + return value === "string" ? value.trim() : value ? value.toString() : defaultValue; case "boolean": return value === true || value === "true" ? true : value === false || value === "false" ? false - : null; + : defaultValue; case "integer": - return isNaN(parseInt(value)) ? null : parseInt(value); + return isNaN(parseInt(value)) ? defaultValue : parseInt(value); case "bigint": // eslint-disable-next-line no-undef - return isNaN(Number(value)) ? null : BigInt(value); + return isNaN(Number(value)) ? defaultValue : BigInt(value); case "number": - return isNaN(Number(value)) ? null : Number(value); + return isNaN(Number(value)) ? defaultValue : Number(value); case "date": { if (value instanceof Date) return value; - if (typeof value === "number") return new Date(value); - let match; - if ( - (match = value.match( - /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ - )) - ) { - if (fixTz && !!match[4] && !match[7]) - value = value.replace(/-/g, "/").replace(/T/, " "); - const date = new Date(value); - return date instanceof Date ? date : null; + if (typeof value === "string") { + let match; + if (match = value.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/)) { + if (fixTz && !!match[4] && !match[7]) value = value.replace(/-/g, "/").replace(/T/, " "); + } } - return null; + const date = new Date(value); + return date instanceof Date ? date : defaultValue; } case "array": + if (Array.isArray(value)) return value; + return Array.isArray(Array.from(value)) ? Array.from(value) : defaultValue; case "buffer": case "object": case "other": default: - return value ? value : null; + return value || value === 0 ? value : defaultValue; } } @@ -605,9 +603,11 @@ export function __table(source, operations) { if (colIndex > -1) schema[colIndex] = {name, type}; } } - // Coerce data according to new schema, unless we already did + // Coerce data according to new schema, unless that happened due to + // operations.type, above. If coercing for the first time here, perform with + // option {soft: true}, so that original values remain visible. if (newlyInferred && !operations.type) { - source = source.map(d => coerceRow(d, types)); + source = source.map(d => coerceRow(d, types, {soft: true})); } for (const {type, operands} of operations.filter) { const [{value: column}] = operands; @@ -729,12 +729,12 @@ export function __table(source, operations) { return source; } -export default function coerceRow(object, types) { +export default function coerceRow(object, types, options) { let coerced = {}; for (var key in object) { const type = types.get(key); const value = object[key]; - coerced[key] = coerceToType(value, type); + coerced[key] = coerceToType(value, type, options); } return coerced; } From 28a6aaf245530bdd6fde3dc01458454bb286dc0f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 15:16:36 -0800 Subject: [PATCH 23/90] Fix test --- test/table-test.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index 4b3f7c44..5c9d7317 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -698,7 +698,7 @@ describe("__table", () => { sort: [{column: "a", direction: "desc"}] }; const expectedDesc = [ - {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: 0}, {a: 0}, {a: null}, {a: null} + {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: 0}, {a: 0}, {a: undefined}, {a: NaN} ]; expectedDesc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( @@ -710,7 +710,7 @@ describe("__table", () => { sort: [{column: "a", direction: "asc"}] }; const expectedAsc = [ - {a: 0}, {a: 0}, {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: null} + {a: 0}, {a: 0}, {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: undefined}, {a: NaN} ]; expectedAsc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( From 0c2a3ca188d5199caca58e31c23aa97806e5f55f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 25 Jan 2023 15:29:55 -0800 Subject: [PATCH 24/90] Formatting --- src/table.js | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/table.js b/src/table.js index d131b661..b8ec3e8d 100644 --- a/src/table.js +++ b/src/table.js @@ -538,11 +538,15 @@ export function getTypeValidator(colType) { } } -export function coerceToType(value, type, options={}) { +export function coerceToType(value, type, options = {}) { const defaultValue = options.soft ? value : null; switch (type) { case "string": - return value === "string" ? value.trim() : value ? value.toString() : defaultValue; + return value === "string" + ? value.trim() + : value + ? value.toString() + : defaultValue; case "boolean": return value === true || value === "true" ? true @@ -560,8 +564,13 @@ export function coerceToType(value, type, options={}) { if (value instanceof Date) return value; if (typeof value === "string") { let match; - if (match = value.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/)) { - if (fixTz && !!match[4] && !match[7]) value = value.replace(/-/g, "/").replace(/T/, " "); + if ( + (match = value.match( + /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ + )) + ) { + if (fixTz && !!match[4] && !match[7]) + value = value.replace(/-/g, "/").replace(/T/, " "); } } const date = new Date(value); @@ -569,7 +578,9 @@ export function coerceToType(value, type, options={}) { } case "array": if (Array.isArray(value)) return value; - return Array.isArray(Array.from(value)) ? Array.from(value) : defaultValue; + return Array.isArray(Array.from(value)) + ? Array.from(value) + : defaultValue; case "buffer": case "object": case "other": @@ -601,7 +612,7 @@ export function __table(source, operations) { // update schema with user-selected type const colIndex = schema.findIndex((col) => col.name === name); if (colIndex > -1) schema[colIndex] = {name, type}; - } + } } // Coerce data according to new schema, unless that happened due to // operations.type, above. If coercing for the first time here, perform with @@ -740,7 +751,9 @@ export default function coerceRow(object, types, options) { } // https://github.com/d3/d3-dsv/issues/45 -const fixTz = new Date("2019-01-01T00:00").getHours() || new Date("2019-07-01T00:00").getHours(); +const fixTz = + new Date("2019-01-01T00:00").getHours() || + new Date("2019-07-01T00:00").getHours(); function initKey() { return { From 8884734efec0aa3725a7f5ea84e30511197ed657 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 08:24:58 -0800 Subject: [PATCH 25/90] Remove export --- src/table.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index b8ec3e8d..5e539127 100644 --- a/src/table.js +++ b/src/table.js @@ -740,7 +740,7 @@ export function __table(source, operations) { return source; } -export default function coerceRow(object, types, options) { +function coerceRow(object, types, options) { let coerced = {}; for (var key in object) { const type = types.get(key); From a95a1bb7438117fdccf75dd2b1e99b52cd42f579 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 11:20:30 -0800 Subject: [PATCH 26/90] Update number and date coercion --- src/table.js | 19 +++++++++++++------ test/table-test.js | 22 ++-------------------- 2 files changed, 15 insertions(+), 26 deletions(-) diff --git a/src/table.js b/src/table.js index 5e539127..0c03e7ab 100644 --- a/src/table.js +++ b/src/table.js @@ -538,8 +538,9 @@ export function getTypeValidator(colType) { } } -export function coerceToType(value, type, options = {}) { +function coerceToType(value, type, options = {}) { const defaultValue = options.soft ? value : null; + const numberDefault = defaultValue === null ? NaN : defaultValue; switch (type) { case "string": return value === "string" @@ -554,12 +555,13 @@ export function coerceToType(value, type, options = {}) { ? false : defaultValue; case "integer": - return isNaN(parseInt(value)) ? defaultValue : parseInt(value); + return !value || isNaN(parseInt(value)) ? numberDefault : parseInt(value); case "bigint": // eslint-disable-next-line no-undef - return isNaN(Number(value)) ? defaultValue : BigInt(value); - case "number": - return isNaN(Number(value)) ? defaultValue : Number(value); + return !value || isNaN(value) ? numberDefault : BigInt(value); + case "number": { + return !value || isNaN(value) ? numberDefault : Number(value); + } case "date": { if (value instanceof Date) return value; if (typeof value === "string") { @@ -573,8 +575,13 @@ export function coerceToType(value, type, options = {}) { value = value.replace(/-/g, "/").replace(/T/, " "); } } + // Invalid Date objects are still instances of Date, but they return true + // from isNaN(). If we are "soft-coercing," we want to return the original + // value for invalid dates. Otherwise, if a date is invalid, return an + // Invalid Date object. const date = new Date(value); - return date instanceof Date ? date : defaultValue; + const dateDefault = options.soft ? value : date; + return isNaN(date) ? dateDefault : date; } case "array": if (Array.isArray(value)) return value; diff --git a/test/table-test.js b/test/table-test.js index 5c9d7317..8d73a3f5 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -698,7 +698,7 @@ describe("__table", () => { sort: [{column: "a", direction: "desc"}] }; const expectedDesc = [ - {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: 0}, {a: 0}, {a: undefined}, {a: NaN} + {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: NaN}, {a: undefined}, {a: NaN}, {a: NaN} ]; expectedDesc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( @@ -710,7 +710,7 @@ describe("__table", () => { sort: [{column: "a", direction: "asc"}] }; const expectedAsc = [ - {a: 0}, {a: 0}, {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: undefined}, {a: NaN} + {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: NaN}, {a: undefined}, {a: NaN}, {a: NaN} ]; expectedAsc.schema = [{name: "a", type: "number"}]; assert.deepStrictEqual( @@ -790,24 +790,6 @@ describe("__table", () => { ] ); }); - - it("__table infers schema", () => { - assert.deepStrictEqual( - __table( - [ - {a: 1, b: 2, c: 3}, - {a: 2, b: 4, c: 6}, - {a: 3, b: 6, c: 9} - ], - EMPTY_TABLE_DATA.operations - ).schema, - [ - {name: "a", type: "number"}, - {name: "b", type: "number"}, - {name: "c", type: "number"} - ] - ); - }); }); describe("getTypeValidator filters accurately", () => { From c7583f7cda164d6c6e8ce0d0984653435648d3c1 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 14:15:44 -0800 Subject: [PATCH 27/90] Infer integers even if type is number --- src/table.js | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 0c03e7ab..ff21dde1 100644 --- a/src/table.js +++ b/src/table.js @@ -798,7 +798,11 @@ export function inferSchema(source) { if (Array.isArray(value)) typeCounts[key].array++; else if (value instanceof Date) typeCounts[key].date++; else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; - // number, bigint, boolean, or object + else if (type === "number") { + if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; + else typeCounts[key].number++; + } + // bigint, boolean, or object else if (type in typeCounts[key]) typeCounts[key][type]++; else if (value !== null && value !== undefined) typeCounts[key].other++; } else { From b9aceae434af62f86da2387bf2045222526d4f23 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 14:16:03 -0800 Subject: [PATCH 28/90] Coercion improvements --- src/table.js | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/table.js b/src/table.js index ff21dde1..7f601e5b 100644 --- a/src/table.js +++ b/src/table.js @@ -538,14 +538,14 @@ export function getTypeValidator(colType) { } } -function coerceToType(value, type, options = {}) { +export function coerceToType(value, type, options = {}) { const defaultValue = options.soft ? value : null; const numberDefault = defaultValue === null ? NaN : defaultValue; switch (type) { case "string": return value === "string" ? value.trim() - : value + : value || value === 0 ? value.toString() : defaultValue; case "boolean": @@ -557,8 +557,12 @@ function coerceToType(value, type, options = {}) { case "integer": return !value || isNaN(parseInt(value)) ? numberDefault : parseInt(value); case "bigint": - // eslint-disable-next-line no-undef - return !value || isNaN(value) ? numberDefault : BigInt(value); + return typeof value === "bigint" + ? value + : !value || isNaN(value) + ? numberDefault + // eslint-disable-next-line no-undef + : BigInt(value); case "number": { return !value || isNaN(value) ? numberDefault : Number(value); } @@ -585,7 +589,7 @@ function coerceToType(value, type, options = {}) { } case "array": if (Array.isArray(value)) return value; - return Array.isArray(Array.from(value)) + return value && Array.isArray(Array.from(value)) ? Array.from(value) : defaultValue; case "buffer": From 41941c55a90248bb3aea728a963d0e9c9597d9a2 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 14:16:23 -0800 Subject: [PATCH 29/90] Add unit tests --- test/table-test.js | 252 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 244 insertions(+), 8 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index 8d73a3f5..67044ca0 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1,4 +1,10 @@ -import {getTypeValidator, makeQueryTemplate, __table} from "../src/table.js"; +import { + coerceToType, + getTypeValidator, + inferSchema, + makeQueryTemplate, + __table +} from "../src/table.js"; import assert from "assert"; export const EMPTY_TABLE_DATA = { @@ -449,9 +455,9 @@ describe("__table", () => { {a: 3, b: 6, c: 9} ]; source.schema = [ - {name: "a", type: "number"}, - {name: "b", type: "number"}, - {name: "c", type: "number"} + {name: "a", type: "integer"}, + {name: "b", type: "integer"}, + {name: "c", type: "integer"} ]; }); @@ -483,7 +489,7 @@ describe("__table", () => { select: {columns: ["a"]} }; const expectedSelected = [{a: 1}, {a: 2}, {a: 3}]; - expectedSelected.schema = [{name: "a", type: "number"}]; + expectedSelected.schema = [{name: "a", type: "integer"}]; assert.deepStrictEqual( __table(source, operationsSelectedColumns), expectedSelected @@ -586,7 +592,7 @@ describe("__table", () => { it("__table filter primitive lte + gte", () => { const expectedPrimitive = [1]; - expectedPrimitive.schema = [{name: "value", type: "number"}]; + expectedPrimitive.schema = [{name: "value", type: "integer"}]; assert.deepStrictEqual( __table([1, 2, 3], { ...EMPTY_TABLE_DATA.operations, @@ -700,7 +706,7 @@ describe("__table", () => { const expectedDesc = [ {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: NaN}, {a: undefined}, {a: NaN}, {a: NaN} ]; - expectedDesc.schema = [{name: "a", type: "number"}]; + expectedDesc.schema = [{name: "a", type: "integer"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsDesc), expectedDesc @@ -712,7 +718,7 @@ describe("__table", () => { const expectedAsc = [ {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: NaN}, {a: undefined}, {a: NaN}, {a: NaN} ]; - expectedAsc.schema = [{name: "a", type: "number"}]; + expectedAsc.schema = [{name: "a", type: "integer"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsAsc), expectedAsc @@ -873,3 +879,233 @@ describe("getTypeValidator filters accurately", () => { ); }); }); + +describe("inferSchema", () => { + it("infers schema", () => { + assert.deepStrictEqual( + inferSchema( + [ + {a: 1, b: 2, c: 3}, + {a: 2, b: 4, c: 6}, + {a: 3, b: 6, c: 9} + ] + ), + [ + {name: "a", type: "integer"}, + {name: "b", type: "integer"}, + {name: "c", type: "integer"} + ] + ); + }); + + it("infers numbers", () => { + assert.deepStrictEqual( + inferSchema([{a: 1.2}, {a: 3.4}, {a: 5.67}]), + [{name: "a", type: "number"}] + ); + }); + + it("infers booleans", () => { + assert.deepStrictEqual( + inferSchema([{a: "true"}, {a: false}, {a: "false"}, {a: null}]), + [{name: "a", type: "boolean"}] + ); + }); + + it("infers dates", () => { + assert.deepStrictEqual( + inferSchema( + [{a: "1/2/20"}, {a: "2020-11-12 12:23:00"}, {a: new Date()}, {a: null}] + ), + [{name: "a", type: "date"}] + ); + }); + + it("infers strings", () => { + assert.deepStrictEqual( + inferSchema([{a: "cat"}, {a: "dog"}, {a: "1,000"}, {a: null}]), + [{name: "a", type: "string"}] + ); + }); + + it("infers arrays", () => { + assert.deepStrictEqual( + inferSchema([{a: ["cat"]}, {a: ["dog"]}, {a: []}, {a: null}]), + [{name: "a", type: "array"}] + ); + }); + + it("infers objects", () => { + assert.deepStrictEqual( + inferSchema([{a: {d: ["cat"]}}, {a: {d: "dog"}}, {a: {d: 12}}, {a: null}]), + [{name: "a", type: "object"}] + ); + }); + + it("infers bigints", () => { + assert.deepStrictEqual( + inferSchema([{a: 10n}, {a: 22n}, {a: 22}, {a: null}]), + [{name: "a", type: "bigint"}] + ); + }); + + it("infers buffers", () => { + assert.deepStrictEqual( + inferSchema([{a: new ArrayBuffer()}, {a: new ArrayBuffer()}, {a: null}]), + [{name: "a", type: "buffer"}] + ); + }); + + it("infers other", () => { + assert.deepStrictEqual( + inferSchema([{a: Symbol("a")}, {a: Symbol("b")}, {a: null}]), + [{name: "a", type: "other"}] + ); + }); +}); + +describe("coerceToType", () => { + it("coerces to number", () => { + assert.deepStrictEqual(coerceToType("1.2", "number"), 1.2); + assert.deepStrictEqual(coerceToType("A", "number"), NaN); + }); + + it("soft coerces to number", () => { + assert.deepStrictEqual(coerceToType("1.2", "number", {soft: true}), 1.2); + assert.deepStrictEqual(coerceToType("a", "number", {soft: true}), "a"); + }); + + it("coerces to boolean", () => { + assert.deepStrictEqual(coerceToType("true", "boolean"), true); + assert.deepStrictEqual(coerceToType(true, "boolean"), true); + assert.deepStrictEqual(coerceToType("A", "boolean"), null); + }); + + it("soft coerces to boolean", () => { + assert.deepStrictEqual(coerceToType("false", "boolean", {soft: true}), false); + assert.deepStrictEqual(coerceToType("a", "boolean", {soft: true}), "a"); + }); + + it("coerces to date", () => { + const invalidDate = new Date("a"); + assert.deepStrictEqual( + coerceToType("12/12/2020", "date"), + new Date("12/12/2020") + ); + assert.deepStrictEqual( + coerceToType("2022-01-01T12:34:00Z", "date"), + new Date("2022-01-01T12:34:00Z") + ); + assert.deepStrictEqual( + coerceToType("B", "date").toString(), + invalidDate.toString() + ); + assert.deepStrictEqual( + coerceToType({a: 1}, "date").toString(), + invalidDate.toString() + ); + }); + + it("soft coerces to date", () => { + assert.deepStrictEqual( + coerceToType("12/12/2020", "date", {soft: true}), + new Date("12/12/2020") + ); + assert.deepStrictEqual(coerceToType("B", "date", {soft: true}), "B"); + assert.deepStrictEqual( + coerceToType({a: 1}, "date", {soft: true}).toString(), + "[object Object]" + ); + }); + + it("coerces to string", () => { + assert.deepStrictEqual(coerceToType(true, "string"), "true"); + assert.deepStrictEqual(coerceToType(10, "string"), "10"); + assert.deepStrictEqual(coerceToType({a: 1}, "string"), "[object Object]"); + assert.deepStrictEqual(coerceToType(0, "string"), "0"); + assert.deepStrictEqual(coerceToType(null, "string"), null); + assert.deepStrictEqual(coerceToType(undefined, "string"), null); + }); + + it("soft coerces to string", () => { + assert.deepStrictEqual(coerceToType(true, "string", {soft: true}), "true"); + assert.deepStrictEqual(coerceToType(null, "string", {soft: true}), null); + assert.deepStrictEqual( + coerceToType(undefined, "string", {soft: true}), + undefined + ); + }); + + it("coerces to array", () => { + assert.deepStrictEqual(coerceToType("true", "array"), ["t", "r", "u", "e"]); + assert.deepStrictEqual(coerceToType([1,2,3], "array"), [1,2,3]); + assert.deepStrictEqual(coerceToType(null, "array"), null); + assert.deepStrictEqual(coerceToType(undefined, "array"), null); + }); + + it("soft coerces to array", () => { + assert.deepStrictEqual(coerceToType([1,2,3], "array", {soft: true}), [1,2,3]); + assert.deepStrictEqual( + coerceToType(undefined, "array", {soft: true}), + undefined + ); + }); + + it("coerces to object", () => { + assert.deepStrictEqual(coerceToType("true", "object"), "true"); + assert.deepStrictEqual(coerceToType({a: 1, b: 2}, "object"), {a: 1, b: 2}); + assert.deepStrictEqual(coerceToType(null, "object"), null); + assert.deepStrictEqual(coerceToType(undefined, "object"), null); + }); + + it("soft coerces to object", () => { + assert.deepStrictEqual(coerceToType("true", "object", {soft: true}), "true"); + assert.deepStrictEqual(coerceToType(null, "object", {soft: true}), null); + assert.deepStrictEqual( + coerceToType(undefined, "object", {soft: true}), + undefined + ); + }); + + it("coerces to bigint", () => { + assert.deepStrictEqual(coerceToType("32", "bigint"), 32n); + assert.deepStrictEqual(coerceToType(32n, "bigint"), 32n); + assert.deepStrictEqual(coerceToType("A", "bigint"), NaN); + }); + + it("soft coerces to bigint", () => { + assert.deepStrictEqual(coerceToType("32", "bigint", {soft: true}), 32n); + assert.deepStrictEqual(coerceToType("A", "bigint", {soft: true}), "A"); + }); + + it("coerces to buffer", () => { + assert.deepStrictEqual( + coerceToType(new ArrayBuffer(), "buffer"), + new ArrayBuffer() + ); + assert.deepStrictEqual(coerceToType("A", "buffer"), "A"); + assert.deepStrictEqual(coerceToType(undefined, "buffer"), null); + }); + + it("soft coerces to buffer", () => { + assert.deepStrictEqual(coerceToType("A", "buffer"), "A"); + assert.deepStrictEqual( + coerceToType(undefined, "buffer", {soft: true}), + undefined + ); + }); + + it("coerces to other", () => { + assert.deepStrictEqual(coerceToType(0, "other"), 0); + assert.deepStrictEqual(coerceToType("a", "other"), "a"); + assert.deepStrictEqual(coerceToType(undefined, "other"), null); + }); + + it("soft coerces to other", () => { + assert.deepStrictEqual(coerceToType("a", "other", {soft: true}), "a"); + assert.deepStrictEqual( + coerceToType(undefined, "other", {soft: true}), + undefined + ); + }); +}); From 7ee04567f3136241206c308aaee3928af7eccf61 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 14:21:46 -0800 Subject: [PATCH 30/90] Formatting --- src/table.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 7f601e5b..986aec33 100644 --- a/src/table.js +++ b/src/table.js @@ -66,7 +66,12 @@ function objectHasEnumerableKeys(value) { } function isQueryResultSetSchema(schemas) { - return (Array.isArray(schemas) && schemas.every((s) => s && typeof s.name === "string" && typeof s.type === "string")); + return ( + Array.isArray(schemas) && + schemas.every( + (s) => s && typeof s.name === "string" && typeof s.type === "string" + ) + ); } function isQueryResultSetColumns(columns) { From fa60ecc30a5c5e9fccc7aeb05a9498151961c7c6 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 26 Jan 2023 14:55:39 -0800 Subject: [PATCH 31/90] Fix bug --- src/table.js | 5 +++-- test/table-test.js | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/table.js b/src/table.js index 986aec33..b799cbc8 100644 --- a/src/table.js +++ b/src/table.js @@ -816,10 +816,11 @@ export function inferSchema(source) { else if (value !== null && value !== undefined) typeCounts[key].other++; } else { if (value === "true" || value === "false") typeCounts[key].boolean++; - else if (!isNaN(value)) { + else if (value && !isNaN(value)) { if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; else typeCounts[key].number++; } else if ( + value && value.match( /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ ) @@ -827,7 +828,7 @@ export function inferSchema(source) { typeCounts[key].date++; // the long regex accepts dates in the form of ISOString and // LocaleDateString, with or without times - else typeCounts[key].string++; + else if (value) typeCounts[key].string++; } } } diff --git a/test/table-test.js b/test/table-test.js index 67044ca0..a9b7f0ba 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -886,8 +886,8 @@ describe("inferSchema", () => { inferSchema( [ {a: 1, b: 2, c: 3}, - {a: 2, b: 4, c: 6}, - {a: 3, b: 6, c: 9} + {a: "", b: 4, c: 6}, + {a: "", b: 6, c: 9} ] ), [ From 1382c0b87e1c65c3c605a78160333f2c4e4c5c8b Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 08:24:04 -0800 Subject: [PATCH 32/90] Move coercion outside of loop --- src/table.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index b799cbc8..275b8a23 100644 --- a/src/table.js +++ b/src/table.js @@ -585,7 +585,7 @@ export function coerceToType(value, type, options = {}) { } } // Invalid Date objects are still instances of Date, but they return true - // from isNaN(). If we are "soft-coercing," we want to return the original + // from isNaN(). If we are soft-coercing, we want to return the original // value for invalid dates. Otherwise, if a date is invalid, return an // Invalid Date object. const date = new Date(value); @@ -624,11 +624,11 @@ export function __table(source, operations) { if (operations.type) { for (const {name, type} of operations.type) { types.set(name, type); - source = source.map(d => coerceRow(d, types)); // update schema with user-selected type const colIndex = schema.findIndex((col) => col.name === name); if (colIndex > -1) schema[colIndex] = {name, type}; } + source = source.map(d => coerceRow(d, types)); } // Coerce data according to new schema, unless that happened due to // operations.type, above. If coercing for the first time here, perform with From ac1219e86e295692f320b4e358e25649ec1645e3 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 11:30:20 -0800 Subject: [PATCH 33/90] Perform intended check --- src/table.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 275b8a23..0db86322 100644 --- a/src/table.js +++ b/src/table.js @@ -547,8 +547,8 @@ export function coerceToType(value, type, options = {}) { const defaultValue = options.soft ? value : null; const numberDefault = defaultValue === null ? NaN : defaultValue; switch (type) { - case "string": - return value === "string" + case "string": + return typeof value === "string" ? value.trim() : value || value === 0 ? value.toString() From 9d2c39fb44a6ab14d801af6fcd42b5da183cee49 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 11:42:53 -0800 Subject: [PATCH 34/90] Update BigInt coercion and tests --- src/table.js | 6 +++--- test/table-test.js | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 0db86322..9a47253a 100644 --- a/src/table.js +++ b/src/table.js @@ -564,7 +564,7 @@ export function coerceToType(value, type, options = {}) { case "bigint": return typeof value === "bigint" ? value - : !value || isNaN(value) + : !value || isNaN(value) || !Number.isInteger(+value) ? numberDefault // eslint-disable-next-line no-undef : BigInt(value); @@ -808,7 +808,7 @@ export function inferSchema(source) { else if (value instanceof Date) typeCounts[key].date++; else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; else if (type === "number") { - if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; + if (Number.isInteger(+value)) typeCounts[key].integer++; else typeCounts[key].number++; } // bigint, boolean, or object @@ -817,7 +817,7 @@ export function inferSchema(source) { } else { if (value === "true" || value === "false") typeCounts[key].boolean++; else if (value && !isNaN(value)) { - if (/^-?[0-9]+$/.test(value)) typeCounts[key].integer++; + if (Number.isInteger(+value)) typeCounts[key].integer++; else typeCounts[key].number++; } else if ( value && diff --git a/test/table-test.js b/test/table-test.js index a9b7f0ba..0a0e9dc4 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1070,11 +1070,13 @@ describe("coerceToType", () => { it("coerces to bigint", () => { assert.deepStrictEqual(coerceToType("32", "bigint"), 32n); assert.deepStrictEqual(coerceToType(32n, "bigint"), 32n); + assert.deepStrictEqual(coerceToType(1.1, "bigint"), NaN); assert.deepStrictEqual(coerceToType("A", "bigint"), NaN); }); it("soft coerces to bigint", () => { assert.deepStrictEqual(coerceToType("32", "bigint", {soft: true}), 32n); + assert.deepStrictEqual(coerceToType(1.1, "bigint", {soft: true}), 1.1); assert.deepStrictEqual(coerceToType("A", "bigint", {soft: true}), "A"); }); From c22c7817b65fd18e95818e9ecbe050ba6b0c60f6 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 11:53:52 -0800 Subject: [PATCH 35/90] Update handling of whitespace --- src/table.js | 11 ++++++----- test/table-test.js | 12 ++++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/table.js b/src/table.js index 9a47253a..16195bf8 100644 --- a/src/table.js +++ b/src/table.js @@ -546,17 +546,18 @@ export function getTypeValidator(colType) { export function coerceToType(value, type, options = {}) { const defaultValue = options.soft ? value : null; const numberDefault = defaultValue === null ? NaN : defaultValue; + const stringValue = typeof value === "string" && !options.soft ? value.trim() : value; switch (type) { case "string": return typeof value === "string" - ? value.trim() + ? stringValue : value || value === 0 ? value.toString() : defaultValue; case "boolean": - return value === true || value === "true" + return value === true || stringValue === "true" ? true - : value === false || value === "false" + : value === false || stringValue === "false" ? false : defaultValue; case "integer": @@ -576,12 +577,12 @@ export function coerceToType(value, type, options = {}) { if (typeof value === "string") { let match; if ( - (match = value.match( + (match = stringValue.match( /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ )) ) { if (fixTz && !!match[4] && !match[7]) - value = value.replace(/-/g, "/").replace(/T/, " "); + value = stringValue.replace(/-/g, "/").replace(/T/, " "); } } // Invalid Date objects are still instances of Date, but they return true diff --git a/test/table-test.js b/test/table-test.js index 0a0e9dc4..30fd0daa 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -977,6 +977,7 @@ describe("coerceToType", () => { it("coerces to boolean", () => { assert.deepStrictEqual(coerceToType("true", "boolean"), true); + assert.deepStrictEqual(coerceToType("true ", "boolean"), true); assert.deepStrictEqual(coerceToType(true, "boolean"), true); assert.deepStrictEqual(coerceToType("A", "boolean"), null); }); @@ -984,6 +985,7 @@ describe("coerceToType", () => { it("soft coerces to boolean", () => { assert.deepStrictEqual(coerceToType("false", "boolean", {soft: true}), false); assert.deepStrictEqual(coerceToType("a", "boolean", {soft: true}), "a"); + assert.deepStrictEqual(coerceToType("true ", "boolean", {soft: true}), "true "); }); it("coerces to date", () => { @@ -992,6 +994,11 @@ describe("coerceToType", () => { coerceToType("12/12/2020", "date"), new Date("12/12/2020") ); + // with whitespace + assert.deepStrictEqual( + coerceToType("12/12/2020 ", "date", {soft: true}), + new Date("12/12/2020") + ); assert.deepStrictEqual( coerceToType("2022-01-01T12:34:00Z", "date"), new Date("2022-01-01T12:34:00Z") @@ -1011,6 +1018,11 @@ describe("coerceToType", () => { coerceToType("12/12/2020", "date", {soft: true}), new Date("12/12/2020") ); + // with whitespace + assert.deepStrictEqual( + coerceToType("12/12/2020 ", "date", {soft: true}), + new Date("12/12/2020") + ); assert.deepStrictEqual(coerceToType("B", "date", {soft: true}), "B"); assert.deepStrictEqual( coerceToType({a: 1}, "date", {soft: true}).toString(), From f44e3effabbb4b228bdc2a327314174bd13f0f48 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 13:17:55 -0800 Subject: [PATCH 36/90] Improve handling of ints, BigInts, and numbers --- src/table.js | 24 ++++++++++++++++++------ test/table-test.js | 14 ++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/src/table.js b/src/table.js index 16195bf8..e3106fb7 100644 --- a/src/table.js +++ b/src/table.js @@ -546,9 +546,10 @@ export function getTypeValidator(colType) { export function coerceToType(value, type, options = {}) { const defaultValue = options.soft ? value : null; const numberDefault = defaultValue === null ? NaN : defaultValue; - const stringValue = typeof value === "string" && !options.soft ? value.trim() : value; + const stringValue = + typeof value === "string" && !options.soft ? value.trim() : value; switch (type) { - case "string": + case "string": return typeof value === "string" ? stringValue : value || value === 0 @@ -561,16 +562,27 @@ export function coerceToType(value, type, options = {}) { ? false : defaultValue; case "integer": - return !value || isNaN(parseInt(value)) ? numberDefault : parseInt(value); + return value === 0 + ? value + : !value || isNaN(parseInt(value)) + ? numberDefault + : parseInt(value); case "bigint": return typeof value === "bigint" ? value + : value === 0 || value === true || value === false + ? // eslint-disable-next-line no-undef + BigInt(value) : !value || isNaN(value) || !Number.isInteger(+value) ? numberDefault - // eslint-disable-next-line no-undef - : BigInt(value); + : // eslint-disable-next-line no-undef + BigInt(value); case "number": { - return !value || isNaN(value) ? numberDefault : Number(value); + return value === 0 + ? value + : !value || isNaN(value) + ? numberDefault + : Number(value); } case "date": { if (value instanceof Date) return value; diff --git a/test/table-test.js b/test/table-test.js index 30fd0daa..1bdd9014 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -965,13 +965,22 @@ describe("inferSchema", () => { }); describe("coerceToType", () => { + it("coerces to integer", () => { + assert.deepStrictEqual(coerceToType("1.2", "integer"), 1); + assert.deepStrictEqual(coerceToType("10", "integer"), 10); + assert.deepStrictEqual(coerceToType(0, "integer"), 0); + assert.deepStrictEqual(coerceToType("A", "integer"), NaN); + }); + it("coerces to number", () => { assert.deepStrictEqual(coerceToType("1.2", "number"), 1.2); + assert.deepStrictEqual(coerceToType(0, "number"), 0); assert.deepStrictEqual(coerceToType("A", "number"), NaN); }); it("soft coerces to number", () => { assert.deepStrictEqual(coerceToType("1.2", "number", {soft: true}), 1.2); + assert.deepStrictEqual(coerceToType(0, "number", {soft: true}), 0); assert.deepStrictEqual(coerceToType("a", "number", {soft: true}), "a"); }); @@ -1082,6 +1091,11 @@ describe("coerceToType", () => { it("coerces to bigint", () => { assert.deepStrictEqual(coerceToType("32", "bigint"), 32n); assert.deepStrictEqual(coerceToType(32n, "bigint"), 32n); + assert.deepStrictEqual(coerceToType(0, "bigint"), 0n); + assert.deepStrictEqual(coerceToType(false, "bigint"), 0n); + assert.deepStrictEqual(coerceToType(true, "bigint"), 1n); + assert.deepStrictEqual(coerceToType(null, "bigint"), NaN); + assert.deepStrictEqual(coerceToType(undefined, "bigint"), NaN); assert.deepStrictEqual(coerceToType(1.1, "bigint"), NaN); assert.deepStrictEqual(coerceToType("A", "bigint"), NaN); }); From 060f21d929513d3d5b7524d7df23c81f88bb162e Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 13:36:32 -0800 Subject: [PATCH 37/90] Update coercion to arrays and objects --- src/table.js | 10 ++++++---- test/table-test.js | 16 ++++++++-------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/table.js b/src/table.js index e3106fb7..67b60762 100644 --- a/src/table.js +++ b/src/table.js @@ -607,11 +607,13 @@ export function coerceToType(value, type, options = {}) { } case "array": if (Array.isArray(value)) return value; - return value && Array.isArray(Array.from(value)) - ? Array.from(value) - : defaultValue; - case "buffer": + return [value]; case "object": + // this will return true for everything except null, undefined, strings, + // numbers, boolean, and symbols, so may yield unexpected results. + if (typeof value === "object") return value; + return {value: value}; + case "buffer": case "other": default: return value || value === 0 ? value : defaultValue; diff --git a/test/table-test.js b/test/table-test.js index 1bdd9014..afa16ab0 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1058,33 +1058,33 @@ describe("coerceToType", () => { }); it("coerces to array", () => { - assert.deepStrictEqual(coerceToType("true", "array"), ["t", "r", "u", "e"]); + assert.deepStrictEqual(coerceToType("true", "array"), ["true"]); assert.deepStrictEqual(coerceToType([1,2,3], "array"), [1,2,3]); - assert.deepStrictEqual(coerceToType(null, "array"), null); - assert.deepStrictEqual(coerceToType(undefined, "array"), null); + assert.deepStrictEqual(coerceToType(null, "array"), [null]); + assert.deepStrictEqual(coerceToType(undefined, "array"), [undefined]); }); it("soft coerces to array", () => { assert.deepStrictEqual(coerceToType([1,2,3], "array", {soft: true}), [1,2,3]); assert.deepStrictEqual( coerceToType(undefined, "array", {soft: true}), - undefined + [undefined] ); }); it("coerces to object", () => { - assert.deepStrictEqual(coerceToType("true", "object"), "true"); + assert.deepStrictEqual(coerceToType("true", "object"), {value: "true"}); assert.deepStrictEqual(coerceToType({a: 1, b: 2}, "object"), {a: 1, b: 2}); assert.deepStrictEqual(coerceToType(null, "object"), null); - assert.deepStrictEqual(coerceToType(undefined, "object"), null); + assert.deepStrictEqual(coerceToType(undefined, "object"), {value: undefined}); }); it("soft coerces to object", () => { - assert.deepStrictEqual(coerceToType("true", "object", {soft: true}), "true"); + assert.deepStrictEqual(coerceToType("true", "object", {soft: true}), {value: "true"}); assert.deepStrictEqual(coerceToType(null, "object", {soft: true}), null); assert.deepStrictEqual( coerceToType(undefined, "object", {soft: true}), - undefined + {value: undefined} ); }); From ac1d36571d56a0a84c56fdf588c34b70cd5cda06 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 14:27:09 -0800 Subject: [PATCH 38/90] Infer bigints from strings --- src/table.js | 5 +++-- test/table-test.js | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 67b60762..b777a8de 100644 --- a/src/table.js +++ b/src/table.js @@ -796,7 +796,7 @@ function initKey() { string: 0, array: 0, object: 0, - bigint: 0, // TODO for csv, tsv? + bigint: 0, buffer: 0 }; } @@ -834,7 +834,8 @@ export function inferSchema(source) { else if (value && !isNaN(value)) { if (Number.isInteger(+value)) typeCounts[key].integer++; else typeCounts[key].number++; - } else if ( + } else if (/^\d+n$/.test(value)) typeCounts[key].bigint++; + else if ( value && value.match( /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ diff --git a/test/table-test.js b/test/table-test.js index afa16ab0..966d85c3 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -947,6 +947,10 @@ describe("inferSchema", () => { inferSchema([{a: 10n}, {a: 22n}, {a: 22}, {a: null}]), [{name: "a", type: "bigint"}] ); + assert.deepStrictEqual( + inferSchema([{a: "10n"}, {a: "22n"}, {a: "0n"}, {a: null}]), + [{name: "a", type: "bigint"}] + ); }); it("infers buffers", () => { From fc2b128637cbec309b6e49a2fb587d164d674fa2 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 15:23:09 -0800 Subject: [PATCH 39/90] Check percentage of values conforming to inferred type --- src/table.js | 9 ++++++++- test/table-test.js | 28 ++++++++++++++-------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/table.js b/src/table.js index b777a8de..d828e94e 100644 --- a/src/table.js +++ b/src/table.js @@ -849,9 +849,16 @@ export function inferSchema(source) { } } for (const col in typeCounts) { + let type = greatest(Object.keys(typeCounts[col]), (d) => typeCounts[col][d]); + // If over 90% of the sampled data counted as this type, use it. Otherwise, + // use "other." + type = + typeCounts[col][type] / Math.min(source.length, sampleSize) >= 0.9 + ? type + : "other"; schema.push({ name: col, - type: greatest(Object.keys(typeCounts[col]), (d) => typeCounts[col][d]) + type: type }); } return schema; diff --git a/test/table-test.js b/test/table-test.js index 966d85c3..3d694cbd 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -704,9 +704,9 @@ describe("__table", () => { sort: [{column: "a", direction: "desc"}] }; const expectedDesc = [ - {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: NaN}, {a: undefined}, {a: NaN}, {a: NaN} + {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: null}, {a: null}, {a: null} ]; - expectedDesc.schema = [{name: "a", type: "integer"}]; + expectedDesc.schema = [{name: "a", type: "other"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsDesc), expectedDesc @@ -716,9 +716,9 @@ describe("__table", () => { sort: [{column: "a", direction: "asc"}] }; const expectedAsc = [ - {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: NaN}, {a: undefined}, {a: NaN}, {a: NaN} + {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: null}, {a: null}, {a: null} ]; - expectedAsc.schema = [{name: "a", type: "integer"}]; + expectedAsc.schema = [{name: "a", type: "other"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsAsc), expectedAsc @@ -891,7 +891,7 @@ describe("inferSchema", () => { ] ), [ - {name: "a", type: "integer"}, + {name: "a", type: "other"}, {name: "b", type: "integer"}, {name: "c", type: "integer"} ] @@ -907,7 +907,7 @@ describe("inferSchema", () => { it("infers booleans", () => { assert.deepStrictEqual( - inferSchema([{a: "true"}, {a: false}, {a: "false"}, {a: null}]), + inferSchema([{a: "true"}, {a: false}, {a: "false"}]), [{name: "a", type: "boolean"}] ); }); @@ -915,7 +915,7 @@ describe("inferSchema", () => { it("infers dates", () => { assert.deepStrictEqual( inferSchema( - [{a: "1/2/20"}, {a: "2020-11-12 12:23:00"}, {a: new Date()}, {a: null}] + [{a: "1/2/20"}, {a: "2020-11-12 12:23:00"}, {a: new Date()}, {a: "2020-1-12"}] ), [{name: "a", type: "date"}] ); @@ -923,46 +923,46 @@ describe("inferSchema", () => { it("infers strings", () => { assert.deepStrictEqual( - inferSchema([{a: "cat"}, {a: "dog"}, {a: "1,000"}, {a: null}]), + inferSchema([{a: "cat"}, {a: "dog"}, {a: "1,000"}, {a: "null"}]), [{name: "a", type: "string"}] ); }); it("infers arrays", () => { assert.deepStrictEqual( - inferSchema([{a: ["cat"]}, {a: ["dog"]}, {a: []}, {a: null}]), + inferSchema([{a: ["cat"]}, {a: ["dog"]}, {a: []}]), [{name: "a", type: "array"}] ); }); it("infers objects", () => { assert.deepStrictEqual( - inferSchema([{a: {d: ["cat"]}}, {a: {d: "dog"}}, {a: {d: 12}}, {a: null}]), + inferSchema([{a: {d: ["cat"]}}, {a: {d: "dog"}}, {a: {d: 12}}]), [{name: "a", type: "object"}] ); }); it("infers bigints", () => { assert.deepStrictEqual( - inferSchema([{a: 10n}, {a: 22n}, {a: 22}, {a: null}]), + inferSchema([{a: 10n}, {a: 22n}, {a: 1n}]), [{name: "a", type: "bigint"}] ); assert.deepStrictEqual( - inferSchema([{a: "10n"}, {a: "22n"}, {a: "0n"}, {a: null}]), + inferSchema([{a: "10n"}, {a: "22n"}, {a: "0n"}]), [{name: "a", type: "bigint"}] ); }); it("infers buffers", () => { assert.deepStrictEqual( - inferSchema([{a: new ArrayBuffer()}, {a: new ArrayBuffer()}, {a: null}]), + inferSchema([{a: new ArrayBuffer()}, {a: new ArrayBuffer()}]), [{name: "a", type: "buffer"}] ); }); it("infers other", () => { assert.deepStrictEqual( - inferSchema([{a: Symbol("a")}, {a: Symbol("b")}, {a: null}]), + inferSchema([{a: Symbol("a")}, {a: Symbol("b")}]), [{name: "a", type: "other"}] ); }); From 9e53e618d830be201a3c947132994dacb720a971 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 15:40:34 -0800 Subject: [PATCH 40/90] Remove soft coercion option --- src/table.js | 36 +++++++------------ test/table-test.js | 87 +++++++--------------------------------------- 2 files changed, 25 insertions(+), 98 deletions(-) diff --git a/src/table.js b/src/table.js index d828e94e..8ca3d71a 100644 --- a/src/table.js +++ b/src/table.js @@ -543,29 +543,26 @@ export function getTypeValidator(colType) { } } -export function coerceToType(value, type, options = {}) { - const defaultValue = options.soft ? value : null; - const numberDefault = defaultValue === null ? NaN : defaultValue; - const stringValue = - typeof value === "string" && !options.soft ? value.trim() : value; +export function coerceToType(value, type) { + const stringValue = typeof value === "string" ? value.trim() : value; switch (type) { case "string": return typeof value === "string" ? stringValue : value || value === 0 ? value.toString() - : defaultValue; + : null; case "boolean": return value === true || stringValue === "true" ? true : value === false || stringValue === "false" ? false - : defaultValue; + : null; case "integer": return value === 0 ? value : !value || isNaN(parseInt(value)) - ? numberDefault + ? NaN : parseInt(value); case "bigint": return typeof value === "bigint" @@ -574,14 +571,14 @@ export function coerceToType(value, type, options = {}) { ? // eslint-disable-next-line no-undef BigInt(value) : !value || isNaN(value) || !Number.isInteger(+value) - ? numberDefault + ? NaN : // eslint-disable-next-line no-undef BigInt(value); case "number": { return value === 0 ? value : !value || isNaN(value) - ? numberDefault + ? NaN : Number(value); } case "date": { @@ -597,13 +594,7 @@ export function coerceToType(value, type, options = {}) { value = stringValue.replace(/-/g, "/").replace(/T/, " "); } } - // Invalid Date objects are still instances of Date, but they return true - // from isNaN(). If we are soft-coercing, we want to return the original - // value for invalid dates. Otherwise, if a date is invalid, return an - // Invalid Date object. - const date = new Date(value); - const dateDefault = options.soft ? value : date; - return isNaN(date) ? dateDefault : date; + return new Date(value); } case "array": if (Array.isArray(value)) return value; @@ -616,7 +607,7 @@ export function coerceToType(value, type, options = {}) { case "buffer": case "other": default: - return value || value === 0 ? value : defaultValue; + return value || value === 0 ? value : null; } } @@ -646,10 +637,9 @@ export function __table(source, operations) { source = source.map(d => coerceRow(d, types)); } // Coerce data according to new schema, unless that happened due to - // operations.type, above. If coercing for the first time here, perform with - // option {soft: true}, so that original values remain visible. + // operations.type, above. if (newlyInferred && !operations.type) { - source = source.map(d => coerceRow(d, types, {soft: true})); + source = source.map(d => coerceRow(d, types)); } for (const {type, operands} of operations.filter) { const [{value: column}] = operands; @@ -771,12 +761,12 @@ export function __table(source, operations) { return source; } -function coerceRow(object, types, options) { +function coerceRow(object, types) { let coerced = {}; for (var key in object) { const type = types.get(key); const value = object[key]; - coerced[key] = coerceToType(value, type, options); + coerced[key] = coerceToType(value, type); } return coerced; } diff --git a/test/table-test.js b/test/table-test.js index 3d694cbd..f7b61618 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -974,18 +974,16 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType("10", "integer"), 10); assert.deepStrictEqual(coerceToType(0, "integer"), 0); assert.deepStrictEqual(coerceToType("A", "integer"), NaN); + assert.deepStrictEqual(coerceToType(null, "integer"), NaN); }); it("coerces to number", () => { assert.deepStrictEqual(coerceToType("1.2", "number"), 1.2); assert.deepStrictEqual(coerceToType(0, "number"), 0); assert.deepStrictEqual(coerceToType("A", "number"), NaN); - }); - - it("soft coerces to number", () => { - assert.deepStrictEqual(coerceToType("1.2", "number", {soft: true}), 1.2); - assert.deepStrictEqual(coerceToType(0, "number", {soft: true}), 0); - assert.deepStrictEqual(coerceToType("a", "number", {soft: true}), "a"); + assert.deepStrictEqual(coerceToType(null, "number"), NaN); + assert.deepStrictEqual(coerceToType(undefined, "number"), NaN); + assert.deepStrictEqual(coerceToType({a: 1}, "number"), NaN); }); it("coerces to boolean", () => { @@ -993,12 +991,8 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType("true ", "boolean"), true); assert.deepStrictEqual(coerceToType(true, "boolean"), true); assert.deepStrictEqual(coerceToType("A", "boolean"), null); - }); - - it("soft coerces to boolean", () => { - assert.deepStrictEqual(coerceToType("false", "boolean", {soft: true}), false); - assert.deepStrictEqual(coerceToType("a", "boolean", {soft: true}), "a"); - assert.deepStrictEqual(coerceToType("true ", "boolean", {soft: true}), "true "); + assert.deepStrictEqual(coerceToType(null, "boolean"), null); + assert.deepStrictEqual(coerceToType(undefined, "boolean"), null); }); it("coerces to date", () => { @@ -1024,22 +1018,9 @@ describe("coerceToType", () => { coerceToType({a: 1}, "date").toString(), invalidDate.toString() ); - }); - - it("soft coerces to date", () => { assert.deepStrictEqual( - coerceToType("12/12/2020", "date", {soft: true}), - new Date("12/12/2020") - ); - // with whitespace - assert.deepStrictEqual( - coerceToType("12/12/2020 ", "date", {soft: true}), - new Date("12/12/2020") - ); - assert.deepStrictEqual(coerceToType("B", "date", {soft: true}), "B"); - assert.deepStrictEqual( - coerceToType({a: 1}, "date", {soft: true}).toString(), - "[object Object]" + coerceToType(null, "date").toString(), + new Date(null).toString() ); }); @@ -1050,15 +1031,7 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(0, "string"), "0"); assert.deepStrictEqual(coerceToType(null, "string"), null); assert.deepStrictEqual(coerceToType(undefined, "string"), null); - }); - - it("soft coerces to string", () => { - assert.deepStrictEqual(coerceToType(true, "string", {soft: true}), "true"); - assert.deepStrictEqual(coerceToType(null, "string", {soft: true}), null); - assert.deepStrictEqual( - coerceToType(undefined, "string", {soft: true}), - undefined - ); + assert.deepStrictEqual(coerceToType(NaN, "string"), null); }); it("coerces to array", () => { @@ -1068,14 +1041,6 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(undefined, "array"), [undefined]); }); - it("soft coerces to array", () => { - assert.deepStrictEqual(coerceToType([1,2,3], "array", {soft: true}), [1,2,3]); - assert.deepStrictEqual( - coerceToType(undefined, "array", {soft: true}), - [undefined] - ); - }); - it("coerces to object", () => { assert.deepStrictEqual(coerceToType("true", "object"), {value: "true"}); assert.deepStrictEqual(coerceToType({a: 1, b: 2}, "object"), {a: 1, b: 2}); @@ -1083,15 +1048,6 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(undefined, "object"), {value: undefined}); }); - it("soft coerces to object", () => { - assert.deepStrictEqual(coerceToType("true", "object", {soft: true}), {value: "true"}); - assert.deepStrictEqual(coerceToType(null, "object", {soft: true}), null); - assert.deepStrictEqual( - coerceToType(undefined, "object", {soft: true}), - {value: undefined} - ); - }); - it("coerces to bigint", () => { assert.deepStrictEqual(coerceToType("32", "bigint"), 32n); assert.deepStrictEqual(coerceToType(32n, "bigint"), 32n); @@ -1102,12 +1058,7 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(undefined, "bigint"), NaN); assert.deepStrictEqual(coerceToType(1.1, "bigint"), NaN); assert.deepStrictEqual(coerceToType("A", "bigint"), NaN); - }); - - it("soft coerces to bigint", () => { - assert.deepStrictEqual(coerceToType("32", "bigint", {soft: true}), 32n); - assert.deepStrictEqual(coerceToType(1.1, "bigint", {soft: true}), 1.1); - assert.deepStrictEqual(coerceToType("A", "bigint", {soft: true}), "A"); + assert.deepStrictEqual(coerceToType(NaN, "bigint"), NaN); }); it("coerces to buffer", () => { @@ -1116,28 +1067,14 @@ describe("coerceToType", () => { new ArrayBuffer() ); assert.deepStrictEqual(coerceToType("A", "buffer"), "A"); + assert.deepStrictEqual(coerceToType(null, "buffer"), null); assert.deepStrictEqual(coerceToType(undefined, "buffer"), null); }); - it("soft coerces to buffer", () => { - assert.deepStrictEqual(coerceToType("A", "buffer"), "A"); - assert.deepStrictEqual( - coerceToType(undefined, "buffer", {soft: true}), - undefined - ); - }); - it("coerces to other", () => { assert.deepStrictEqual(coerceToType(0, "other"), 0); assert.deepStrictEqual(coerceToType("a", "other"), "a"); + assert.deepStrictEqual(coerceToType(null, "other"), null); assert.deepStrictEqual(coerceToType(undefined, "other"), null); }); - - it("soft coerces to other", () => { - assert.deepStrictEqual(coerceToType("a", "other", {soft: true}), "a"); - assert.deepStrictEqual( - coerceToType(undefined, "other", {soft: true}), - undefined - ); - }); }); From 43b073bb950ba42ec27768565417a7affc40b59d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Mon, 30 Jan 2023 16:05:30 -0800 Subject: [PATCH 41/90] Work with all keys present in data source --- src/table.js | 23 ++++++++++++++++++++++- test/table-test.js | 4 +++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 8ca3d71a..fcf26ff9 100644 --- a/src/table.js +++ b/src/table.js @@ -791,7 +791,27 @@ function initKey() { }; } +// We need to show *all* keys present in the array of Objects +function getAllKeys(rows) { + const keys = new Set(); + for (const row of rows) { + // avoid crash if row is null or undefined + if (row) { + // only enumerable properties + for (const key in row) { + // only own properties + if (Object.prototype.hasOwnProperty.call(row, key)) { + // unique properties, in the order they appear + keys.add(key); + } + } + } + } + return Array.from(keys); +} + export function inferSchema(source) { + const allKeys = getAllKeys(source); const schema = []; const sampleSize = 100; let sample = source.slice(0, sampleSize); @@ -799,10 +819,11 @@ export function inferSchema(source) { sample = sample.map((d) => { return {value: d}; }); + allKeys.push("value"); } const typeCounts = {}; for (const d of sample) { - for (const key in d) { + for (const key of allKeys) { if (!typeCounts[key]) typeCounts[key] = initKey(); // for json and sqlite, we already have some types, but for csv and tsv, all // columns are strings here. diff --git a/test/table-test.js b/test/table-test.js index f7b61618..3c66fca1 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -609,7 +609,7 @@ describe("__table", () => { expectedPrimitive ); const expectedUint32Array = [1]; - expectedUint32Array.schema = []; + expectedUint32Array.schema = [{name: "value", type: "other"}]; assert.deepStrictEqual( __table(Uint32Array.of(1, 2, 3), { ...EMPTY_TABLE_DATA.operations, @@ -990,6 +990,8 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType("true", "boolean"), true); assert.deepStrictEqual(coerceToType("true ", "boolean"), true); assert.deepStrictEqual(coerceToType(true, "boolean"), true); + assert.deepStrictEqual(coerceToType("false", "boolean"), false); + assert.deepStrictEqual(coerceToType(false, "boolean"), false); assert.deepStrictEqual(coerceToType("A", "boolean"), null); assert.deepStrictEqual(coerceToType(null, "boolean"), null); assert.deepStrictEqual(coerceToType(undefined, "boolean"), null); From 744412384249364b37f6990157d50c7598082c1a Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 08:17:31 -0800 Subject: [PATCH 42/90] Add inferred property to schema elements --- src/table.js | 3 ++- test/table-test.js | 56 +++++++++++++++++++++++----------------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/table.js b/src/table.js index fcf26ff9..53fdaa36 100644 --- a/src/table.js +++ b/src/table.js @@ -869,7 +869,8 @@ export function inferSchema(source) { : "other"; schema.push({ name: col, - type: type + type: type, + inferred: type }); } return schema; diff --git a/test/table-test.js b/test/table-test.js index 3c66fca1..5387e20b 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -455,9 +455,9 @@ describe("__table", () => { {a: 3, b: 6, c: 9} ]; source.schema = [ - {name: "a", type: "integer"}, - {name: "b", type: "integer"}, - {name: "c", type: "integer"} + {name: "a", type: "integer", inferred: "integer"}, + {name: "b", type: "integer", inferred: "integer"}, + {name: "c", type: "integer", inferred: "integer"} ]; }); @@ -489,7 +489,7 @@ describe("__table", () => { select: {columns: ["a"]} }; const expectedSelected = [{a: 1}, {a: 2}, {a: 3}]; - expectedSelected.schema = [{name: "a", type: "integer"}]; + expectedSelected.schema = [{name: "a", type: "integer", inferred: "integer"}]; assert.deepStrictEqual( __table(source, operationsSelectedColumns), expectedSelected @@ -592,7 +592,7 @@ describe("__table", () => { it("__table filter primitive lte + gte", () => { const expectedPrimitive = [1]; - expectedPrimitive.schema = [{name: "value", type: "integer"}]; + expectedPrimitive.schema = [{name: "value", type: "integer", inferred: "integer"}]; assert.deepStrictEqual( __table([1, 2, 3], { ...EMPTY_TABLE_DATA.operations, @@ -609,7 +609,7 @@ describe("__table", () => { expectedPrimitive ); const expectedUint32Array = [1]; - expectedUint32Array.schema = [{name: "value", type: "other"}]; + expectedUint32Array.schema = [{name: "value", type: "other", inferred: "other"}]; assert.deepStrictEqual( __table(Uint32Array.of(1, 2, 3), { ...EMPTY_TABLE_DATA.operations, @@ -646,7 +646,7 @@ describe("__table", () => { {a: new Date("2021-01-03")} ]; const expected = [{a: new Date("2021-01-02")}]; - expected.schema = [{name: "a", type: "date"}]; + expected.schema = [{name: "a", type: "date", inferred: "date"}]; assert.deepStrictEqual(__table(source, operationsEquals), expected); }); @@ -706,7 +706,7 @@ describe("__table", () => { const expectedDesc = [ {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: null}, {a: null}, {a: null} ]; - expectedDesc.schema = [{name: "a", type: "other"}]; + expectedDesc.schema = [{name: "a", type: "other", inferred: "other"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsDesc), expectedDesc @@ -718,7 +718,7 @@ describe("__table", () => { const expectedAsc = [ {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: null}, {a: null}, {a: null} ]; - expectedAsc.schema = [{name: "a", type: "other"}]; + expectedAsc.schema = [{name: "a", type: "other", inferred: "other"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsAsc), expectedAsc @@ -783,16 +783,16 @@ describe("__table", () => { ["a", "b", "c"] ); source.schema = [ - {name: "a", type: "number"}, - {name: "b", type: "number"}, - {name: "c", type: "number"} + {name: "a", type: "number", inferred: "number"}, + {name: "b", type: "number", inferred: "number"}, + {name: "c", type: "number", inferred: "number"} ]; assert.deepStrictEqual( __table(source, EMPTY_TABLE_DATA.operations).schema, [ - {name: "a", type: "number"}, - {name: "b", type: "number"}, - {name: "c", type: "number"} + {name: "a", type: "number", inferred: "number"}, + {name: "b", type: "number", inferred: "number"}, + {name: "c", type: "number", inferred: "number"} ] ); }); @@ -891,9 +891,9 @@ describe("inferSchema", () => { ] ), [ - {name: "a", type: "other"}, - {name: "b", type: "integer"}, - {name: "c", type: "integer"} + {name: "a", type: "other", inferred: "other"}, + {name: "b", type: "integer", inferred: "integer"}, + {name: "c", type: "integer", inferred: "integer"} ] ); }); @@ -901,14 +901,14 @@ describe("inferSchema", () => { it("infers numbers", () => { assert.deepStrictEqual( inferSchema([{a: 1.2}, {a: 3.4}, {a: 5.67}]), - [{name: "a", type: "number"}] + [{name: "a", type: "number", inferred: "number"}] ); }); it("infers booleans", () => { assert.deepStrictEqual( inferSchema([{a: "true"}, {a: false}, {a: "false"}]), - [{name: "a", type: "boolean"}] + [{name: "a", type: "boolean", inferred: "boolean"}] ); }); @@ -917,53 +917,53 @@ describe("inferSchema", () => { inferSchema( [{a: "1/2/20"}, {a: "2020-11-12 12:23:00"}, {a: new Date()}, {a: "2020-1-12"}] ), - [{name: "a", type: "date"}] + [{name: "a", type: "date", inferred: "date"}] ); }); it("infers strings", () => { assert.deepStrictEqual( inferSchema([{a: "cat"}, {a: "dog"}, {a: "1,000"}, {a: "null"}]), - [{name: "a", type: "string"}] + [{name: "a", type: "string", inferred: "string"}] ); }); it("infers arrays", () => { assert.deepStrictEqual( inferSchema([{a: ["cat"]}, {a: ["dog"]}, {a: []}]), - [{name: "a", type: "array"}] + [{name: "a", type: "array", inferred: "array"}] ); }); it("infers objects", () => { assert.deepStrictEqual( inferSchema([{a: {d: ["cat"]}}, {a: {d: "dog"}}, {a: {d: 12}}]), - [{name: "a", type: "object"}] + [{name: "a", type: "object", inferred: "object"}] ); }); it("infers bigints", () => { assert.deepStrictEqual( inferSchema([{a: 10n}, {a: 22n}, {a: 1n}]), - [{name: "a", type: "bigint"}] + [{name: "a", type: "bigint", inferred: "bigint"}] ); assert.deepStrictEqual( inferSchema([{a: "10n"}, {a: "22n"}, {a: "0n"}]), - [{name: "a", type: "bigint"}] + [{name: "a", type: "bigint", inferred: "bigint"}] ); }); it("infers buffers", () => { assert.deepStrictEqual( inferSchema([{a: new ArrayBuffer()}, {a: new ArrayBuffer()}]), - [{name: "a", type: "buffer"}] + [{name: "a", type: "buffer", inferred: "buffer"}] ); }); it("infers other", () => { assert.deepStrictEqual( inferSchema([{a: Symbol("a")}, {a: Symbol("b")}]), - [{name: "a", type: "other"}] + [{name: "a", type: "other", inferred: "other"}] ); }); }); From 99e985e8e00bf2c719535f5ecb7489f86093654e Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 10:01:11 -0800 Subject: [PATCH 43/90] Support raw type --- src/table.js | 2 ++ test/table-test.js | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/src/table.js b/src/table.js index 53fdaa36..221328c2 100644 --- a/src/table.js +++ b/src/table.js @@ -546,6 +546,8 @@ export function getTypeValidator(colType) { export function coerceToType(value, type) { const stringValue = typeof value === "string" ? value.trim() : value; switch (type) { + case "raw": + return value; case "string": return typeof value === "string" ? stringValue diff --git a/test/table-test.js b/test/table-test.js index 5387e20b..ee87aedb 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1079,4 +1079,21 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(null, "other"), null); assert.deepStrictEqual(coerceToType(undefined, "other"), null); }); + + it("coerces to raw", () => { + assert.deepStrictEqual(coerceToType(0, "raw"), 0); + assert.deepStrictEqual(coerceToType("a", "raw"), "a"); + assert.deepStrictEqual(coerceToType(32n, "raw"), 32n); + assert.deepStrictEqual( + coerceToType(new ArrayBuffer(), "raw"), + new ArrayBuffer() + ); + assert.deepStrictEqual(coerceToType([1,2,3], "raw"), [1,2,3]); + assert.deepStrictEqual( + coerceToType("12/12/2020 ", "raw", {soft: true}), "12/12/2020 " + ); + assert.deepStrictEqual(coerceToType(null, "raw"), null); + assert.deepStrictEqual(coerceToType(NaN, "raw"), NaN); + assert.deepStrictEqual(coerceToType(undefined, "raw"), undefined); + }); }); From 31048d6fe9bd7d2878ddb0c05a3a2464971e7c74 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 10:03:16 -0800 Subject: [PATCH 44/90] Remove stray options --- test/table-test.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index ee87aedb..339867b5 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1005,7 +1005,7 @@ describe("coerceToType", () => { ); // with whitespace assert.deepStrictEqual( - coerceToType("12/12/2020 ", "date", {soft: true}), + coerceToType("12/12/2020 ", "date"), new Date("12/12/2020") ); assert.deepStrictEqual( @@ -1090,7 +1090,7 @@ describe("coerceToType", () => { ); assert.deepStrictEqual(coerceToType([1,2,3], "raw"), [1,2,3]); assert.deepStrictEqual( - coerceToType("12/12/2020 ", "raw", {soft: true}), "12/12/2020 " + coerceToType("12/12/2020 ", "raw"), "12/12/2020 " ); assert.deepStrictEqual(coerceToType(null, "raw"), null); assert.deepStrictEqual(coerceToType(NaN, "raw"), NaN); From 6acab1d6b1f350b338b774adcdcb0c91b3e8655d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 10:28:22 -0800 Subject: [PATCH 45/90] Don't mutate schema --- src/table.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 221328c2..70181557 100644 --- a/src/table.js +++ b/src/table.js @@ -633,8 +633,9 @@ export function __table(source, operations) { for (const {name, type} of operations.type) { types.set(name, type); // update schema with user-selected type + if (schema === input.schema) schema = schema.slice(); // copy on write const colIndex = schema.findIndex((col) => col.name === name); - if (colIndex > -1) schema[colIndex] = {name, type}; + if (colIndex > -1) schema[colIndex] = {...schema[colIndex], type}; } source = source.map(d => coerceRow(d, types)); } From bbe7cf3c26aa421e843051e9761ddcd72c218e25 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 10:29:11 -0800 Subject: [PATCH 46/90] Rename variable --- src/table.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 70181557..a4ea40e9 100644 --- a/src/table.js +++ b/src/table.js @@ -620,10 +620,10 @@ export function coerceToType(value, type) { export function __table(source, operations) { const input = source; let {schema, columns} = source; - let newlyInferred = false; + let inferredSchema = false; if (!schema || !isQueryResultSetSchema(schema)) { schema = inferSchema(source); - newlyInferred = true; + inferredSchema = true; } let primitive = arrayIsPrimitive(source); if (primitive) source = Array.from(source, (value) => ({value})); @@ -641,7 +641,7 @@ export function __table(source, operations) { } // Coerce data according to new schema, unless that happened due to // operations.type, above. - if (newlyInferred && !operations.type) { + if (inferredSchema && !operations.type) { source = source.map(d => coerceRow(d, types)); } for (const {type, operands} of operations.filter) { From 19e3eefcc8337a7648953999d3a7beb9a922403a Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 10:31:49 -0800 Subject: [PATCH 47/90] Remove unnecessary check --- src/table.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index a4ea40e9..bfb299ca 100644 --- a/src/table.js +++ b/src/table.js @@ -621,7 +621,7 @@ export function __table(source, operations) { const input = source; let {schema, columns} = source; let inferredSchema = false; - if (!schema || !isQueryResultSetSchema(schema)) { + if (!isQueryResultSetSchema(schema)) { schema = inferSchema(source); inferredSchema = true; } From 72f681bc5a6a6bffa6bdca55adea0013aad6aeda Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 11:16:57 -0800 Subject: [PATCH 48/90] Use schema rather than object keys --- src/table.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/table.js b/src/table.js index bfb299ca..2c8ffafe 100644 --- a/src/table.js +++ b/src/table.js @@ -764,12 +764,12 @@ export function __table(source, operations) { return source; } -function coerceRow(object, types) { - let coerced = {}; - for (var key in object) { - const type = types.get(key); - const value = object[key]; - coerced[key] = coerceToType(value, type); +function coerceRow(object, types, schema) { + const coerced = {}; + for (const col of schema) { + const type = types.get(col.name); + const value = object[col.name]; + coerced[col.name] = type === "raw" ? value : coerceToType(value, type); } return coerced; } From fc4023715ae435386ca31a1f2db1e2654901a34c Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 11:17:24 -0800 Subject: [PATCH 49/90] Updates based on feedback --- src/table.js | 13 +++++-------- test/table-test.js | 20 ++++---------------- 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/src/table.js b/src/table.js index 2c8ffafe..a823b649 100644 --- a/src/table.js +++ b/src/table.js @@ -546,8 +546,6 @@ export function getTypeValidator(colType) { export function coerceToType(value, type) { const stringValue = typeof value === "string" ? value.trim() : value; switch (type) { - case "raw": - return value; case "string": return typeof value === "string" ? stringValue @@ -637,12 +635,11 @@ export function __table(source, operations) { const colIndex = schema.findIndex((col) => col.name === name); if (colIndex > -1) schema[colIndex] = {...schema[colIndex], type}; } - source = source.map(d => coerceRow(d, types)); - } - // Coerce data according to new schema, unless that happened due to - // operations.type, above. - if (inferredSchema && !operations.type) { - source = source.map(d => coerceRow(d, types)); + source = source.map(d => coerceRow(d, types, schema)); + } else if (inferredSchema) { + // Coerce data according to new schema, unless that happened due to + // operations.type, above. + source = source.map(d => coerceRow(d, types, schema)); } for (const {type, operands} of operations.filter) { const [{value: column}] = operands; diff --git a/test/table-test.js b/test/table-test.js index 339867b5..239aad37 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1080,20 +1080,8 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(undefined, "other"), null); }); - it("coerces to raw", () => { - assert.deepStrictEqual(coerceToType(0, "raw"), 0); - assert.deepStrictEqual(coerceToType("a", "raw"), "a"); - assert.deepStrictEqual(coerceToType(32n, "raw"), 32n); - assert.deepStrictEqual( - coerceToType(new ArrayBuffer(), "raw"), - new ArrayBuffer() - ); - assert.deepStrictEqual(coerceToType([1,2,3], "raw"), [1,2,3]); - assert.deepStrictEqual( - coerceToType("12/12/2020 ", "raw"), "12/12/2020 " - ); - assert.deepStrictEqual(coerceToType(null, "raw"), null); - assert.deepStrictEqual(coerceToType(NaN, "raw"), NaN); - assert.deepStrictEqual(coerceToType(undefined, "raw"), undefined); - }); + // Note: if type is "raw", coerceToType() will not be called. Instead, values + // will be returned from coerceRow(). + // it("coerces to raw", () => { + // }); }); From c5e070c303bb33bc24a848f6db429547961981e7 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 11:20:27 -0800 Subject: [PATCH 50/90] Don't getAllKeys if we have columns --- src/table.js | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/table.js b/src/table.js index a823b649..41c944a6 100644 --- a/src/table.js +++ b/src/table.js @@ -620,7 +620,7 @@ export function __table(source, operations) { let {schema, columns} = source; let inferredSchema = false; if (!isQueryResultSetSchema(schema)) { - schema = inferSchema(source); + schema = inferSchema(source, columns); inferredSchema = true; } let primitive = arrayIsPrimitive(source); @@ -810,8 +810,7 @@ function getAllKeys(rows) { return Array.from(keys); } -export function inferSchema(source) { - const allKeys = getAllKeys(source); +export function inferSchema(source, columns = getAllKeys(source)) { const schema = []; const sampleSize = 100; let sample = source.slice(0, sampleSize); @@ -819,43 +818,43 @@ export function inferSchema(source) { sample = sample.map((d) => { return {value: d}; }); - allKeys.push("value"); + columns.push("value"); } const typeCounts = {}; for (const d of sample) { - for (const key of allKeys) { - if (!typeCounts[key]) typeCounts[key] = initKey(); + for (const col of columns) { + if (!typeCounts[col]) typeCounts[col] = initKey(); // for json and sqlite, we already have some types, but for csv and tsv, all // columns are strings here. - const type = typeof d[key]; - const value = type === "string" ? d[key].trim() : d[key]; + const type = typeof d[col]; + const value = type === "string" ? d[col].trim() : d[col]; if (type !== "string") { - if (Array.isArray(value)) typeCounts[key].array++; - else if (value instanceof Date) typeCounts[key].date++; - else if (value instanceof ArrayBuffer) typeCounts[key].buffer++; + if (Array.isArray(value)) typeCounts[col].array++; + else if (value instanceof Date) typeCounts[col].date++; + else if (value instanceof ArrayBuffer) typeCounts[col].buffer++; else if (type === "number") { - if (Number.isInteger(+value)) typeCounts[key].integer++; - else typeCounts[key].number++; + if (Number.isInteger(+value)) typeCounts[col].integer++; + else typeCounts[col].number++; } // bigint, boolean, or object - else if (type in typeCounts[key]) typeCounts[key][type]++; - else if (value !== null && value !== undefined) typeCounts[key].other++; + else if (type in typeCounts[col]) typeCounts[col][type]++; + else if (value !== null && value !== undefined) typeCounts[col].other++; } else { - if (value === "true" || value === "false") typeCounts[key].boolean++; + if (value === "true" || value === "false") typeCounts[col].boolean++; else if (value && !isNaN(value)) { - if (Number.isInteger(+value)) typeCounts[key].integer++; - else typeCounts[key].number++; - } else if (/^\d+n$/.test(value)) typeCounts[key].bigint++; + if (Number.isInteger(+value)) typeCounts[col].integer++; + else typeCounts[col].number++; + } else if (/^\d+n$/.test(value)) typeCounts[col].bigint++; else if ( value && value.match( /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ ) ) - typeCounts[key].date++; + typeCounts[col].date++; // the long regex accepts dates in the form of ISOString and // LocaleDateString, with or without times - else if (value) typeCounts[key].string++; + else if (value) typeCounts[col].string++; } } } From a4443e37f11955a18e7e2627493f099bc38ac3c8 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 11:33:09 -0800 Subject: [PATCH 51/90] Don't export for now --- src/index.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/index.js b/src/index.js index 9ae35311..3afaf70c 100644 --- a/src/index.js +++ b/src/index.js @@ -9,6 +9,5 @@ export { isDataArray, isDatabaseClient, __table as applyDataTableOperations, - getTypeValidator, - inferSchema + getTypeValidator } from "./table.js"; From 834aa7156ce1b16dd59a671da6a915b3b030c2cc Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 15:05:43 -0800 Subject: [PATCH 52/90] Don't mutate columns --- src/table.js | 17 ++++++++++------- test/table-test.js | 12 ++++++++++++ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/table.js b/src/table.js index 41c944a6..536b42a8 100644 --- a/src/table.js +++ b/src/table.js @@ -620,7 +620,8 @@ export function __table(source, operations) { let {schema, columns} = source; let inferredSchema = false; if (!isQueryResultSetSchema(schema)) { - schema = inferSchema(source, columns); + if (arrayIsPrimitive(source)) schema = inferFromPrimitive(source, columns); + else schema = inferSchema(source, columns); inferredSchema = true; } let primitive = arrayIsPrimitive(source); @@ -810,16 +811,18 @@ function getAllKeys(rows) { return Array.from(keys); } +export function inferFromPrimitive(source) { + const primitiveSource = source.map((d) => { + return {value: d}; + }); + return inferSchema(primitiveSource, ["value"]); +} + + export function inferSchema(source, columns = getAllKeys(source)) { const schema = []; const sampleSize = 100; let sample = source.slice(0, sampleSize); - if (arrayIsPrimitive(sample)) { - sample = sample.map((d) => { - return {value: d}; - }); - columns.push("value"); - } const typeCounts = {}; for (const d of sample) { for (const col of columns) { diff --git a/test/table-test.js b/test/table-test.js index 239aad37..30d8cc24 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1,6 +1,7 @@ import { coerceToType, getTypeValidator, + inferFromPrimitive, inferSchema, makeQueryTemplate, __table @@ -966,6 +967,17 @@ describe("inferSchema", () => { [{name: "a", type: "other", inferred: "other"}] ); }); + + it("infers from arrays of primitives", () => { + assert.deepStrictEqual( + inferFromPrimitive(["true", "false"]), + [{name: "value", type: "boolean", inferred: "boolean"}] + ); + assert.deepStrictEqual( + inferFromPrimitive([1, 2, 3]), + [{name: "value", type: "integer", inferred: "integer"}] + ); + }); }); describe("coerceToType", () => { From 9b5637169e7cb3d8d3b966c1c0e733eed3f3651c Mon Sep 17 00:00:00 2001 From: Libbey White Date: Tue, 31 Jan 2023 15:12:26 -0800 Subject: [PATCH 53/90] Formatting --- src/table.js | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 536b42a8..963c1d20 100644 --- a/src/table.js +++ b/src/table.js @@ -815,10 +815,9 @@ export function inferFromPrimitive(source) { const primitiveSource = source.map((d) => { return {value: d}; }); - return inferSchema(primitiveSource, ["value"]); + return inferSchema(primitiveSource, ["value"]); } - export function inferSchema(source, columns = getAllKeys(source)) { const schema = []; const sampleSize = 100; @@ -862,7 +861,10 @@ export function inferSchema(source, columns = getAllKeys(source)) { } } for (const col in typeCounts) { - let type = greatest(Object.keys(typeCounts[col]), (d) => typeCounts[col][d]); + let type = greatest( + Object.keys(typeCounts[col]), + (d) => typeCounts[col][d] + ); // If over 90% of the sampled data counted as this type, use it. Otherwise, // use "other." type = From 9dec278815b75e6a80a527f03b27f5528eac034e Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 08:49:12 -0800 Subject: [PATCH 54/90] Remove unnecessary inferFromPrimitive function --- src/table.js | 13 +++---------- test/table-test.js | 14 +------------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/src/table.js b/src/table.js index 963c1d20..2bed2e8e 100644 --- a/src/table.js +++ b/src/table.js @@ -618,14 +618,14 @@ export function coerceToType(value, type) { export function __table(source, operations) { const input = source; let {schema, columns} = source; + let primitive = arrayIsPrimitive(source); + if (primitive) source = Array.from(source, (value) => ({value})); let inferredSchema = false; if (!isQueryResultSetSchema(schema)) { - if (arrayIsPrimitive(source)) schema = inferFromPrimitive(source, columns); + if (primitive) schema = inferSchema(source, ["value"]); else schema = inferSchema(source, columns); inferredSchema = true; } - let primitive = arrayIsPrimitive(source); - if (primitive) source = Array.from(source, (value) => ({value})); // Combine column types from schema with user-selected types in operations const types = new Map(schema.map(({name, type}) => [name, type])); if (operations.type) { @@ -811,13 +811,6 @@ function getAllKeys(rows) { return Array.from(keys); } -export function inferFromPrimitive(source) { - const primitiveSource = source.map((d) => { - return {value: d}; - }); - return inferSchema(primitiveSource, ["value"]); -} - export function inferSchema(source, columns = getAllKeys(source)) { const schema = []; const sampleSize = 100; diff --git a/test/table-test.js b/test/table-test.js index 30d8cc24..46db0cad 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1,7 +1,6 @@ import { coerceToType, getTypeValidator, - inferFromPrimitive, inferSchema, makeQueryTemplate, __table @@ -610,7 +609,7 @@ describe("__table", () => { expectedPrimitive ); const expectedUint32Array = [1]; - expectedUint32Array.schema = [{name: "value", type: "other", inferred: "other"}]; + expectedUint32Array.schema = [{name: "value", type: "integer", inferred: "integer"}]; assert.deepStrictEqual( __table(Uint32Array.of(1, 2, 3), { ...EMPTY_TABLE_DATA.operations, @@ -967,17 +966,6 @@ describe("inferSchema", () => { [{name: "a", type: "other", inferred: "other"}] ); }); - - it("infers from arrays of primitives", () => { - assert.deepStrictEqual( - inferFromPrimitive(["true", "false"]), - [{name: "value", type: "boolean", inferred: "boolean"}] - ); - assert.deepStrictEqual( - inferFromPrimitive([1, 2, 3]), - [{name: "value", type: "integer", inferred: "integer"}] - ); - }); }); describe("coerceToType", () => { From c2a20db25b777e63389ed8acec44e65ad3892708 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 09:09:39 -0800 Subject: [PATCH 55/90] Update string coercion --- src/table.js | 16 ++++++++-------- test/table-test.js | 5 +++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/table.js b/src/table.js index 2bed2e8e..ffc9ef35 100644 --- a/src/table.js +++ b/src/table.js @@ -68,9 +68,7 @@ function objectHasEnumerableKeys(value) { function isQueryResultSetSchema(schemas) { return ( Array.isArray(schemas) && - schemas.every( - (s) => s && typeof s.name === "string" && typeof s.type === "string" - ) + schemas.every(isColumnSchema) ); } @@ -78,6 +76,10 @@ function isQueryResultSetColumns(columns) { return (Array.isArray(columns) && columns.every((name) => typeof name === "string")); } +function isColumnSchema(schema) { + return schema && typeof schema.name === "string" && typeof schema.type === "string"; +} + // Returns true if the value represents an array of primitives (i.e., a // single-column table). This should only be passed values for which // isDataArray returns true. @@ -547,11 +549,9 @@ export function coerceToType(value, type) { const stringValue = typeof value === "string" ? value.trim() : value; switch (type) { case "string": - return typeof value === "string" - ? stringValue - : value || value === 0 - ? value.toString() - : null; + return typeof value === "string" || value == null + ? value + : String(value); case "boolean": return value === true || stringValue === "true" ? true diff --git a/test/table-test.js b/test/table-test.js index 46db0cad..210bede8 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1028,12 +1028,13 @@ describe("coerceToType", () => { it("coerces to string", () => { assert.deepStrictEqual(coerceToType(true, "string"), "true"); + assert.deepStrictEqual(coerceToType(false, "string"), "false"); assert.deepStrictEqual(coerceToType(10, "string"), "10"); assert.deepStrictEqual(coerceToType({a: 1}, "string"), "[object Object]"); assert.deepStrictEqual(coerceToType(0, "string"), "0"); assert.deepStrictEqual(coerceToType(null, "string"), null); - assert.deepStrictEqual(coerceToType(undefined, "string"), null); - assert.deepStrictEqual(coerceToType(NaN, "string"), null); + assert.deepStrictEqual(coerceToType(undefined, "string"), undefined); + assert.deepStrictEqual(coerceToType(NaN, "string"), "NaN"); }); it("coerces to array", () => { From 2956af90059f67fb6c48995259fcf2c99dd564fa Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 09:13:08 -0800 Subject: [PATCH 56/90] Update boolean coercion --- src/table.js | 12 +++++++----- test/table-test.js | 4 +++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/table.js b/src/table.js index ffc9ef35..f6a0d0ad 100644 --- a/src/table.js +++ b/src/table.js @@ -553,11 +553,13 @@ export function coerceToType(value, type) { ? value : String(value); case "boolean": - return value === true || stringValue === "true" - ? true - : value === false || stringValue === "false" - ? false - : null; + if (typeof value === "string") { + const trimValue = value.trim(); + return trimValue === "true" ? true : trimValue === "false" ? false : null; + } + return typeof value === "boolean" || value == null + ? value + : Boolean(value); case "integer": return value === 0 ? value diff --git a/test/table-test.js b/test/table-test.js index 210bede8..a6aa5d64 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -992,9 +992,11 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(true, "boolean"), true); assert.deepStrictEqual(coerceToType("false", "boolean"), false); assert.deepStrictEqual(coerceToType(false, "boolean"), false); + assert.deepStrictEqual(coerceToType(1, "boolean"), true); + assert.deepStrictEqual(coerceToType(0, "boolean"), false); assert.deepStrictEqual(coerceToType("A", "boolean"), null); assert.deepStrictEqual(coerceToType(null, "boolean"), null); - assert.deepStrictEqual(coerceToType(undefined, "boolean"), null); + assert.deepStrictEqual(coerceToType(undefined, "boolean"), undefined); }); it("coerces to date", () => { From e92247ba443217e7278450facf670e95365ae95f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 09:24:15 -0800 Subject: [PATCH 57/90] Remove stringValue --- src/table.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index f6a0d0ad..959ebb2d 100644 --- a/src/table.js +++ b/src/table.js @@ -546,7 +546,6 @@ export function getTypeValidator(colType) { } export function coerceToType(value, type) { - const stringValue = typeof value === "string" ? value.trim() : value; switch (type) { case "string": return typeof value === "string" || value == null @@ -586,14 +585,15 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date) return value; if (typeof value === "string") { + const trimValue = value.trim(); let match; if ( - (match = stringValue.match( + (match = trimValue.match( /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ )) ) { if (fixTz && !!match[4] && !match[7]) - value = stringValue.replace(/-/g, "/").replace(/T/, " "); + value = trimValue.replace(/-/g, "/").replace(/T/, " "); } } return new Date(value); From e45a130bb49423eccabc0af74722d14a45a74527 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 10:06:22 -0800 Subject: [PATCH 58/90] Remove coercion for some types --- src/table.js | 16 +++------------- test/table-test.js | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/table.js b/src/table.js index 959ebb2d..a9c9ce95 100644 --- a/src/table.js +++ b/src/table.js @@ -559,12 +559,6 @@ export function coerceToType(value, type) { return typeof value === "boolean" || value == null ? value : Boolean(value); - case "integer": - return value === 0 - ? value - : !value || isNaN(parseInt(value)) - ? NaN - : parseInt(value); case "bigint": return typeof value === "bigint" ? value @@ -575,6 +569,7 @@ export function coerceToType(value, type) { ? NaN : // eslint-disable-next-line no-undef BigInt(value); + case "integer": // not a target type for coercion, but can be inferred case "number": { return value === 0 ? value @@ -599,17 +594,12 @@ export function coerceToType(value, type) { return new Date(value); } case "array": - if (Array.isArray(value)) return value; - return [value]; case "object": - // this will return true for everything except null, undefined, strings, - // numbers, boolean, and symbols, so may yield unexpected results. - if (typeof value === "object") return value; - return {value: value}; case "buffer": case "other": - default: return value || value === 0 ? value : null; + default: + throw new Error(`Unable to coerce to type: ${type}`); } } diff --git a/test/table-test.js b/test/table-test.js index a6aa5d64..acfaf3c5 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -970,7 +970,10 @@ describe("inferSchema", () => { describe("coerceToType", () => { it("coerces to integer", () => { - assert.deepStrictEqual(coerceToType("1.2", "integer"), 1); + // "integer" is not a target type for coercion, but can be inferred. So it + // will be handled as an alias for "number". + assert.deepStrictEqual(coerceToType("1.2", "integer"), 1.2); + assert.deepStrictEqual(coerceToType(1.2, "integer"), 1.2); assert.deepStrictEqual(coerceToType("10", "integer"), 10); assert.deepStrictEqual(coerceToType(0, "integer"), 0); assert.deepStrictEqual(coerceToType("A", "integer"), NaN); @@ -1040,17 +1043,17 @@ describe("coerceToType", () => { }); it("coerces to array", () => { - assert.deepStrictEqual(coerceToType("true", "array"), ["true"]); + // "array" is not a target type for coercion, but can be inferred. assert.deepStrictEqual(coerceToType([1,2,3], "array"), [1,2,3]); - assert.deepStrictEqual(coerceToType(null, "array"), [null]); - assert.deepStrictEqual(coerceToType(undefined, "array"), [undefined]); + assert.deepStrictEqual(coerceToType(null, "array"), null); + assert.deepStrictEqual(coerceToType(undefined, "array"), null); }); it("coerces to object", () => { - assert.deepStrictEqual(coerceToType("true", "object"), {value: "true"}); + // "object" is not a target type for coercion, but can be inferred. assert.deepStrictEqual(coerceToType({a: 1, b: 2}, "object"), {a: 1, b: 2}); assert.deepStrictEqual(coerceToType(null, "object"), null); - assert.deepStrictEqual(coerceToType(undefined, "object"), {value: undefined}); + assert.deepStrictEqual(coerceToType(undefined, "object"), null); }); it("coerces to bigint", () => { @@ -1067,6 +1070,7 @@ describe("coerceToType", () => { }); it("coerces to buffer", () => { + // "buffer" is not a target type for coercion, but can be inferred. assert.deepStrictEqual( coerceToType(new ArrayBuffer(), "buffer"), new ArrayBuffer() @@ -1077,6 +1081,7 @@ describe("coerceToType", () => { }); it("coerces to other", () => { + // "other" is not a target type for coercion, but can be inferred. assert.deepStrictEqual(coerceToType(0, "other"), 0); assert.deepStrictEqual(coerceToType("a", "other"), "a"); assert.deepStrictEqual(coerceToType(null, "other"), null); @@ -1085,6 +1090,4 @@ describe("coerceToType", () => { // Note: if type is "raw", coerceToType() will not be called. Instead, values // will be returned from coerceRow(). - // it("coerces to raw", () => { - // }); }); From 35579ddf3852787367f99d6e602e3e0da04e7432 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 10:30:47 -0800 Subject: [PATCH 59/90] Move promotion of arrays of primitives into loadTableDataSource --- src/table.js | 7 ++----- test/table-test.js | 37 ------------------------------------- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/src/table.js b/src/table.js index a9c9ce95..06eae616 100644 --- a/src/table.js +++ b/src/table.js @@ -207,6 +207,7 @@ const loadTableDataSource = sourceCache(async (source, name) => { throw new Error(`unsupported file type: ${source.mimeType}`); } if (isArrowTable(source) || isArqueroTable(source)) return loadDuckDBClient(source, name); + if (arrayIsPrimitive(source)) return Array.from(source, (value) => ({value})); return source; }); @@ -610,12 +611,9 @@ export function coerceToType(value, type) { export function __table(source, operations) { const input = source; let {schema, columns} = source; - let primitive = arrayIsPrimitive(source); - if (primitive) source = Array.from(source, (value) => ({value})); let inferredSchema = false; if (!isQueryResultSetSchema(schema)) { - if (primitive) schema = inferSchema(source, ["value"]); - else schema = inferSchema(source, columns); + schema = inferSchema(source, columns); inferredSchema = true; } // Combine column types from schema with user-selected types in operations @@ -746,7 +744,6 @@ export function __table(source, operations) { Object.fromEntries(operations.select.columns.map((c) => [c, d[c]])) ); } - if (primitive) source = source.map((d) => d.value); if (source !== input) { if (schema) source.schema = schema; if (columns) source.columns = columns; diff --git a/test/table-test.js b/test/table-test.js index acfaf3c5..411ed4e0 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -590,43 +590,6 @@ describe("__table", () => { ); }); - it("__table filter primitive lte + gte", () => { - const expectedPrimitive = [1]; - expectedPrimitive.schema = [{name: "value", type: "integer", inferred: "integer"}]; - assert.deepStrictEqual( - __table([1, 2, 3], { - ...EMPTY_TABLE_DATA.operations, - filter: [ - { - type: "eq", - operands: [ - {type: "column", value: "value"}, - {type: "resolved", value: 1} - ] - } - ] - }), - expectedPrimitive - ); - const expectedUint32Array = [1]; - expectedUint32Array.schema = [{name: "value", type: "integer", inferred: "integer"}]; - assert.deepStrictEqual( - __table(Uint32Array.of(1, 2, 3), { - ...EMPTY_TABLE_DATA.operations, - filter: [ - { - type: "eq", - operands: [ - {type: "column", value: "value"}, - {type: "resolved", value: 1} - ] - } - ] - }), - expectedUint32Array - ); - }); - it("__table filter eq date", () => { const operationsEquals = { ...EMPTY_TABLE_DATA.operations, From c3384d344286f19782cc4647ac4549a11b13042e Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 12:35:03 -0800 Subject: [PATCH 60/90] Fix names test --- test/table-test.js | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/test/table-test.js b/test/table-test.js index 31f48c97..8e113067 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -788,17 +788,33 @@ describe("__table", () => { ...EMPTY_TABLE_DATA.operations, names: [{column: "a", name: "nameA"}] }; - assert.deepStrictEqual(__table(source, operations), [{nameA: 1, b: 2, c: 3}, {nameA: 2, b: 4, c: 6}, {nameA: 3, b: 6, c: 9}]); + const expected = [ + {nameA: 1, b: 2, c: 3}, + {nameA: 2, b: 4, c: 6}, + {nameA: 3, b: 6, c: 9} + ]; + expected.schema = [ + {name: "nameA", type: "integer", inferred: "integer"}, + {name: "b", type: "integer", inferred: "integer"}, + {name: "c", type: "integer", inferred: "integer"} + ]; + assert.deepStrictEqual(__table(source, operations), expected); source.columns = ["a", "b", "c"]; - assert.deepStrictEqual( - __table(source, operations).columns, - ["nameA", "b", "c"] - ); - source.schema = [{name: "a", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}]; - assert.deepStrictEqual( - __table(source, operations).schema, - [{name: "nameA", type: "number"}, {name: "b", type: "number"}, {name: "c", type: "number"}] - ); + assert.deepStrictEqual(__table(source, operations).columns, [ + "nameA", + "b", + "c" + ]); + source.schema = [ + {name: "a", type: "integer", inferred: "integer"}, + {name: "b", type: "integer", inferred: "integer"}, + {name: "c", type: "integer", inferred: "integer"} + ]; + assert.deepStrictEqual(__table(source, operations).schema, [ + {name: "nameA", type: "integer", inferred: "integer"}, + {name: "b", type: "integer", inferred: "integer"}, + {name: "c", type: "integer", inferred: "integer"} + ]); }); }); From b5e10c7041141d70fc4dfb48b3725c058451c31d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 12:43:45 -0800 Subject: [PATCH 61/90] Update coercion of numbers and dates --- src/table.js | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/src/table.js b/src/table.js index 66473d30..a5e46baf 100644 --- a/src/table.js +++ b/src/table.js @@ -576,27 +576,16 @@ export function coerceToType(value, type) { BigInt(value); case "integer": // not a target type for coercion, but can be inferred case "number": { - return value === 0 + return typeof value === "number" ? value - : !value || isNaN(value) + : value == null || value === "" ? NaN : Number(value); } case "date": { if (value instanceof Date) return value; - if (typeof value === "string") { - const trimValue = value.trim(); - let match; - if ( - (match = trimValue.match( - /^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ - )) - ) { - if (fixTz && !!match[4] && !match[7]) - value = trimValue.replace(/-/g, "/").replace(/T/, " "); - } - } - return new Date(value); + const trimValue = typeof value === "string" ? value.trim() : value; + return new Date(trimValue); } case "array": case "object": @@ -786,11 +775,6 @@ function coerceRow(object, types, schema) { return coerced; } -// https://github.com/d3/d3-dsv/issues/45 -const fixTz = - new Date("2019-01-01T00:00").getHours() || - new Date("2019-07-01T00:00").getHours(); - function initKey() { return { other: 0, From 6bee09b37b3b0a32213203699a82bf306d7e17ae Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 12:44:09 -0800 Subject: [PATCH 62/90] Don't coerce when type is array, object, buffer, or other --- src/table.js | 2 +- test/table-test.js | 37 +++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/table.js b/src/table.js index a5e46baf..b3a3ea50 100644 --- a/src/table.js +++ b/src/table.js @@ -591,7 +591,7 @@ export function coerceToType(value, type) { case "object": case "buffer": case "other": - return value || value === 0 ? value : null; + return value; default: throw new Error(`Unable to coerce to type: ${type}`); } diff --git a/test/table-test.js b/test/table-test.js index 8e113067..d1ed5df9 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -690,7 +690,7 @@ describe("__table", () => { sort: [{column: "a", direction: "desc"}] }; const expectedDesc = [ - {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: null}, {a: null}, {a: null} + {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null} ]; expectedDesc.schema = [{name: "a", type: "other", inferred: "other"}]; assert.deepStrictEqual( @@ -702,7 +702,7 @@ describe("__table", () => { sort: [{column: "a", direction: "asc"}] }; const expectedAsc = [ - {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: null}, {a: null}, {a: null} + {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null} ]; expectedAsc.schema = [{name: "a", type: "other", inferred: "other"}]; assert.deepStrictEqual( @@ -1003,6 +1003,7 @@ describe("coerceToType", () => { it("coerces to number", () => { assert.deepStrictEqual(coerceToType("1.2", "number"), 1.2); assert.deepStrictEqual(coerceToType(0, "number"), 0); + assert.deepStrictEqual(coerceToType("", "number"), NaN); assert.deepStrictEqual(coerceToType("A", "number"), NaN); assert.deepStrictEqual(coerceToType(null, "number"), NaN); assert.deepStrictEqual(coerceToType(undefined, "number"), NaN); @@ -1062,20 +1063,6 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(NaN, "string"), "NaN"); }); - it("coerces to array", () => { - // "array" is not a target type for coercion, but can be inferred. - assert.deepStrictEqual(coerceToType([1,2,3], "array"), [1,2,3]); - assert.deepStrictEqual(coerceToType(null, "array"), null); - assert.deepStrictEqual(coerceToType(undefined, "array"), null); - }); - - it("coerces to object", () => { - // "object" is not a target type for coercion, but can be inferred. - assert.deepStrictEqual(coerceToType({a: 1, b: 2}, "object"), {a: 1, b: 2}); - assert.deepStrictEqual(coerceToType(null, "object"), null); - assert.deepStrictEqual(coerceToType(undefined, "object"), null); - }); - it("coerces to bigint", () => { assert.deepStrictEqual(coerceToType("32", "bigint"), 32n); assert.deepStrictEqual(coerceToType(32n, "bigint"), 32n); @@ -1089,6 +1076,20 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(NaN, "bigint"), NaN); }); + it("coerces to array", () => { + // "array" is not a target type for coercion, but can be inferred. + assert.deepStrictEqual(coerceToType([1,2,3], "array"), [1,2,3]); + assert.deepStrictEqual(coerceToType(null, "array"), null); + assert.deepStrictEqual(coerceToType(undefined, "array"), undefined); + }); + + it("coerces to object", () => { + // "object" is not a target type for coercion, but can be inferred. + assert.deepStrictEqual(coerceToType({a: 1, b: 2}, "object"), {a: 1, b: 2}); + assert.deepStrictEqual(coerceToType(null, "object"), null); + assert.deepStrictEqual(coerceToType(undefined, "object"), undefined); + }); + it("coerces to buffer", () => { // "buffer" is not a target type for coercion, but can be inferred. assert.deepStrictEqual( @@ -1097,7 +1098,7 @@ describe("coerceToType", () => { ); assert.deepStrictEqual(coerceToType("A", "buffer"), "A"); assert.deepStrictEqual(coerceToType(null, "buffer"), null); - assert.deepStrictEqual(coerceToType(undefined, "buffer"), null); + assert.deepStrictEqual(coerceToType(undefined, "buffer"), undefined); }); it("coerces to other", () => { @@ -1105,7 +1106,7 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(0, "other"), 0); assert.deepStrictEqual(coerceToType("a", "other"), "a"); assert.deepStrictEqual(coerceToType(null, "other"), null); - assert.deepStrictEqual(coerceToType(undefined, "other"), null); + assert.deepStrictEqual(coerceToType(undefined, "other"), undefined); }); // Note: if type is "raw", coerceToType() will not be called. Instead, values From caba589e37f895d57abf22beff8c3387249d0a84 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 15:03:02 -0800 Subject: [PATCH 63/90] Add isDataArray check before arrayIsPrimitive --- src/table.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index b3a3ea50..dff6baaa 100644 --- a/src/table.js +++ b/src/table.js @@ -207,7 +207,8 @@ const loadTableDataSource = sourceCache(async (source, name) => { throw new Error(`unsupported file type: ${source.mimeType}`); } if (isArrowTable(source) || isArqueroTable(source)) return loadDuckDBClient(source, name); - if (arrayIsPrimitive(source)) return Array.from(source, (value) => ({value})); + if (isDataArray(source) && arrayIsPrimitive(source)) + return Array.from(source, (value) => ({value})); return source; }); From f8a2544355009e9c651de54f1dc419af26e8403c Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 15:04:36 -0800 Subject: [PATCH 64/90] Handle whitespace-only strings as well --- src/table.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index dff6baaa..33abaf93 100644 --- a/src/table.js +++ b/src/table.js @@ -579,7 +579,7 @@ export function coerceToType(value, type) { case "number": { return typeof value === "number" ? value - : value == null || value === "" + : value == null || (typeof value === "string" && !value) ? NaN : Number(value); } From b771cc033b901e01c5f53a77cacfe0f26863b3d7 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 15:22:04 -0800 Subject: [PATCH 65/90] Tighten up date regex and use test instead of match --- src/table.js | 5 +---- test/table-test.js | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/table.js b/src/table.js index 33abaf93..27549721 100644 --- a/src/table.js +++ b/src/table.js @@ -840,10 +840,7 @@ export function inferSchema(source, columns = getAllKeys(source)) { else typeCounts[col].number++; } else if (/^\d+n$/.test(value)) typeCounts[col].bigint++; else if ( - value && - value.match( - /^(([-+]\d{2})?\d{4}(-\d{1,2}(-\d{1,2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/ - ) + /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) ) typeCounts[col].date++; // the long regex accepts dates in the form of ISOString and diff --git a/test/table-test.js b/test/table-test.js index d1ed5df9..d26d40ac 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -935,7 +935,7 @@ describe("inferSchema", () => { it("infers dates", () => { assert.deepStrictEqual( inferSchema( - [{a: "1/2/20"}, {a: "2020-11-12 12:23:00"}, {a: new Date()}, {a: "2020-1-12"}] + [{a: "1/2/20"}, {a: "2020-11-12 12:23:00"}, {a: new Date()}, {a: "2020-01-12"}] ), [{name: "a", type: "date", inferred: "date"}] ); From 3055ebb8f94083490882c79ab4df530f4cc598d4 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 15:40:13 -0800 Subject: [PATCH 66/90] Repeat date regex when coercing --- src/table.js | 4 +++- test/table-test.js | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 27549721..cb2ad949 100644 --- a/src/table.js +++ b/src/table.js @@ -586,7 +586,9 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date) return value; const trimValue = typeof value === "string" ? value.trim() : value; - return new Date(trimValue); + return value && /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(trimValue) + ? new Date(trimValue) + : new Date(""); } case "array": case "object": diff --git a/test/table-test.js b/test/table-test.js index d26d40ac..e5cb8930 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1024,7 +1024,7 @@ describe("coerceToType", () => { }); it("coerces to date", () => { - const invalidDate = new Date("a"); + const invalidDate = new Date(""); assert.deepStrictEqual( coerceToType("12/12/2020", "date"), new Date("12/12/2020") @@ -1048,7 +1048,11 @@ describe("coerceToType", () => { ); assert.deepStrictEqual( coerceToType(null, "date").toString(), - new Date(null).toString() + invalidDate.toString() + ); + assert.deepStrictEqual( + coerceToType("2020-1-12", "date").toString(), + invalidDate.toString() ); }); From 750ef2a1881a4f73d0b227ef343cd3b04fae10aa Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 16:10:36 -0800 Subject: [PATCH 67/90] Move bulk of inference to new inferType function --- src/table.js | 58 ++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/src/table.js b/src/table.js index cb2ad949..727825c5 100644 --- a/src/table.js +++ b/src/table.js @@ -820,35 +820,9 @@ export function inferSchema(source, columns = getAllKeys(source)) { for (const d of sample) { for (const col of columns) { if (!typeCounts[col]) typeCounts[col] = initKey(); - // for json and sqlite, we already have some types, but for csv and tsv, all - // columns are strings here. const type = typeof d[col]; const value = type === "string" ? d[col].trim() : d[col]; - if (type !== "string") { - if (Array.isArray(value)) typeCounts[col].array++; - else if (value instanceof Date) typeCounts[col].date++; - else if (value instanceof ArrayBuffer) typeCounts[col].buffer++; - else if (type === "number") { - if (Number.isInteger(+value)) typeCounts[col].integer++; - else typeCounts[col].number++; - } - // bigint, boolean, or object - else if (type in typeCounts[col]) typeCounts[col][type]++; - else if (value !== null && value !== undefined) typeCounts[col].other++; - } else { - if (value === "true" || value === "false") typeCounts[col].boolean++; - else if (value && !isNaN(value)) { - if (Number.isInteger(+value)) typeCounts[col].integer++; - else typeCounts[col].number++; - } else if (/^\d+n$/.test(value)) typeCounts[col].bigint++; - else if ( - /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) - ) - typeCounts[col].date++; - // the long regex accepts dates in the form of ISOString and - // LocaleDateString, with or without times - else if (value) typeCounts[col].string++; - } + typeCounts[col][inferType(type, value)]++; } } for (const col in typeCounts) { @@ -870,3 +844,33 @@ export function inferSchema(source, columns = getAllKeys(source)) { } return schema; } + +function inferType(type, value) { + // for json and sqlite, we already have some types, but for csv and tsv, all + // columns are strings here. + const typedNonStrings = ["bigint", "boolean", "object"]; + if (type !== "string") { + if (Array.isArray(value)) return "array"; + else if (value instanceof Date) return "date"; + else if (value instanceof ArrayBuffer) return "buffer"; + else if (type === "number") { + if (Number.isInteger(+value)) return "integer"; + else return "number"; + } + else if (typedNonStrings.includes(type)) return type; + else if (value !== null && value !== undefined) return "other"; + } else { + if (value === "true" || value === "false") return "boolean"; + else if (value && !isNaN(value)) { + if (Number.isInteger(+value)) return "integer"; + else return "number"; + } else if (/^\d+n$/.test(value)) return "bigint"; + else if ( + /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) + ) + return "date"; + // the long regex accepts dates in the form of ISOString and + // LocaleDateString, with or without times + else if (value) return "string"; + } +} \ No newline at end of file From 3131fdb6ace75de2a080d3ef301f9bba731b92a6 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 17:08:28 -0800 Subject: [PATCH 68/90] Only use defined values in the denominator for 90% check --- src/table.js | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/table.js b/src/table.js index 727825c5..e3586b50 100644 --- a/src/table.js +++ b/src/table.js @@ -826,16 +826,11 @@ export function inferSchema(source, columns = getAllKeys(source)) { } } for (const col in typeCounts) { - let type = greatest( - Object.keys(typeCounts[col]), - (d) => typeCounts[col][d] - ); + let type = greatest(Object.keys(typeCounts[col]), d => typeCounts[col][d]); + const numDefined = sample.filter(d => !(d[col] == null || d[col] === "")).length; // If over 90% of the sampled data counted as this type, use it. Otherwise, // use "other." - type = - typeCounts[col][type] / Math.min(source.length, sampleSize) >= 0.9 - ? type - : "other"; + type = typeCounts[col][type] / numDefined >= 0.9 ? type : "other"; schema.push({ name: col, type: type, From b77df67c8b959731437629d3ec25057be99061aa Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 17:09:38 -0800 Subject: [PATCH 69/90] Default to "other" rather than getting undefined as key on typeCounts --- src/table.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index e3586b50..e9b51356 100644 --- a/src/table.js +++ b/src/table.js @@ -853,7 +853,7 @@ function inferType(type, value) { else return "number"; } else if (typedNonStrings.includes(type)) return type; - else if (value !== null && value !== undefined) return "other"; + else return "other"; } else { if (value === "true" || value === "false") return "boolean"; else if (value && !isNaN(value)) { @@ -867,5 +867,6 @@ function inferType(type, value) { // the long regex accepts dates in the form of ISOString and // LocaleDateString, with or without times else if (value) return "string"; + else return "other"; } } \ No newline at end of file From 9c32bd3b504a14c462a4eacc2255b86653b6dc6d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 17:09:59 -0800 Subject: [PATCH 70/90] Add value check back and tighten up date regex a bit --- src/table.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index e9b51356..8147ef94 100644 --- a/src/table.js +++ b/src/table.js @@ -861,7 +861,8 @@ function inferType(type, value) { else return "number"; } else if (/^\d+n$/.test(value)) return "bigint"; else if ( - /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) + value && + /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) ) return "date"; // the long regex accepts dates in the form of ISOString and From 329465e652978c8dbf7d8950ccca74aa5ee3c9fd Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 17:16:35 -0800 Subject: [PATCH 71/90] Update coercion to BigInt --- src/table.js | 14 +++++--------- test/table-test.js | 10 +++++----- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/table.js b/src/table.js index 8147ef94..f2d91c84 100644 --- a/src/table.js +++ b/src/table.js @@ -566,15 +566,11 @@ export function coerceToType(value, type) { ? value : Boolean(value); case "bigint": - return typeof value === "bigint" + return typeof value === "bigint" || value == null ? value - : value === 0 || value === true || value === false - ? // eslint-disable-next-line no-undef - BigInt(value) - : !value || isNaN(value) || !Number.isInteger(+value) - ? NaN - : // eslint-disable-next-line no-undef - BigInt(value); + : Number.isInteger(typeof value === "string" && !value ? NaN : +value) + ? BigInt(value) // eslint-disable-line no-undef + : undefined; case "integer": // not a target type for coercion, but can be inferred case "number": { return typeof value === "number" @@ -586,7 +582,7 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date) return value; const trimValue = typeof value === "string" ? value.trim() : value; - return value && /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(trimValue) + return value && /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(trimValue) ? new Date(trimValue) : new Date(""); } diff --git a/test/table-test.js b/test/table-test.js index e5cb8930..2de93587 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1073,11 +1073,11 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(0, "bigint"), 0n); assert.deepStrictEqual(coerceToType(false, "bigint"), 0n); assert.deepStrictEqual(coerceToType(true, "bigint"), 1n); - assert.deepStrictEqual(coerceToType(null, "bigint"), NaN); - assert.deepStrictEqual(coerceToType(undefined, "bigint"), NaN); - assert.deepStrictEqual(coerceToType(1.1, "bigint"), NaN); - assert.deepStrictEqual(coerceToType("A", "bigint"), NaN); - assert.deepStrictEqual(coerceToType(NaN, "bigint"), NaN); + assert.deepStrictEqual(coerceToType(null, "bigint"), null); + assert.deepStrictEqual(coerceToType(undefined, "bigint"), undefined); + assert.deepStrictEqual(coerceToType(1.1, "bigint"), undefined); + assert.deepStrictEqual(coerceToType("A", "bigint"), undefined); + assert.deepStrictEqual(coerceToType(NaN, "bigint"), undefined); }); it("coerces to array", () => { From 4600d6ea8232eeeec8b4a9757824e544e29cb893 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 1 Feb 2023 17:39:01 -0800 Subject: [PATCH 72/90] Fix date regex --- src/table.js | 5 ++--- test/table-test.js | 10 +++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/table.js b/src/table.js index f2d91c84..714162bb 100644 --- a/src/table.js +++ b/src/table.js @@ -582,7 +582,7 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date) return value; const trimValue = typeof value === "string" ? value.trim() : value; - return value && /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(trimValue) + return /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(trimValue) ? new Date(trimValue) : new Date(""); } @@ -857,8 +857,7 @@ function inferType(type, value) { else return "number"; } else if (/^\d+n$/.test(value)) return "bigint"; else if ( - value && - /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))?([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) + /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) ) return "date"; // the long regex accepts dates in the form of ISOString and diff --git a/test/table-test.js b/test/table-test.js index 2de93587..17701bfe 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1039,17 +1039,25 @@ describe("coerceToType", () => { new Date("2022-01-01T12:34:00Z") ); assert.deepStrictEqual( - coerceToType("B", "date").toString(), + coerceToType("", "date").toString(), invalidDate.toString() ); assert.deepStrictEqual( coerceToType({a: 1}, "date").toString(), invalidDate.toString() ); + assert.deepStrictEqual( + coerceToType(undefined, "date").toString(), + invalidDate.toString() + ); assert.deepStrictEqual( coerceToType(null, "date").toString(), invalidDate.toString() ); + assert.deepStrictEqual( + coerceToType(true, "date").toString(), + invalidDate.toString() + ); assert.deepStrictEqual( coerceToType("2020-1-12", "date").toString(), invalidDate.toString() From 8eccbd8e3b932ca97d170a688eac7447db5579c8 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 08:40:15 -0800 Subject: [PATCH 73/90] Move date regex to constant --- src/table.js | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/table.js b/src/table.js index 714162bb..63bdb817 100644 --- a/src/table.js +++ b/src/table.js @@ -551,6 +551,9 @@ export function getTypeValidator(colType) { } } +// Accepts dates in the form of ISOString and LocaleDateString, with or without time +const DATE_TEST = /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/; + export function coerceToType(value, type) { switch (type) { case "string": @@ -582,7 +585,7 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date) return value; const trimValue = typeof value === "string" ? value.trim() : value; - return /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(trimValue) + return DATE_TEST.test(trimValue) ? new Date(trimValue) : new Date(""); } @@ -856,12 +859,7 @@ function inferType(type, value) { if (Number.isInteger(+value)) return "integer"; else return "number"; } else if (/^\d+n$/.test(value)) return "bigint"; - else if ( - /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d{2,4}))([T ]\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/.test(value) - ) - return "date"; - // the long regex accepts dates in the form of ISOString and - // LocaleDateString, with or without times + else if (DATE_TEST.test(value)) return "date"; else if (value) return "string"; else return "other"; } From 154e20e4c88f7a1a6846ba051760e3167bfb1d3d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 08:46:20 -0800 Subject: [PATCH 74/90] Move trim to inferType function --- src/table.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/table.js b/src/table.js index 63bdb817..47dd3dda 100644 --- a/src/table.js +++ b/src/table.js @@ -819,9 +819,7 @@ export function inferSchema(source, columns = getAllKeys(source)) { for (const d of sample) { for (const col of columns) { if (!typeCounts[col]) typeCounts[col] = initKey(); - const type = typeof d[col]; - const value = type === "string" ? d[col].trim() : d[col]; - typeCounts[col][inferType(type, value)]++; + typeCounts[col][inferType(d[col])]++; } } for (const col in typeCounts) { @@ -839,7 +837,9 @@ export function inferSchema(source, columns = getAllKeys(source)) { return schema; } -function inferType(type, value) { +function inferType(colValue) { + const type = typeof colValue; + const value = type === "string" ? colValue.trim() : colValue; // for json and sqlite, we already have some types, but for csv and tsv, all // columns are strings here. const typedNonStrings = ["bigint", "boolean", "object"]; From d82b2f80d994f1e4e22efa29b2189d753ef5c302 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 09:29:21 -0800 Subject: [PATCH 75/90] Update coercion of dates --- src/table.js | 9 ++++----- test/table-test.js | 16 +++++++--------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/table.js b/src/table.js index 47dd3dda..9b1aec87 100644 --- a/src/table.js +++ b/src/table.js @@ -583,11 +583,10 @@ export function coerceToType(value, type) { : Number(value); } case "date": { - if (value instanceof Date) return value; - const trimValue = typeof value === "string" ? value.trim() : value; - return DATE_TEST.test(trimValue) - ? new Date(trimValue) - : new Date(""); + if (value instanceof Date || value == null) return value; + if (typeof value === "number") return new Date(value); + const trimValue = String(value).trim(); + return new Date(DATE_TEST.test(trimValue) ? trimValue : NaN); } case "array": case "object": diff --git a/test/table-test.js b/test/table-test.js index 17701bfe..ed57b48e 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1024,7 +1024,7 @@ describe("coerceToType", () => { }); it("coerces to date", () => { - const invalidDate = new Date(""); + const invalidDate = new Date(NaN); assert.deepStrictEqual( coerceToType("12/12/2020", "date"), new Date("12/12/2020") @@ -1046,14 +1046,6 @@ describe("coerceToType", () => { coerceToType({a: 1}, "date").toString(), invalidDate.toString() ); - assert.deepStrictEqual( - coerceToType(undefined, "date").toString(), - invalidDate.toString() - ); - assert.deepStrictEqual( - coerceToType(null, "date").toString(), - invalidDate.toString() - ); assert.deepStrictEqual( coerceToType(true, "date").toString(), invalidDate.toString() @@ -1062,6 +1054,12 @@ describe("coerceToType", () => { coerceToType("2020-1-12", "date").toString(), invalidDate.toString() ); + assert.deepStrictEqual( + coerceToType(1675356739000, "date"), + new Date(1675356739000) + ); + assert.deepStrictEqual(coerceToType(undefined, "date"), undefined); + assert.deepStrictEqual(coerceToType(null, "date"), null); }); it("coerces to string", () => { From 2f2ee5e9acb954f0614b0095a153eeffe8e105b5 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 09:35:13 -0800 Subject: [PATCH 76/90] Don't have inferType fall back to "other" --- src/table.js | 3 +-- test/table-test.js | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 9b1aec87..ef66a5f6 100644 --- a/src/table.js +++ b/src/table.js @@ -851,7 +851,7 @@ function inferType(colValue) { else return "number"; } else if (typedNonStrings.includes(type)) return type; - else return "other"; + else if (value) return "other"; } else { if (value === "true" || value === "false") return "boolean"; else if (value && !isNaN(value)) { @@ -860,6 +860,5 @@ function inferType(colValue) { } else if (/^\d+n$/.test(value)) return "bigint"; else if (DATE_TEST.test(value)) return "date"; else if (value) return "string"; - else return "other"; } } \ No newline at end of file diff --git a/test/table-test.js b/test/table-test.js index ed57b48e..6ae1e129 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -911,7 +911,7 @@ describe("inferSchema", () => { ] ), [ - {name: "a", type: "other", inferred: "other"}, + {name: "a", type: "integer", inferred: "integer"}, {name: "b", type: "integer", inferred: "integer"}, {name: "c", type: "integer", inferred: "integer"} ] From 6eae9d361b193fc4029aef1912ca68430c8fae95 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 09:39:22 -0800 Subject: [PATCH 77/90] Coerce empty strings to null when type is "date" --- src/table.js | 1 + test/table-test.js | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/table.js b/src/table.js index ef66a5f6..01df494a 100644 --- a/src/table.js +++ b/src/table.js @@ -585,6 +585,7 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date || value == null) return value; if (typeof value === "number") return new Date(value); + if (typeof value === "string" && !value) return null; const trimValue = String(value).trim(); return new Date(DATE_TEST.test(trimValue) ? trimValue : NaN); } diff --git a/test/table-test.js b/test/table-test.js index 6ae1e129..0866f303 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1038,10 +1038,6 @@ describe("coerceToType", () => { coerceToType("2022-01-01T12:34:00Z", "date"), new Date("2022-01-01T12:34:00Z") ); - assert.deepStrictEqual( - coerceToType("", "date").toString(), - invalidDate.toString() - ); assert.deepStrictEqual( coerceToType({a: 1}, "date").toString(), invalidDate.toString() @@ -1060,6 +1056,7 @@ describe("coerceToType", () => { ); assert.deepStrictEqual(coerceToType(undefined, "date"), undefined); assert.deepStrictEqual(coerceToType(null, "date"), null); + assert.deepStrictEqual(coerceToType("", "date"), null); }); it("coerces to string", () => { From 30ba6e5b329e6ccd05233ebad36d72149ca136ec Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 10:09:32 -0800 Subject: [PATCH 78/90] Case-insensitive boolean inference/coercion --- src/table.js | 16 +++++++++------- test/table-test.js | 7 +++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/table.js b/src/table.js index 01df494a..94e6a358 100644 --- a/src/table.js +++ b/src/table.js @@ -557,13 +557,15 @@ const DATE_TEST = /^(([-+]\d{2})?\d{4}(-\d{2}(-\d{2}))|(\d{1,2})\/(\d{1,2})\/(\d export function coerceToType(value, type) { switch (type) { case "string": - return typeof value === "string" || value == null - ? value - : String(value); + return typeof value === "string" || value == null ? value : String(value); case "boolean": if (typeof value === "string") { const trimValue = value.trim(); - return trimValue === "true" ? true : trimValue === "false" ? false : null; + return trimValue.toLowerCase() === "true" + ? true + : trimValue.toLowerCase() === "false" + ? false + : null; } return typeof value === "boolean" || value == null ? value @@ -850,11 +852,11 @@ function inferType(colValue) { else if (type === "number") { if (Number.isInteger(+value)) return "integer"; else return "number"; - } - else if (typedNonStrings.includes(type)) return type; + } else if (typedNonStrings.includes(type)) return type; else if (value) return "other"; } else { - if (value === "true" || value === "false") return "boolean"; + if (value.toLowerCase() === "true" || value.toLowerCase() === "false") + return "boolean"; else if (value && !isNaN(value)) { if (Number.isInteger(+value)) return "integer"; else return "number"; diff --git a/test/table-test.js b/test/table-test.js index 0866f303..9648086b 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1012,12 +1012,15 @@ describe("coerceToType", () => { it("coerces to boolean", () => { assert.deepStrictEqual(coerceToType("true", "boolean"), true); - assert.deepStrictEqual(coerceToType("true ", "boolean"), true); + assert.deepStrictEqual(coerceToType("True ", "boolean"), true); assert.deepStrictEqual(coerceToType(true, "boolean"), true); - assert.deepStrictEqual(coerceToType("false", "boolean"), false); + assert.deepStrictEqual(coerceToType("False", "boolean"), false); assert.deepStrictEqual(coerceToType(false, "boolean"), false); assert.deepStrictEqual(coerceToType(1, "boolean"), true); + assert.deepStrictEqual(coerceToType(2, "boolean"), true); assert.deepStrictEqual(coerceToType(0, "boolean"), false); + assert.deepStrictEqual(coerceToType({}, "boolean"), true); + assert.deepStrictEqual(coerceToType(new Date(), "boolean"), true); assert.deepStrictEqual(coerceToType("A", "boolean"), null); assert.deepStrictEqual(coerceToType(null, "boolean"), null); assert.deepStrictEqual(coerceToType(undefined, "boolean"), undefined); From 7d0f114f54b69f4308f4992716d6411be34ee2bf Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 12:27:23 -0800 Subject: [PATCH 79/90] Allow multiple types to be counted during inference --- src/table.js | 52 +++++++++++++++++++++------------------------- test/table-test.js | 29 ++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 32 deletions(-) diff --git a/src/table.js b/src/table.js index 94e6a358..09b5e87c 100644 --- a/src/table.js +++ b/src/table.js @@ -821,7 +821,30 @@ export function inferSchema(source, columns = getAllKeys(source)) { for (const d of sample) { for (const col of columns) { if (!typeCounts[col]) typeCounts[col] = initKey(); - typeCounts[col][inferType(d[col])]++; + const type = typeof d[col]; + const value = type === "string" ? d[col].trim() : d[col]; + if (type !== "string") { + if (Array.isArray(value)) ++typeCounts[col].array; + else if (value instanceof Date) ++typeCounts[col].date; + else if (value instanceof ArrayBuffer) ++typeCounts[col].buffer; + else if (type === "number") { + ++typeCounts[col].number; + if (Number.isInteger(+value)) ++typeCounts[col].integer; + } + // bigint, boolean, or object + else if (type in typeCounts[col]) ++typeCounts[col][type]; + else if (value !== null && value !== undefined) ++typeCounts[col].other; + } else { + if (value.toLowerCase() === "true" || value.toLowerCase() === "false") { + ++typeCounts[col].string; + ++typeCounts[col].boolean; + } else if (value && !isNaN(value)) { + ++typeCounts[col].number; + if (Number.isInteger(+value)) ++typeCounts[col].integer; + } else if (/^\d+n$/.test(value)) ++typeCounts[col].bigint; + else if (DATE_TEST.test(value)) ++typeCounts[col].date; + else if (value) ++typeCounts[col].string; + } } } for (const col in typeCounts) { @@ -838,30 +861,3 @@ export function inferSchema(source, columns = getAllKeys(source)) { } return schema; } - -function inferType(colValue) { - const type = typeof colValue; - const value = type === "string" ? colValue.trim() : colValue; - // for json and sqlite, we already have some types, but for csv and tsv, all - // columns are strings here. - const typedNonStrings = ["bigint", "boolean", "object"]; - if (type !== "string") { - if (Array.isArray(value)) return "array"; - else if (value instanceof Date) return "date"; - else if (value instanceof ArrayBuffer) return "buffer"; - else if (type === "number") { - if (Number.isInteger(+value)) return "integer"; - else return "number"; - } else if (typedNonStrings.includes(type)) return type; - else if (value) return "other"; - } else { - if (value.toLowerCase() === "true" || value.toLowerCase() === "false") - return "boolean"; - else if (value && !isNaN(value)) { - if (Number.isInteger(+value)) return "integer"; - else return "number"; - } else if (/^\d+n$/.test(value)) return "bigint"; - else if (DATE_TEST.test(value)) return "date"; - else if (value) return "string"; - } -} \ No newline at end of file diff --git a/test/table-test.js b/test/table-test.js index 9648086b..02cf41d7 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -690,9 +690,9 @@ describe("__table", () => { sort: [{column: "a", direction: "desc"}] }; const expectedDesc = [ - {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: null}, {a: undefined}, {a: NaN}, {a: null} + {a: 20}, {a: 10}, {a: 5}, {a: 1}, {a: NaN}, {a: NaN}, {a: NaN}, {a: NaN} ]; - expectedDesc.schema = [{name: "a", type: "other", inferred: "other"}]; + expectedDesc.schema = [{name: "a", type: "number", inferred: "number"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsDesc), expectedDesc @@ -702,9 +702,9 @@ describe("__table", () => { sort: [{column: "a", direction: "asc"}] }; const expectedAsc = [ - {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: null}, {a: undefined}, {a: NaN}, {a: null} + {a: 1}, {a: 5}, {a: 10}, {a: 20}, {a: NaN}, {a: NaN}, {a: NaN}, {a: NaN} ]; - expectedAsc.schema = [{name: "a", type: "other", inferred: "other"}]; + expectedAsc.schema = [{name: "a", type: "number", inferred: "number"}]; assert.deepStrictEqual( __table(sourceWithMissing, operationsAsc), expectedAsc @@ -986,6 +986,27 @@ describe("inferSchema", () => { [{name: "a", type: "other", inferred: "other"}] ); }); + + it("infers mixed integers and numbers as numbers", () => { + assert.deepStrictEqual( + inferSchema([0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5].map((x) => ({x}))), + [{name: "x", type: "number", inferred: "number"}] + ); + }); + + it("infers mixed integers and NaNs as numbers", () => { + assert.deepStrictEqual( + inferSchema([NaN, NaN, NaN, 1, 2, 3, 4, 5].map((x) => ({x}))), + [{name: "x", type: "number", inferred: "number"}] + ); + }); + + it("infers boolean-ish strings and strings as strings", () => { + assert.deepStrictEqual( + inferSchema(["true", "false", "pants on fire"].map((x) => ({x}))), + [{name: "x", type: "string", inferred: "string"}] + ); + }); }); describe("coerceToType", () => { From 9bff937fe0664122b7db086c52ba4da3a53bd890 Mon Sep 17 00:00:00 2001 From: Libbey White <111310561+libbey-observable@users.noreply.github.com> Date: Thu, 2 Feb 2023 12:53:03 -0800 Subject: [PATCH 80/90] Update src/table.js Co-authored-by: Mike Bostock --- src/table.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/table.js b/src/table.js index 09b5e87c..59dc745c 100644 --- a/src/table.js +++ b/src/table.js @@ -560,10 +560,10 @@ export function coerceToType(value, type) { return typeof value === "string" || value == null ? value : String(value); case "boolean": if (typeof value === "string") { - const trimValue = value.trim(); - return trimValue.toLowerCase() === "true" + const trimValue = value.trim().toLowerCase(); + return trimValue === "true" ? true - : trimValue.toLowerCase() === "false" + : trimValue === "false" ? false : null; } From 5c4bc45e8f3c6ae65a3eef315ec3ec9cfbbbdb78 Mon Sep 17 00:00:00 2001 From: Libbey White <111310561+libbey-observable@users.noreply.github.com> Date: Thu, 2 Feb 2023 13:01:16 -0800 Subject: [PATCH 81/90] Update src/table.js Co-authored-by: Mike Bostock --- src/table.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 59dc745c..6b8c0182 100644 --- a/src/table.js +++ b/src/table.js @@ -829,7 +829,7 @@ export function inferSchema(source, columns = getAllKeys(source)) { else if (value instanceof ArrayBuffer) ++typeCounts[col].buffer; else if (type === "number") { ++typeCounts[col].number; - if (Number.isInteger(+value)) ++typeCounts[col].integer; + if (Number.isInteger(value)) ++typeCounts[col].integer; } // bigint, boolean, or object else if (type in typeCounts[col]) ++typeCounts[col][type]; From 8344ef6bf407f05535ee9e94dd6d9d733eeeb2f7 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 13:13:23 -0800 Subject: [PATCH 82/90] Clean up trim and lower casing --- src/table.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/table.js b/src/table.js index 6b8c0182..deb21b78 100644 --- a/src/table.js +++ b/src/table.js @@ -822,7 +822,7 @@ export function inferSchema(source, columns = getAllKeys(source)) { for (const col of columns) { if (!typeCounts[col]) typeCounts[col] = initKey(); const type = typeof d[col]; - const value = type === "string" ? d[col].trim() : d[col]; + let value = d[col]; if (type !== "string") { if (Array.isArray(value)) ++typeCounts[col].array; else if (value instanceof Date) ++typeCounts[col].date; @@ -835,7 +835,8 @@ export function inferSchema(source, columns = getAllKeys(source)) { else if (type in typeCounts[col]) ++typeCounts[col][type]; else if (value !== null && value !== undefined) ++typeCounts[col].other; } else { - if (value.toLowerCase() === "true" || value.toLowerCase() === "false") { + value = value.trim(); + if (/^(true|false)$/i.test(value)) { ++typeCounts[col].string; ++typeCounts[col].boolean; } else if (value && !isNaN(value)) { From 0b21fac4b472cfd21107eaec62e0fb34cc461c4d Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 13:27:47 -0800 Subject: [PATCH 83/90] Use trimmed string in filter --- src/table.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index deb21b78..41e426bc 100644 --- a/src/table.js +++ b/src/table.js @@ -850,7 +850,13 @@ export function inferSchema(source, columns = getAllKeys(source)) { } for (const col in typeCounts) { let type = greatest(Object.keys(typeCounts[col]), d => typeCounts[col][d]); - const numDefined = sample.filter(d => !(d[col] == null || d[col] === "")).length; + const numDefined = sample.filter( + (d) => + !( + d[col] == null || + (typeof d[col] === "string" && d[col].trim() === "") + ) + ).length; // If over 90% of the sampled data counted as this type, use it. Otherwise, // use "other." type = typeCounts[col][type] / numDefined >= 0.9 ? type : "other"; From 9f9a3f2b77b5df3d34fde4d47bca1771511fea2e Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Thu, 2 Feb 2023 14:53:11 -0800 Subject: [PATCH 84/90] checkpoint --- src/table.js | 79 ++++++++++++++++++++++++++++------------------ test/table-test.js | 7 ++++ 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/src/table.js b/src/table.js index 41e426bc..a0624f1d 100644 --- a/src/table.js +++ b/src/table.js @@ -626,7 +626,7 @@ export function __table(source, operations) { source = source.map(d => coerceRow(d, types, schema)); } else if (inferredSchema) { // Coerce data according to new schema, unless that happened due to - // operations.type, above. + // operations.type, above. source = source.map(d => coerceRow(d, types, schema)); } for (const {type, operands} of operations.filter) { @@ -779,9 +779,8 @@ function coerceRow(object, types, schema) { return coerced; } -function initKey() { +function createTypeCount() { return { - other: 0, boolean: 0, integer: 0, number: 0, @@ -790,10 +789,27 @@ function initKey() { array: 0, object: 0, bigint: 0, - buffer: 0 + buffer: 0, + defined: 0 }; } +// Caution: the order below matters! 🌶️ The first one that passes the ≥90% test +// should be the one that we chose, and therefore these types should be listed +// from most specific to least specific. +const types = [ + "boolean", + "integer", + "number", + "date", + "bigint", + "array", + "object", + "buffer", + "string" // should probably always be last since it’s least specific + // Note: "other" is intentionally omitted; it is handed specially! +]; + // We need to show *all* keys present in the array of Objects function getAllKeys(rows) { const keys = new Set(); @@ -818,48 +834,49 @@ export function inferSchema(source, columns = getAllKeys(source)) { const sampleSize = 100; let sample = source.slice(0, sampleSize); const typeCounts = {}; + for (const col of columns) typeCounts[col] = createTypeCount(); + // TODO invert order of these loops? for (const d of sample) { for (const col of columns) { - if (!typeCounts[col]) typeCounts[col] = initKey(); - const type = typeof d[col]; let value = d[col]; + if (value == null) continue; + const colCount = typeCounts[col]; + const type = typeof value; if (type !== "string") { - if (Array.isArray(value)) ++typeCounts[col].array; - else if (value instanceof Date) ++typeCounts[col].date; - else if (value instanceof ArrayBuffer) ++typeCounts[col].buffer; + ++colCount.defined; + if (Array.isArray(value)) ++colCount.array; + else if (value instanceof Date) ++colCount.date; + else if (value instanceof ArrayBuffer) ++colCount.buffer; else if (type === "number") { - ++typeCounts[col].number; - if (Number.isInteger(value)) ++typeCounts[col].integer; + ++colCount.number; + if (Number.isInteger(value)) ++colCount.integer; } // bigint, boolean, or object - else if (type in typeCounts[col]) ++typeCounts[col][type]; - else if (value !== null && value !== undefined) ++typeCounts[col].other; + else if (type in colCount) ++colCount[type]; } else { value = value.trim(); + if (!value) continue; + ++colCount.defined; + ++colCount.string; if (/^(true|false)$/i.test(value)) { - ++typeCounts[col].string; - ++typeCounts[col].boolean; + ++colCount.boolean; } else if (value && !isNaN(value)) { - ++typeCounts[col].number; - if (Number.isInteger(+value)) ++typeCounts[col].integer; - } else if (/^\d+n$/.test(value)) ++typeCounts[col].bigint; - else if (DATE_TEST.test(value)) ++typeCounts[col].date; - else if (value) ++typeCounts[col].string; + ++colCount.number; + if (Number.isInteger(+value)) ++colCount.integer; + } else if (/^\d+n$/.test(value)) ++colCount.bigint; + else if (DATE_TEST.test(value)) ++colCount.date; } } } for (const col in typeCounts) { - let type = greatest(Object.keys(typeCounts[col]), d => typeCounts[col][d]); - const numDefined = sample.filter( - (d) => - !( - d[col] == null || - (typeof d[col] === "string" && d[col].trim() === "") - ) - ).length; - // If over 90% of the sampled data counted as this type, use it. Otherwise, - // use "other." - type = typeCounts[col][type] / numDefined >= 0.9 ? type : "other"; + const colCount = typeCounts[col]; + // Chose the type with the greatest count that is also ≥90%; or, if no type + // meets that criterion, fallback to other. + const minCount = colCount.defined * 0.9; + const type = + greatest(types, (type) => + colCount[type] >= minCount ? colCount[type] : NaN + ) ?? "other"; schema.push({ name: col, type: type, diff --git a/test/table-test.js b/test/table-test.js index 02cf41d7..34ec1de0 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1001,6 +1001,13 @@ describe("inferSchema", () => { ); }); + it("infers mixed integers and strings as integers", () => { + assert.deepStrictEqual( + inferSchema(["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "x"].map((x) => ({x}))), + [{name: "x", type: "integer", inferred: "integer"}] + ); + }); + it("infers boolean-ish strings and strings as strings", () => { assert.deepStrictEqual( inferSchema(["true", "false", "pants on fire"].map((x) => ({x}))), From 543d55b6326af7903489ac1e1e863bab8318f93f Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Thu, 2 Feb 2023 15:02:13 -0800 Subject: [PATCH 85/90] tweaks to inferSchema --- src/table.js | 22 ++++++++++------------ test/table-test.js | 28 ++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/src/table.js b/src/table.js index a0624f1d..a8464948 100644 --- a/src/table.js +++ b/src/table.js @@ -805,9 +805,8 @@ const types = [ "bigint", "array", "object", - "buffer", - "string" // should probably always be last since it’s least specific - // Note: "other" is intentionally omitted; it is handed specially! + "buffer" + // Note: "other" and "string" are intentionally omitted; see below! ]; // We need to show *all* keys present in the array of Objects @@ -832,15 +831,13 @@ function getAllKeys(rows) { export function inferSchema(source, columns = getAllKeys(source)) { const schema = []; const sampleSize = 100; - let sample = source.slice(0, sampleSize); + const sample = source.slice(0, sampleSize); const typeCounts = {}; - for (const col of columns) typeCounts[col] = createTypeCount(); - // TODO invert order of these loops? - for (const d of sample) { - for (const col of columns) { + for (const col of columns) { + const colCount = typeCounts[col] = createTypeCount(); + for (const d of sample) { let value = d[col]; if (value == null) continue; - const colCount = typeCounts[col]; const type = typeof value; if (type !== "string") { ++colCount.defined; @@ -870,13 +867,14 @@ export function inferSchema(source, columns = getAllKeys(source)) { } for (const col in typeCounts) { const colCount = typeCounts[col]; - // Chose the type with the greatest count that is also ≥90%; or, if no type - // meets that criterion, fallback to other. + // Chose the non-string, non-other type with the greatest count that is also + // ≥90%; or if no such type meets that criterion, fallback to string if + // ≥90%; and lastly fallback to other. const minCount = colCount.defined * 0.9; const type = greatest(types, (type) => colCount[type] >= minCount ? colCount[type] : NaN - ) ?? "other"; + ) ?? (colCount.string >= minCount ? "string" : "other"); schema.push({ name: col, type: type, diff --git a/test/table-test.js b/test/table-test.js index 34ec1de0..43472673 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1014,6 +1014,34 @@ describe("inferSchema", () => { [{name: "x", type: "string", inferred: "string"}] ); }); + + it("infers boolean-ish strings and strings as booleans", () => { + assert.deepStrictEqual( + inferSchema(["true", "false", "true", "false", "true", "false", "true", "false", "true", "false", "pants on fire"].map((x) => ({x}))), + [{name: "x", type: "boolean", inferred: "boolean"}] + ); + }); + + it("infers booleans and strings as booleans", () => { + assert.deepStrictEqual( + inferSchema([true, false, true, false, true, false, true, false, true, false, "pants on fire"].map((x) => ({x}))), + [{name: "x", type: "boolean", inferred: "boolean"}] + ); + }); + + it("infers numbers and strings as numbers", () => { + assert.deepStrictEqual( + inferSchema([0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2, "x"].map((x) => ({x}))), + [{name: "x", type: "number", inferred: "number"}] + ); + }); + + it("infers number-ish strings and strings as numbers", () => { + assert.deepStrictEqual( + inferSchema(["0.1", "0.2", "0.1", "0.2", "0.1", "0.2", "0.1", "0.2", "0.1", "0.2", "x"].map((x) => ({x}))), + [{name: "x", type: "number", inferred: "number"}] + ); + }); }); describe("coerceToType", () => { From 63ea07990cdd37100e65acfcb9725c9f538057d9 Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Thu, 2 Feb 2023 15:03:26 -0800 Subject: [PATCH 86/90] combine loops! --- src/table.js | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/table.js b/src/table.js index a8464948..cda6f4da 100644 --- a/src/table.js +++ b/src/table.js @@ -864,9 +864,6 @@ export function inferSchema(source, columns = getAllKeys(source)) { else if (DATE_TEST.test(value)) ++colCount.date; } } - } - for (const col in typeCounts) { - const colCount = typeCounts[col]; // Chose the non-string, non-other type with the greatest count that is also // ≥90%; or if no such type meets that criterion, fallback to string if // ≥90%; and lastly fallback to other. From 89e62c6ff81347c5a3e7fdc3ca33e34426d4c405 Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Thu, 2 Feb 2023 15:18:02 -0800 Subject: [PATCH 87/90] whitespace, bigint fixes --- src/table.js | 9 ++++----- test/table-test.js | 29 ++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/table.js b/src/table.js index cda6f4da..895c5519 100644 --- a/src/table.js +++ b/src/table.js @@ -573,21 +573,21 @@ export function coerceToType(value, type) { case "bigint": return typeof value === "bigint" || value == null ? value - : Number.isInteger(typeof value === "string" && !value ? NaN : +value) + : Number.isInteger(typeof value === "string" && !value.trim() ? NaN : +value) ? BigInt(value) // eslint-disable-line no-undef : undefined; case "integer": // not a target type for coercion, but can be inferred case "number": { return typeof value === "number" ? value - : value == null || (typeof value === "string" && !value) + : value == null || (typeof value === "string" && !value.trim()) ? NaN : Number(value); } case "date": { if (value instanceof Date || value == null) return value; if (typeof value === "number") return new Date(value); - if (typeof value === "string" && !value) return null; + if (typeof value === "string" && !value.trim()) return null; const trimValue = String(value).trim(); return new Date(DATE_TEST.test(trimValue) ? trimValue : NaN); } @@ -860,8 +860,7 @@ export function inferSchema(source, columns = getAllKeys(source)) { } else if (value && !isNaN(value)) { ++colCount.number; if (Number.isInteger(+value)) ++colCount.integer; - } else if (/^\d+n$/.test(value)) ++colCount.bigint; - else if (DATE_TEST.test(value)) ++colCount.date; + } else if (DATE_TEST.test(value)) ++colCount.date; } } // Chose the non-string, non-other type with the greatest count that is also diff --git a/test/table-test.js b/test/table-test.js index 43472673..a748283d 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -946,6 +946,10 @@ describe("inferSchema", () => { inferSchema([{a: "cat"}, {a: "dog"}, {a: "1,000"}, {a: "null"}]), [{name: "a", type: "string", inferred: "string"}] ); + assert.deepStrictEqual( + inferSchema([{a: "10n"}, {a: "22n"}, {a: "0n"}]), // not considered bigints + [{name: "a", type: "string", inferred: "string"}] + ); }); it("infers arrays", () => { @@ -967,10 +971,6 @@ describe("inferSchema", () => { inferSchema([{a: 10n}, {a: 22n}, {a: 1n}]), [{name: "a", type: "bigint", inferred: "bigint"}] ); - assert.deepStrictEqual( - inferSchema([{a: "10n"}, {a: "22n"}, {a: "0n"}]), - [{name: "a", type: "bigint", inferred: "bigint"}] - ); }); it("infers buffers", () => { @@ -1049,17 +1049,24 @@ describe("coerceToType", () => { // "integer" is not a target type for coercion, but can be inferred. So it // will be handled as an alias for "number". assert.deepStrictEqual(coerceToType("1.2", "integer"), 1.2); + assert.deepStrictEqual(coerceToType(" 1.2", "integer"), 1.2); + assert.deepStrictEqual(coerceToType(" 1.2 ", "integer"), 1.2); assert.deepStrictEqual(coerceToType(1.2, "integer"), 1.2); assert.deepStrictEqual(coerceToType("10", "integer"), 10); assert.deepStrictEqual(coerceToType(0, "integer"), 0); assert.deepStrictEqual(coerceToType("A", "integer"), NaN); + assert.deepStrictEqual(coerceToType("", "integer"), NaN); + assert.deepStrictEqual(coerceToType(" ", "integer"), NaN); assert.deepStrictEqual(coerceToType(null, "integer"), NaN); }); it("coerces to number", () => { assert.deepStrictEqual(coerceToType("1.2", "number"), 1.2); + assert.deepStrictEqual(coerceToType(" 1.2", "number"), 1.2); + assert.deepStrictEqual(coerceToType(" 1.2 ", "number"), 1.2); assert.deepStrictEqual(coerceToType(0, "number"), 0); assert.deepStrictEqual(coerceToType("", "number"), NaN); + assert.deepStrictEqual(coerceToType(" ", "number"), NaN); assert.deepStrictEqual(coerceToType("A", "number"), NaN); assert.deepStrictEqual(coerceToType(null, "number"), NaN); assert.deepStrictEqual(coerceToType(undefined, "number"), NaN); @@ -1078,6 +1085,8 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType({}, "boolean"), true); assert.deepStrictEqual(coerceToType(new Date(), "boolean"), true); assert.deepStrictEqual(coerceToType("A", "boolean"), null); + assert.deepStrictEqual(coerceToType("", "boolean"), null); + assert.deepStrictEqual(coerceToType(" ", "boolean"), null); assert.deepStrictEqual(coerceToType(null, "boolean"), null); assert.deepStrictEqual(coerceToType(undefined, "boolean"), undefined); }); @@ -1116,6 +1125,7 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(undefined, "date"), undefined); assert.deepStrictEqual(coerceToType(null, "date"), null); assert.deepStrictEqual(coerceToType("", "date"), null); + assert.deepStrictEqual(coerceToType(" ", "date"), null); }); it("coerces to string", () => { @@ -1124,6 +1134,10 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(10, "string"), "10"); assert.deepStrictEqual(coerceToType({a: 1}, "string"), "[object Object]"); assert.deepStrictEqual(coerceToType(0, "string"), "0"); + assert.deepStrictEqual(coerceToType("", "string"), ""); + assert.deepStrictEqual(coerceToType(" ", "string"), " "); + assert.deepStrictEqual(coerceToType(" foo", "string"), " foo"); + assert.deepStrictEqual(coerceToType(" foo ", "string"), " foo "); assert.deepStrictEqual(coerceToType(null, "string"), null); assert.deepStrictEqual(coerceToType(undefined, "string"), undefined); assert.deepStrictEqual(coerceToType(NaN, "string"), "NaN"); @@ -1131,6 +1145,7 @@ describe("coerceToType", () => { it("coerces to bigint", () => { assert.deepStrictEqual(coerceToType("32", "bigint"), 32n); + assert.deepStrictEqual(coerceToType(" 32", "bigint"), 32n); assert.deepStrictEqual(coerceToType(32n, "bigint"), 32n); assert.deepStrictEqual(coerceToType(0, "bigint"), 0n); assert.deepStrictEqual(coerceToType(false, "bigint"), 0n); @@ -1138,13 +1153,17 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(null, "bigint"), null); assert.deepStrictEqual(coerceToType(undefined, "bigint"), undefined); assert.deepStrictEqual(coerceToType(1.1, "bigint"), undefined); + assert.deepStrictEqual(coerceToType("1.1", "bigint"), undefined); + assert.deepStrictEqual(coerceToType(" 32n", "bigint"), undefined); assert.deepStrictEqual(coerceToType("A", "bigint"), undefined); + assert.deepStrictEqual(coerceToType("", "bigint"), undefined); + assert.deepStrictEqual(coerceToType(" ", "bigint"), undefined); assert.deepStrictEqual(coerceToType(NaN, "bigint"), undefined); }); it("coerces to array", () => { // "array" is not a target type for coercion, but can be inferred. - assert.deepStrictEqual(coerceToType([1,2,3], "array"), [1,2,3]); + assert.deepStrictEqual(coerceToType([1, 2, 3], "array"), [1,2,3]); assert.deepStrictEqual(coerceToType(null, "array"), null); assert.deepStrictEqual(coerceToType(undefined, "array"), undefined); }); From f3a4ad8956bea8243defc9eea36af7fbd72d5fcc Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Thu, 2 Feb 2023 15:18:27 -0800 Subject: [PATCH 88/90] prEtTieR --- test/table-test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/table-test.js b/test/table-test.js index a748283d..258032ee 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1163,7 +1163,7 @@ describe("coerceToType", () => { it("coerces to array", () => { // "array" is not a target type for coercion, but can be inferred. - assert.deepStrictEqual(coerceToType([1, 2, 3], "array"), [1,2,3]); + assert.deepStrictEqual(coerceToType([1, 2, 3], "array"), [1, 2, 3]); assert.deepStrictEqual(coerceToType(null, "array"), null); assert.deepStrictEqual(coerceToType(undefined, "array"), undefined); }); From 9a56e6715d179f7cb255dd24f46087aa73b9c81e Mon Sep 17 00:00:00 2001 From: Mike Bostock Date: Thu, 2 Feb 2023 15:24:12 -0800 Subject: [PATCH 89/90] stricter string coercion --- src/table.js | 2 +- test/table-test.js | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 895c5519..c3d4fd53 100644 --- a/src/table.js +++ b/src/table.js @@ -587,8 +587,8 @@ export function coerceToType(value, type) { case "date": { if (value instanceof Date || value == null) return value; if (typeof value === "number") return new Date(value); - if (typeof value === "string" && !value.trim()) return null; const trimValue = String(value).trim(); + if (typeof value === "string" && !trimValue) return null; return new Date(DATE_TEST.test(trimValue) ? trimValue : NaN); } case "array": diff --git a/test/table-test.js b/test/table-test.js index 258032ee..5d996c2a 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -1126,6 +1126,8 @@ describe("coerceToType", () => { assert.deepStrictEqual(coerceToType(null, "date"), null); assert.deepStrictEqual(coerceToType("", "date"), null); assert.deepStrictEqual(coerceToType(" ", "date"), null); + assert.deepStrictEqual(coerceToType({toString: () => " "}, "date").toString(), invalidDate.toString()); + assert.deepStrictEqual(coerceToType({toString: () => "2020-01-01"}, "date"), new Date("2020-01-01")); }); it("coerces to string", () => { From 326f542003aba079323580739ec3750d77a23b39 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 2 Feb 2023 15:47:24 -0800 Subject: [PATCH 90/90] Handle column of nulls --- src/table.js | 2 +- test/table-test.js | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index c3d4fd53..62911b32 100644 --- a/src/table.js +++ b/src/table.js @@ -866,7 +866,7 @@ export function inferSchema(source, columns = getAllKeys(source)) { // Chose the non-string, non-other type with the greatest count that is also // ≥90%; or if no such type meets that criterion, fallback to string if // ≥90%; and lastly fallback to other. - const minCount = colCount.defined * 0.9; + const minCount = Math.max(1, colCount.defined * 0.9); const type = greatest(types, (type) => colCount[type] >= minCount ? colCount[type] : NaN diff --git a/test/table-test.js b/test/table-test.js index 5d996c2a..2eb76e87 100644 --- a/test/table-test.js +++ b/test/table-test.js @@ -985,6 +985,10 @@ describe("inferSchema", () => { inferSchema([{a: Symbol("a")}, {a: Symbol("b")}]), [{name: "a", type: "other", inferred: "other"}] ); + assert.deepStrictEqual( + inferSchema([{a: null}, {a: null}]), + [{name: "a", type: "other", inferred: "other"}] + ); }); it("infers mixed integers and numbers as numbers", () => {