From 1bdf08642eef18441b7bbab7cff69fbcc350054f Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 11 Jan 2023 16:32:11 -0800 Subject: [PATCH 1/9] Add insertUntypedCSV function to duckdb --- src/duckdb.js | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/duckdb.js b/src/duckdb.js index 698fadc4..f172f716 100644 --- a/src/duckdb.js +++ b/src/duckdb.js @@ -130,7 +130,9 @@ export class DuckDBClient { await Promise.all( Object.entries(sources).map(async ([name, source]) => { if (source instanceof FileAttachment) { // bare file - await insertFile(db, name, source); + config.untyped + ? await insertFile(db, name, source, {}, config.untyped) + : await insertFile(db, name, source); } else if (isArrowTable(source)) { // bare arrow table await insertArrowTable(db, name, source); } else if (Array.isArray(source)) { // bare array of objects @@ -160,7 +162,7 @@ Object.defineProperty(DuckDBClient.prototype, "dialect", { value: "duckdb" }); -async function insertFile(database, name, file, options) { +async function insertFile(database, name, file, options, untyped = false) { const url = await file.url(); if (url.startsWith("blob:")) { const buffer = await file.arrayBuffer(); @@ -172,12 +174,17 @@ async function insertFile(database, name, file, options) { try { switch (file.mimeType) { case "text/csv": - case "text/tab-separated-values": - return await connection.insertCSVFromPath(file.name, { - name, - schema: "main", - ...options - }); + case "text/tab-separated-values": { + if (untyped) { + return await insertUntypedCSV(connection, file, name); + } else { + return await connection.insertCSVFromPath(file.name, { + name, + schema: "main", + ...options + }); + } + } case "application/json": return await connection.insertJSONFromPath(file.name, { name, @@ -205,6 +212,13 @@ async function insertFile(database, name, file, options) { } } +async function insertUntypedCSV(connection, file, name) { + const statement = await connection.prepare( + `CREATE TABLE '${name}' AS SELECT * FROM read_csv_auto(?, ALL_VARCHAR=TRUE)` + ); + return await statement.send(file.name); +} + async function insertArrowTable(database, name, table, options) { const connection = await database.connect(); try { From 5cf6f4f534302d9055a9cc16c12ac0e74aa560a2 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 11 Jan 2023 16:34:27 -0800 Subject: [PATCH 2/9] Try untyped insertion as a fallback when loading DuckDBClient --- src/table.js | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/table.js b/src/table.js index 05f30469..4d285e42 100644 --- a/src/table.js +++ b/src/table.js @@ -232,7 +232,14 @@ function loadDuckDBClient( ? getFileSourceName(source) : "__table" ) { - return DuckDBClient.of({[name]: source}); + return DuckDBClient.of({[name]: source}) + .catch(() => { + // If initial attempt to create a DuckDB client resulted in an error, try + // one more time, treating all columns as strings. + // Could check error for a substring like "Could not convert", if this + // seems too costly for a catch-all error path. + return DuckDBClient.of({[name]: source}, {untyped: true}); + }); } function getFileSourceName(file) { From 1650d9b30f2321689e439d851240c01b9c7e5b37 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Wed, 11 Jan 2023 16:55:51 -0800 Subject: [PATCH 3/9] simplify --- src/duckdb.js | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/duckdb.js b/src/duckdb.js index f172f716..3128d05b 100644 --- a/src/duckdb.js +++ b/src/duckdb.js @@ -130,9 +130,7 @@ export class DuckDBClient { await Promise.all( Object.entries(sources).map(async ([name, source]) => { if (source instanceof FileAttachment) { // bare file - config.untyped - ? await insertFile(db, name, source, {}, config.untyped) - : await insertFile(db, name, source); + await insertFile(db, name, source, {}, config.untyped); } else if (isArrowTable(source)) { // bare arrow table await insertArrowTable(db, name, source); } else if (Array.isArray(source)) { // bare array of objects From 501fbc853dd0ec0ddcc640c7df4cd46b7bed6d70 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 12 Jan 2023 08:46:25 -0800 Subject: [PATCH 4/9] Check error message --- src/table.js | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/table.js b/src/table.js index 4d285e42..075ac675 100644 --- a/src/table.js +++ b/src/table.js @@ -232,13 +232,16 @@ function loadDuckDBClient( ? getFileSourceName(source) : "__table" ) { - return DuckDBClient.of({[name]: source}) - .catch(() => { - // If initial attempt to create a DuckDB client resulted in an error, try - // one more time, treating all columns as strings. - // Could check error for a substring like "Could not convert", if this - // seems too costly for a catch-all error path. - return DuckDBClient.of({[name]: source}, {untyped: true}); + const client = DuckDBClient.of({[name]: source}); + return client + .catch((error) => { + // If initial attempt to create a DuckDB client resulted in a conversion + // error, try again, this time treating all columns as strings. + if (error.toString().includes("Could not convert")) { + return DuckDBClient.of({[name]: source}, {untyped: true}); + } + // If this is not a conversion error, return the original attempt. + return client; }); } From 846545470705d0a18b417e53f71ed00c2d8dde2a Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 12 Jan 2023 11:11:20 -0800 Subject: [PATCH 5/9] Use symbol and existing options --- src/duckdb.js | 7 ++++--- src/table.js | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/duckdb.js b/src/duckdb.js index 3128d05b..e6d55305 100644 --- a/src/duckdb.js +++ b/src/duckdb.js @@ -3,6 +3,7 @@ import {getArrowTableSchema, isArrowTable, loadArrow} from "./arrow.js"; import {duckdb} from "./dependencies.js"; import {FileAttachment} from "./fileAttachment.js"; import {cdn} from "./require.js"; +import {untyped} from "./table.js"; // Adapted from https://observablehq.com/@cmudig/duckdb-client // Copyright 2021 CMU Data Interaction Group @@ -130,7 +131,7 @@ export class DuckDBClient { await Promise.all( Object.entries(sources).map(async ([name, source]) => { if (source instanceof FileAttachment) { // bare file - await insertFile(db, name, source, {}, config.untyped); + await insertFile(db, name, source, {[untyped]: config[untyped]}); } else if (isArrowTable(source)) { // bare arrow table await insertArrowTable(db, name, source); } else if (Array.isArray(source)) { // bare array of objects @@ -160,7 +161,7 @@ Object.defineProperty(DuckDBClient.prototype, "dialect", { value: "duckdb" }); -async function insertFile(database, name, file, options, untyped = false) { +async function insertFile(database, name, file, options) { const url = await file.url(); if (url.startsWith("blob:")) { const buffer = await file.arrayBuffer(); @@ -173,7 +174,7 @@ async function insertFile(database, name, file, options, untyped = false) { switch (file.mimeType) { case "text/csv": case "text/tab-separated-values": { - if (untyped) { + if (options[untyped]) { return await insertUntypedCSV(connection, file, name); } else { return await connection.insertCSVFromPath(file.name, { diff --git a/src/table.js b/src/table.js index 075ac675..341032f5 100644 --- a/src/table.js +++ b/src/table.js @@ -5,6 +5,7 @@ import {isArrowTable, loadArrow} from "./arrow.js"; import {DuckDBClient} from "./duckdb.js"; const nChecks = 20; // number of values to check in each array +export const untyped = Symbol("untyped"); // We support two levels of DatabaseClient. The simplest DatabaseClient // implements only the client.sql tagged template literal. More advanced @@ -238,7 +239,7 @@ function loadDuckDBClient( // If initial attempt to create a DuckDB client resulted in a conversion // error, try again, this time treating all columns as strings. if (error.toString().includes("Could not convert")) { - return DuckDBClient.of({[name]: source}, {untyped: true}); + return DuckDBClient.of({[name]: source}, {[untyped]: true}); } // If this is not a conversion error, return the original attempt. return client; From b5a4e7fd712ce709d75c9c53a5d7dbad3113adfe Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 12 Jan 2023 14:49:48 -0800 Subject: [PATCH 6/9] Undo changes to table.js --- src/table.js | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/table.js b/src/table.js index 341032f5..05f30469 100644 --- a/src/table.js +++ b/src/table.js @@ -5,7 +5,6 @@ import {isArrowTable, loadArrow} from "./arrow.js"; import {DuckDBClient} from "./duckdb.js"; const nChecks = 20; // number of values to check in each array -export const untyped = Symbol("untyped"); // We support two levels of DatabaseClient. The simplest DatabaseClient // implements only the client.sql tagged template literal. More advanced @@ -233,17 +232,7 @@ function loadDuckDBClient( ? getFileSourceName(source) : "__table" ) { - const client = DuckDBClient.of({[name]: source}); - return client - .catch((error) => { - // If initial attempt to create a DuckDB client resulted in a conversion - // error, try again, this time treating all columns as strings. - if (error.toString().includes("Could not convert")) { - return DuckDBClient.of({[name]: source}, {[untyped]: true}); - } - // If this is not a conversion error, return the original attempt. - return client; - }); + return DuckDBClient.of({[name]: source}); } function getFileSourceName(file) { From c27516a8e23e85f915716c7a3d1bbb3555001812 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 12 Jan 2023 14:50:15 -0800 Subject: [PATCH 7/9] Catch earlier in process, no need for config --- src/duckdb.js | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/duckdb.js b/src/duckdb.js index e6d55305..83168260 100644 --- a/src/duckdb.js +++ b/src/duckdb.js @@ -3,7 +3,6 @@ import {getArrowTableSchema, isArrowTable, loadArrow} from "./arrow.js"; import {duckdb} from "./dependencies.js"; import {FileAttachment} from "./fileAttachment.js"; import {cdn} from "./require.js"; -import {untyped} from "./table.js"; // Adapted from https://observablehq.com/@cmudig/duckdb-client // Copyright 2021 CMU Data Interaction Group @@ -131,7 +130,7 @@ export class DuckDBClient { await Promise.all( Object.entries(sources).map(async ([name, source]) => { if (source instanceof FileAttachment) { // bare file - await insertFile(db, name, source, {[untyped]: config[untyped]}); + await insertFile(db, name, source); } else if (isArrowTable(source)) { // bare arrow table await insertArrowTable(db, name, source); } else if (Array.isArray(source)) { // bare array of objects @@ -174,15 +173,18 @@ async function insertFile(database, name, file, options) { switch (file.mimeType) { case "text/csv": case "text/tab-separated-values": { - if (options[untyped]) { - return await insertUntypedCSV(connection, file, name); - } else { - return await connection.insertCSVFromPath(file.name, { - name, - schema: "main", - ...options - }); - } + const client = await connection.insertCSVFromPath(file.name, { + name, + schema: "main", + ...options + }).catch(async (error) => { + // If initial attempt to create a DuckDB client resulted in a conversion + // error, try again, this time treating all columns as strings. + if (error.toString().includes("Could not convert")) { + return await insertUntypedCSV(connection, file, name); + } + }); + return client; } case "application/json": return await connection.insertJSONFromPath(file.name, { From 1a60e214c75f3964541aa6630af9a98727ab5bc6 Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 12 Jan 2023 14:51:45 -0800 Subject: [PATCH 8/9] Remove unnecessary variable --- src/duckdb.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/duckdb.js b/src/duckdb.js index 83168260..c2ca0043 100644 --- a/src/duckdb.js +++ b/src/duckdb.js @@ -173,7 +173,7 @@ async function insertFile(database, name, file, options) { switch (file.mimeType) { case "text/csv": case "text/tab-separated-values": { - const client = await connection.insertCSVFromPath(file.name, { + return await connection.insertCSVFromPath(file.name, { name, schema: "main", ...options @@ -184,7 +184,6 @@ async function insertFile(database, name, file, options) { return await insertUntypedCSV(connection, file, name); } }); - return client; } case "application/json": return await connection.insertJSONFromPath(file.name, { From 175f5da79329f07dea7141a0cbac6868128d71fb Mon Sep 17 00:00:00 2001 From: Libbey White Date: Thu, 12 Jan 2023 15:02:16 -0800 Subject: [PATCH 9/9] Update comment --- src/duckdb.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/duckdb.js b/src/duckdb.js index c2ca0043..cebf7fea 100644 --- a/src/duckdb.js +++ b/src/duckdb.js @@ -178,7 +178,7 @@ async function insertFile(database, name, file, options) { schema: "main", ...options }).catch(async (error) => { - // If initial attempt to create a DuckDB client resulted in a conversion + // If initial attempt to insert CSV resulted in a conversion // error, try again, this time treating all columns as strings. if (error.toString().includes("Could not convert")) { return await insertUntypedCSV(connection, file, name);