diff --git a/bin/resolve-dependencies b/bin/resolve-dependencies index 49e03595..016ad676 100755 --- a/bin/resolve-dependencies +++ b/bin/resolve-dependencies @@ -64,7 +64,11 @@ const mains = ["unpkg", "jsdelivr", "browser", "main"]; } { const package = await resolve("apache-arrow@4"); - console.log(`export const arrow = dependency("${package.name}", "${package.version}", "${package.export}");`); + console.log(`export const arrow4 = dependency("${package.name}", "${package.version}", "${package.export}");`); + } + { + const package = await resolve("apache-arrow@9"); + console.log(`export const arrow9 = dependency("${package.name}", "${package.version}", "+esm");`); } { const package = await resolve("arquero"); @@ -86,6 +90,10 @@ const mains = ["unpkg", "jsdelivr", "browser", "main"]; const package = await resolve("leaflet"); console.log(`export const leaflet = dependency("${package.name}", "${package.version}", "${package.export.replace(/-src\.js$/, ".js")}");`); } + { + const package = await resolve("@duckdb/duckdb-wasm"); + console.log(`export const duckdb = dependency("${package.name}", "${package.version}", "+esm");`); + } })(); async function resolve(specifier) { diff --git a/rollup.config.js b/rollup.config.js index 86ca56e9..7a02422d 100644 --- a/rollup.config.js +++ b/rollup.config.js @@ -15,6 +15,7 @@ export default [ reserved: [ "FileAttachment", "RequireError", + "DuckDBClient", "SQLiteDatabaseClient", "Workbook", "ZipArchive", diff --git a/src/arrow.js b/src/arrow.js new file mode 100644 index 00000000..7ed60993 --- /dev/null +++ b/src/arrow.js @@ -0,0 +1,58 @@ +// Returns true if the vaue is an Apache Arrow table. This uses a “duck” test +// (instead of strict instanceof) because we want it to work with a range of +// Apache Arrow versions at least 7.0.0 or above. +// https://arrow.apache.org/docs/7.0/js/classes/Arrow_dom.Table.html +export function isArrowTable(value) { + return ( + value && + typeof value.getChild === "function" && + typeof value.toArray === "function" && + value.schema && + Array.isArray(value.schema.fields) + ); +} + +export function getArrowTableSchema(table) { + return table.schema.fields.map(getArrowFieldSchema); +} + +function getArrowFieldSchema(field) { + return { + name: field.name, + type: getArrowType(field.type), + nullable: field.nullable, + databaseType: String(field.type) + }; +} + +// https://github.com/apache/arrow/blob/89f9a0948961f6e94f1ef5e4f310b707d22a3c11/js/src/enum.ts#L140-L141 +function getArrowType(type) { + switch (type.typeId) { + case 2: // Int + return "integer"; + case 3: // Float + case 7: // Decimal + return "number"; + case 4: // Binary + case 15: // FixedSizeBinary + return "buffer"; + case 5: // Utf8 + return "string"; + case 6: // Bool + return "boolean"; + case 8: // Date + case 9: // Time + case 10: // Timestamp + return "date"; + case 12: // List + case 16: // FixedSizeList + return "array"; + case 13: // Struct + case 14: // Union + return "object"; + case 11: // Interval + case 17: // Map + default: + return "other"; + } +} diff --git a/src/dependencies.js b/src/dependencies.js index 7f51fd71..b5e95acc 100644 --- a/src/dependencies.js +++ b/src/dependencies.js @@ -13,9 +13,11 @@ export const sql = dependency("sql.js", "1.7.0", "dist/sql-wasm.js"); export const vega = dependency("vega", "5.22.1", "build/vega.min.js"); export const vegalite = dependency("vega-lite", "5.5.0", "build/vega-lite.min.js"); export const vegaliteApi = dependency("vega-lite-api", "5.0.0", "build/vega-lite-api.min.js"); -export const arrow = dependency("apache-arrow", "4.0.1", "Arrow.es2015.min.js"); +export const arrow4 = dependency("apache-arrow", "4.0.1", "Arrow.es2015.min.js"); +export const arrow9 = dependency("apache-arrow", "9.0.0", "+esm"); export const arquero = dependency("arquero", "4.8.8", "dist/arquero.min.js"); export const topojson = dependency("topojson-client", "3.1.0", "dist/topojson-client.min.js"); export const exceljs = dependency("exceljs", "4.3.0", "dist/exceljs.min.js"); export const mermaid = dependency("mermaid", "9.1.6", "dist/mermaid.min.js"); export const leaflet = dependency("leaflet", "1.8.0", "dist/leaflet.js"); +export const duckdb = dependency("@duckdb/duckdb-wasm", "1.17.0", "+esm"); diff --git a/src/duckdb.js b/src/duckdb.js new file mode 100644 index 00000000..abc6ad05 --- /dev/null +++ b/src/duckdb.js @@ -0,0 +1,272 @@ +import {getArrowTableSchema, isArrowTable} from "./arrow.js"; +import {arrow9 as arrow, duckdb} from "./dependencies.js"; +import {FileAttachment} from "./fileAttachment.js"; +import {cdn} from "./require.js"; + +// Adapted from https://observablehq.com/@cmudig/duckdb-client +// Copyright 2021 CMU Data Interaction Group +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +export class DuckDBClient { + constructor(db) { + Object.defineProperties(this, { + _db: {value: db} + }); + } + + async queryStream(query, params) { + const connection = await this._db.connect(); + let reader, batch; + try { + reader = await connection.send(query, params); + batch = await reader.next(); + if (batch.done) throw new Error("missing first batch"); + } catch (error) { + await connection.close(); + throw error; + } + return { + schema: getArrowTableSchema(batch.value), + async *readRows() { + try { + while (!batch.done) { + yield batch.value.toArray(); + batch = await reader.next(); + } + } finally { + await connection.close(); + } + } + }; + } + + async query(query, params) { + const result = await this.queryStream(query, params); + const results = []; + for await (const rows of result.readRows()) { + for (const row of rows) { + results.push(row); + } + } + results.schema = result.schema; + return results; + } + + async queryRow(query, params) { + const result = await this.queryStream(query, params); + const reader = result.readRows(); + try { + const {done, value} = await reader.next(); + return done || !value.length ? null : value[0]; + } finally { + await reader.return(); + } + } + + async sql(strings, ...args) { + return await this.query(strings.join("?"), args); + } + + queryTag(strings, ...params) { + return [strings.join("?"), params]; + } + + escape(name) { + return `"${name}"`; + } + + async describeTables() { + const tables = await this.query(`SHOW TABLES`); + return tables.map(({name}) => ({name})); + } + + async describeColumns({table} = {}) { + const columns = await this.query(`DESCRIBE ${table}`); + return columns.map(({column_name, column_type, null: nullable}) => ({ + name: column_name, + type: getDuckDBType(column_type), + nullable: nullable !== "NO", + databaseType: column_type + })); + } + + static async of(sources = {}, config = {}) { + const db = await createDuckDB(); + if (config.query?.castTimestampToDate === undefined) { + config = {...config, query: {...config.query, castTimestampToDate: true}}; + } + await db.open(config); + await Promise.all( + Object.entries(sources).map(async ([name, source]) => { + if (source instanceof FileAttachment) { // bare file + await insertFile(db, name, source); + } else if (isArrowTable(source)) { // bare arrow table + await insertArrowTable(db, name, source); + } else if (Array.isArray(source)) { // bare array of objects + await insertArray(db, name, source); + } else if ("data" in source) { // data + options + const {data, ...options} = source; + if (isArrowTable(data)) { + await insertArrowTable(db, name, data, options); + } else { + await insertArray(db, name, data, options); + } + } else if ("file" in source) { // file + options + const {file, ...options} = source; + await insertFile(db, name, file, options); + } else { + throw new Error(`invalid source: ${source}`); + } + }) + ); + return new DuckDBClient(db); + } +} + +async function insertFile(database, name, file, options) { + const url = await file.url(); + if (url.startsWith("blob:")) { + const buffer = await file.arrayBuffer(); + await database.registerFileBuffer(file.name, new Uint8Array(buffer)); + } else { + await database.registerFileURL(file.name, url); + } + const connection = await database.connect(); + try { + switch (file.mimeType) { + case "text/csv": + return await connection.insertCSVFromPath(file.name, { + name, + schema: "main", + ...options + }); + case "application/json": + return await connection.insertJSONFromPath(file.name, { + name, + schema: "main", + ...options + }); + default: + if (/\.arrow$/i.test(file.name)) { + const buffer = new Uint8Array(await file.arrayBuffer()); + return await connection.insertArrowFromIPCStream(buffer, { + name, + schema: "main", + ...options + }); + } + if (/\.parquet$/i.test(file.name)) { + return await connection.query( + `CREATE VIEW '${name}' AS SELECT * FROM parquet_scan('${file.name}')` + ); + } + throw new Error(`unknown file type: ${file.mimeType}`); + } + } finally { + await connection.close(); + } +} + +async function insertArrowTable(database, name, table, options) { + const arrow = await loadArrow(); + const buffer = arrow.tableToIPC(table); + const connection = await database.connect(); + try { + await connection.insertArrowFromIPCStream(buffer, { + name, + schema: "main", + ...options + }); + } finally { + await connection.close(); + } +} + +async function insertArray(database, name, array, options) { + const arrow = await loadArrow(); + const table = arrow.tableFromJSON(array); + return await insertArrowTable(database, name, table, options); +} + +async function createDuckDB() { + const duck = await import(`${cdn}${duckdb.resolve()}`); + const bundle = await duck.selectBundle({ + mvp: { + mainModule: `${cdn}${duckdb.resolve("dist/duckdb-mvp.wasm")}`, + mainWorker: `${cdn}${duckdb.resolve("dist/duckdb-browser-mvp.worker.js")}` + }, + eh: { + mainModule: `${cdn}${duckdb.resolve("dist/duckdb-eh.wasm")}`, + mainWorker: `${cdn}${duckdb.resolve("dist/duckdb-browser-eh.worker.js")}` + } + }); + const logger = new duck.ConsoleLogger(); + const worker = await duck.createWorker(bundle.mainWorker); + const db = new duck.AsyncDuckDB(logger, worker); + await db.instantiate(bundle.mainModule); + return db; +} + +async function loadArrow() { + return await import(`${cdn}${arrow.resolve()}`); +} + +// https://duckdb.org/docs/sql/data_types/overview +function getDuckDBType(type) { + switch (type) { + case "BIGINT": + case "HUGEINT": + case "UBIGINT": + return "bigint"; + case "DOUBLE": + case "REAL": + return "number"; + case "INTEGER": + case "SMALLINT": + case "TINYINT": + case "USMALLINT": + case "UINTEGER": + case "UTINYINT": + return "integer"; + case "BOOLEAN": + return "boolean"; + case "DATE": + case "TIMESTAMP": + case "TIMESTAMP WITH TIME ZONE": + return "date"; + case "VARCHAR": + case "UUID": + return "string"; + // case "BLOB": + // case "INTERVAL": + // case "TIME": + default: + if (/^DECIMAL\(/.test(type)) return "integer"; + return "other"; + } +} diff --git a/src/fileAttachment.js b/src/fileAttachment.js index 5e83ad56..07c8aeb9 100644 --- a/src/fileAttachment.js +++ b/src/fileAttachment.js @@ -1,6 +1,6 @@ import {autoType, csvParse, csvParseRows, tsvParse, tsvParseRows} from "d3-dsv"; -import {arrow, jszip, exceljs} from "./dependencies.js"; -import {requireDefault} from "./require.js"; +import {arrow4, arrow9, jszip, exceljs} from "./dependencies.js"; +import {cdn, requireDefault} from "./require.js"; import {SQLiteDatabaseClient} from "./sqlite.js"; import {Workbook} from "./xlsx.js"; @@ -56,9 +56,18 @@ export class AbstractFile { i.src = url; }); } - async arrow() { - const [Arrow, response] = await Promise.all([requireDefault(arrow.resolve()), remote_fetch(this)]); - return Arrow.Table.from(response); + async arrow({version = 4} = {}) { + switch (version) { + case 4: { + const [Arrow, response] = await Promise.all([requireDefault(arrow4.resolve()), remote_fetch(this)]); + return Arrow.Table.from(response); + } + case 9: { + const [Arrow, response] = await Promise.all([import(`${cdn}${arrow9.resolve()}`), remote_fetch(this)]); + return Arrow.tableFromIPC(response); + } + default: throw new Error(`unsupported arrow version: ${version}`); + } } async sqlite() { return SQLiteDatabaseClient.open(remote_fetch(this)); diff --git a/src/index.js b/src/index.js index 1b380c7f..50f8f809 100644 --- a/src/index.js +++ b/src/index.js @@ -1,3 +1,4 @@ export {FileAttachments, AbstractFile} from "./fileAttachment.js"; export {Library} from "./library.js"; +export {getArrowTableSchema, isArrowTable} from "./arrow.js"; export {makeQueryTemplate, loadDataSource, arrayIsPrimitive, isDataArray, isDatabaseClient} from "./table.js"; diff --git a/src/library.js b/src/library.js index 7cb1f9b1..c4d4e7c6 100644 --- a/src/library.js +++ b/src/library.js @@ -3,6 +3,7 @@ import * as DOM from "./dom/index.js"; import * as Files from "./files/index.js"; import {AbstractFile, FileAttachment, NoFileAttachments} from "./fileAttachment.js"; import * as Generators from "./generators/index.js"; +import {DuckDBClient} from "./duckdb.js"; import {html} from "./html.js"; import {leaflet} from "./leaflet.js"; import {md} from "./md.js"; @@ -17,7 +18,7 @@ import {svg} from "./svg.js"; import {tex} from "./tex.js"; import {vl} from "./vegalite.js"; import {width} from "./width.js"; -import {arquero, arrow, d3, graphviz, htl, inputs, lodash, plot, topojson} from "./dependencies.js"; +import {arquero, arrow4, d3, graphviz, htl, inputs, lodash, plot, topojson} from "./dependencies.js"; import {__query} from "./table.js"; export const Library = Object.assign(Object.defineProperties(function Library(resolver) { @@ -39,9 +40,10 @@ export const Library = Object.assign(Object.defineProperties(function Library(re // Recommended libraries // https://observablehq.com/@observablehq/recommended-libraries _: () => require(lodash.resolve()), - aq: () => require.alias({"apache-arrow": arrow.resolve()})(arquero.resolve()), - Arrow: () => require(arrow.resolve()), + aq: () => require.alias({"apache-arrow": arrow4.resolve()})(arquero.resolve()), // TODO upgrade to apache-arrow@9 + Arrow: () => require(arrow4.resolve()), // TODO upgrade to apache-arrow@9 d3: () => require(d3.resolve()), + DuckDBClient: () => DuckDBClient, Inputs: () => require(inputs.resolve()).then(Inputs => ({...Inputs, file: Inputs.fileOf(AbstractFile)})), L: () => leaflet(require), mermaid: () => mermaid(require), diff --git a/src/require.js b/src/require.js index 8f328839..0fe72370 100644 --- a/src/require.js +++ b/src/require.js @@ -1,5 +1,8 @@ import {require as initialRequire, requireFrom} from "d3-require"; +// TODO Allow this to be overridden using the Library’s resolver. +export const cdn = "https://cdn.observableusercontent.com/npm/"; + export let requireDefault = initialRequire; export function setDefaultRequire(require) { diff --git a/test/index-test.js b/test/index-test.js index 84353b77..305f8153 100644 --- a/test/index-test.js +++ b/test/index-test.js @@ -5,6 +5,7 @@ it("new Library returns a library with the expected keys", () => { assert.deepStrictEqual(Object.keys(new Library()).sort(), [ "Arrow", "DOM", + "DuckDBClient", "FileAttachment", "Files", "Generators",