Skip to content

feat(query): support imports and packages in python udf scripts #18187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weโ€™ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ overflow-checks = true
rpath = false

[patch.crates-io]
arrow-udf-runtime = { git = "https://github.com/datafuse-extras/arrow-udf.git", rev = "92eeb3b" }
arrow-udf-runtime = { git = "https://github.com/datafuse-extras/arrow-udf.git", rev = "a442343" }
async-backtrace = { git = "https://github.com/datafuse-extras/async-backtrace.git", rev = "dea4553" }
async-recursion = { git = "https://github.com/datafuse-extras/async-recursion.git", rev = "a353334" }
backtrace = { git = "https://github.com/rust-lang/backtrace-rs.git", rev = "72265be" }
Expand Down
14 changes: 12 additions & 2 deletions src/meta/app/src/principal/user_defined_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pub struct UDFServer {
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct UDFScript {
pub code: String,
pub imports: Vec<String>,
pub packages: Vec<String>,
pub handler: String,
pub language: String,
pub arg_types: Vec<DataType>,
Expand All @@ -50,6 +52,8 @@ pub struct UDFScript {
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct UDAFScript {
pub code: String,
pub imports: Vec<String>,
pub packages: Vec<String>,
pub language: String,
// aggregate function input types
pub arg_types: Vec<DataType>,
Expand Down Expand Up @@ -167,6 +171,8 @@ impl UserDefinedFunction {
arg_types,
return_type,
runtime_version: runtime_version.to_string(),
imports: vec![],
packages: vec![],
}),
created_on: Utc::now(),
}
Expand Down Expand Up @@ -226,6 +232,8 @@ impl Display for UDFDefinition {
handler,
language,
runtime_version,
imports,
packages,
}) => {
for (i, item) in arg_types.iter().enumerate() {
if i > 0 {
Expand All @@ -235,7 +243,7 @@ impl Display for UDFDefinition {
}
write!(
f,
") RETURNS {return_type} LANGUAGE {language} RUNTIME_VERSION = {runtime_version} HANDLER = {handler} AS $${code}$$"
") RETURNS {return_type} LANGUAGE {language} IMPORTS = {imports:?} PACKAGES = {packages:?} RUNTIME_VERSION = {runtime_version} HANDLER = {handler} AS $${code}$$"
)?;
}
UDFDefinition::UDAFScript(UDAFScript {
Expand All @@ -245,6 +253,8 @@ impl Display for UDFDefinition {
return_type,
language,
runtime_version,
imports,
packages,
}) => {
for (i, item) in arg_types.iter().enumerate() {
if i > 0 {
Expand All @@ -259,7 +269,7 @@ impl Display for UDFDefinition {
}
write!(f, "{} {}", item.name(), item.data_type())?;
}
write!(f, " }} RETURNS {return_type} LANGUAGE {language} RUNTIME_VERSION = {runtime_version} AS $${code}$$")?;
write!(f, " }} RETURNS {return_type} LANGUAGE {language} IMPORTS = {imports:?} PACKAGES = {packages:?} RUNTIME_VERSION = {runtime_version} AS $${code}$$")?;
}
}
Ok(())
Expand Down
8 changes: 8 additions & 0 deletions src/meta/proto-conv/src/udf_from_to_protobuf_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ impl FromToProto for mt::UDFScript {
handler: p.handler,
language: p.language,
runtime_version: p.runtime_version,
imports: p.imports,
packages: p.packages,
})
}

Expand Down Expand Up @@ -171,6 +173,8 @@ impl FromToProto for mt::UDFScript {
arg_types,
return_type: Some(return_type),
runtime_version: self.runtime_version.clone(),
imports: self.imports.clone(),
packages: self.packages.clone(),
})
}
}
Expand Down Expand Up @@ -206,6 +210,8 @@ impl FromToProto for mt::UDAFScript {
return_type,
language: p.language,
runtime_version: p.runtime_version,
imports: p.imports,
packages: p.packages,
state_fields,
})
}
Expand Down Expand Up @@ -259,6 +265,8 @@ impl FromToProto for mt::UDAFScript {
arg_types,
state_fields,
return_type: Some(return_type),
imports: self.imports.clone(),
packages: self.packages.clone(),
})
}
}
Expand Down
1 change: 1 addition & 0 deletions src/meta/proto-conv/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ const META_CHANGE_LOG: &[(u64, &str)] = &[
(127, "2025-05-18: Add: UserOption::workload_group"),
(128, "2025-05-22: Add: Storage Network config"),
(129, "2025-05-30: Add: New DataType Vector"),
(130, "2025-06-19: Add: New UDF imports and packages in udf definition"),
// Dear developer:
// If you're gonna add a new metadata version, you'll have to add a test for it.
// You could just copy an existing test file(e.g., `../tests/it/v024_table_meta.rs`)
Expand Down
2 changes: 1 addition & 1 deletion src/meta/proto-conv/tests/it/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,4 @@ mod v125_table_index;
mod v126_iceberg_storage_catalog_option;
mod v127_user_option_workload_group;
mod v128_storage_network_config;
mod v129_vector_datatype;
mod v130_udf_imports_packages;
2 changes: 2 additions & 0 deletions src/meta/proto-conv/tests/it/v081_udf_script.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ fn test_decode_udf_script() -> anyhow::Result<()> {
language: "python".to_string(),
arg_types: vec![DataType::Number(NumberDataType::Int32)],
return_type: DataType::Number(NumberDataType::Float32),
imports: vec![],
packages: vec![],
runtime_version: "3.12.2".to_string(),
}),
created_on: DateTime::<Utc>::default(),
Expand Down
2 changes: 2 additions & 0 deletions src/meta/proto-conv/tests/it/v115_add_udaf_script.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ fn test_decode_v115_add_udaf_script() -> anyhow::Result<()> {
)],
return_type: DataType::Number(NumberDataType::Float32),
runtime_version: "".to_string(),
imports: vec![],
packages: vec![],
}),
created_on: DateTime::<Utc>::default(),
};
Expand Down
68 changes: 68 additions & 0 deletions src/meta/proto-conv/tests/it/v130_udf_imports_packages.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use chrono::DateTime;
use chrono::Utc;
use databend_common_expression::types::DataType;
use databend_common_expression::types::NumberDataType;
use databend_common_meta_app::principal::UDFDefinition;
use databend_common_meta_app::principal::UDFScript;
use databend_common_meta_app::principal::UserDefinedFunction;
use fastrace::func_name;

use crate::common;

// These bytes are built when a new version in introduced,
// and are kept for backward compatibility test.
//
// *************************************************************
// * These messages should never be updated, *
// * only be added when a new version is added, *
// * or be removed when an old version is no longer supported. *
// *************************************************************
//
// The message bytes are built from the output of `test_pb_from_to()`
#[test]
fn test_decode_v130_udf_script() -> anyhow::Result<()> {
let bytes = vec![
10, 5, 109, 121, 95, 102, 110, 18, 21, 84, 104, 105, 115, 32, 105, 115, 32, 97, 32, 100,
101, 115, 99, 114, 105, 112, 116, 105, 111, 110, 50, 119, 10, 9, 115, 111, 109, 101, 32,
99, 111, 100, 101, 18, 5, 109, 121, 95, 102, 110, 26, 6, 112, 121, 116, 104, 111, 110, 34,
19, 154, 2, 9, 58, 0, 160, 6, 130, 1, 168, 6, 24, 160, 6, 130, 1, 168, 6, 24, 42, 19, 154,
2, 9, 74, 0, 160, 6, 130, 1, 168, 6, 24, 160, 6, 130, 1, 168, 6, 24, 50, 6, 51, 46, 49, 50,
46, 50, 58, 9, 64, 115, 49, 47, 97, 46, 122, 105, 112, 58, 8, 64, 115, 50, 47, 98, 46, 112,
121, 66, 5, 110, 117, 109, 112, 121, 66, 6, 112, 97, 110, 100, 97, 115, 160, 6, 130, 1,
168, 6, 24, 42, 23, 49, 57, 55, 48, 45, 48, 49, 45, 48, 49, 32, 48, 48, 58, 48, 48, 58, 48,
48, 32, 85, 84, 67, 160, 6, 130, 1, 168, 6, 24,
];

let want = || UserDefinedFunction {
name: "my_fn".to_string(),
description: "This is a description".to_string(),
definition: UDFDefinition::UDFScript(UDFScript {
code: "some code".to_string(),
handler: "my_fn".to_string(),
language: "python".to_string(),
arg_types: vec![DataType::Number(NumberDataType::Int32)],
return_type: DataType::Number(NumberDataType::Float32),
imports: vec!["@s1/a.zip".to_string(), "@s2/b.py".to_string()],
packages: vec!["numpy".to_string(), "pandas".to_string()],
runtime_version: "3.12.2".to_string(),
}),
created_on: DateTime::<Utc>::default(),
};

common::test_pb_from_to(func_name!(), want())?;
common::test_load_old(func_name!(), bytes.as_slice(), 130, want())
}
6 changes: 5 additions & 1 deletion src/meta/protos/proto/udf.proto
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ message LambdaUDF {
uint64 min_reader_ver = 101;

repeated string parameters = 1;
string definition = 2;
string definition = 2;
}

message UDFServer {
Expand All @@ -49,6 +49,8 @@ message UDFScript {
repeated DataType arg_types = 4;
DataType return_type = 5;
string runtime_version = 6;
repeated string imports = 7;
repeated string packages = 8;
}

message UDAFScript {
Expand All @@ -61,6 +63,8 @@ message UDAFScript {
DataType return_type = 4;
repeated DataType arg_types = 5;
repeated DataField state_fields = 6;
repeated string imports = 7;
repeated string packages = 8;
}

message UserDefinedFunction {
Expand Down
33 changes: 31 additions & 2 deletions src/query/ast/src/ast/statements/udf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ use std::fmt::Formatter;

use derive_visitor::Drive;
use derive_visitor::DriveMut;
use itertools::Itertools;

use crate::ast::quote::QuotedString;
use crate::ast::write_comma_separated_list;
use crate::ast::CreateOption;
use crate::ast::Expr;
Expand All @@ -43,6 +45,8 @@ pub enum UDFDefinition {
arg_types: Vec<TypeName>,
return_type: TypeName,
code: String,
imports: Vec<String>,
packages: Vec<String>,
handler: String,
language: String,
runtime_version: String,
Expand All @@ -59,6 +63,8 @@ pub enum UDFDefinition {
arg_types: Vec<TypeName>,
state_fields: Vec<UDAFStateField>,
return_type: TypeName,
imports: Vec<String>,
packages: Vec<String>,
code: String,
language: String,
runtime_version: String,
Expand Down Expand Up @@ -109,12 +115,23 @@ impl Display for UDFDefinition {
handler,
language,
runtime_version: _,
imports,
packages,
} => {
write!(f, "( ")?;
write_comma_separated_list(f, arg_types)?;
let imports = imports
.iter()
.map(|s| QuotedString(s, '\'').to_string())
.join(",");
let packages = packages
.iter()
.map(|s| QuotedString(s, '\'').to_string())
.join(",");
write!(
f,
" ) RETURNS {return_type} LANGUAGE {language} HANDLER = '{handler}' AS $$\n{code}\n$$"
" ) RETURNS {return_type} LANGUAGE {language} IMPORTS = ({}) PACKAGES = ({}) HANDLER = '{handler}' AS $$\n{code}\n$$",
imports, packages
)?;
}
UDFDefinition::UDAFServer {
Expand Down Expand Up @@ -149,14 +166,26 @@ impl Display for UDFDefinition {
code,
language,
runtime_version: _,
imports,
packages,
} => {
let imports = imports
.iter()
.map(|s| QuotedString(s, '\'').to_string())
.join(",");
let packages = packages
.iter()
.map(|s| QuotedString(s, '\'').to_string())
.join(",");

write!(f, "( ")?;
write_comma_separated_list(f, arg_types)?;
write!(f, " ) STATE {{ ")?;
write_comma_separated_list(f, state_types)?;
write!(
f,
" }} RETURNS {return_type} LANGUAGE {language} AS $$\n{code}\n$$"
" }} RETURNS {return_type} LANGUAGE {language} IMPORTS = ({}) PACKAGES = ({}) AS $$\n{code}\n$$",
imports, packages
)?;
}
}
Expand Down
20 changes: 20 additions & 0 deletions src/query/ast/src/parser/statement.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4777,6 +4777,8 @@ pub fn udf_definition(i: Input) -> IResult<UDFDefinition> {
"(" ~ #comma_separated_list0(type_name) ~ ")"
~ RETURNS ~ #type_name
~ LANGUAGE ~ #ident
~ ( IMPORTS ~ ^"=" ~ "(" ~ #comma_separated_list0(literal_string) ~ ")" )?
~ ( PACKAGES ~ ^"=" ~ "(" ~ #comma_separated_list0(literal_string) ~ ")" )?
~ HANDLER ~ ^"=" ~ ^#literal_string
~ ( HEADERS ~ ^"=" ~ "(" ~ #comma_separated_list0(udf_header) ~ ")" )?
~ #udf_script_or_address
Expand All @@ -4789,6 +4791,8 @@ pub fn udf_definition(i: Input) -> IResult<UDFDefinition> {
return_type,
_,
language,
imports,
packages,
_,
_,
handler,
Expand All @@ -4800,6 +4804,12 @@ pub fn udf_definition(i: Input) -> IResult<UDFDefinition> {
arg_types,
return_type,
code: address_or_code.0,
imports: imports
.map(|(_, _, _, imports, _)| imports)
.unwrap_or_default(),
packages: packages
.map(|(_, _, _, packages, _)| packages)
.unwrap_or_default(),
handler,
language: language.to_string(),
// TODO inject runtime_version by user
Expand Down Expand Up @@ -4827,6 +4837,8 @@ pub fn udf_definition(i: Input) -> IResult<UDFDefinition> {
~ STATE ~ "{" ~ #comma_separated_list0(udaf_state_field) ~ "}"
~ RETURNS ~ #type_name
~ LANGUAGE ~ #ident
~ ( IMPORTS ~ ^"=" ~ "(" ~ #comma_separated_list0(literal_string) ~ ")" )?
~ ( PACKAGES ~ ^"=" ~ "(" ~ #comma_separated_list0(literal_string) ~ ")" )?
~ ( HEADERS ~ ^"=" ~ "(" ~ #comma_separated_list0(udf_header) ~ ")" )?
~ #udf_script_or_address
},
Expand All @@ -4842,6 +4854,8 @@ pub fn udf_definition(i: Input) -> IResult<UDFDefinition> {
return_type,
_,
language,
imports,
packages,
headers,
address_or_code,
)| {
Expand All @@ -4852,6 +4866,12 @@ pub fn udf_definition(i: Input) -> IResult<UDFDefinition> {
return_type,
code: address_or_code.0,
language: language.to_string(),
imports: imports
.map(|(_, _, _, imports, _)| imports)
.unwrap_or_default(),
packages: packages
.map(|(_, _, _, packages, _)| packages)
.unwrap_or_default(),
// TODO inject runtime_version by user
// Now we use fixed runtime version
runtime_version: "".to_string(),
Expand Down
4 changes: 4 additions & 0 deletions src/query/ast/src/parser/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1342,6 +1342,10 @@ pub enum TokenKind {
HEADERS,
#[token("LANGUAGE", ignore(ascii_case))]
LANGUAGE,
#[token("IMPORTS", ignore(ascii_case))]
IMPORTS,
#[token("PACKAGES", ignore(ascii_case))]
PACKAGES,
#[token("STATE", ignore(ascii_case))]
STATE,
#[token("TASK", ignore(ascii_case))]
Expand Down
Loading
Loading