Skip to content

Add JSON single value encoding #1805

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 217 additions & 2 deletions pyiceberg/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- Converting partition strings to built-in python objects.
- Converting a value to a byte buffer.
- Converting a byte buffer to a value.
- Converting a json-single field serialized field

Note:
Conversion logic varies based on the PrimitiveType implementation. Therefore conversion functions
Expand All @@ -28,6 +29,7 @@
implementations that share the same conversion logic, registrations can be stacked.
"""

import codecs
import uuid
from datetime import date, datetime, time
from decimal import Decimal
Expand Down Expand Up @@ -60,7 +62,23 @@
UUIDType,
strtobool,
)
from pyiceberg.utils.datetime import date_to_days, datetime_to_micros, time_to_micros
from pyiceberg.utils.datetime import (
date_str_to_days,
date_to_days,
datetime_to_micros,
days_to_date,
micros_to_time,
micros_to_timestamp,
micros_to_timestamptz,
time_str_to_micros,
time_to_micros,
timestamp_to_micros,
timestamptz_to_micros,
to_human_day,
to_human_time,
to_human_timestamp,
to_human_timestamptz,
)
from pyiceberg.utils.decimal import decimal_to_bytes, unscaled_to_decimal

_BOOL_STRUCT = Struct("<?")
Expand Down Expand Up @@ -283,7 +301,7 @@ def from_bytes(primitive_type: PrimitiveType, b: bytes) -> L: # type: ignore
primitive_type (PrimitiveType): An implementation of the PrimitiveType base class.
b (bytes): The bytes to convert.
"""
raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not supported: {str(b)}")
raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not supported: {b!r}")


@from_bytes.register(BooleanType)
Expand Down Expand Up @@ -336,3 +354,200 @@ def _(primitive_type: DecimalType, buf: bytes) -> Decimal:
@from_bytes.register(UnknownType)
def _(type_: UnknownType, buf: bytes) -> None:
return None


@singledispatch # type: ignore
def to_json(primitive_type: PrimitiveType, val: Any) -> L: # type: ignore
"""Convert built-in python values into JSON value types.

https://iceberg.apache.org/spec/#json-single-value-serialization

Args:
primitive_type (PrimitiveType): An implementation of the PrimitiveType base class.
val (Any): The arbitrary built-in value to convert into the right form
"""
raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not supported: {val}")


@to_json.register(BooleanType)
def _(_: BooleanType, val: bool) -> bool:
"""Python bool automatically converts into a JSON bool."""
return val


@to_json.register(IntegerType)
@to_json.register(LongType)
def _(_: Union[IntegerType, LongType], val: int) -> int:
"""Python int automatically converts to a JSON int."""
return val


@to_json.register(DateType)
def _(_: DateType, val: Union[date, int]) -> str:
"""JSON date is string encoded."""
if isinstance(val, date):
val = date_to_days(val)
return to_human_day(val)


@to_json.register(TimeType)
def _(_: TimeType, val: Union[int, time]) -> str:
"""Python time or microseconds since epoch serializes into an ISO8601 time."""
if isinstance(val, time):
val = time_to_micros(val)
return to_human_time(val)


@to_json.register(TimestampType)
def _(_: PrimitiveType, val: Union[int, datetime]) -> str:
"""Python datetime (without timezone) or microseconds since epoch serializes into an ISO8601 timestamp."""
if isinstance(val, datetime):
val = datetime_to_micros(val)

return to_human_timestamp(val)


@to_json.register(TimestamptzType)
def _(_: TimestamptzType, val: Union[int, datetime]) -> str:
"""Python datetime (with timezone) or microseconds since epoch serializes into an ISO8601 timestamp."""
if isinstance(val, datetime):
val = datetime_to_micros(val)
return to_human_timestamptz(val)


@to_json.register(FloatType)
@to_json.register(DoubleType)
def _(_: Union[FloatType, DoubleType], val: float) -> float:
"""Float serializes into JSON float."""
return val


@to_json.register(StringType)
def _(_: StringType, val: str) -> str:
"""Python string serializes into JSON string."""
return val


@to_json.register(FixedType)
def _(t: FixedType, b: bytes) -> str:
"""Python bytes serializes into hexadecimal encoded string."""
if len(t) != len(b):
raise ValueError(f"FixedType has length {len(t)}, which is different from the value: {len(b)}")

return codecs.encode(b, "hex").decode(UTF8)


@to_json.register(BinaryType)
def _(_: BinaryType, b: bytes) -> str:
"""Python bytes serializes into hexadecimal encoded string."""
return codecs.encode(b, "hex").decode(UTF8)


@to_json.register(DecimalType)
def _(_: DecimalType, val: Decimal) -> str:
"""Python decimal serializes into string.

Stores the string representation of the decimal value, specifically, for
values with a positive scale, the number of digits to the right of the
decimal point is used to indicate scale, for values with a negative scale,
the scientific notation is used and the exponent must equal the negated scale.
"""
return str(val)


@to_json.register(UUIDType)
def _(_: UUIDType, val: uuid.UUID) -> str:
"""Serialize into a JSON string."""
return str(val)


@singledispatch # type: ignore
def from_json(primitive_type: PrimitiveType, val: Any) -> L: # type: ignore
"""Convert JSON value types into built-in python values.

https://iceberg.apache.org/spec/#json-single-value-serialization

Args:
primitive_type (PrimitiveType): An implementation of the PrimitiveType base class.
val (Any): The arbitrary JSON value to convert into the right form
"""
raise TypeError(f"Cannot deserialize bytes, type {primitive_type} not supported: {str(val)}")


@from_json.register(BooleanType)
def _(_: BooleanType, val: bool) -> bool:
"""JSON bool automatically converts into a Python bool."""
return val


@from_json.register(IntegerType)
@from_json.register(LongType)
def _(_: Union[IntegerType, LongType], val: int) -> int:
"""JSON int automatically converts to a Python int."""
return val


@from_json.register(DateType)
def _(_: DateType, val: str) -> date:
"""JSON date is string encoded."""
return days_to_date(date_str_to_days(val))


@from_json.register(TimeType)
def _(_: TimeType, val: str) -> time:
"""JSON ISO8601 string into Python time."""
return micros_to_time(time_str_to_micros(val))


@from_json.register(TimestampType)
def _(_: PrimitiveType, val: str) -> datetime:
"""JSON ISO8601 string into Python datetime."""
return micros_to_timestamp(timestamp_to_micros(val))


@from_json.register(TimestamptzType)
def _(_: TimestamptzType, val: str) -> datetime:
"""JSON ISO8601 string into Python datetime."""
return micros_to_timestamptz(timestamptz_to_micros(val))


@from_json.register(FloatType)
@from_json.register(DoubleType)
def _(_: Union[FloatType, DoubleType], val: float) -> float:
"""JSON float deserializes into a Python float."""
return val


@from_json.register(StringType)
def _(_: StringType, val: str) -> str:
"""JSON string serializes into a Python string."""
return val


@from_json.register(FixedType)
def _(t: FixedType, val: str) -> bytes:
"""JSON hexadecimal encoded string into bytes."""
b = codecs.decode(val.encode(UTF8), "hex")

if len(t) != len(b):
raise ValueError(f"FixedType has length {len(t)}, which is different from the value: {len(b)}")

return b


@from_json.register(BinaryType)
def _(_: BinaryType, val: str) -> bytes:
"""JSON hexadecimal encoded string into bytes."""
return codecs.decode(val.encode(UTF8), "hex")


@from_json.register(DecimalType)
def _(_: DecimalType, val: str) -> Decimal:
"""Convert JSON string into a Python Decimal."""
return Decimal(val)


@from_json.register(UUIDType)
def _(_: UUIDType, val: str) -> uuid.UUID:
"""Convert JSON string into Python UUID."""
return uuid.UUID(val)
49 changes: 49 additions & 0 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,3 +545,52 @@ def test_datetime_obj_to_bytes(primitive_type: PrimitiveType, value: Union[datet
bytes_from_value = conversions.to_bytes(primitive_type, value)

assert bytes_from_value == expected_bytes


@pytest.mark.parametrize(
"primitive_type, value, expected",
[
(BooleanType(), True, True),
(IntegerType(), 34, 34),
(LongType(), 34, 34),
(FloatType(), 1.0, 1.0),
(DoubleType(), 1.0, 1.0),
(DecimalType(9, 4), Decimal("123.4500"), "123.4500"),
(DecimalType(9, 0), Decimal("2"), "2"),
(DecimalType(9, -20), Decimal("2E+20"), "2E+20"),
(DateType(), date(2017, 11, 16), "2017-11-16"),
(TimeType(), time(22, 31, 8, 123456), "22:31:08.123456"),
(TimestampType(), datetime(2017, 11, 16, 22, 31, 8, 123456), "2017-11-16T22:31:08.123456"),
(TimestamptzType(), datetime(2017, 11, 16, 22, 31, 8, 123456, tzinfo=timezone.utc), "2017-11-16T22:31:08.123456+00:00"),
(StringType(), "iceberg", "iceberg"),
(BinaryType(), b"\x01\x02\x03\xff", "010203ff"),
(FixedType(4), b"\x01\x02\x03\xff", "010203ff"),
],
)
def test_json_single_serialization(primitive_type: PrimitiveType, value: Any, expected: Any) -> None:
json_val = conversions.to_json(primitive_type, value)
assert json_val == expected


@pytest.mark.parametrize(
"primitive_type, value",
[
(BooleanType(), True),
(IntegerType(), 34),
(LongType(), 34),
(FloatType(), 1.0),
(DoubleType(), 1.0),
(DecimalType(9, 4), Decimal("123.4500")),
(DecimalType(9, 0), Decimal("2")),
(DecimalType(9, -20), Decimal("2E+20")),
(DateType(), date(2017, 11, 16)),
(TimeType(), time(22, 31, 8, 123456)),
(TimestampType(), datetime(2017, 11, 16, 22, 31, 8, 123456)),
(TimestamptzType(), datetime(2017, 11, 16, 22, 31, 8, 123456, tzinfo=timezone.utc)),
(StringType(), "iceberg"),
(BinaryType(), b"\x01\x02\x03\xff"),
(FixedType(4), b"\x01\x02\x03\xff"),
],
)
def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any) -> None:
assert value == conversions.from_json(primitive_type, conversions.to_json(primitive_type, value))