Skip to content

Commit 5af05bd

Browse files
authored
Allow filtering on newly added columns (#246)
* Allow filtering on newly added columns Resolves #217 * Thanks Amogh!
1 parent 46b36b3 commit 5af05bd

File tree

3 files changed

+45
-4
lines changed

3 files changed

+45
-4
lines changed

dev/provision.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,6 @@
152152
"""
153153
)
154154

155-
156155
spark.sql(
157156
"""
158157
DELETE FROM default.test_positional_mor_deletes WHERE number = 9
@@ -315,9 +314,28 @@
315314
)
316315

317316
spark.sql(
318-
f"""
317+
"""
319318
INSERT INTO default.test_table_sanitized_character
320319
VALUES
321320
('123')
322321
"""
323322
)
323+
324+
spark.sql(
325+
"""
326+
CREATE TABLE default.test_table_add_column (
327+
a string
328+
)
329+
USING iceberg
330+
"""
331+
)
332+
333+
spark.sql("INSERT INTO default.test_table_add_column VALUES ('1')")
334+
335+
spark.sql(
336+
"""
337+
ALTER TABLE default.test_table_add_column ADD COLUMN b string
338+
"""
339+
)
340+
341+
spark.sql("INSERT INTO default.test_table_add_column VALUES ('2', '2')")

pyiceberg/expressions/visitors.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -892,8 +892,13 @@ def visit_unbound_predicate(self, predicate: UnboundPredicate[L]) -> BooleanExpr
892892
def visit_bound_predicate(self, predicate: BoundPredicate[L]) -> BooleanExpression:
893893
file_column_name = self.file_schema.find_column_name(predicate.term.ref().field.field_id)
894894

895-
if not file_column_name:
896-
raise ValueError(f"Not found in file schema: {file_column_name}")
895+
if file_column_name is None:
896+
# In the case of schema evolution, the column might not be present
897+
# in the file schema when reading older data
898+
if isinstance(predicate, BoundIsNull):
899+
return AlwaysTrue()
900+
else:
901+
return AlwaysFalse()
897902

898903
if isinstance(predicate, BoundUnaryPredicate):
899904
return predicate.as_unbound(file_column_name)

tests/test_integration.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,12 @@ def test_positional_mor_deletes(catalog: Catalog) -> Table:
143143
return catalog.load_table("default.test_positional_mor_deletes")
144144

145145

146+
@pytest.fixture()
147+
def test_table_add_column(catalog: Catalog) -> Table:
148+
"""Table that has a new column"""
149+
return catalog.load_table("default.test_table_add_column")
150+
151+
146152
@pytest.fixture()
147153
def test_positional_mor_double_deletes(catalog: Catalog) -> Table:
148154
"""Table that has multiple positional deletes"""
@@ -373,6 +379,18 @@ def test_scan_branch(test_positional_mor_deletes: Table) -> None:
373379
assert arrow_table["number"].to_pylist() == [1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12]
374380

375381

382+
@pytest.mark.integration
383+
def test_filter_on_new_column(test_table_add_column: Table) -> None:
384+
arrow_table = test_table_add_column.scan(row_filter="b == '2'").to_arrow()
385+
assert arrow_table["b"].to_pylist() == ['2']
386+
387+
arrow_table = test_table_add_column.scan(row_filter="b is not null").to_arrow()
388+
assert arrow_table["b"].to_pylist() == ['2']
389+
390+
arrow_table = test_table_add_column.scan(row_filter="b is null").to_arrow()
391+
assert arrow_table["b"].to_pylist() == [None]
392+
393+
376394
@pytest.mark.integration
377395
def test_upgrade_table_version(table_test_table_version: Table) -> None:
378396
assert table_test_table_version.format_version == 1

0 commit comments

Comments
 (0)