From 820f5ae0f78f8c5d0a2fc5d6385cbceb91f511a9 Mon Sep 17 00:00:00 2001 From: kosiew Date: Fri, 20 Jun 2025 18:35:27 +0800 Subject: [PATCH 1/6] docs: unify dataframe documentation (#2) --- docs/source/api/dataframe.rst | 235 ------------------ docs/source/api/index.rst | 2 - docs/source/index.rst | 9 +- .../{dataframe.rst => dataframe/index.rst} | 7 +- .../source/user-guide/dataframe/rendering.rst | 235 ++++++++++++++++++ 5 files changed, 242 insertions(+), 246 deletions(-) rename docs/source/user-guide/{dataframe.rst => dataframe/index.rst} (99%) create mode 100644 docs/source/user-guide/dataframe/rendering.rst diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst index a9e9e47c..6fecbce8 100644 --- a/docs/source/api/dataframe.rst +++ b/docs/source/api/dataframe.rst @@ -150,238 +150,3 @@ To materialize the results of your DataFrame operations: # Count rows count = df.count() -HTML Rendering in Jupyter -------------------------- - -When working in Jupyter notebooks or other environments that support rich HTML display, -DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality -is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. - -Basic HTML Rendering -~~~~~~~~~~~~~~~~~~~~ - -In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: - -.. code-block:: python - - # Will display as HTML table in Jupyter - df - - # Explicit display also uses HTML rendering - display(df) - -HTML Rendering Customization ----------------------------- - -DataFusion provides extensive customization options for HTML table rendering through the -``datafusion.html_formatter`` module. - -Configuring the HTML Formatter -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can customize how DataFrames are rendered by configuring the formatter: - -.. code-block:: python - - from datafusion.html_formatter import configure_formatter - - configure_formatter( - max_cell_length=30, # Maximum length of cell content before truncation - max_width=800, # Maximum width of table in pixels - max_height=400, # Maximum height of table in pixels - max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB) - min_rows_display=10, # Minimum rows to display - repr_rows=20, # Number of rows to display in representation - enable_cell_expansion=True, # Allow cells to be expandable on click - custom_css=None, # Custom CSS to apply - show_truncation_message=True, # Show message when data is truncated - style_provider=None, # Custom style provider class - use_shared_styles=True # Share styles across tables to reduce duplication - ) - -Custom Style Providers -~~~~~~~~~~~~~~~~~~~~~~ - -For advanced styling needs, you can create a custom style provider class: - -.. code-block:: python - - from datafusion.html_formatter import configure_formatter - - class CustomStyleProvider: - def get_cell_style(self) -> str: - return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;" - - def get_header_style(self) -> str: - return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;" - - # Apply custom styling - configure_formatter(style_provider=CustomStyleProvider()) - -Custom Type Formatters -~~~~~~~~~~~~~~~~~~~~~~ - -You can register custom formatters for specific data types: - -.. code-block:: python - - from datafusion.html_formatter import get_formatter - - formatter = get_formatter() - - # Format integers with color based on value - def format_int(value): - return f' 100 else "blue"}">{value}' - - formatter.register_formatter(int, format_int) - - # Format date values - def format_date(value): - return f'{value.isoformat()}' - - formatter.register_formatter(datetime.date, format_date) - -Custom Cell Builders -~~~~~~~~~~~~~~~~~~~~ - -For complete control over cell rendering: - -.. code-block:: python - - formatter = get_formatter() - - def custom_cell_builder(value, row, col, table_id): - try: - num_value = float(value) - if num_value > 0: # Positive values get green - return f'{value}' - if num_value < 0: # Negative values get red - return f'{value}' - except (ValueError, TypeError): - pass - - # Default styling for non-numeric or zero values - return f'{value}' - - formatter.set_custom_cell_builder(custom_cell_builder) - -Custom Header Builders -~~~~~~~~~~~~~~~~~~~~~~ - -Similarly, you can customize the rendering of table headers: - -.. code-block:: python - - def custom_header_builder(field): - tooltip = f"Type: {field.type}" - return f'{field.name}' - - formatter.set_custom_header_builder(custom_header_builder) - -Managing Formatter State ------------------------~ - -The HTML formatter maintains global state that can be managed: - -.. code-block:: python - - from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter - - # Reset the formatter to default settings - reset_formatter() - - # Reset only the styles loaded state (useful when styles were loaded but need reloading) - reset_styles_loaded_state() - - # Get the current formatter instance to make changes - formatter = get_formatter() - -Advanced Example: Dashboard-Style Formatting -------------------------------------------~~ - -This example shows how to create a dashboard-like styling for your DataFrames: - -.. code-block:: python - - from datafusion.html_formatter import configure_formatter, get_formatter - - # Define custom CSS - custom_css = """ - .datafusion-table { - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - border-collapse: collapse; - width: 100%; - box-shadow: 0 2px 3px rgba(0,0,0,0.1); - } - .datafusion-table th { - position: sticky; - top: 0; - z-index: 10; - } - .datafusion-table tr:hover td { - background-color: #f1f7fa !important; - } - .datafusion-table .numeric-positive { - color: #0a7c00; - } - .datafusion-table .numeric-negative { - color: #d13438; - } - """ - - class DashboardStyleProvider: - def get_cell_style(self) -> str: - return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;" - - def get_header_style(self) -> str: - return ("background-color: #0078d4; color: white; font-weight: 600; " - "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;") - - # Apply configuration - configure_formatter( - max_height=500, - enable_cell_expansion=True, - custom_css=custom_css, - style_provider=DashboardStyleProvider(), - max_cell_length=50 - ) - - # Add custom formatters for numbers - formatter = get_formatter() - - def format_number(value): - try: - num = float(value) - cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else "" - return f'{value:,}' if cls else f'{value:,}' - except (ValueError, TypeError): - return str(value) - - formatter.register_formatter(int, format_number) - formatter.register_formatter(float, format_number) - -Best Practices --------------- - -1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. - -2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens. - -3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables. - -4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. - -5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. - -Additional Resources --------------------- - -* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames -* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources -* :doc:`../user-guide/data-sources` - Comprehensive data sources guide -* :ref:`io_csv` - CSV file reading -* :ref:`io_parquet` - Parquet file reading -* :ref:`io_json` - JSON file reading -* :ref:`io_avro` - Avro file reading -* :ref:`io_custom_table_provider` - Custom table providers -* `API Reference `_ - Full API reference diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 7f58227c..131b97e0 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -23,5 +23,3 @@ This section provides detailed API documentation for the DataFusion Python libra .. toctree:: :maxdepth: 2 - - dataframe diff --git a/docs/source/index.rst b/docs/source/index.rst index ff1e4728..9b3a2f7f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -72,7 +72,7 @@ Example user-guide/introduction user-guide/basics user-guide/data-sources - user-guide/dataframe + user-guide/dataframe/index user-guide/common-operations/index user-guide/io/index user-guide/configuration @@ -88,10 +88,3 @@ Example contributor-guide/introduction contributor-guide/ffi -.. _toc.api: -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: API - - api/index diff --git a/docs/source/user-guide/dataframe.rst b/docs/source/user-guide/dataframe/index.rst similarity index 99% rename from docs/source/user-guide/dataframe.rst rename to docs/source/user-guide/dataframe/index.rst index 23c65b5f..78569f9f 100644 --- a/docs/source/user-guide/dataframe.rst +++ b/docs/source/user-guide/dataframe/index.rst @@ -215,4 +215,9 @@ You can control how much data is displayed and how much memory is used for rende repr_rows=20 # Show 20 rows in __repr__ output ) -These parameters help balance comprehensive data display against performance considerations. \ No newline at end of file +These parameters help balance comprehensive data display against performance considerations. + +.. toctree:: + :maxdepth: 1 + + rendering diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst new file mode 100644 index 00000000..8a97d6f1 --- /dev/null +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -0,0 +1,235 @@ +HTML Rendering in Jupyter +------------------------- + +When working in Jupyter notebooks or other environments that support rich HTML display, +DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality +is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. + +Basic HTML Rendering +~~~~~~~~~~~~~~~~~~~~ + +In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: + +.. code-block:: python + + # Will display as HTML table in Jupyter + df + + # Explicit display also uses HTML rendering + display(df) + +HTML Rendering Customization +---------------------------- + +DataFusion provides extensive customization options for HTML table rendering through the +``datafusion.html_formatter`` module. + +Configuring the HTML Formatter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can customize how DataFrames are rendered by configuring the formatter: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + configure_formatter( + max_cell_length=30, # Maximum length of cell content before truncation + max_width=800, # Maximum width of table in pixels + max_height=400, # Maximum height of table in pixels + max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB) + min_rows_display=10, # Minimum rows to display + repr_rows=20, # Number of rows to display in representation + enable_cell_expansion=True, # Allow cells to be expandable on click + custom_css=None, # Custom CSS to apply + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom style provider class + use_shared_styles=True # Share styles across tables to reduce duplication + ) + +Custom Style Providers +~~~~~~~~~~~~~~~~~~~~~~ + +For advanced styling needs, you can create a custom style provider class: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter + + class CustomStyleProvider: + def get_cell_style(self) -> str: + return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;" + + def get_header_style(self) -> str: + return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;" + + # Apply custom styling + configure_formatter(style_provider=CustomStyleProvider()) + +Custom Type Formatters +~~~~~~~~~~~~~~~~~~~~~~ + +You can register custom formatters for specific data types: + +.. code-block:: python + + from datafusion.html_formatter import get_formatter + + formatter = get_formatter() + + # Format integers with color based on value + def format_int(value): + return f' 100 else "blue"}">{value}' + + formatter.register_formatter(int, format_int) + + # Format date values + def format_date(value): + return f'{value.isoformat()}' + + formatter.register_formatter(datetime.date, format_date) + +Custom Cell Builders +~~~~~~~~~~~~~~~~~~~~ + +For complete control over cell rendering: + +.. code-block:: python + + formatter = get_formatter() + + def custom_cell_builder(value, row, col, table_id): + try: + num_value = float(value) + if num_value > 0: # Positive values get green + return f'{value}' + if num_value < 0: # Negative values get red + return f'{value}' + except (ValueError, TypeError): + pass + + # Default styling for non-numeric or zero values + return f'{value}' + + formatter.set_custom_cell_builder(custom_cell_builder) + +Custom Header Builders +~~~~~~~~~~~~~~~~~~~~~~ + +Similarly, you can customize the rendering of table headers: + +.. code-block:: python + + def custom_header_builder(field): + tooltip = f"Type: {field.type}" + return f'{field.name}' + + formatter.set_custom_header_builder(custom_header_builder) + +Managing Formatter State +-----------------------~ + +The HTML formatter maintains global state that can be managed: + +.. code-block:: python + + from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter + + # Reset the formatter to default settings + reset_formatter() + + # Reset only the styles loaded state (useful when styles were loaded but need reloading) + reset_styles_loaded_state() + + # Get the current formatter instance to make changes + formatter = get_formatter() + +Advanced Example: Dashboard-Style Formatting +------------------------------------------~~ + +This example shows how to create a dashboard-like styling for your DataFrames: + +.. code-block:: python + + from datafusion.html_formatter import configure_formatter, get_formatter + + # Define custom CSS + custom_css = """ + .datafusion-table { + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; + border-collapse: collapse; + width: 100%; + box-shadow: 0 2px 3px rgba(0,0,0,0.1); + } + .datafusion-table th { + position: sticky; + top: 0; + z-index: 10; + } + .datafusion-table tr:hover td { + background-color: #f1f7fa !important; + } + .datafusion-table .numeric-positive { + color: #0a7c00; + } + .datafusion-table .numeric-negative { + color: #d13438; + } + """ + + class DashboardStyleProvider: + def get_cell_style(self) -> str: + return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;" + + def get_header_style(self) -> str: + return ("background-color: #0078d4; color: white; font-weight: 600; " + "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;") + + # Apply configuration + configure_formatter( + max_height=500, + enable_cell_expansion=True, + custom_css=custom_css, + style_provider=DashboardStyleProvider(), + max_cell_length=50 + ) + + # Add custom formatters for numbers + formatter = get_formatter() + + def format_number(value): + try: + num = float(value) + cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else "" + return f'{value:,}' if cls else f'{value:,}' + except (ValueError, TypeError): + return str(value) + + formatter.register_formatter(int, format_number) + formatter.register_formatter(float, format_number) + +Best Practices +-------------- + +1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. + +2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens. + +3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables. + +4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. + +5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full. + +Additional Resources +-------------------- + +* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames +* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources +* :doc:`../user-guide/data-sources` - Comprehensive data sources guide +* :ref:`io_csv` - CSV file reading +* :ref:`io_parquet` - Parquet file reading +* :ref:`io_json` - JSON file reading +* :ref:`io_avro` - Avro file reading +* :ref:`io_custom_table_provider` - Custom table providers +* `API Reference `_ - Full API reference From de7cb89a3694f182f212b00472f53fa8ca553448 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 20 Jun 2025 18:42:21 +0800 Subject: [PATCH 2/6] docs: update links in basics and rendering documentation --- docs/source/user-guide/basics.rst | 2 +- docs/source/user-guide/dataframe/rendering.rst | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst index 2975d9a6..7c682046 100644 --- a/docs/source/user-guide/basics.rst +++ b/docs/source/user-guide/basics.rst @@ -73,7 +73,7 @@ DataFrames are typically created by calling a method on :py:class:`~datafusion.c calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`, and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition. -For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe`. +For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe/index`. Expressions ----------- diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 8a97d6f1..9554b54d 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -224,9 +224,9 @@ Best Practices Additional Resources -------------------- -* :doc:`../user-guide/dataframe` - Complete guide to using DataFrames -* :doc:`../user-guide/io/index` - I/O Guide for reading data from various sources -* :doc:`../user-guide/data-sources` - Comprehensive data sources guide +* :doc:`../dataframe/index` - Complete guide to using DataFrames +* :doc:`../io/index` - I/O Guide for reading data from various sources +* :doc:`../data-sources` - Comprehensive data sources guide * :ref:`io_csv` - CSV file reading * :ref:`io_parquet` - Parquet file reading * :ref:`io_json` - JSON file reading From c6664970a204f06fede9f41e19e2ff12d156e94b Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 20 Jun 2025 18:44:25 +0800 Subject: [PATCH 3/6] docs: add API reference section to main index --- docs/source/api/index.rst | 2 ++ docs/source/index.rst | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst index 131b97e0..c6b17f64 100644 --- a/docs/source/api/index.rst +++ b/docs/source/api/index.rst @@ -23,3 +23,5 @@ This section provides detailed API documentation for the DataFusion Python libra .. toctree:: :maxdepth: 2 + + dataframe \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 9b3a2f7f..64de340a 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -79,6 +79,15 @@ Example user-guide/sql +.. _toc.api: +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: API REFERENCE + + api/index + + .. _toc.contributor_guide: .. toctree:: :hidden: From 5ab5b988d0b4886610e29cce1230e664637ef7f1 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 20 Jun 2025 20:55:44 +0800 Subject: [PATCH 4/6] docs: add license information to rendering documentation --- docs/source/user-guide/dataframe/rendering.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 9554b54d..d6b7c34f 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -1,3 +1,20 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + HTML Rendering in Jupyter ------------------------- From ff5541ed67f396735262f45c35a563ddb8b86106 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Sat, 21 Jun 2025 11:04:54 +0800 Subject: [PATCH 5/6] Move API Reference under User Guide > Dataframe --- docs/source/api/dataframe.rst | 152 ---------- docs/source/api/index.rst | 27 -- docs/source/index.rst | 9 - .../user-guide/dataframe/api-reference.rst | 69 +++++ docs/source/user-guide/dataframe/index.rst | 260 +++++++----------- .../source/user-guide/dataframe/rendering.rst | 249 ++++++++--------- 6 files changed, 279 insertions(+), 487 deletions(-) delete mode 100644 docs/source/api/dataframe.rst delete mode 100644 docs/source/api/index.rst create mode 100644 docs/source/user-guide/dataframe/api-reference.rst diff --git a/docs/source/api/dataframe.rst b/docs/source/api/dataframe.rst deleted file mode 100644 index 6fecbce8..00000000 --- a/docs/source/api/dataframe.rst +++ /dev/null @@ -1,152 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -================= -DataFrame API -================= - -Overview --------- - -The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations -on that data. DataFrames provide a flexible API for transforming data through various operations such as -filtering, projection, aggregation, joining, and more. - -A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when -terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. - -Creating DataFrames -------------------- - -DataFrames can be created in several ways: - -* From SQL queries via a ``SessionContext``: - - .. code-block:: python - - from datafusion import SessionContext - - ctx = SessionContext() - df = ctx.sql("SELECT * FROM your_table") - -* From registered tables: - - .. code-block:: python - - df = ctx.table("your_table") - -* From various data sources: - - .. code-block:: python - - # From CSV files (see :ref:`io_csv` for detailed options) - df = ctx.read_csv("path/to/data.csv") - - # From Parquet files (see :ref:`io_parquet` for detailed options) - df = ctx.read_parquet("path/to/data.parquet") - - # From JSON files (see :ref:`io_json` for detailed options) - df = ctx.read_json("path/to/data.json") - - # From Avro files (see :ref:`io_avro` for detailed options) - df = ctx.read_avro("path/to/data.avro") - - # From Pandas DataFrame - import pandas as pd - pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - df = ctx.from_pandas(pandas_df) - - # From Arrow data - import pyarrow as pa - batch = pa.RecordBatch.from_arrays( - [pa.array([1, 2, 3]), pa.array([4, 5, 6])], - names=["a", "b"] - ) - df = ctx.from_arrow(batch) - - For detailed information about reading from different data sources, see the :doc:`I/O Guide <../user-guide/io/index>`. - For custom data sources, see :ref:`io_custom_table_provider`. - -Common DataFrame Operations ---------------------------- - -DataFusion's DataFrame API offers a wide range of operations: - -.. code-block:: python - - from datafusion import column, literal - - # Select specific columns - df = df.select("col1", "col2") - - # Select with expressions - df = df.select(column("a") + column("b"), column("a") - column("b")) - - # Filter rows - df = df.filter(column("age") > literal(25)) - - # Add computed columns - df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) - - # Multiple column additions - df = df.with_columns( - (column("a") + column("b")).alias("sum"), - (column("a") * column("b")).alias("product") - ) - - # Sort data - df = df.sort(column("age").sort(ascending=False)) - - # Join DataFrames - df = df1.join(df2, on="user_id", how="inner") - - # Aggregate data - from datafusion import functions as f - df = df.aggregate( - [], # Group by columns (empty for global aggregation) - [f.sum(column("amount")).alias("total_amount")] - ) - - # Limit rows - df = df.limit(100) - - # Drop columns - df = df.drop("temporary_column") - -Terminal Operations -------------------- - -To materialize the results of your DataFrame operations: - -.. code-block:: python - - # Collect all data as PyArrow RecordBatches - result_batches = df.collect() - - # Convert to various formats - pandas_df = df.to_pandas() # Pandas DataFrame - polars_df = df.to_polars() # Polars DataFrame - arrow_table = df.to_arrow_table() # PyArrow Table - py_dict = df.to_pydict() # Python dictionary - py_list = df.to_pylist() # Python list of dictionaries - - # Display results - df.show() # Print tabular format to console - - # Count rows - count = df.count() - diff --git a/docs/source/api/index.rst b/docs/source/api/index.rst deleted file mode 100644 index c6b17f64..00000000 --- a/docs/source/api/index.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -============= -API Reference -============= - -This section provides detailed API documentation for the DataFusion Python library. - -.. toctree:: - :maxdepth: 2 - - dataframe \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 64de340a..9b3a2f7f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -79,15 +79,6 @@ Example user-guide/sql -.. _toc.api: -.. toctree:: - :hidden: - :maxdepth: 1 - :caption: API REFERENCE - - api/index - - .. _toc.contributor_guide: .. toctree:: :hidden: diff --git a/docs/source/user-guide/dataframe/api-reference.rst b/docs/source/user-guide/dataframe/api-reference.rst new file mode 100644 index 00000000..189456ea --- /dev/null +++ b/docs/source/user-guide/dataframe/api-reference.rst @@ -0,0 +1,69 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +DataFrame API Reference +======================= + +This page provides quick access to DataFusion's DataFrame API documentation. + +For comprehensive usage patterns and examples, see the main :doc:`DataFrame Guide `. + +Core Classes +------------ + +**DataFrame** + The main DataFrame class for building and executing queries. + + See: :py:class:`datafusion.DataFrame` + +**SessionContext** + The primary entry point for creating DataFrames from various data sources. + + Key methods for DataFrame creation: + + * :py:meth:`~datafusion.SessionContext.read_csv` - Read CSV files + * :py:meth:`~datafusion.SessionContext.read_parquet` - Read Parquet files + * :py:meth:`~datafusion.SessionContext.read_json` - Read JSON files + * :py:meth:`~datafusion.SessionContext.read_avro` - Read Avro files + * :py:meth:`~datafusion.SessionContext.table` - Access registered tables + * :py:meth:`~datafusion.SessionContext.sql` - Execute SQL queries + * :py:meth:`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame + * :py:meth:`~datafusion.SessionContext.from_arrow` - Create from Arrow data + + See: :py:class:`datafusion.SessionContext` + +Expression Classes +------------------ + +**Expr** + Represents expressions that can be used in DataFrame operations. + + See: :py:class:`datafusion.Expr` + +**Functions for creating expressions:** + +* :py:func:`datafusion.column` - Reference a column by name +* :py:func:`datafusion.literal` - Create a literal value expression + +Built-in Functions +------------------ + +DataFusion provides many built-in functions for data manipulation: + +* :py:mod:`datafusion.functions` - Mathematical, string, date/time, and aggregation functions + +For a complete list of available functions, see the :py:mod:`datafusion.functions` module documentation. diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst index 78569f9f..761bdbdd 100644 --- a/docs/source/user-guide/dataframe/index.rst +++ b/docs/source/user-guide/dataframe/index.rst @@ -21,203 +21,143 @@ DataFrames Overview -------- -DataFusion's DataFrame API provides a powerful interface for building and executing queries against data sources. -It offers a familiar API similar to pandas and other DataFrame libraries, but with the performance benefits of Rust -and Arrow. +The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations +on that data. DataFrames provide a flexible API for transforming data through various operations such as +filtering, projection, aggregation, joining, and more. -A DataFrame represents a logical plan that can be composed through operations like filtering, projection, and aggregation. -The actual execution happens when terminal operations like ``collect()`` or ``show()`` are called. +A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when +terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called. -Basic Usage ------------ +Creating DataFrames +------------------- -.. code-block:: python +DataFrames can be created in several ways: - import datafusion - from datafusion import col, lit +* From SQL queries via a ``SessionContext``: - # Create a context and register a data source - ctx = datafusion.SessionContext() - ctx.register_csv("my_table", "path/to/data.csv") - - # Create and manipulate a DataFrame - df = ctx.sql("SELECT * FROM my_table") - - # Or use the DataFrame API directly - df = (ctx.table("my_table") - .filter(col("age") > lit(25)) - .select([col("name"), col("age")])) - - # Execute and collect results - result = df.collect() - - # Display the first few rows - df.show() + .. code-block:: python -HTML Rendering --------------- + from datafusion import SessionContext + + ctx = SessionContext() + df = ctx.sql("SELECT * FROM your_table") -When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will -automatically display as formatted HTML tables, making it easier to visualize your data. +* From registered tables: -The ``_repr_html_`` method is called automatically by Jupyter to render a DataFrame. This method -controls how DataFrames appear in notebook environments, providing a richer visualization than -plain text output. + .. code-block:: python -Customizing HTML Rendering --------------------------- + df = ctx.table("your_table") -You can customize how DataFrames are rendered in HTML by configuring the formatter: +* From various data sources: -.. code-block:: python + .. code-block:: python - from datafusion.html_formatter import configure_formatter - - # Change the default styling - configure_formatter( - max_cell_length=25, # Maximum characters in a cell before truncation - max_width=1000, # Maximum width in pixels - max_height=300, # Maximum height in pixels - max_memory_bytes=2097152, # Maximum memory for rendering (2MB) - min_rows_display=20, # Minimum number of rows to display - repr_rows=10, # Number of rows to display in __repr__ - enable_cell_expansion=True,# Allow expanding truncated cells - custom_css=None, # Additional custom CSS - show_truncation_message=True, # Show message when data is truncated - style_provider=None, # Custom styling provider - use_shared_styles=True # Share styles across tables - ) + # From CSV files (see :ref:`io_csv` for detailed options) + df = ctx.read_csv("path/to/data.csv") + + # From Parquet files (see :ref:`io_parquet` for detailed options) + df = ctx.read_parquet("path/to/data.parquet") + + # From JSON files (see :ref:`io_json` for detailed options) + df = ctx.read_json("path/to/data.json") + + # From Avro files (see :ref:`io_avro` for detailed options) + df = ctx.read_avro("path/to/data.avro") + + # From Pandas DataFrame + import pandas as pd + pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = ctx.from_pandas(pandas_df) + + # From Arrow data + import pyarrow as pa + batch = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3]), pa.array([4, 5, 6])], + names=["a", "b"] + ) + df = ctx.from_arrow(batch) -The formatter settings affect all DataFrames displayed after configuration. +For detailed information about reading from different data sources, see the :doc:`I/O Guide <../io/index>`. +For custom data sources, see :ref:`io_custom_table_provider`. -Custom Style Providers ----------------------- +Common DataFrame Operations +--------------------------- -For advanced styling needs, you can create a custom style provider: +DataFusion's DataFrame API offers a wide range of operations: .. code-block:: python - from datafusion.html_formatter import StyleProvider, configure_formatter + from datafusion import column, literal - class MyStyleProvider(StyleProvider): - def get_table_styles(self): - return { - "table": "border-collapse: collapse; width: 100%;", - "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;", - "td": "border: 1px solid #ddd; padding: 8px;", - "tr:nth-child(even)": "background-color: #f2f2f2;", - } - - def get_value_styles(self, dtype, value): - """Return custom styles for specific values""" - if dtype == "float" and value < 0: - return "color: red;" - return None + # Select specific columns + df = df.select("col1", "col2") - # Apply the custom style provider - configure_formatter(style_provider=MyStyleProvider()) - -Performance Optimization with Shared Styles -------------------------------------------- -The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying -multiple DataFrames in notebook environments: - -.. code-block:: python - - from datafusion.html_formatter import StyleProvider, configure_formatter - # Default: Use shared styles (recommended for notebooks) - configure_formatter(use_shared_styles=True) - - # Disable shared styles (each DataFrame includes its own styles) - configure_formatter(use_shared_styles=False) - -When ``use_shared_styles=True``: -- CSS styles and JavaScript are included only once per notebook session -- This reduces HTML output size and prevents style duplication -- Improves rendering performance with many DataFrames -- Applies consistent styling across all DataFrames - -Creating a Custom Formatter ---------------------------- - -For complete control over rendering, you can implement a custom formatter: - -.. code-block:: python - - from datafusion.html_formatter import Formatter, get_formatter + # Select with expressions + df = df.select(column("a") + column("b"), column("a") - column("b")) - class MyFormatter(Formatter): - def format_html(self, batches, schema, has_more=False, table_uuid=None): - # Create your custom HTML here - html = "
" - # ... formatting logic ... - html += "
" - return html + # Filter rows + df = df.filter(column("age") > literal(25)) - # Set as the global formatter - configure_formatter(formatter_class=MyFormatter) + # Add computed columns + df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name")) - # Or use the formatter just for specific operations - formatter = get_formatter() - custom_html = formatter.format_html(batches, schema) - -Managing Formatters -------------------- - -Reset to default formatting: - -.. code-block:: python - - from datafusion.html_formatter import reset_formatter + # Multiple column additions + df = df.with_columns( + (column("a") + column("b")).alias("sum"), + (column("a") * column("b")).alias("product") + ) - # Reset to default settings - reset_formatter() - -Get the current formatter settings: - -.. code-block:: python - - from datafusion.html_formatter import get_formatter + # Sort data + df = df.sort(column("age").sort(ascending=False)) - formatter = get_formatter() - print(formatter.max_rows) - print(formatter.theme) + # Join DataFrames + df = df1.join(df2, on="user_id", how="inner") + + # Aggregate data + from datafusion import functions as f + df = df.aggregate( + [], # Group by columns (empty for global aggregation) + [f.sum(column("amount")).alias("total_amount")] + ) + + # Limit rows + df = df.limit(100) + + # Drop columns + df = df.drop("temporary_column") -Contextual Formatting ---------------------- +Terminal Operations +------------------- -You can also use a context manager to temporarily change formatting settings: +To materialize the results of your DataFrame operations: .. code-block:: python - from datafusion.html_formatter import formatting_context + # Collect all data as PyArrow RecordBatches + result_batches = df.collect() - # Default formatting - df.show() + # Convert to various formats + pandas_df = df.to_pandas() # Pandas DataFrame + polars_df = df.to_polars() # Polars DataFrame + arrow_table = df.to_arrow_table() # PyArrow Table + py_dict = df.to_pydict() # Python dictionary + py_list = df.to_pylist() # Python list of dictionaries - # Temporarily use different formatting - with formatting_context(max_rows=100, theme="dark"): - df.show() # Will use the temporary settings + # Display results + df.show() # Print tabular format to console - # Back to default formatting - df.show() - -Memory and Display Controls ---------------------------- - -You can control how much data is displayed and how much memory is used for rendering: + # Count rows + count = df.count() - .. code-block:: python - - configure_formatter( - max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display - min_rows_display=50, # Always show at least 50 rows - repr_rows=20 # Show 20 rows in __repr__ output - ) +HTML Rendering +-------------- -These parameters help balance comprehensive data display against performance considerations. +When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will +automatically display as formatted HTML tables. For detailed information about customizing HTML +rendering, formatting options, and advanced styling, see :doc:`rendering`. .. toctree:: :maxdepth: 1 rendering + api-reference diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index d6b7c34f..4c37c747 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -16,14 +16,15 @@ .. under the License. HTML Rendering in Jupyter -------------------------- +========================= When working in Jupyter notebooks or other environments that support rich HTML display, DataFusion DataFrames automatically render as nicely formatted HTML tables. This functionality -is provided by the ``_repr_html_`` method, which is automatically called by Jupyter. +is provided by the ``_repr_html_`` method, which is automatically called by Jupyter to provide +a richer visualization than plain text output. Basic HTML Rendering -~~~~~~~~~~~~~~~~~~~~ +-------------------- In a Jupyter environment, simply displaying a DataFrame object will trigger HTML rendering: @@ -35,14 +36,14 @@ In a Jupyter environment, simply displaying a DataFrame object will trigger HTML # Explicit display also uses HTML rendering display(df) -HTML Rendering Customization ----------------------------- +Customizing HTML Rendering +--------------------------- DataFusion provides extensive customization options for HTML table rendering through the ``datafusion.html_formatter`` module. Configuring the HTML Formatter -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ You can customize how DataFrames are rendered by configuring the formatter: @@ -50,189 +51,159 @@ You can customize how DataFrames are rendered by configuring the formatter: from datafusion.html_formatter import configure_formatter + # Change the default styling configure_formatter( - max_cell_length=30, # Maximum length of cell content before truncation - max_width=800, # Maximum width of table in pixels - max_height=400, # Maximum height of table in pixels - max_memory_bytes=2 * 1024 * 1024,# Maximum memory used for rendering (2MB) - min_rows_display=10, # Minimum rows to display - repr_rows=20, # Number of rows to display in representation - enable_cell_expansion=True, # Allow cells to be expandable on click - custom_css=None, # Custom CSS to apply - show_truncation_message=True, # Show message when data is truncated - style_provider=None, # Custom style provider class - use_shared_styles=True # Share styles across tables to reduce duplication + max_cell_length=25, # Maximum characters in a cell before truncation + max_width=1000, # Maximum width in pixels + max_height=300, # Maximum height in pixels + max_memory_bytes=2097152, # Maximum memory for rendering (2MB) + min_rows_display=20, # Minimum number of rows to display + repr_rows=10, # Number of rows to display in __repr__ + enable_cell_expansion=True,# Allow expanding truncated cells + custom_css=None, # Additional custom CSS + show_truncation_message=True, # Show message when data is truncated + style_provider=None, # Custom styling provider + use_shared_styles=True # Share styles across tables ) +The formatter settings affect all DataFrames displayed after configuration. + Custom Style Providers -~~~~~~~~~~~~~~~~~~~~~~ +----------------------- -For advanced styling needs, you can create a custom style provider class: +For advanced styling needs, you can create a custom style provider: .. code-block:: python - from datafusion.html_formatter import configure_formatter - - class CustomStyleProvider: - def get_cell_style(self) -> str: - return "background-color: #f5f5f5; color: #333; padding: 8px; border: 1px solid #ddd;" + from datafusion.html_formatter import StyleProvider, configure_formatter - def get_header_style(self) -> str: - return "background-color: #4285f4; color: white; font-weight: bold; padding: 10px;" + class MyStyleProvider(StyleProvider): + def get_table_styles(self): + return { + "table": "border-collapse: collapse; width: 100%;", + "th": "background-color: #007bff; color: white; padding: 8px; text-align: left;", + "td": "border: 1px solid #ddd; padding: 8px;", + "tr:nth-child(even)": "background-color: #f2f2f2;", + } + + def get_value_styles(self, dtype, value): + """Return custom styles for specific values""" + if dtype == "float" and value < 0: + return "color: red;" + return None - # Apply custom styling - configure_formatter(style_provider=CustomStyleProvider()) + # Apply the custom style provider + configure_formatter(style_provider=MyStyleProvider()) -Custom Type Formatters -~~~~~~~~~~~~~~~~~~~~~~ +Performance Optimization with Shared Styles +-------------------------------------------- -You can register custom formatters for specific data types: +The ``use_shared_styles`` parameter (enabled by default) optimizes performance when displaying +multiple DataFrames in notebook environments: .. code-block:: python - from datafusion.html_formatter import get_formatter - - formatter = get_formatter() - - # Format integers with color based on value - def format_int(value): - return f' 100 else "blue"}">{value}' - - formatter.register_formatter(int, format_int) - - # Format date values - def format_date(value): - return f'{value.isoformat()}' - - formatter.register_formatter(datetime.date, format_date) + from datafusion.html_formatter import StyleProvider, configure_formatter + # Default: Use shared styles (recommended for notebooks) + configure_formatter(use_shared_styles=True) + + # Disable shared styles (each DataFrame includes its own styles) + configure_formatter(use_shared_styles=False) -Custom Cell Builders -~~~~~~~~~~~~~~~~~~~~ +When ``use_shared_styles=True``: +- CSS styles and JavaScript are included only once per notebook session +- This reduces HTML output size and prevents style duplication +- Improves rendering performance with many DataFrames +- Applies consistent styling across all DataFrames -For complete control over cell rendering: +Creating a Custom Formatter +---------------------------- + +For complete control over rendering, you can implement a custom formatter: .. code-block:: python - formatter = get_formatter() + from datafusion.html_formatter import Formatter, get_formatter + + class MyFormatter(Formatter): + def format_html(self, batches, schema, has_more=False, table_uuid=None): + # Create your custom HTML here + html = "
" + # ... formatting logic ... + html += "
" + return html - def custom_cell_builder(value, row, col, table_id): - try: - num_value = float(value) - if num_value > 0: # Positive values get green - return f'{value}' - if num_value < 0: # Negative values get red - return f'{value}' - except (ValueError, TypeError): - pass - - # Default styling for non-numeric or zero values - return f'{value}' + # Set as the global formatter + configure_formatter(formatter_class=MyFormatter) - formatter.set_custom_cell_builder(custom_cell_builder) + # Or use the formatter just for specific operations + formatter = get_formatter() + custom_html = formatter.format_html(batches, schema) -Custom Header Builders -~~~~~~~~~~~~~~~~~~~~~~ +Managing Formatters +------------------- -Similarly, you can customize the rendering of table headers: +Reset to default formatting: .. code-block:: python - def custom_header_builder(field): - tooltip = f"Type: {field.type}" - return f'{field.name}' + from datafusion.html_formatter import reset_formatter - formatter.set_custom_header_builder(custom_header_builder) - -Managing Formatter State ------------------------~ + # Reset to default settings + reset_formatter() -The HTML formatter maintains global state that can be managed: +Get the current formatter settings: .. code-block:: python - from datafusion.html_formatter import reset_formatter, reset_styles_loaded_state, get_formatter - - # Reset the formatter to default settings - reset_formatter() - - # Reset only the styles loaded state (useful when styles were loaded but need reloading) - reset_styles_loaded_state() + from datafusion.html_formatter import get_formatter - # Get the current formatter instance to make changes formatter = get_formatter() + print(formatter.max_rows) + print(formatter.theme) -Advanced Example: Dashboard-Style Formatting -------------------------------------------~~ +Contextual Formatting +---------------------- -This example shows how to create a dashboard-like styling for your DataFrames: +You can also use a context manager to temporarily change formatting settings: .. code-block:: python - from datafusion.html_formatter import configure_formatter, get_formatter + from datafusion.html_formatter import formatting_context - # Define custom CSS - custom_css = """ - .datafusion-table { - font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; - border-collapse: collapse; - width: 100%; - box-shadow: 0 2px 3px rgba(0,0,0,0.1); - } - .datafusion-table th { - position: sticky; - top: 0; - z-index: 10; - } - .datafusion-table tr:hover td { - background-color: #f1f7fa !important; - } - .datafusion-table .numeric-positive { - color: #0a7c00; - } - .datafusion-table .numeric-negative { - color: #d13438; - } - """ + # Default formatting + df.show() - class DashboardStyleProvider: - def get_cell_style(self) -> str: - return "padding: 8px 12px; border-bottom: 1px solid #e0e0e0;" - - def get_header_style(self) -> str: - return ("background-color: #0078d4; color: white; font-weight: 600; " - "padding: 12px; text-align: left; border-bottom: 2px solid #005a9e;") + # Temporarily use different formatting + with formatting_context(max_rows=100, theme="dark"): + df.show() # Will use the temporary settings - # Apply configuration + # Back to default formatting + df.show() + +Memory and Display Controls +--------------------------- + +You can control how much data is displayed and how much memory is used for rendering: + +.. code-block:: python + configure_formatter( - max_height=500, - enable_cell_expansion=True, - custom_css=custom_css, - style_provider=DashboardStyleProvider(), - max_cell_length=50 + max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display + min_rows_display=50, # Always show at least 50 rows + repr_rows=20 # Show 20 rows in __repr__ output ) - - # Add custom formatters for numbers - formatter = get_formatter() - - def format_number(value): - try: - num = float(value) - cls = "numeric-positive" if num > 0 else "numeric-negative" if num < 0 else "" - return f'{value:,}' if cls else f'{value:,}' - except (ValueError, TypeError): - return str(value) - - formatter.register_formatter(int, format_number) - formatter.register_formatter(float, format_number) + +These parameters help balance comprehensive data display against performance considerations. Best Practices -------------- -1. **Memory Management**: For large datasets, use ``max_memory_bytes`` to limit memory usage. +1. **Global Configuration**: Use ``configure_formatter()`` at the beginning of your notebook to set up consistent formatting for all DataFrames. -2. **Responsive Design**: Set reasonable ``max_width`` and ``max_height`` values to ensure tables display well on different screens. +2. **Memory Management**: Set appropriate ``max_memory_bytes`` limits to prevent performance issues with large datasets. -3. **Style Optimization**: Use ``use_shared_styles=True`` to avoid duplicate style definitions when displaying multiple tables. +3. **Shared Styles**: Keep ``use_shared_styles=True`` (default) for better performance in notebooks with multiple DataFrames. 4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings. From 2d5db1fbc122f0574e1dec4e52ba0d733cc86bdf Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 24 Jun 2025 18:48:59 -0400 Subject: [PATCH 6/6] Merge data from dataframe api reference page into main dataframe page --- docs/source/index.rst | 5 ++ .../user-guide/dataframe/api-reference.rst | 69 ------------------- docs/source/user-guide/dataframe/index.rst | 48 ++++++++++++- 3 files changed, 52 insertions(+), 70 deletions(-) delete mode 100644 docs/source/user-guide/dataframe/api-reference.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 9b3a2f7f..adec60f4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -88,3 +88,8 @@ Example contributor-guide/introduction contributor-guide/ffi +.. _toc.api: +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: API diff --git a/docs/source/user-guide/dataframe/api-reference.rst b/docs/source/user-guide/dataframe/api-reference.rst deleted file mode 100644 index 189456ea..00000000 --- a/docs/source/user-guide/dataframe/api-reference.rst +++ /dev/null @@ -1,69 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -.. or more contributor license agreements. See the NOTICE file -.. distributed with this work for additional information -.. regarding copyright ownership. The ASF licenses this file -.. to you under the Apache License, Version 2.0 (the -.. "License"); you may not use this file except in compliance -.. with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -.. software distributed under the License is distributed on an -.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -.. KIND, either express or implied. See the License for the -.. specific language governing permissions and limitations -.. under the License. - -DataFrame API Reference -======================= - -This page provides quick access to DataFusion's DataFrame API documentation. - -For comprehensive usage patterns and examples, see the main :doc:`DataFrame Guide `. - -Core Classes ------------- - -**DataFrame** - The main DataFrame class for building and executing queries. - - See: :py:class:`datafusion.DataFrame` - -**SessionContext** - The primary entry point for creating DataFrames from various data sources. - - Key methods for DataFrame creation: - - * :py:meth:`~datafusion.SessionContext.read_csv` - Read CSV files - * :py:meth:`~datafusion.SessionContext.read_parquet` - Read Parquet files - * :py:meth:`~datafusion.SessionContext.read_json` - Read JSON files - * :py:meth:`~datafusion.SessionContext.read_avro` - Read Avro files - * :py:meth:`~datafusion.SessionContext.table` - Access registered tables - * :py:meth:`~datafusion.SessionContext.sql` - Execute SQL queries - * :py:meth:`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame - * :py:meth:`~datafusion.SessionContext.from_arrow` - Create from Arrow data - - See: :py:class:`datafusion.SessionContext` - -Expression Classes ------------------- - -**Expr** - Represents expressions that can be used in DataFrame operations. - - See: :py:class:`datafusion.Expr` - -**Functions for creating expressions:** - -* :py:func:`datafusion.column` - Reference a column by name -* :py:func:`datafusion.literal` - Create a literal value expression - -Built-in Functions ------------------- - -DataFusion provides many built-in functions for data manipulation: - -* :py:mod:`datafusion.functions` - Mathematical, string, date/time, and aggregation functions - -For a complete list of available functions, see the :py:mod:`datafusion.functions` module documentation. diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst index 761bdbdd..f69485af 100644 --- a/docs/source/user-guide/dataframe/index.rst +++ b/docs/source/user-guide/dataframe/index.rst @@ -156,8 +156,54 @@ When working in Jupyter notebooks or other environments that support HTML render automatically display as formatted HTML tables. For detailed information about customizing HTML rendering, formatting options, and advanced styling, see :doc:`rendering`. +Core Classes +------------ + +**DataFrame** + The main DataFrame class for building and executing queries. + + See: :py:class:`datafusion.DataFrame` + +**SessionContext** + The primary entry point for creating DataFrames from various data sources. + + Key methods for DataFrame creation: + + * :py:meth:`~datafusion.SessionContext.read_csv` - Read CSV files + * :py:meth:`~datafusion.SessionContext.read_parquet` - Read Parquet files + * :py:meth:`~datafusion.SessionContext.read_json` - Read JSON files + * :py:meth:`~datafusion.SessionContext.read_avro` - Read Avro files + * :py:meth:`~datafusion.SessionContext.table` - Access registered tables + * :py:meth:`~datafusion.SessionContext.sql` - Execute SQL queries + * :py:meth:`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame + * :py:meth:`~datafusion.SessionContext.from_arrow` - Create from Arrow data + + See: :py:class:`datafusion.SessionContext` + +Expression Classes +------------------ + +**Expr** + Represents expressions that can be used in DataFrame operations. + + See: :py:class:`datafusion.Expr` + +**Functions for creating expressions:** + +* :py:func:`datafusion.column` - Reference a column by name +* :py:func:`datafusion.literal` - Create a literal value expression + +Built-in Functions +------------------ + +DataFusion provides many built-in functions for data manipulation: + +* :py:mod:`datafusion.functions` - Mathematical, string, date/time, and aggregation functions + +For a complete list of available functions, see the :py:mod:`datafusion.functions` module documentation. + + .. toctree:: :maxdepth: 1 rendering - api-reference