From fa9f25730f28bc0aa9adff4a47402e55695f802e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 14:18:53 +0800 Subject: [PATCH 01/15] Update DataFrameHtmlFormatter to enforce min_rows_display constraint and adjust default values --- python/datafusion/dataframe_formatter.py | 13 +++++++++---- python/tests/test_dataframe.py | 5 +++++ src/dataframe.rs | 8 ++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index bb53d323e..e7c2f0d81 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -126,7 +126,7 @@ class DataFrameHtmlFormatter: max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display + min_rows_display: Minimum number of rows to display (must be <= repr_rows) repr_rows: Default number of rows to display in repr output enable_cell_expansion: Whether to add expand/collapse buttons for long cell values @@ -143,7 +143,7 @@ def __init__( max_width: int = 1000, max_height: int = 300, max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, + min_rows_display: int = 10, repr_rows: int = 10, enable_cell_expansion: bool = True, custom_css: str | None = None, @@ -163,8 +163,9 @@ def __init__( Maximum height of the displayed table in pixels. max_memory_bytes : int, default 2097152 (2MB) Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. + min_rows_display : int, default 10 + Minimum number of rows to display. Must be less than or equal to + ``repr_rows``. repr_rows : int, default 10 Default number of rows to display in repr output. enable_cell_expansion : bool, default True @@ -199,6 +200,10 @@ def __init__( _validate_positive_int(min_rows_display, "min_rows_display") _validate_positive_int(repr_rows, "repr_rows") + if min_rows_display > repr_rows: + msg = "min_rows_display must be less than or equal to repr_rows" + raise ValueError(msg) + # Validate boolean parameters _validate_bool(enable_cell_expansion, "enable_cell_expansion") _validate_bool(show_truncation_message, "show_truncation_message") diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 53a661969..6bd8c31b2 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1504,6 +1504,11 @@ def test_html_formatter_validation(): with pytest.raises(ValueError, match="repr_rows must be a positive integer"): DataFrameHtmlFormatter(repr_rows=-10) + with pytest.raises( + ValueError, match="min_rows_display must be less than or equal to repr_rows" + ): + DataFrameHtmlFormatter(min_rows_display=5, repr_rows=4) + def test_configure_formatter(df, clean_formatter_state): """Test using custom style providers with the HTML formatter and configured diff --git a/src/dataframe.rs b/src/dataframe.rs index 79b76779b..24111152e 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -72,7 +72,7 @@ type SharedCachedBatches = Arc>; pub struct FormatterConfig { /// Maximum memory in bytes to use for display (default: 2MB) pub max_bytes: usize, - /// Minimum number of rows to display (default: 20) + /// Minimum number of rows to display (default: 10) pub min_rows: usize, /// Number of rows to include in __repr__ output (default: 10) pub repr_rows: usize, @@ -82,7 +82,7 @@ impl Default for FormatterConfig { fn default() -> Self { Self { max_bytes: 2 * 1024 * 1024, // 2MB - min_rows: 20, + min_rows: 10, repr_rows: 10, } } @@ -107,6 +107,10 @@ impl FormatterConfig { return Err("repr_rows must be a positive integer".to_string()); } + if self.min_rows > self.repr_rows { + return Err("min_rows must be less than or equal to repr_rows".to_string()); + } + Ok(()) } } From 0563f6ca800ede74712cfc17d7aa8621c49ad90d Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 14:32:57 +0800 Subject: [PATCH 02/15] Refactor DataFrame formatter to replace repr_rows with max_rows and update related validations --- .../source/user-guide/dataframe/rendering.rst | 4 +- python/datafusion/dataframe_formatter.py | 38 +++++++++++++------ python/tests/test_dataframe.py | 26 ++++++------- src/dataframe.rs | 32 +++++++++------- 4 files changed, 61 insertions(+), 39 deletions(-) diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 4c37c7471..07ab5e50a 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -58,7 +58,7 @@ You can customize how DataFrames are rendered by configuring the formatter: max_height=300, # Maximum height in pixels max_memory_bytes=2097152, # Maximum memory for rendering (2MB) min_rows_display=20, # Minimum number of rows to display - repr_rows=10, # Number of rows to display in __repr__ + max_rows=10, # Maximum rows to display in __repr__ enable_cell_expansion=True,# Allow expanding truncated cells custom_css=None, # Additional custom CSS show_truncation_message=True, # Show message when data is truncated @@ -191,7 +191,7 @@ You can control how much data is displayed and how much memory is used for rende configure_formatter( max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display min_rows_display=50, # Always show at least 50 rows - repr_rows=20 # Show 20 rows in __repr__ output + max_rows=20 # Show 20 rows in __repr__ output ) These parameters help balance comprehensive data display against performance considerations. diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index e7c2f0d81..843584fa3 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -126,8 +126,9 @@ class DataFrameHtmlFormatter: max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display (must be <= repr_rows) - repr_rows: Default number of rows to display in repr output + min_rows_display: Minimum number of rows to display (must be <= max_rows) + max_rows: Maximum number of rows to display in repr output + repr_rows: Deprecated alias for max_rows enable_cell_expansion: Whether to add expand/collapse buttons for long cell values custom_css: Additional CSS to include in the HTML output @@ -144,7 +145,8 @@ def __init__( max_height: int = 300, max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB min_rows_display: int = 10, - repr_rows: int = 10, + max_rows: int = 10, + repr_rows: int | None = None, enable_cell_expansion: bool = True, custom_css: str | None = None, show_truncation_message: bool = True, @@ -165,9 +167,11 @@ def __init__( Maximum memory in bytes for rendered data. min_rows_display : int, default 10 Minimum number of rows to display. Must be less than or equal to - ``repr_rows``. - repr_rows : int, default 10 - Default number of rows to display in repr output. + ``max_rows``. + max_rows : int, default 10 + Maximum number of rows to display in repr output. + repr_rows : int, optional + Deprecated alias for ``max_rows``. Use ``max_rows`` instead. enable_cell_expansion : bool, default True Whether to allow cells to expand when clicked. custom_css : str, optional @@ -184,7 +188,7 @@ def __init__( ------ ValueError If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. + min_rows_display or max_rows is not a positive integer. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is not a boolean, @@ -198,10 +202,19 @@ def __init__( _validate_positive_int(max_height, "max_height") _validate_positive_int(max_memory_bytes, "max_memory_bytes") _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - if min_rows_display > repr_rows: - msg = "min_rows_display must be less than or equal to repr_rows" + if repr_rows is not None and repr_rows != max_rows: + msg = "Specify only max_rows (repr_rows is deprecated)" + raise ValueError(msg) + + if repr_rows is not None: + _validate_positive_int(repr_rows, "repr_rows") + max_rows = repr_rows + + _validate_positive_int(max_rows, "max_rows") + + if min_rows_display > max_rows: + msg = "min_rows_display must be less than or equal to max_rows" raise ValueError(msg) # Validate boolean parameters @@ -224,7 +237,9 @@ def __init__( self.max_height = max_height self.max_memory_bytes = max_memory_bytes self.min_rows_display = min_rows_display - self.repr_rows = repr_rows + self.max_rows = max_rows + # Backwards-compatible alias + self.repr_rows = max_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -665,6 +680,7 @@ def configure_formatter(**kwargs: Any) -> None: "max_height", "max_memory_bytes", "min_rows_display", + "max_rows", "repr_rows", "enable_cell_expansion", "custom_css", diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 6bd8c31b2..fcb1e17e0 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1458,15 +1458,15 @@ def test_html_formatter_memory(df, clean_formatter_state): assert "data truncated" not in html_output.lower() -def test_html_formatter_repr_rows(df, clean_formatter_state): - configure_formatter(min_rows_display=2, repr_rows=2) +def test_html_formatter_max_rows(df, clean_formatter_state): + configure_formatter(min_rows_display=2, max_rows=2) html_output = df._repr_html_() tr_count = count_table_rows(html_output) # Table should have header row (1) + 2 data rows = 3 rows assert tr_count == 3 - configure_formatter(min_rows_display=2, repr_rows=3) + configure_formatter(min_rows_display=2, max_rows=3) html_output = df._repr_html_() tr_count = count_table_rows(html_output) @@ -1498,16 +1498,16 @@ def test_html_formatter_validation(): with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): DataFrameHtmlFormatter(min_rows_display=-5) - with pytest.raises(ValueError, match="repr_rows must be a positive integer"): - DataFrameHtmlFormatter(repr_rows=0) + with pytest.raises(ValueError, match="max_rows must be a positive integer"): + DataFrameHtmlFormatter(max_rows=0) - with pytest.raises(ValueError, match="repr_rows must be a positive integer"): - DataFrameHtmlFormatter(repr_rows=-10) + with pytest.raises(ValueError, match="max_rows must be a positive integer"): + DataFrameHtmlFormatter(max_rows=-10) with pytest.raises( - ValueError, match="min_rows_display must be less than or equal to repr_rows" + ValueError, match="min_rows_display must be less than or equal to max_rows" ): - DataFrameHtmlFormatter(min_rows_display=5, repr_rows=4) + DataFrameHtmlFormatter(min_rows_display=5, max_rows=4) def test_configure_formatter(df, clean_formatter_state): @@ -1520,7 +1520,7 @@ def test_configure_formatter(df, clean_formatter_state): max_height = 30 max_memory_bytes = 3 * MB min_rows_display = 2 - repr_rows = 2 + max_rows = 2 enable_cell_expansion = False show_truncation_message = False use_shared_styles = False @@ -1533,7 +1533,7 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_default.max_height != max_height assert formatter_default.max_memory_bytes != max_memory_bytes assert formatter_default.min_rows_display != min_rows_display - assert formatter_default.repr_rows != repr_rows + assert formatter_default.max_rows != max_rows assert formatter_default.enable_cell_expansion != enable_cell_expansion assert formatter_default.show_truncation_message != show_truncation_message assert formatter_default.use_shared_styles != use_shared_styles @@ -1545,7 +1545,7 @@ def test_configure_formatter(df, clean_formatter_state): max_height=max_height, max_memory_bytes=max_memory_bytes, min_rows_display=min_rows_display, - repr_rows=repr_rows, + max_rows=max_rows, enable_cell_expansion=enable_cell_expansion, show_truncation_message=show_truncation_message, use_shared_styles=use_shared_styles, @@ -1556,7 +1556,7 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_custom.max_height == max_height assert formatter_custom.max_memory_bytes == max_memory_bytes assert formatter_custom.min_rows_display == min_rows_display - assert formatter_custom.repr_rows == repr_rows + assert formatter_custom.max_rows == max_rows assert formatter_custom.enable_cell_expansion == enable_cell_expansion assert formatter_custom.show_truncation_message == show_truncation_message assert formatter_custom.use_shared_styles == use_shared_styles diff --git a/src/dataframe.rs b/src/dataframe.rs index 24111152e..977b59261 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -74,8 +74,8 @@ pub struct FormatterConfig { pub max_bytes: usize, /// Minimum number of rows to display (default: 10) pub min_rows: usize, - /// Number of rows to include in __repr__ output (default: 10) - pub repr_rows: usize, + /// Maximum number of rows to include in __repr__ output (default: 10) + pub max_rows: usize, } impl Default for FormatterConfig { @@ -83,7 +83,7 @@ impl Default for FormatterConfig { Self { max_bytes: 2 * 1024 * 1024, // 2MB min_rows: 10, - repr_rows: 10, + max_rows: 10, } } } @@ -103,12 +103,12 @@ impl FormatterConfig { return Err("min_rows must be a positive integer".to_string()); } - if self.repr_rows == 0 { - return Err("repr_rows must be a positive integer".to_string()); + if self.max_rows == 0 { + return Err("max_rows must be a positive integer".to_string()); } - if self.min_rows > self.repr_rows { - return Err("min_rows must be less than or equal to repr_rows".to_string()); + if self.min_rows > self.max_rows { + return Err("min_rows must be less than or equal to max_rows".to_string()); } Ok(()) @@ -153,12 +153,18 @@ fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult< let default_config = FormatterConfig::default(); let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); - let repr_rows = get_attr(formatter, "repr_rows", default_config.repr_rows); + let max_rows = get_attr(formatter, "max_rows", default_config.max_rows); + let repr_rows = get_attr(formatter, "repr_rows", max_rows); + let max_rows = if repr_rows != max_rows { + repr_rows + } else { + max_rows + }; let config = FormatterConfig { max_bytes, min_rows, - repr_rows, + max_rows, }; // Return the validated config, converting String error to PyErr @@ -1344,7 +1350,7 @@ async fn collect_record_batches_to_display( let FormatterConfig { max_bytes, min_rows, - repr_rows, + max_rows, } = config; let partitioned_stream = df.execute_stream_partitioned().await?; @@ -1355,7 +1361,7 @@ async fn collect_record_batches_to_display( let mut has_more = false; // ensure minimum rows even if memory/row limits are hit - while (size_estimate_so_far < max_bytes && rows_so_far < repr_rows) || rows_so_far < min_rows { + while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { break; @@ -1385,8 +1391,8 @@ async fn collect_record_batches_to_display( } } - if rows_in_rb + rows_so_far > repr_rows { - rb = rb.slice(0, repr_rows - rows_so_far); + if rows_in_rb + rows_so_far > max_rows { + rb = rb.slice(0, max_rows - rows_so_far); has_more = true; } From 168eda8efc65799815d01bcc2b99d0dfc3b2fb6c Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 15:05:35 +0800 Subject: [PATCH 03/15] Add validation for formatter parameters and deprecate repr_rows alias --- python/datafusion/dataframe_formatter.py | 193 ++++++++++++++++++----- python/tests/test_dataframe.py | 22 +++ 2 files changed, 176 insertions(+), 39 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 843584fa3..c9cf4670c 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -18,6 +18,7 @@ from __future__ import annotations +import warnings from typing import ( TYPE_CHECKING, Any, @@ -61,6 +62,93 @@ def _validate_bool(value: Any, param_name: str) -> None: raise TypeError(msg) +def _validate_formatter_parameters( + max_cell_length: int, + max_width: int, + max_height: int, + max_memory_bytes: int, + min_rows_display: int, + max_rows: int | None, + repr_rows: int | None, + enable_cell_expansion: bool, + show_truncation_message: bool, + use_shared_styles: bool, + custom_css: str | None, + style_provider: Any, +) -> int: + """Validate all formatter parameters and return resolved max_rows value. + + Args: + max_cell_length: Maximum cell length value to validate + max_width: Maximum width value to validate + max_height: Maximum height value to validate + max_memory_bytes: Maximum memory bytes value to validate + min_rows_display: Minimum rows to display value to validate + max_rows: Maximum rows value to validate (None means use default) + repr_rows: Deprecated repr_rows value to validate + enable_cell_expansion: Boolean expansion flag to validate + show_truncation_message: Boolean message flag to validate + use_shared_styles: Boolean styles flag to validate + custom_css: Custom CSS string to validate + style_provider: Style provider object to validate + + Returns: + The resolved max_rows value after handling repr_rows deprecation + + Raises: + ValueError: If any numeric parameter is invalid or constraints are violated + TypeError: If any parameter has invalid type + DeprecationWarning: If repr_rows parameter is used + """ + # Validate numeric parameters + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows_display, "min_rows_display") + + # Handle deprecated repr_rows parameter + if repr_rows is not None: + warnings.warn( + "repr_rows parameter is deprecated, use max_rows instead", + DeprecationWarning, + stacklevel=4, + ) + _validate_positive_int(repr_rows, "repr_rows") + if max_rows is not None and repr_rows != max_rows: + msg = "Cannot specify both repr_rows and max_rows; use max_rows only" + raise ValueError(msg) + max_rows = repr_rows + + # Use default if max_rows was not provided + if max_rows is None: + max_rows = 10 + + _validate_positive_int(max_rows, "max_rows") + + # Validate constraint: min_rows_display <= max_rows + if min_rows_display > max_rows: + msg = "min_rows_display must be less than or equal to max_rows" + raise ValueError(msg) + + # Validate boolean parameters + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + return max_rows + + @runtime_checkable class CellFormatter(Protocol): """Protocol for cell value formatters.""" @@ -145,7 +233,7 @@ def __init__( max_height: int = 300, max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB min_rows_display: int = 10, - max_rows: int = 10, + max_rows: int | None = None, repr_rows: int | None = None, enable_cell_expansion: bool = True, custom_css: str | None = None, @@ -196,50 +284,28 @@ def __init__( or if style_provider is provided but does not implement the StyleProvider protocol. """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - - if repr_rows is not None and repr_rows != max_rows: - msg = "Specify only max_rows (repr_rows is deprecated)" - raise ValueError(msg) - - if repr_rows is not None: - _validate_positive_int(repr_rows, "repr_rows") - max_rows = repr_rows - - _validate_positive_int(max_rows, "max_rows") - - if min_rows_display > max_rows: - msg = "min_rows_display must be less than or equal to max_rows" - raise ValueError(msg) - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) + # Validate all parameters and get resolved max_rows + resolved_max_rows = _validate_formatter_parameters( + max_cell_length, + max_width, + max_height, + max_memory_bytes, + min_rows_display, + max_rows, + repr_rows, + enable_cell_expansion, + show_truncation_message, + use_shared_styles, + custom_css, + style_provider, + ) self.max_cell_length = max_cell_length self.max_width = max_width self.max_height = max_height self.max_memory_bytes = max_memory_bytes self.min_rows_display = min_rows_display - self.max_rows = max_rows - # Backwards-compatible alias - self.repr_rows = max_rows + self._max_rows = resolved_max_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -251,6 +317,55 @@ def __init__( self._custom_cell_builder: Callable[[Any, int, int, str], str] | None = None self._custom_header_builder: Callable[[Any], str] | None = None + @property + def max_rows(self) -> int: + """Get the maximum number of rows to display. + + Returns: + The maximum number of rows to display in repr output + """ + return self._max_rows + + @max_rows.setter + def max_rows(self, value: int) -> None: + """Set the maximum number of rows to display. + + Args: + value: The maximum number of rows + """ + self._max_rows = value + + @property + def repr_rows(self) -> int: + """Get the maximum number of rows (deprecated name). + + .. deprecated:: + Use :attr:`max_rows` instead. This property is provided for + backward compatibility. + + Returns: + The maximum number of rows to display + """ + return self._max_rows + + @repr_rows.setter + def repr_rows(self, value: int) -> None: + """Set the maximum number of rows using deprecated name. + + .. deprecated:: + Use :attr:`max_rows` setter instead. This property is provided for + backward compatibility. + + Args: + value: The maximum number of rows + """ + warnings.warn( + "repr_rows is deprecated, use max_rows instead", + DeprecationWarning, + stacklevel=2, + ) + self._max_rows = value + def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: """Register a custom formatter for a specific data type. diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index fcb1e17e0..3a839b100 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1510,6 +1510,28 @@ def test_html_formatter_validation(): DataFrameHtmlFormatter(min_rows_display=5, max_rows=4) +def test_repr_rows_backward_compatibility(clean_formatter_state): + """Test that repr_rows parameter still works as deprecated alias.""" + # Should work when not conflicting with max_rows + with pytest.warns(DeprecationWarning, match="repr_rows parameter is deprecated"): + formatter = DataFrameHtmlFormatter(repr_rows=15, min_rows_display=10) + assert formatter.max_rows == 15 + assert formatter.repr_rows == 15 + + # Should fail when conflicting with max_rows + with pytest.raises( + ValueError, match="Cannot specify both repr_rows and max_rows" + ): + DataFrameHtmlFormatter(repr_rows=5, max_rows=10) + + # Setting repr_rows via property should warn + formatter2 = DataFrameHtmlFormatter() + with pytest.warns(DeprecationWarning, match="repr_rows is deprecated"): + formatter2.repr_rows = 7 + assert formatter2.max_rows == 7 + assert formatter2.repr_rows == 7 + + def test_configure_formatter(df, clean_formatter_state): """Test using custom style providers with the HTML formatter and configured parameters.""" From 0ad2621514d75dee363d2a03a439380d78b9476d Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 15:14:10 +0800 Subject: [PATCH 04/15] Add boundary condition tests for HTML formatter memory limits and resolve max_rows logic --- python/tests/test_dataframe.py | 49 ++++++++++++++++++++++++++++++++++ src/dataframe.rs | 35 ++++++++++++++++++------ 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 3a839b100..b1fc9ef84 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1458,6 +1458,55 @@ def test_html_formatter_memory(df, clean_formatter_state): assert "data truncated" not in html_output.lower() +def test_html_formatter_memory_boundary_conditions(df, clean_formatter_state): + """Test memory limit behavior at boundary conditions. + + This test validates that the formatter correctly handles edge cases when + the memory limit is very close to actual data size, ensuring that min_rows + constraint is properly respected while respecting memory limits. + """ + # Get the raw size of the data to test boundary conditions + # First, capture output with no limits + configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1, max_rows=100) + unrestricted_output = df._repr_html_() + unrestricted_rows = count_table_rows(unrestricted_output) + + # Test 1: Very small memory limit should still respect min_rows + configure_formatter(max_memory_bytes=10, min_rows_display=1) + html_output = df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 2 # At least header + 1 data row (minimum) + # Should show truncation since we limited memory so aggressively + assert "data truncated" in html_output.lower() + + # Test 2: Memory limit at default size should work well + configure_formatter(max_memory_bytes=2 * MB, min_rows_display=1) + html_output = df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 2 # At least header + min_rows + + # Test 3: Very large memory limit should show all data + configure_formatter(max_memory_bytes=100 * MB, min_rows_display=1) + html_output = df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count == unrestricted_rows # Should show all rows + + # Test 4: Min rows should override memory limit + # With tiny memory and larger min_rows, min_rows should win + configure_formatter(max_memory_bytes=10, min_rows_display=2) + html_output = df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 3 # At least header + 2 data rows (min_rows) + # Should show truncation message despite min_rows being satisfied + assert "data truncated" in html_output.lower() + + # Test 5: Default memory limit with different min_rows + configure_formatter(max_memory_bytes=2 * MB, min_rows_display=2, max_rows=2) + html_output = df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count == 3 # header + 2 data rows + + def test_html_formatter_max_rows(df, clean_formatter_state): configure_formatter(min_rows_display=2, max_rows=2) html_output = df._repr_html_() diff --git a/src/dataframe.rs b/src/dataframe.rs index 977b59261..dbc8c6d30 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -148,18 +148,31 @@ where .unwrap_or_else(|_| default_value.clone()) } +/// Resolve the max_rows value, preferring repr_rows if it differs from the default. +/// +/// This function handles the transition from the deprecated `repr_rows` parameter +/// to the new `max_rows` parameter. It checks both attributes and uses `repr_rows` +/// if it has been explicitly set to a different value than `max_rows`. +fn resolve_max_rows(formatter: &Bound<'_, PyAny>, default: usize) -> usize { + let max_rows = get_attr(formatter, "max_rows", default); + let repr_rows = get_attr(formatter, "repr_rows", default); + + // If repr_rows differs from the default, it was explicitly set by the user + // (Python-side validation ensures only one is used, but we prefer repr_rows + // for backward compatibility in case it was set) + if repr_rows != default && repr_rows != max_rows { + repr_rows + } else { + max_rows + } +} + /// Helper function to create a FormatterConfig from a Python formatter object fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { let default_config = FormatterConfig::default(); let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); - let max_rows = get_attr(formatter, "max_rows", default_config.max_rows); - let repr_rows = get_attr(formatter, "repr_rows", max_rows); - let max_rows = if repr_rows != max_rows { - repr_rows - } else { - max_rows - }; + let max_rows = resolve_max_rows(formatter, default_config.max_rows); let config = FormatterConfig { max_bytes, @@ -1360,7 +1373,10 @@ async fn collect_record_batches_to_display( let mut record_batches = Vec::default(); let mut has_more = false; - // ensure minimum rows even if memory/row limits are hit + // Collect rows until we hit a limit (memory or max_rows) OR reach the guaranteed minimum. + // The minimum rows constraint overrides both memory and row limits to ensure a baseline + // of data is always displayed, even if it temporarily exceeds those limits. + // This provides better UX by guaranteeing users see at least min_rows rows. while (size_estimate_so_far < max_bytes && rows_so_far < max_rows) || rows_so_far < min_rows { let mut rb = match stream.next().await { None => { @@ -1374,11 +1390,14 @@ async fn collect_record_batches_to_display( if rows_in_rb > 0 { size_estimate_so_far += rb.get_array_memory_size(); + // When memory limit is exceeded, scale back row count proportionally to stay within budget if size_estimate_so_far > max_bytes { let ratio = max_bytes as f32 / size_estimate_so_far as f32; let total_rows = rows_in_rb + rows_so_far; + // Calculate reduced rows maintaining the memory/data proportion let mut reduced_row_num = (total_rows as f32 * ratio).round() as usize; + // Ensure we always respect the minimum rows guarantee if reduced_row_num < min_rows { reduced_row_num = min_rows.min(total_rows); } From a7dfd3f7bacb3e52f349e359747ede798e2ff7c4 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 15:24:50 +0800 Subject: [PATCH 05/15] Remove repr_rows handling in max_rows resolution in Rust --- src/dataframe.rs | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/src/dataframe.rs b/src/dataframe.rs index dbc8c6d30..f97c93926 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -148,31 +148,12 @@ where .unwrap_or_else(|_| default_value.clone()) } -/// Resolve the max_rows value, preferring repr_rows if it differs from the default. -/// -/// This function handles the transition from the deprecated `repr_rows` parameter -/// to the new `max_rows` parameter. It checks both attributes and uses `repr_rows` -/// if it has been explicitly set to a different value than `max_rows`. -fn resolve_max_rows(formatter: &Bound<'_, PyAny>, default: usize) -> usize { - let max_rows = get_attr(formatter, "max_rows", default); - let repr_rows = get_attr(formatter, "repr_rows", default); - - // If repr_rows differs from the default, it was explicitly set by the user - // (Python-side validation ensures only one is used, but we prefer repr_rows - // for backward compatibility in case it was set) - if repr_rows != default && repr_rows != max_rows { - repr_rows - } else { - max_rows - } -} - /// Helper function to create a FormatterConfig from a Python formatter object fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { let default_config = FormatterConfig::default(); let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); - let max_rows = resolve_max_rows(formatter, default_config.max_rows); + let max_rows = get_attr(formatter, "max_rows", default_config.max_rows); let config = FormatterConfig { max_bytes, From 61db037aa3c7872d4f718844fc1f7c28487b8b0f Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 16:13:33 +0800 Subject: [PATCH 06/15] Refactor whitespace in parameter validation and update test for HTML formatter memory limits --- python/datafusion/dataframe_formatter.py | 4 ++-- python/tests/test_dataframe.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index c9cf4670c..401102243 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -119,11 +119,11 @@ def _validate_formatter_parameters( msg = "Cannot specify both repr_rows and max_rows; use max_rows only" raise ValueError(msg) max_rows = repr_rows - + # Use default if max_rows was not provided if max_rows is None: max_rows = 10 - + _validate_positive_int(max_rows, "max_rows") # Validate constraint: min_rows_display <= max_rows diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index b1fc9ef84..f4ca9a44d 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1460,11 +1460,12 @@ def test_html_formatter_memory(df, clean_formatter_state): def test_html_formatter_memory_boundary_conditions(df, clean_formatter_state): """Test memory limit behavior at boundary conditions. - + This test validates that the formatter correctly handles edge cases when the memory limit is very close to actual data size, ensuring that min_rows constraint is properly respected while respecting memory limits. """ + # Get the raw size of the data to test boundary conditions # First, capture output with no limits configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1, max_rows=100) From 69bcf6f29ce312af882afdc76dfedf139bd81a9e Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Wed, 4 Feb 2026 16:19:49 +0800 Subject: [PATCH 07/15] ruff fix --- python/tests/test_dataframe.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index f4ca9a44d..32e174802 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1569,9 +1569,7 @@ def test_repr_rows_backward_compatibility(clean_formatter_state): assert formatter.repr_rows == 15 # Should fail when conflicting with max_rows - with pytest.raises( - ValueError, match="Cannot specify both repr_rows and max_rows" - ): + with pytest.raises(ValueError, match="Cannot specify both repr_rows and max_rows"): DataFrameHtmlFormatter(repr_rows=5, max_rows=10) # Setting repr_rows via property should warn From 399e3e22ba7472959e690eee4c57931234d83989 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 5 Feb 2026 14:27:18 +0800 Subject: [PATCH 08/15] Rename min_rows_display to min_rows in formatter configuration and update related tests --- .../source/user-guide/dataframe/rendering.rst | 6 +-- python/datafusion/dataframe_formatter.py | 26 ++++++------ python/tests/test_dataframe.py | 42 +++++++++---------- src/dataframe.rs | 2 +- 4 files changed, 38 insertions(+), 38 deletions(-) diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 07ab5e50a..9dea948bb 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -57,7 +57,7 @@ You can customize how DataFrames are rendered by configuring the formatter: max_width=1000, # Maximum width in pixels max_height=300, # Maximum height in pixels max_memory_bytes=2097152, # Maximum memory for rendering (2MB) - min_rows_display=20, # Minimum number of rows to display + min_rows=10, # Minimum number of rows to display max_rows=10, # Maximum rows to display in __repr__ enable_cell_expansion=True,# Allow expanding truncated cells custom_css=None, # Additional custom CSS @@ -190,8 +190,8 @@ You can control how much data is displayed and how much memory is used for rende configure_formatter( max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display - min_rows_display=50, # Always show at least 50 rows - max_rows=20 # Show 20 rows in __repr__ output + min_rows=20, # Always show at least 20 rows + max_rows=50 # Show up to 50 rows in output ) These parameters help balance comprehensive data display against performance considerations. diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 401102243..f1a6907ba 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -67,7 +67,7 @@ def _validate_formatter_parameters( max_width: int, max_height: int, max_memory_bytes: int, - min_rows_display: int, + min_rows: int, max_rows: int | None, repr_rows: int | None, enable_cell_expansion: bool, @@ -83,7 +83,7 @@ def _validate_formatter_parameters( max_width: Maximum width value to validate max_height: Maximum height value to validate max_memory_bytes: Maximum memory bytes value to validate - min_rows_display: Minimum rows to display value to validate + min_rows: Minimum rows to display value to validate max_rows: Maximum rows value to validate (None means use default) repr_rows: Deprecated repr_rows value to validate enable_cell_expansion: Boolean expansion flag to validate @@ -105,7 +105,7 @@ def _validate_formatter_parameters( _validate_positive_int(max_width, "max_width") _validate_positive_int(max_height, "max_height") _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") + _validate_positive_int(min_rows, "min_rows") # Handle deprecated repr_rows parameter if repr_rows is not None: @@ -126,9 +126,9 @@ def _validate_formatter_parameters( _validate_positive_int(max_rows, "max_rows") - # Validate constraint: min_rows_display <= max_rows - if min_rows_display > max_rows: - msg = "min_rows_display must be less than or equal to max_rows" + # Validate constraint: min_rows <= max_rows + if min_rows > max_rows: + msg = "min_rows must be less than or equal to max_rows" raise ValueError(msg) # Validate boolean parameters @@ -214,7 +214,7 @@ class DataFrameHtmlFormatter: max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display (must be <= max_rows) + min_rows: Minimum number of rows to display (must be <= max_rows) max_rows: Maximum number of rows to display in repr output repr_rows: Deprecated alias for max_rows enable_cell_expansion: Whether to add expand/collapse buttons for long cell @@ -232,7 +232,7 @@ def __init__( max_width: int = 1000, max_height: int = 300, max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 10, + min_rows: int = 10, max_rows: int | None = None, repr_rows: int | None = None, enable_cell_expansion: bool = True, @@ -253,7 +253,7 @@ def __init__( Maximum height of the displayed table in pixels. max_memory_bytes : int, default 2097152 (2MB) Maximum memory in bytes for rendered data. - min_rows_display : int, default 10 + min_rows : int, default 10 Minimum number of rows to display. Must be less than or equal to ``max_rows``. max_rows : int, default 10 @@ -276,7 +276,7 @@ def __init__( ------ ValueError If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display or max_rows is not a positive integer. + min_rows or max_rows is not a positive integer. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is not a boolean, @@ -290,7 +290,7 @@ def __init__( max_width, max_height, max_memory_bytes, - min_rows_display, + min_rows, max_rows, repr_rows, enable_cell_expansion, @@ -304,7 +304,7 @@ def __init__( self.max_width = max_width self.max_height = max_height self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display + self.min_rows = min_rows self._max_rows = resolved_max_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css @@ -794,7 +794,7 @@ def configure_formatter(**kwargs: Any) -> None: "max_width", "max_height", "max_memory_bytes", - "min_rows_display", + "min_rows", "max_rows", "repr_rows", "enable_cell_expansion", diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 32e174802..eba474940 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1438,7 +1438,7 @@ def get_header_style(self) -> str: def test_html_formatter_memory(df, clean_formatter_state): """Test the memory and row control parameters in DataFrameHtmlFormatter.""" - configure_formatter(max_memory_bytes=10, min_rows_display=1) + configure_formatter(max_memory_bytes=10, min_rows=1) html_output = df._repr_html_() # Count the number of table rows in the output @@ -1448,7 +1448,7 @@ def test_html_formatter_memory(df, clean_formatter_state): assert tr_count == 2 # 1 for header row, 1 for data row assert "data truncated" in html_output.lower() - configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1) + configure_formatter(max_memory_bytes=10 * MB, min_rows=1) html_output = df._repr_html_() # With larger memory limit and min_rows=2, should display all rows tr_count = count_table_rows(html_output) @@ -1468,12 +1468,12 @@ def test_html_formatter_memory_boundary_conditions(df, clean_formatter_state): # Get the raw size of the data to test boundary conditions # First, capture output with no limits - configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1, max_rows=100) + configure_formatter(max_memory_bytes=10 * MB, min_rows=1, max_rows=100) unrestricted_output = df._repr_html_() unrestricted_rows = count_table_rows(unrestricted_output) # Test 1: Very small memory limit should still respect min_rows - configure_formatter(max_memory_bytes=10, min_rows_display=1) + configure_formatter(max_memory_bytes=10, min_rows=1) html_output = df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count >= 2 # At least header + 1 data row (minimum) @@ -1481,20 +1481,20 @@ def test_html_formatter_memory_boundary_conditions(df, clean_formatter_state): assert "data truncated" in html_output.lower() # Test 2: Memory limit at default size should work well - configure_formatter(max_memory_bytes=2 * MB, min_rows_display=1) + configure_formatter(max_memory_bytes=2 * MB, min_rows=1) html_output = df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count >= 2 # At least header + min_rows # Test 3: Very large memory limit should show all data - configure_formatter(max_memory_bytes=100 * MB, min_rows_display=1) + configure_formatter(max_memory_bytes=100 * MB, min_rows=1) html_output = df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count == unrestricted_rows # Should show all rows # Test 4: Min rows should override memory limit # With tiny memory and larger min_rows, min_rows should win - configure_formatter(max_memory_bytes=10, min_rows_display=2) + configure_formatter(max_memory_bytes=10, min_rows=2) html_output = df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count >= 3 # At least header + 2 data rows (min_rows) @@ -1502,21 +1502,21 @@ def test_html_formatter_memory_boundary_conditions(df, clean_formatter_state): assert "data truncated" in html_output.lower() # Test 5: Default memory limit with different min_rows - configure_formatter(max_memory_bytes=2 * MB, min_rows_display=2, max_rows=2) + configure_formatter(max_memory_bytes=2 * MB, min_rows=2, max_rows=2) html_output = df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count == 3 # header + 2 data rows def test_html_formatter_max_rows(df, clean_formatter_state): - configure_formatter(min_rows_display=2, max_rows=2) + configure_formatter(min_rows=2, max_rows=2) html_output = df._repr_html_() tr_count = count_table_rows(html_output) # Table should have header row (1) + 2 data rows = 3 rows assert tr_count == 3 - configure_formatter(min_rows_display=2, max_rows=3) + configure_formatter(min_rows=2, max_rows=3) html_output = df._repr_html_() tr_count = count_table_rows(html_output) @@ -1542,11 +1542,11 @@ def test_html_formatter_validation(): with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): DataFrameHtmlFormatter(max_memory_bytes=-100) - with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): - DataFrameHtmlFormatter(min_rows_display=0) + with pytest.raises(ValueError, match="min_rows must be a positive integer"): + DataFrameHtmlFormatter(min_rows=0) - with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): - DataFrameHtmlFormatter(min_rows_display=-5) + with pytest.raises(ValueError, match="min_rows must be a positive integer"): + DataFrameHtmlFormatter(min_rows=-5) with pytest.raises(ValueError, match="max_rows must be a positive integer"): DataFrameHtmlFormatter(max_rows=0) @@ -1555,16 +1555,16 @@ def test_html_formatter_validation(): DataFrameHtmlFormatter(max_rows=-10) with pytest.raises( - ValueError, match="min_rows_display must be less than or equal to max_rows" + ValueError, match="min_rows must be less than or equal to max_rows" ): - DataFrameHtmlFormatter(min_rows_display=5, max_rows=4) + DataFrameHtmlFormatter(min_rows=5, max_rows=4) def test_repr_rows_backward_compatibility(clean_formatter_state): """Test that repr_rows parameter still works as deprecated alias.""" # Should work when not conflicting with max_rows with pytest.warns(DeprecationWarning, match="repr_rows parameter is deprecated"): - formatter = DataFrameHtmlFormatter(repr_rows=15, min_rows_display=10) + formatter = DataFrameHtmlFormatter(repr_rows=15, min_rows=10) assert formatter.max_rows == 15 assert formatter.repr_rows == 15 @@ -1589,7 +1589,7 @@ def test_configure_formatter(df, clean_formatter_state): max_width = 500 max_height = 30 max_memory_bytes = 3 * MB - min_rows_display = 2 + min_rows = 2 max_rows = 2 enable_cell_expansion = False show_truncation_message = False @@ -1602,7 +1602,7 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_default.max_width != max_width assert formatter_default.max_height != max_height assert formatter_default.max_memory_bytes != max_memory_bytes - assert formatter_default.min_rows_display != min_rows_display + assert formatter_default.min_rows != min_rows assert formatter_default.max_rows != max_rows assert formatter_default.enable_cell_expansion != enable_cell_expansion assert formatter_default.show_truncation_message != show_truncation_message @@ -1614,7 +1614,7 @@ def test_configure_formatter(df, clean_formatter_state): max_width=max_width, max_height=max_height, max_memory_bytes=max_memory_bytes, - min_rows_display=min_rows_display, + min_rows=min_rows, max_rows=max_rows, enable_cell_expansion=enable_cell_expansion, show_truncation_message=show_truncation_message, @@ -1625,7 +1625,7 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_custom.max_width == max_width assert formatter_custom.max_height == max_height assert formatter_custom.max_memory_bytes == max_memory_bytes - assert formatter_custom.min_rows_display == min_rows_display + assert formatter_custom.min_rows == min_rows assert formatter_custom.max_rows == max_rows assert formatter_custom.enable_cell_expansion == enable_cell_expansion assert formatter_custom.show_truncation_message == show_truncation_message diff --git a/src/dataframe.rs b/src/dataframe.rs index f97c93926..a3625b062 100644 --- a/src/dataframe.rs +++ b/src/dataframe.rs @@ -152,7 +152,7 @@ where fn build_formatter_config_from_python(formatter: &Bound<'_, PyAny>) -> PyResult { let default_config = FormatterConfig::default(); let max_bytes = get_attr(formatter, "max_memory_bytes", default_config.max_bytes); - let min_rows = get_attr(formatter, "min_rows_display", default_config.min_rows); + let min_rows = get_attr(formatter, "min_rows", default_config.min_rows); let max_rows = get_attr(formatter, "max_rows", default_config.max_rows); let config = FormatterConfig { From af3ef4bd0011432dee9c1ce8c46e4f48653e91db Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 5 Feb 2026 14:36:55 +0800 Subject: [PATCH 09/15] Refactor function parameter handling and documentation Removed type annotations and redundant default values from parameter names. Enhanced descriptions for clarity and added context for usage. Fixed formatting for the documentation sections to improve readability. --- python/datafusion/dataframe_formatter.py | 45 +++++++++++++----------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index f1a6907ba..b8af45a1b 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -245,43 +245,46 @@ def __init__( Parameters ---------- - max_cell_length : int, default 25 + max_cell_length Maximum length of cell content before truncation. - max_width : int, default 1000 + max_width Maximum width of the displayed table in pixels. - max_height : int, default 300 + max_height Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows : int, default 10 - Minimum number of rows to display. Must be less than or equal to - ``max_rows``. - max_rows : int, default 10 - Maximum number of rows to display in repr output. - repr_rows : int, optional + max_memory_bytes + Maximum memory in bytes for rendered data. Helps prevent performance + issues with large datasets. + min_rows + Minimum number of rows to display even if memory limit is reached. + Must not exceed ``max_rows``. + max_rows + Maximum number of rows to display. Takes precedence over memory limits + when fewer rows are requested. + repr_rows Deprecated alias for ``max_rows``. Use ``max_rows`` instead. - enable_cell_expansion : bool, default True + enable_cell_expansion Whether to allow cells to expand when clicked. - custom_css : str, optional + custom_css Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True + show_truncation_message Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional + style_provider Provider of CSS styles for the HTML table. If None, DefaultStyleProvider is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. + use_shared_styles + Whether to use shared styles across multiple tables. This improves + performance when displaying many DataFrames in a single notebook. Raises: ------ ValueError If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows or max_rows is not a positive integer. + min_rows or max_rows is not a positive integer, or if min_rows + exceeds max_rows. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider + not a boolean, or if custom_css is provided but is not a string, or if + style_provider is provided but does not implement the StyleProvider protocol. """ # Validate all parameters and get resolved max_rows From af11824072533d57810083ec50a1603fa034a800 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 5 Feb 2026 14:45:02 +0800 Subject: [PATCH 10/15] Update HTML formatter memory boundary tests for large datasets --- python/tests/test_dataframe.py | 45 ++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index eba474940..62a7da064 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1458,54 +1458,61 @@ def test_html_formatter_memory(df, clean_formatter_state): assert "data truncated" not in html_output.lower() -def test_html_formatter_memory_boundary_conditions(df, clean_formatter_state): - """Test memory limit behavior at boundary conditions. +def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_state): + """Test memory limit behavior at boundary conditions with large dataset. This test validates that the formatter correctly handles edge cases when - the memory limit is very close to actual data size, ensuring that min_rows - constraint is properly respected while respecting memory limits. + the memory limit is reached with a large dataset (100,000 rows), ensuring + that min_rows constraint is properly respected while respecting memory limits. + Uses large_df to actually test memory limit behavior with realistic data sizes. """ # Get the raw size of the data to test boundary conditions - # First, capture output with no limits - configure_formatter(max_memory_bytes=10 * MB, min_rows=1, max_rows=100) - unrestricted_output = df._repr_html_() + # First, capture output with no limits - use very high max_rows to avoid row limit + configure_formatter(max_memory_bytes=10 * MB, min_rows=1, max_rows=200000) + unrestricted_output = large_df._repr_html_() unrestricted_rows = count_table_rows(unrestricted_output) # Test 1: Very small memory limit should still respect min_rows + # With large dataset, this should definitely hit memory limit configure_formatter(max_memory_bytes=10, min_rows=1) - html_output = df._repr_html_() + html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count >= 2 # At least header + 1 data row (minimum) # Should show truncation since we limited memory so aggressively assert "data truncated" in html_output.lower() - # Test 2: Memory limit at default size should work well + # Test 2: Memory limit at default size (2MB) should truncate the large dataset configure_formatter(max_memory_bytes=2 * MB, min_rows=1) - html_output = df._repr_html_() + html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count >= 2 # At least header + min_rows + # Should be truncated since full dataset is much larger than 2MB + assert tr_count < unrestricted_rows - # Test 3: Very large memory limit should show all data - configure_formatter(max_memory_bytes=100 * MB, min_rows=1) - html_output = df._repr_html_() + # Test 3: Very large memory limit should show much more data + configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) + html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) - assert tr_count == unrestricted_rows # Should show all rows + # Should show significantly more rows, possibly all + assert tr_count > 100 # Should show substantially more rows # Test 4: Min rows should override memory limit # With tiny memory and larger min_rows, min_rows should win configure_formatter(max_memory_bytes=10, min_rows=2) - html_output = df._repr_html_() + html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) assert tr_count >= 3 # At least header + 2 data rows (min_rows) # Should show truncation message despite min_rows being satisfied assert "data truncated" in html_output.lower() - # Test 5: Default memory limit with different min_rows - configure_formatter(max_memory_bytes=2 * MB, min_rows=2, max_rows=2) - html_output = df._repr_html_() + # Test 5: With reasonable memory and min_rows settings + configure_formatter(max_memory_bytes=2 * MB, min_rows=10, max_rows=200000) + html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) - assert tr_count == 3 # header + 2 data rows + assert tr_count >= 11 # header + at least 10 data rows (min_rows) + # Should be truncated due to memory limit + assert tr_count < unrestricted_rows def test_html_formatter_max_rows(df, clean_formatter_state): From 108599236b150df4b63e69973d49eb40e4ef6439 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 5 Feb 2026 14:47:03 +0800 Subject: [PATCH 11/15] Enhance memory boundary tests in HTML formatter for large datasets --- python/tests/test_dataframe.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 62a7da064..e92e5ee89 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -1468,13 +1468,16 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta """ # Get the raw size of the data to test boundary conditions - # First, capture output with no limits - use very high max_rows to avoid row limit + # First, capture output with no limits + # NOTE: max_rows=200000 is set well above the dataset size (100k rows) to ensure + # we're testing memory limits, not row limits. Default max_rows=10 would + # truncate before memory limit is reached. configure_formatter(max_memory_bytes=10 * MB, min_rows=1, max_rows=200000) unrestricted_output = large_df._repr_html_() unrestricted_rows = count_table_rows(unrestricted_output) # Test 1: Very small memory limit should still respect min_rows - # With large dataset, this should definitely hit memory limit + # With large dataset, this should definitely hit memory limit before min_rows configure_formatter(max_memory_bytes=10, min_rows=1) html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) @@ -1483,6 +1486,8 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta assert "data truncated" in html_output.lower() # Test 2: Memory limit at default size (2MB) should truncate the large dataset + # Default max_rows would truncate at 10 rows, so we don't set it here to test + # that memory limit is respected even with default row limit configure_formatter(max_memory_bytes=2 * MB, min_rows=1) html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) @@ -1491,6 +1496,8 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta assert tr_count < unrestricted_rows # Test 3: Very large memory limit should show much more data + # NOTE: max_rows=200000 is critical here - without it, default max_rows=10 + # would limit output to 10 rows even though we have 100MB of memory available configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) @@ -1507,6 +1514,7 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta assert "data truncated" in html_output.lower() # Test 5: With reasonable memory and min_rows settings + # NOTE: max_rows=200000 ensures we test memory limit behavior, not row limit configure_formatter(max_memory_bytes=2 * MB, min_rows=10, max_rows=200000) html_output = large_df._repr_html_() tr_count = count_table_rows(html_output) From 6f92b3ddd1199a33ce0860a0e9cbc69cf4112226 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 5 Feb 2026 14:54:33 +0800 Subject: [PATCH 12/15] Add fixture for multi-batch DataFrame and test early stream termination with memory limits --- python/tests/test_dataframe.py | 89 ++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index e92e5ee89..c9acaf228 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -91,6 +91,39 @@ def large_df(): return ctx.from_arrow(batch) +@pytest.fixture +def large_multi_batch_df(): + """Create a DataFrame with multiple record batches for testing stream behavior. + + This fixture creates 10 batches of 10,000 rows each (100,000 rows total), + ensuring the DataFrame spans multiple batches. This is essential for testing + that memory limits actually cause early stream termination rather than + truncating all collected data. + """ + ctx = SessionContext() + + # Create multiple batches, each with 10,000 rows + batches = [] + rows_per_batch = 10000 + num_batches = 10 + + for batch_idx in range(num_batches): + start_row = batch_idx * rows_per_batch + end_row = start_row + rows_per_batch + data = { + "a": list(range(start_row, end_row)), + "b": [f"s-{i}" for i in range(start_row, end_row)], + "c": [float(i + 0.1) for i in range(start_row, end_row)], + } + batch = pa.record_batch(data) + batches.append(batch) + + # Register as record batches to maintain multi-batch structure + # Using [batches] wraps list in another list as required by register_record_batches + ctx.register_record_batches("large_multi_batch_data", [batches]) + return ctx.table("large_multi_batch_data") + + @pytest.fixture def struct_df(): ctx = SessionContext() @@ -1523,6 +1556,62 @@ def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_sta assert tr_count < unrestricted_rows +def test_html_formatter_stream_early_termination( + large_multi_batch_df, clean_formatter_state +): + """Test that memory limits cause early stream termination with multi-batch data. + + This test specifically validates that the formatter stops collecting data when + the memory limit is reached, rather than collecting all data and then truncating. + The large_multi_batch_df fixture creates 10 record batches, allowing us to verify + that not all batches are consumed when memory limit is hit. + + Key difference from test_html_formatter_memory_boundary_conditions: + - Uses multi-batch DataFrame to verify stream termination behavior + - Tests with memory limit exceeded by 2-3 batches but not 1 batch + - Verifies partial data + truncation message + respects min_rows + """ + + # Get baseline: how much data fits without memory limit + configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) + unrestricted_output = large_multi_batch_df._repr_html_() + unrestricted_rows = count_table_rows(unrestricted_output) + + # Test 1: Memory limit exceeded by ~2 batches (each batch ~10k rows) + # With 1 batch (~1-2MB), we should have space. With 2-3 batches, we exceed limit. + # Set limit to ~3MB to ensure we collect ~1 batch before hitting limit + configure_formatter(max_memory_bytes=3 * MB, min_rows=1, max_rows=200000) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + # Should show significant truncation (not all 100k rows) + assert tr_count < unrestricted_rows, "Should be truncated by memory limit" + assert tr_count >= 2, "Should respect min_rows" + assert "data truncated" in html_output.lower(), "Should indicate truncation" + + # Test 2: Very tight memory limit should still respect min_rows + # Even with tiny memory (10 bytes), should show at least min_rows + configure_formatter(max_memory_bytes=10, min_rows=5, max_rows=200000) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + assert tr_count >= 6, "Should show header + at least min_rows (5)" + assert "data truncated" in html_output.lower(), "Should indicate truncation" + + # Test 3: Memory limit should take precedence over max_rows in early termination + # With max_rows=100 but small memory limit, should terminate early due to memory + configure_formatter(max_memory_bytes=2 * MB, min_rows=1, max_rows=100) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + # Should be truncated by memory limit (showing more than max_rows would suggest + # but less than unrestricted) + assert tr_count >= 2, "Should respect min_rows" + assert tr_count < unrestricted_rows, "Should be truncated" + # Output should indicate why truncation occurred + assert "data truncated" in html_output.lower() + + def test_html_formatter_max_rows(df, clean_formatter_state): configure_formatter(min_rows=2, max_rows=2) html_output = df._repr_html_() From 894bac02d00ab61e6d450e3a89754fdb3a580971 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Thu, 5 Feb 2026 15:26:03 +0800 Subject: [PATCH 13/15] Add backward compatibility tests for deprecated formatter attributes --- python/tests/test_dataframe.py | 47 ++++++++++++++++++++++++++++++++++ src/dataframe.rs | 19 +++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index c9acaf228..f9645e8ff 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -3134,6 +3134,53 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "