diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst index 4c37c7471..9dea948bb 100644 --- a/docs/source/user-guide/dataframe/rendering.rst +++ b/docs/source/user-guide/dataframe/rendering.rst @@ -57,8 +57,8 @@ You can customize how DataFrames are rendered by configuring the formatter: max_width=1000, # Maximum width in pixels max_height=300, # Maximum height in pixels max_memory_bytes=2097152, # Maximum memory for rendering (2MB) - min_rows_display=20, # Minimum number of rows to display - repr_rows=10, # Number of rows to display in __repr__ + min_rows=10, # Minimum number of rows to display + max_rows=10, # Maximum rows to display in __repr__ enable_cell_expansion=True,# Allow expanding truncated cells custom_css=None, # Additional custom CSS show_truncation_message=True, # Show message when data is truncated @@ -190,8 +190,8 @@ You can control how much data is displayed and how much memory is used for rende configure_formatter( max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display - min_rows_display=50, # Always show at least 50 rows - repr_rows=20 # Show 20 rows in __repr__ output + min_rows=20, # Always show at least 20 rows + max_rows=50 # Show up to 50 rows in output ) These parameters help balance comprehensive data display against performance considerations. diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index bb53d323e..b8af45a1b 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -18,6 +18,7 @@ from __future__ import annotations +import warnings from typing import ( TYPE_CHECKING, Any, @@ -61,6 +62,93 @@ def _validate_bool(value: Any, param_name: str) -> None: raise TypeError(msg) +def _validate_formatter_parameters( + max_cell_length: int, + max_width: int, + max_height: int, + max_memory_bytes: int, + min_rows: int, + max_rows: int | None, + repr_rows: int | None, + enable_cell_expansion: bool, + show_truncation_message: bool, + use_shared_styles: bool, + custom_css: str | None, + style_provider: Any, +) -> int: + """Validate all formatter parameters and return resolved max_rows value. + + Args: + max_cell_length: Maximum cell length value to validate + max_width: Maximum width value to validate + max_height: Maximum height value to validate + max_memory_bytes: Maximum memory bytes value to validate + min_rows: Minimum rows to display value to validate + max_rows: Maximum rows value to validate (None means use default) + repr_rows: Deprecated repr_rows value to validate + enable_cell_expansion: Boolean expansion flag to validate + show_truncation_message: Boolean message flag to validate + use_shared_styles: Boolean styles flag to validate + custom_css: Custom CSS string to validate + style_provider: Style provider object to validate + + Returns: + The resolved max_rows value after handling repr_rows deprecation + + Raises: + ValueError: If any numeric parameter is invalid or constraints are violated + TypeError: If any parameter has invalid type + DeprecationWarning: If repr_rows parameter is used + """ + # Validate numeric parameters + _validate_positive_int(max_cell_length, "max_cell_length") + _validate_positive_int(max_width, "max_width") + _validate_positive_int(max_height, "max_height") + _validate_positive_int(max_memory_bytes, "max_memory_bytes") + _validate_positive_int(min_rows, "min_rows") + + # Handle deprecated repr_rows parameter + if repr_rows is not None: + warnings.warn( + "repr_rows parameter is deprecated, use max_rows instead", + DeprecationWarning, + stacklevel=4, + ) + _validate_positive_int(repr_rows, "repr_rows") + if max_rows is not None and repr_rows != max_rows: + msg = "Cannot specify both repr_rows and max_rows; use max_rows only" + raise ValueError(msg) + max_rows = repr_rows + + # Use default if max_rows was not provided + if max_rows is None: + max_rows = 10 + + _validate_positive_int(max_rows, "max_rows") + + # Validate constraint: min_rows <= max_rows + if min_rows > max_rows: + msg = "min_rows must be less than or equal to max_rows" + raise ValueError(msg) + + # Validate boolean parameters + _validate_bool(enable_cell_expansion, "enable_cell_expansion") + _validate_bool(show_truncation_message, "show_truncation_message") + _validate_bool(use_shared_styles, "use_shared_styles") + + # Validate custom_css + if custom_css is not None and not isinstance(custom_css, str): + msg = "custom_css must be None or a string" + raise TypeError(msg) + + # Validate style_provider + if style_provider is not None and not isinstance(style_provider, StyleProvider): + msg = "style_provider must implement the StyleProvider protocol" + raise TypeError(msg) + + return max_rows + + @runtime_checkable class CellFormatter(Protocol): """Protocol for cell value formatters.""" @@ -126,8 +214,9 @@ class DataFrameHtmlFormatter: max_width: Maximum width of the HTML table in pixels max_height: Maximum height of the HTML table in pixels max_memory_bytes: Maximum memory in bytes for rendered data (default: 2MB) - min_rows_display: Minimum number of rows to display - repr_rows: Default number of rows to display in repr output + min_rows: Minimum number of rows to display (must be <= max_rows) + max_rows: Maximum number of rows to display in repr output + repr_rows: Deprecated alias for max_rows enable_cell_expansion: Whether to add expand/collapse buttons for long cell values custom_css: Additional CSS to include in the HTML output @@ -143,8 +232,9 @@ def __init__( max_width: int = 1000, max_height: int = 300, max_memory_bytes: int = 2 * 1024 * 1024, # 2 MB - min_rows_display: int = 20, - repr_rows: int = 10, + min_rows: int = 10, + max_rows: int | None = None, + repr_rows: int | None = None, enable_cell_expansion: bool = True, custom_css: str | None = None, show_truncation_message: bool = True, @@ -155,71 +245,70 @@ def __init__( Parameters ---------- - max_cell_length : int, default 25 + max_cell_length Maximum length of cell content before truncation. - max_width : int, default 1000 + max_width Maximum width of the displayed table in pixels. - max_height : int, default 300 + max_height Maximum height of the displayed table in pixels. - max_memory_bytes : int, default 2097152 (2MB) - Maximum memory in bytes for rendered data. - min_rows_display : int, default 20 - Minimum number of rows to display. - repr_rows : int, default 10 - Default number of rows to display in repr output. - enable_cell_expansion : bool, default True + max_memory_bytes + Maximum memory in bytes for rendered data. Helps prevent performance + issues with large datasets. + min_rows + Minimum number of rows to display even if memory limit is reached. + Must not exceed ``max_rows``. + max_rows + Maximum number of rows to display. Takes precedence over memory limits + when fewer rows are requested. + repr_rows + Deprecated alias for ``max_rows``. Use ``max_rows`` instead. + enable_cell_expansion Whether to allow cells to expand when clicked. - custom_css : str, optional + custom_css Custom CSS to apply to the HTML table. - show_truncation_message : bool, default True + show_truncation_message Whether to show a message indicating that content has been truncated. - style_provider : StyleProvider, optional + style_provider Provider of CSS styles for the HTML table. If None, DefaultStyleProvider is used. - use_shared_styles : bool, default True - Whether to use shared styles across multiple tables. + use_shared_styles + Whether to use shared styles across multiple tables. This improves + performance when displaying many DataFrames in a single notebook. Raises: ------ ValueError If max_cell_length, max_width, max_height, max_memory_bytes, - min_rows_display, or repr_rows is not a positive integer. + min_rows or max_rows is not a positive integer, or if min_rows + exceeds max_rows. TypeError If enable_cell_expansion, show_truncation_message, or use_shared_styles is - not a boolean, - or if custom_css is provided but is not a string, - or if style_provider is provided but does not implement the StyleProvider + not a boolean, or if custom_css is provided but is not a string, or if + style_provider is provided but does not implement the StyleProvider protocol. """ - # Validate numeric parameters - _validate_positive_int(max_cell_length, "max_cell_length") - _validate_positive_int(max_width, "max_width") - _validate_positive_int(max_height, "max_height") - _validate_positive_int(max_memory_bytes, "max_memory_bytes") - _validate_positive_int(min_rows_display, "min_rows_display") - _validate_positive_int(repr_rows, "repr_rows") - - # Validate boolean parameters - _validate_bool(enable_cell_expansion, "enable_cell_expansion") - _validate_bool(show_truncation_message, "show_truncation_message") - _validate_bool(use_shared_styles, "use_shared_styles") - - # Validate custom_css - if custom_css is not None and not isinstance(custom_css, str): - msg = "custom_css must be None or a string" - raise TypeError(msg) - - # Validate style_provider - if style_provider is not None and not isinstance(style_provider, StyleProvider): - msg = "style_provider must implement the StyleProvider protocol" - raise TypeError(msg) + # Validate all parameters and get resolved max_rows + resolved_max_rows = _validate_formatter_parameters( + max_cell_length, + max_width, + max_height, + max_memory_bytes, + min_rows, + max_rows, + repr_rows, + enable_cell_expansion, + show_truncation_message, + use_shared_styles, + custom_css, + style_provider, + ) self.max_cell_length = max_cell_length self.max_width = max_width self.max_height = max_height self.max_memory_bytes = max_memory_bytes - self.min_rows_display = min_rows_display - self.repr_rows = repr_rows + self.min_rows = min_rows + self._max_rows = resolved_max_rows self.enable_cell_expansion = enable_cell_expansion self.custom_css = custom_css self.show_truncation_message = show_truncation_message @@ -231,6 +320,55 @@ def __init__( self._custom_cell_builder: Callable[[Any, int, int, str], str] | None = None self._custom_header_builder: Callable[[Any], str] | None = None + @property + def max_rows(self) -> int: + """Get the maximum number of rows to display. + + Returns: + The maximum number of rows to display in repr output + """ + return self._max_rows + + @max_rows.setter + def max_rows(self, value: int) -> None: + """Set the maximum number of rows to display. + + Args: + value: The maximum number of rows + """ + self._max_rows = value + + @property + def repr_rows(self) -> int: + """Get the maximum number of rows (deprecated name). + + .. deprecated:: + Use :attr:`max_rows` instead. This property is provided for + backward compatibility. + + Returns: + The maximum number of rows to display + """ + return self._max_rows + + @repr_rows.setter + def repr_rows(self, value: int) -> None: + """Set the maximum number of rows using deprecated name. + + .. deprecated:: + Use :attr:`max_rows` setter instead. This property is provided for + backward compatibility. + + Args: + value: The maximum number of rows + """ + warnings.warn( + "repr_rows is deprecated, use max_rows instead", + DeprecationWarning, + stacklevel=2, + ) + self._max_rows = value + def register_formatter(self, type_class: type, formatter: CellFormatter) -> None: """Register a custom formatter for a specific data type. @@ -659,7 +797,8 @@ def configure_formatter(**kwargs: Any) -> None: "max_width", "max_height", "max_memory_bytes", - "min_rows_display", + "min_rows", + "max_rows", "repr_rows", "enable_cell_expansion", "custom_css", diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py index 53a661969..71abe2925 100644 --- a/python/tests/test_dataframe.py +++ b/python/tests/test_dataframe.py @@ -91,6 +91,39 @@ def large_df(): return ctx.from_arrow(batch) +@pytest.fixture +def large_multi_batch_df(): + """Create a DataFrame with multiple record batches for testing stream behavior. + + This fixture creates 10 batches of 10,000 rows each (100,000 rows total), + ensuring the DataFrame spans multiple batches. This is essential for testing + that memory limits actually cause early stream termination rather than + truncating all collected data. + """ + ctx = SessionContext() + + # Create multiple batches, each with 10,000 rows + batches = [] + rows_per_batch = 10000 + num_batches = 10 + + for batch_idx in range(num_batches): + start_row = batch_idx * rows_per_batch + end_row = start_row + rows_per_batch + data = { + "a": list(range(start_row, end_row)), + "b": [f"s-{i}" for i in range(start_row, end_row)], + "c": [float(i + 0.1) for i in range(start_row, end_row)], + } + batch = pa.record_batch(data) + batches.append(batch) + + # Register as record batches to maintain multi-batch structure + # Using [batches] wraps list in another list as required by register_record_batches + ctx.register_record_batches("large_multi_batch_data", [batches]) + return ctx.table("large_multi_batch_data") + + @pytest.fixture def struct_df(): ctx = SessionContext() @@ -1438,7 +1471,7 @@ def get_header_style(self) -> str: def test_html_formatter_memory(df, clean_formatter_state): """Test the memory and row control parameters in DataFrameHtmlFormatter.""" - configure_formatter(max_memory_bytes=10, min_rows_display=1) + configure_formatter(max_memory_bytes=10, min_rows=1) html_output = df._repr_html_() # Count the number of table rows in the output @@ -1448,7 +1481,7 @@ def test_html_formatter_memory(df, clean_formatter_state): assert tr_count == 2 # 1 for header row, 1 for data row assert "data truncated" in html_output.lower() - configure_formatter(max_memory_bytes=10 * MB, min_rows_display=1) + configure_formatter(max_memory_bytes=10 * MB, min_rows=1) html_output = df._repr_html_() # With larger memory limit and min_rows=2, should display all rows tr_count = count_table_rows(html_output) @@ -1458,15 +1491,136 @@ def test_html_formatter_memory(df, clean_formatter_state): assert "data truncated" not in html_output.lower() -def test_html_formatter_repr_rows(df, clean_formatter_state): - configure_formatter(min_rows_display=2, repr_rows=2) +def test_html_formatter_memory_boundary_conditions(large_df, clean_formatter_state): + """Test memory limit behavior at boundary conditions with large dataset. + + This test validates that the formatter correctly handles edge cases when + the memory limit is reached with a large dataset (100,000 rows), ensuring + that min_rows constraint is properly respected while respecting memory limits. + Uses large_df to actually test memory limit behavior with realistic data sizes. + """ + + # Get the raw size of the data to test boundary conditions + # First, capture output with no limits + # NOTE: max_rows=200000 is set well above the dataset size (100k rows) to ensure + # we're testing memory limits, not row limits. Default max_rows=10 would + # truncate before memory limit is reached. + configure_formatter(max_memory_bytes=10 * MB, min_rows=1, max_rows=200000) + unrestricted_output = large_df._repr_html_() + unrestricted_rows = count_table_rows(unrestricted_output) + + # Test 1: Very small memory limit should still respect min_rows + # With large dataset, this should definitely hit memory limit before min_rows + configure_formatter(max_memory_bytes=10, min_rows=1) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 2 # At least header + 1 data row (minimum) + # Should show truncation since we limited memory so aggressively + assert "data truncated" in html_output.lower() + + # Test 2: Memory limit at default size (2MB) should truncate the large dataset + # Default max_rows would truncate at 10 rows, so we don't set it here to test + # that memory limit is respected even with default row limit + configure_formatter(max_memory_bytes=2 * MB, min_rows=1) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 2 # At least header + min_rows + # Should be truncated since full dataset is much larger than 2MB + assert tr_count < unrestricted_rows + + # Test 3: Very large memory limit should show much more data + # NOTE: max_rows=200000 is critical here - without it, default max_rows=10 + # would limit output to 10 rows even though we have 100MB of memory available + configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + # Should show significantly more rows, possibly all + assert tr_count > 100 # Should show substantially more rows + + # Test 4: Min rows should override memory limit + # With tiny memory and larger min_rows, min_rows should win + configure_formatter(max_memory_bytes=10, min_rows=2) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 3 # At least header + 2 data rows (min_rows) + # Should show truncation message despite min_rows being satisfied + assert "data truncated" in html_output.lower() + + # Test 5: With reasonable memory and min_rows settings + # NOTE: max_rows=200000 ensures we test memory limit behavior, not row limit + configure_formatter(max_memory_bytes=2 * MB, min_rows=10, max_rows=200000) + html_output = large_df._repr_html_() + tr_count = count_table_rows(html_output) + assert tr_count >= 11 # header + at least 10 data rows (min_rows) + # Should be truncated due to memory limit + assert tr_count < unrestricted_rows + + +def test_html_formatter_stream_early_termination( + large_multi_batch_df, clean_formatter_state +): + """Test that memory limits cause early stream termination with multi-batch data. + + This test specifically validates that the formatter stops collecting data when + the memory limit is reached, rather than collecting all data and then truncating. + The large_multi_batch_df fixture creates 10 record batches, allowing us to verify + that not all batches are consumed when memory limit is hit. + + Key difference from test_html_formatter_memory_boundary_conditions: + - Uses multi-batch DataFrame to verify stream termination behavior + - Tests with memory limit exceeded by 2-3 batches but not 1 batch + - Verifies partial data + truncation message + respects min_rows + """ + + # Get baseline: how much data fits without memory limit + configure_formatter(max_memory_bytes=100 * MB, min_rows=1, max_rows=200000) + unrestricted_output = large_multi_batch_df._repr_html_() + unrestricted_rows = count_table_rows(unrestricted_output) + + # Test 1: Memory limit exceeded by ~2 batches (each batch ~10k rows) + # With 1 batch (~1-2MB), we should have space. With 2-3 batches, we exceed limit. + # Set limit to ~3MB to ensure we collect ~1 batch before hitting limit + configure_formatter(max_memory_bytes=3 * MB, min_rows=1, max_rows=200000) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + # Should show significant truncation (not all 100k rows) + assert tr_count < unrestricted_rows, "Should be truncated by memory limit" + assert tr_count >= 2, "Should respect min_rows" + assert "data truncated" in html_output.lower(), "Should indicate truncation" + + # Test 2: Very tight memory limit should still respect min_rows + # Even with tiny memory (10 bytes), should show at least min_rows + configure_formatter(max_memory_bytes=10, min_rows=5, max_rows=200000) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + assert tr_count >= 6, "Should show header + at least min_rows (5)" + assert "data truncated" in html_output.lower(), "Should indicate truncation" + + # Test 3: Memory limit should take precedence over max_rows in early termination + # With max_rows=100 but small memory limit, should terminate early due to memory + configure_formatter(max_memory_bytes=2 * MB, min_rows=1, max_rows=100) + html_output = large_multi_batch_df._repr_html_() + tr_count = count_table_rows(html_output) + + # Should be truncated by memory limit (showing more than max_rows would suggest + # but less than unrestricted) + assert tr_count >= 2, "Should respect min_rows" + assert tr_count < unrestricted_rows, "Should be truncated" + # Output should indicate why truncation occurred + assert "data truncated" in html_output.lower() + + +def test_html_formatter_max_rows(df, clean_formatter_state): + configure_formatter(min_rows=2, max_rows=2) html_output = df._repr_html_() tr_count = count_table_rows(html_output) # Table should have header row (1) + 2 data rows = 3 rows assert tr_count == 3 - configure_formatter(min_rows_display=2, repr_rows=3) + configure_formatter(min_rows=2, max_rows=3) html_output = df._repr_html_() tr_count = count_table_rows(html_output) @@ -1492,17 +1646,42 @@ def test_html_formatter_validation(): with pytest.raises(ValueError, match="max_memory_bytes must be a positive integer"): DataFrameHtmlFormatter(max_memory_bytes=-100) - with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): - DataFrameHtmlFormatter(min_rows_display=0) + with pytest.raises(ValueError, match="min_rows must be a positive integer"): + DataFrameHtmlFormatter(min_rows=0) + + with pytest.raises(ValueError, match="min_rows must be a positive integer"): + DataFrameHtmlFormatter(min_rows=-5) - with pytest.raises(ValueError, match="min_rows_display must be a positive integer"): - DataFrameHtmlFormatter(min_rows_display=-5) + with pytest.raises(ValueError, match="max_rows must be a positive integer"): + DataFrameHtmlFormatter(max_rows=0) + + with pytest.raises(ValueError, match="max_rows must be a positive integer"): + DataFrameHtmlFormatter(max_rows=-10) + + with pytest.raises( + ValueError, match="min_rows must be less than or equal to max_rows" + ): + DataFrameHtmlFormatter(min_rows=5, max_rows=4) - with pytest.raises(ValueError, match="repr_rows must be a positive integer"): - DataFrameHtmlFormatter(repr_rows=0) - with pytest.raises(ValueError, match="repr_rows must be a positive integer"): - DataFrameHtmlFormatter(repr_rows=-10) +def test_repr_rows_backward_compatibility(clean_formatter_state): + """Test that repr_rows parameter still works as deprecated alias.""" + # Should work when not conflicting with max_rows + with pytest.warns(DeprecationWarning, match="repr_rows parameter is deprecated"): + formatter = DataFrameHtmlFormatter(repr_rows=15, min_rows=10) + assert formatter.max_rows == 15 + assert formatter.repr_rows == 15 + + # Should fail when conflicting with max_rows + with pytest.raises(ValueError, match="Cannot specify both repr_rows and max_rows"): + DataFrameHtmlFormatter(repr_rows=5, max_rows=10) + + # Setting repr_rows via property should warn + formatter2 = DataFrameHtmlFormatter() + with pytest.warns(DeprecationWarning, match="repr_rows is deprecated"): + formatter2.repr_rows = 7 + assert formatter2.max_rows == 7 + assert formatter2.repr_rows == 7 def test_configure_formatter(df, clean_formatter_state): @@ -1514,8 +1693,8 @@ def test_configure_formatter(df, clean_formatter_state): max_width = 500 max_height = 30 max_memory_bytes = 3 * MB - min_rows_display = 2 - repr_rows = 2 + min_rows = 2 + max_rows = 2 enable_cell_expansion = False show_truncation_message = False use_shared_styles = False @@ -1527,8 +1706,8 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_default.max_width != max_width assert formatter_default.max_height != max_height assert formatter_default.max_memory_bytes != max_memory_bytes - assert formatter_default.min_rows_display != min_rows_display - assert formatter_default.repr_rows != repr_rows + assert formatter_default.min_rows != min_rows + assert formatter_default.max_rows != max_rows assert formatter_default.enable_cell_expansion != enable_cell_expansion assert formatter_default.show_truncation_message != show_truncation_message assert formatter_default.use_shared_styles != use_shared_styles @@ -1539,8 +1718,8 @@ def test_configure_formatter(df, clean_formatter_state): max_width=max_width, max_height=max_height, max_memory_bytes=max_memory_bytes, - min_rows_display=min_rows_display, - repr_rows=repr_rows, + min_rows=min_rows, + max_rows=max_rows, enable_cell_expansion=enable_cell_expansion, show_truncation_message=show_truncation_message, use_shared_styles=use_shared_styles, @@ -1550,8 +1729,8 @@ def test_configure_formatter(df, clean_formatter_state): assert formatter_custom.max_width == max_width assert formatter_custom.max_height == max_height assert formatter_custom.max_memory_bytes == max_memory_bytes - assert formatter_custom.min_rows_display == min_rows_display - assert formatter_custom.repr_rows == repr_rows + assert formatter_custom.min_rows == min_rows + assert formatter_custom.max_rows == max_rows assert formatter_custom.enable_cell_expansion == enable_cell_expansion assert formatter_custom.show_truncation_message == show_truncation_message assert formatter_custom.use_shared_styles == use_shared_styles @@ -2955,6 +3134,47 @@ def test_html_formatter_manual_format_html(clean_formatter_state): assert "