diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a49564649cf..051a783c742 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,13 @@ New Features By `Ian Hunt-Isaak `_. +- Added ``create_index`` parameter to :py:meth:`Dataset.to_dataframe`, :py:meth:`DataArray.to_dataframe`, + :py:meth:`Dataset.to_dask_dataframe`, and :py:meth:`DataArray.to_dask_dataframe` methods. + When ``create_index=False``, the resulting DataFrame will use a :py:class:`pandas.RangeIndex` + instead of setting dimension coordinates as the index, which can significantly improve performance + when the default multi-index is not needed. + By `Sanjay Kumar `_. + Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index fcfa0317131..0f4a31621e3 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame: return pandas_object def to_dataframe( - self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None + self, + name: Hashable | None = None, + dim_order: Sequence[Hashable] | None = None, + create_index: bool = True, ) -> pd.DataFrame: """Convert this array and its coordinates into a tidy pandas.DataFrame. @@ -3979,6 +3982,11 @@ def to_dataframe( If provided, must include all dimensions of this DataArray. By default, dimensions are sorted according to the DataArray dimensions order. + create_index : bool, default: True + If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product + of this DataArray's indices. If False, use a :py:class:`pandas.RangeIndex` instead. + This can be useful to avoid the potentially expensive MultiIndex + creation. Returns ------- @@ -4013,7 +4021,7 @@ def to_dataframe( else: ordered_dims = ds._normalize_dim_order(dim_order=dim_order) - df = ds._to_dataframe(ordered_dims) + df = ds._to_dataframe(ordered_dims, create_index=create_index) df.columns = [name if c == unique_name else c for c in df.columns] return df @@ -7599,6 +7607,7 @@ def to_dask_dataframe( self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False, + create_index: bool = True, ) -> DaskDataFrame: """Convert this array into a dask.dataframe.DataFrame. @@ -7614,6 +7623,14 @@ def to_dask_dataframe( If set_index=True, the dask DataFrame is indexed by this dataset's coordinate. Since dask DataFrames do not support multi-indexes, set_index only works if the dataset only contains one dimension. + create_index : bool, default: True + If ``create_index=True`` (default), dimension coordinates will be included + as columns in the resulting DataFrame. If ``create_index=False``, dimension + coordinates will be excluded, leaving only data variables and non-dimension + coordinates. This can improve performance and reduce memory usage when dimension + information is not needed. ``create_index=False`` is incompatible with ``set_index=True``. + + .. versionadded:: 2025.01.1 Returns ------- @@ -7658,7 +7675,7 @@ def to_dask_dataframe( ) name = self.name ds = self._to_dataset_whole(name, shallow_copy=False) - return ds.to_dask_dataframe(dim_order, set_index) + return ds.to_dask_dataframe(dim_order, set_index, create_index) # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 01baa9aed3d..3e34e103615 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -7198,7 +7198,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame: "Please use Dataset.to_dataframe() instead." ) - def _to_dataframe(self, ordered_dims: Mapping[Any, int]): + def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True): from xarray.core.extension_array import PandasExtensionArray # All and only non-index arrays (whether data or coordinates) should @@ -7229,7 +7229,15 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]): self._variables[k].set_dims(ordered_dims).values.reshape(-1) for k in non_extension_array_columns ] - index = self.coords.to_index([*ordered_dims]) + if create_index: + index = self.coords.to_index([*ordered_dims]) + else: + # Use a simple RangeIndex when create_index=False + # Calculate the total size from ordered_dims + total_size = ( + int(np.prod(list(ordered_dims.values()))) if ordered_dims else 0 + ) + index = pd.RangeIndex(total_size) broadcasted_df = pd.DataFrame( { **dict(zip(non_extension_array_columns, data, strict=True)), @@ -7257,7 +7265,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]): broadcasted_df = broadcasted_df.join(extension_array_df) return broadcasted_df[columns_in_order] - def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame: + def to_dataframe( + self, + dim_order: Sequence[Hashable] | None = None, + create_index: bool = True, + ) -> pd.DataFrame: """Convert this dataset into a pandas.DataFrame. Non-index variables in this dataset form the columns of the @@ -7276,6 +7288,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr If provided, must include all dimensions of this dataset. By default, dimensions are in the same order as in `Dataset.sizes`. + create_index : bool, default: True + If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product + of this dataset's indices. If False, use a :py:class:`pandas.RangeIndex` instead. + This can be useful to avoid the potentially expensive MultiIndex + creation. Returns ------- @@ -7286,7 +7303,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr ordered_dims = self._normalize_dim_order(dim_order=dim_order) - return self._to_dataframe(ordered_dims=ordered_dims) + return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index) def _set_sparse_data_from_dataframe( self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple @@ -7444,7 +7461,10 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self: return obj[dataframe.columns] if len(dataframe.columns) else obj def to_dask_dataframe( - self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False + self, + dim_order: Sequence[Hashable] | None = None, + set_index: bool = False, + create_index: bool = True, ) -> DaskDataFrame: """ Convert this dataset into a dask.dataframe.DataFrame. @@ -7468,6 +7488,14 @@ def to_dask_dataframe( If set_index=True, the dask DataFrame is indexed by this dataset's coordinate. Since dask DataFrames do not support multi-indexes, set_index only works if the dataset only contains one dimension. + create_index : bool, default: True + If ``create_index=True`` (default), dimension coordinates will be included + as columns in the resulting DataFrame. If ``create_index=False``, dimension + coordinates will be excluded, leaving only data variables and non-dimension + coordinates. This can improve performance and reduce memory usage when dimension + information is not needed. ``create_index=False`` is incompatible with ``set_index=True``. + + .. versionadded:: 2025.01.1 Returns ------- @@ -7477,9 +7505,20 @@ def to_dask_dataframe( import dask.array as da import dask.dataframe as dd + if not create_index and set_index: + raise ValueError("create_index=False is incompatible with set_index=True") + ordered_dims = self._normalize_dim_order(dim_order=dim_order) - columns = list(ordered_dims) + # Build column list based on create_index + if create_index: + # Include dimension coordinates as columns + columns = list(ordered_dims) + else: + # Exclude dimension coordinates + columns = [] + + # Always include non-dimension coordinates and data variables columns.extend(k for k in self.coords if k not in self.dims) columns.extend(self.data_vars) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 2d103994410..21d6f8249aa 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -941,6 +941,96 @@ def test_to_dask_dataframe_dim_order(self): with pytest.raises(ValueError, match=r"does not match the set of dimensions"): ds.to_dask_dataframe(dim_order=["x"]) + def test_to_dask_dataframe_create_index_false(self): + # Test that create_index=False excludes dimension columns + x = np.random.randn(10) + y = np.arange(10, dtype="uint8") + t = list("abcdefghij") + + ds = Dataset( + {"a": ("t", da.from_array(x, chunks=4)), "b": ("t", y), "t": ("t", t)} + ) + + # With create_index=False, dimension columns should be excluded + actual = ds.to_dask_dataframe(create_index=False) + assert isinstance(actual, dd.DataFrame) + actual_computed = actual.compute() + + # Check that index is RangeIndex + assert isinstance(actual_computed.index, pd.RangeIndex) + + # Check that dimension columns are NOT present + assert "t" not in actual_computed.columns + + # Check that data columns are present + assert "a" in actual_computed.columns + assert "b" in actual_computed.columns + + # Verify values are correct + assert_array_equal(actual_computed["a"].values, x) + assert_array_equal(actual_computed["b"].values, y) + + def test_to_dask_dataframe_create_index_incompatible_with_set_index(self): + # Test that create_index=False and set_index=True raises an error + ds = Dataset({"a": ("t", da.from_array([1, 2, 3], chunks=2))}) + + with pytest.raises( + ValueError, + match="create_index=False is incompatible with set_index=True", + ): + ds.to_dask_dataframe(create_index=False, set_index=True) + + def test_to_dask_dataframe_create_index_2D(self): + # Test create_index=False with 2D dataset + w = np.random.randn(2, 3) + ds = Dataset({"w": (("x", "y"), da.from_array(w, chunks=(1, 2)))}) + ds["x"] = ("x", np.array([0, 1], np.int64)) + ds["y"] = ("y", list("abc")) + + actual = ds.to_dask_dataframe(create_index=False) + assert isinstance(actual, dd.DataFrame) + actual_computed = actual.compute() + + # Check that index is RangeIndex + assert isinstance(actual_computed.index, pd.RangeIndex) + + # Check that dimension columns are not present + assert "x" not in actual_computed.columns + assert "y" not in actual_computed.columns + + # Check that data column is present + assert "w" in actual_computed.columns + + # Verify values are correct (flattened) + assert_array_equal(actual_computed["w"].values, w.reshape(-1)) + + def test_to_dask_dataframe_create_index_dataarray(self): + # Test create_index parameter for DataArray.to_dask_dataframe + arr_np = np.arange(3 * 4).reshape(3, 4) + arr = DataArray( + da.from_array(arr_np, chunks=(2, 2)), + [("B", [1, 2, 3]), ("A", list("cdef"))], + name="foo", + ) + + # With create_index=False, should use RangeIndex + actual = arr.to_dask_dataframe(create_index=False) + assert isinstance(actual, dd.DataFrame) + actual_computed = actual.compute() + + assert isinstance(actual_computed.index, pd.RangeIndex) + assert "B" not in actual_computed.columns + assert "A" not in actual_computed.columns + assert "foo" in actual_computed.columns + assert_array_equal(actual_computed["foo"].values, arr_np.reshape(-1)) + + # Test incompatibility with set_index=True + with pytest.raises( + ValueError, + match="create_index=False is incompatible with set_index=True", + ): + arr.to_dask_dataframe(create_index=False, set_index=True) + @pytest.mark.parametrize("method", ["load", "compute"]) def test_dask_kwargs_variable(method): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index e61ea9e7fe8..636acf5cdd7 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3663,6 +3663,32 @@ def test_to_dataframe_0length(self) -> None: assert len(actual) == 0 assert_array_equal(actual.index.names, list("ABC")) + def test_to_dataframe_create_index(self) -> None: + # Test create_index parameter + arr_np = np.arange(12).reshape(3, 4) + arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo") + + # Default behavior: create MultiIndex + df_with_index = arr.to_dataframe() + assert isinstance(df_with_index.index, pd.MultiIndex) + assert df_with_index.index.names == ["x", "y"] + assert len(df_with_index) == 12 + + # With create_index=False: use RangeIndex + df_without_index = arr.to_dataframe(create_index=False) + assert isinstance(df_without_index.index, pd.RangeIndex) + assert len(df_without_index) == 12 + + # Data should be the same regardless + assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values) + + # Test with coords that have different dimensions + arr.coords["z"] = ("x", [-1, -2, -3]) + df_with_coords = arr.to_dataframe(create_index=False) + assert isinstance(df_with_coords.index, pd.RangeIndex) + assert "z" in df_with_coords.columns + assert len(df_with_coords) == 12 + @pytest.mark.parametrize( "x_dtype,y_dtype,v_dtype", [ diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d25ef5a2771..45886614044 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2282,6 +2282,38 @@ def test_to_pandas(self) -> None: with pytest.raises(ValueError, match=r"cannot convert Datasets"): Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas() + def test_to_dataframe_create_index(self) -> None: + # Test create_index parameter for Dataset + x = np.random.randn(3, 4) + y = np.random.randn(3, 4) + ds = Dataset( + {"a": (("x", "y"), x), "b": (("x", "y"), y)}, + coords={"x": [1, 2, 3], "y": list("abcd")}, + ) + + # Default behavior: create MultiIndex + df_with_index = ds.to_dataframe() + assert isinstance(df_with_index.index, pd.MultiIndex) + assert df_with_index.index.names == ["x", "y"] + assert len(df_with_index) == 12 + + # With create_index=False: use RangeIndex + df_without_index = ds.to_dataframe(create_index=False) + assert isinstance(df_without_index.index, pd.RangeIndex) + assert len(df_without_index) == 12 + + # Data should be the same regardless + assert_array_equal(df_with_index["a"].values, df_without_index["a"].values) + assert_array_equal(df_with_index["b"].values, df_without_index["b"].values) + + # Test with dim_order and create_index=False + df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False) + assert isinstance(df_reordered.index, pd.RangeIndex) + assert len(df_reordered) == 12 + # Check that dim_order affects the data ordering + df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"]) + assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values) + def test_reindex_like(self) -> None: data = create_test_data() data["letters"] = ("dim3", 10 * ["a"])