Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ New Features
(:pull:`10849`).
By `Stephan Hoyer <https://github.com/shoyer>`_.

- Added ``create_index`` parameter to :py:meth:`Dataset.to_dataframe`, :py:meth:`DataArray.to_dataframe`,
:py:meth:`Dataset.to_dask_dataframe`, and :py:meth:`DataArray.to_dask_dataframe` methods.
When ``create_index=False``, the resulting DataFrame will use a :py:class:`pandas.RangeIndex`
instead of setting dimension coordinates as the index, which can significantly improve performance
when the default multi-index is not needed.
By `Sanjay Kumar <https://github.com/sanjay>`_.

Breaking Changes
~~~~~~~~~~~~~~~~

Expand Down
22 changes: 19 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3954,7 +3954,10 @@ def to_pandas(self) -> Self | pd.Series | pd.DataFrame:
return pandas_object

def to_dataframe(
self, name: Hashable | None = None, dim_order: Sequence[Hashable] | None = None
self,
name: Hashable | None = None,
dim_order: Sequence[Hashable] | None = None,
create_index: bool = True,
) -> pd.DataFrame:
"""Convert this array and its coordinates into a tidy pandas.DataFrame.

Expand All @@ -3979,6 +3982,11 @@ def to_dataframe(

If provided, must include all dimensions of this DataArray. By default,
dimensions are sorted according to the DataArray dimensions order.
create_index : bool, default: True
If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product
of this DataArray's indices. If False, use a :py:class:`pandas.RangeIndex` instead.
This can be useful to avoid the potentially expensive MultiIndex
creation.

Returns
-------
Expand Down Expand Up @@ -4013,7 +4021,7 @@ def to_dataframe(
else:
ordered_dims = ds._normalize_dim_order(dim_order=dim_order)

df = ds._to_dataframe(ordered_dims)
df = ds._to_dataframe(ordered_dims, create_index=create_index)
df.columns = [name if c == unique_name else c for c in df.columns]
return df

Expand Down Expand Up @@ -7579,6 +7587,7 @@ def to_dask_dataframe(
self,
dim_order: Sequence[Hashable] | None = None,
set_index: bool = False,
create_index: bool = True,
) -> DaskDataFrame:
"""Convert this array into a dask.dataframe.DataFrame.

Expand All @@ -7594,6 +7603,13 @@ def to_dask_dataframe(
If set_index=True, the dask DataFrame is indexed by this dataset's
coordinate. Since dask DataFrames do not support multi-indexes,
set_index only works if the dataset only contains one dimension.
create_index : bool, default: True
If ``create_index=False``, the resulting DataFrame will use a
:py:class:`pandas.RangeIndex` instead of setting dimensions as index columns.
This can significantly improve performance when the default index is not needed.
``create_index=False`` is incompatible with ``set_index=True``.

.. versionadded:: 2025.01.1

Returns
-------
Expand Down Expand Up @@ -7638,7 +7654,7 @@ def to_dask_dataframe(
)
name = self.name
ds = self._to_dataset_whole(name, shallow_copy=False)
return ds.to_dask_dataframe(dim_order, set_index)
return ds.to_dask_dataframe(dim_order, set_index, create_index)

# this needs to be at the end, or mypy will confuse with `str`
# https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names
Expand Down
52 changes: 44 additions & 8 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7200,7 +7200,7 @@ def to_pandas(self) -> pd.Series | pd.DataFrame:
"Please use Dataset.to_dataframe() instead."
)

def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
def _to_dataframe(self, ordered_dims: Mapping[Any, int], create_index: bool = True):
from xarray.core.extension_array import PandasExtensionArray

# All and only non-index arrays (whether data or coordinates) should
Expand Down Expand Up @@ -7231,7 +7231,15 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
self._variables[k].set_dims(ordered_dims).values.reshape(-1)
for k in non_extension_array_columns
]
index = self.coords.to_index([*ordered_dims])
if create_index:
index = self.coords.to_index([*ordered_dims])
else:
# Use a simple RangeIndex when create_index=False
# Calculate the total size from ordered_dims
total_size = (
int(np.prod(list(ordered_dims.values()))) if ordered_dims else 0
)
index = pd.RangeIndex(total_size)
broadcasted_df = pd.DataFrame(
{
**dict(zip(non_extension_array_columns, data, strict=True)),
Expand Down Expand Up @@ -7259,7 +7267,11 @@ def _to_dataframe(self, ordered_dims: Mapping[Any, int]):
broadcasted_df = broadcasted_df.join(extension_array_df)
return broadcasted_df[columns_in_order]

def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFrame:
def to_dataframe(
self,
dim_order: Sequence[Hashable] | None = None,
create_index: bool = True,
) -> pd.DataFrame:
"""Convert this dataset into a pandas.DataFrame.

Non-index variables in this dataset form the columns of the
Expand All @@ -7278,6 +7290,11 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr

If provided, must include all dimensions of this dataset. By
default, dimensions are in the same order as in `Dataset.sizes`.
create_index : bool, default: True
If True (default), create a :py:class:`pandas.MultiIndex` from the Cartesian product
of this dataset's indices. If False, use a :py:class:`pandas.RangeIndex` instead.
This can be useful to avoid the potentially expensive MultiIndex
creation.

Returns
-------
Expand All @@ -7288,7 +7305,7 @@ def to_dataframe(self, dim_order: Sequence[Hashable] | None = None) -> pd.DataFr

ordered_dims = self._normalize_dim_order(dim_order=dim_order)

return self._to_dataframe(ordered_dims=ordered_dims)
return self._to_dataframe(ordered_dims=ordered_dims, create_index=create_index)

def _set_sparse_data_from_dataframe(
self, idx: pd.Index, arrays: list[tuple[Hashable, np.ndarray]], dims: tuple
Expand Down Expand Up @@ -7446,7 +7463,10 @@ def from_dataframe(cls, dataframe: pd.DataFrame, sparse: bool = False) -> Self:
return obj[dataframe.columns] if len(dataframe.columns) else obj

def to_dask_dataframe(
self, dim_order: Sequence[Hashable] | None = None, set_index: bool = False
self,
dim_order: Sequence[Hashable] | None = None,
set_index: bool = False,
create_index: bool = True,
) -> DaskDataFrame:
"""
Convert this dataset into a dask.dataframe.DataFrame.
Expand All @@ -7470,6 +7490,13 @@ def to_dask_dataframe(
If set_index=True, the dask DataFrame is indexed by this dataset's
coordinate. Since dask DataFrames do not support multi-indexes,
set_index only works if the dataset only contains one dimension.
create_index : bool, default: True
If ``create_index=False``, the resulting DataFrame will use a
:py:class:`pandas.RangeIndex` instead of setting dimensions as index columns.
This can significantly improve performance when the default index is not needed.
``create_index=False`` is incompatible with ``set_index=True``.

.. versionadded:: 2025.01.1

Returns
-------
Expand All @@ -7479,11 +7506,20 @@ def to_dask_dataframe(
import dask.array as da
import dask.dataframe as dd

if not create_index and set_index:
raise ValueError("create_index=False is incompatible with set_index=True")

ordered_dims = self._normalize_dim_order(dim_order=dim_order)

columns = list(ordered_dims)
columns.extend(k for k in self.coords if k not in self.dims)
columns.extend(self.data_vars)
if create_index:
columns = list(ordered_dims)
columns.extend(k for k in self.coords if k not in self.dims)
columns.extend(self.data_vars)
else:
# When create_index=False, exclude dimensions from columns
columns = []
columns.extend(k for k in self.coords if k not in self.dims)
columns.extend(self.data_vars)

ds_chunks = self.chunks

Expand Down
90 changes: 90 additions & 0 deletions xarray/tests/test_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,96 @@ def test_to_dask_dataframe_dim_order(self):
with pytest.raises(ValueError, match=r"does not match the set of dimensions"):
ds.to_dask_dataframe(dim_order=["x"])

def test_to_dask_dataframe_create_index_false(self):
# Test that create_index=False uses RangeIndex instead of dimension columns
x = np.random.randn(10)
y = np.arange(10, dtype="uint8")
t = list("abcdefghij")

ds = Dataset(
{"a": ("t", da.from_array(x, chunks=4)), "b": ("t", y), "t": ("t", t)}
)

# With create_index=False, we should get a RangeIndex and no dimension columns
actual = ds.to_dask_dataframe(create_index=False)
assert isinstance(actual, dd.DataFrame)
actual_computed = actual.compute()

# Check that index is RangeIndex
assert isinstance(actual_computed.index, pd.RangeIndex)

# Check that dimension columns are not present
assert "t" not in actual_computed.columns

# Check that data columns are present
assert "a" in actual_computed.columns
assert "b" in actual_computed.columns

# Verify values are correct
assert_array_equal(actual_computed["a"].values, x)
assert_array_equal(actual_computed["b"].values, y)

def test_to_dask_dataframe_create_index_incompatible_with_set_index(self):
# Test that create_index=False and set_index=True raises an error
ds = Dataset({"a": ("t", da.from_array([1, 2, 3], chunks=2))})

with pytest.raises(
ValueError,
match="create_index=False is incompatible with set_index=True",
):
ds.to_dask_dataframe(create_index=False, set_index=True)

def test_to_dask_dataframe_create_index_2D(self):
# Test create_index=False with 2D dataset
w = np.random.randn(2, 3)
ds = Dataset({"w": (("x", "y"), da.from_array(w, chunks=(1, 2)))})
ds["x"] = ("x", np.array([0, 1], np.int64))
ds["y"] = ("y", list("abc"))

actual = ds.to_dask_dataframe(create_index=False)
assert isinstance(actual, dd.DataFrame)
actual_computed = actual.compute()

# Check that index is RangeIndex
assert isinstance(actual_computed.index, pd.RangeIndex)

# Check that dimension columns are not present
assert "x" not in actual_computed.columns
assert "y" not in actual_computed.columns

# Check that data column is present
assert "w" in actual_computed.columns

# Verify values are correct (flattened)
assert_array_equal(actual_computed["w"].values, w.reshape(-1))

def test_to_dask_dataframe_create_index_dataarray(self):
# Test create_index parameter for DataArray.to_dask_dataframe
arr_np = np.arange(3 * 4).reshape(3, 4)
arr = DataArray(
da.from_array(arr_np, chunks=(2, 2)),
[("B", [1, 2, 3]), ("A", list("cdef"))],
name="foo",
)

# With create_index=False, should use RangeIndex
actual = arr.to_dask_dataframe(create_index=False)
assert isinstance(actual, dd.DataFrame)
actual_computed = actual.compute()

assert isinstance(actual_computed.index, pd.RangeIndex)
assert "B" not in actual_computed.columns
assert "A" not in actual_computed.columns
assert "foo" in actual_computed.columns
assert_array_equal(actual_computed["foo"].values, arr_np.reshape(-1))

# Test incompatibility with set_index=True
with pytest.raises(
ValueError,
match="create_index=False is incompatible with set_index=True",
):
arr.to_dask_dataframe(create_index=False, set_index=True)


@pytest.mark.parametrize("method", ["load", "compute"])
def test_dask_kwargs_variable(method):
Expand Down
26 changes: 26 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3655,6 +3655,32 @@ def test_to_dataframe_0length(self) -> None:
assert len(actual) == 0
assert_array_equal(actual.index.names, list("ABC"))

def test_to_dataframe_create_index(self) -> None:
# Test create_index parameter
arr_np = np.arange(12).reshape(3, 4)
arr = DataArray(arr_np, [("x", [1, 2, 3]), ("y", list("abcd"))], name="foo")

# Default behavior: create MultiIndex
df_with_index = arr.to_dataframe()
assert isinstance(df_with_index.index, pd.MultiIndex)
assert df_with_index.index.names == ["x", "y"]
assert len(df_with_index) == 12

# With create_index=False: use RangeIndex
df_without_index = arr.to_dataframe(create_index=False)
assert isinstance(df_without_index.index, pd.RangeIndex)
assert len(df_without_index) == 12

# Data should be the same regardless
assert_array_equal(df_with_index["foo"].values, df_without_index["foo"].values)

# Test with coords that have different dimensions
arr.coords["z"] = ("x", [-1, -2, -3])
df_with_coords = arr.to_dataframe(create_index=False)
assert isinstance(df_with_coords.index, pd.RangeIndex)
assert "z" in df_with_coords.columns
assert len(df_with_coords) == 12

@pytest.mark.parametrize(
"x_dtype,y_dtype,v_dtype",
[
Expand Down
32 changes: 32 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2282,6 +2282,38 @@ def test_to_pandas(self) -> None:
with pytest.raises(ValueError, match=r"cannot convert Datasets"):
Dataset({"a": (["t", "r"], x2d), "b": (["t", "r"], y2d)}).to_pandas()

def test_to_dataframe_create_index(self) -> None:
# Test create_index parameter for Dataset
x = np.random.randn(3, 4)
y = np.random.randn(3, 4)
ds = Dataset(
{"a": (("x", "y"), x), "b": (("x", "y"), y)},
coords={"x": [1, 2, 3], "y": list("abcd")},
)

# Default behavior: create MultiIndex
df_with_index = ds.to_dataframe()
assert isinstance(df_with_index.index, pd.MultiIndex)
assert df_with_index.index.names == ["x", "y"]
assert len(df_with_index) == 12

# With create_index=False: use RangeIndex
df_without_index = ds.to_dataframe(create_index=False)
assert isinstance(df_without_index.index, pd.RangeIndex)
assert len(df_without_index) == 12

# Data should be the same regardless
assert_array_equal(df_with_index["a"].values, df_without_index["a"].values)
assert_array_equal(df_with_index["b"].values, df_without_index["b"].values)

# Test with dim_order and create_index=False
df_reordered = ds.to_dataframe(dim_order=["y", "x"], create_index=False)
assert isinstance(df_reordered.index, pd.RangeIndex)
assert len(df_reordered) == 12
# Check that dim_order affects the data ordering
df_reordered_with_idx = ds.to_dataframe(dim_order=["y", "x"])
assert_array_equal(df_reordered["a"].values, df_reordered_with_idx["a"].values)

def test_reindex_like(self) -> None:
data = create_test_data()
data["letters"] = ("dim3", 10 * ["a"])
Expand Down
Loading