From 5dceb642fcfdc2a943ecdc3a0c60bdc1ca998350 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Tue, 16 Dec 2025 04:22:21 +0000 Subject: [PATCH 01/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 6e4e3391..5b03faf8 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 6e4e3391db59f79bff2138846d3eab391ea32c20 +Subproject commit 5b03faf8a62ecb64a777753df7e83d5d75c3c9b2 From 7aa8caece75a6e6fa51ade765f9e375cc33a8e8a Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Wed, 17 Dec 2025 07:19:57 +0000 Subject: [PATCH 02/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 5b03faf8..91d691a9 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 5b03faf8a62ecb64a777753df7e83d5d75c3c9b2 +Subproject commit 91d691a90831e79381c2b5c457cc6dd70a0d5ea2 From 89b12d40467b5962324a522324b95fc49cc8f9ea Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Thu, 18 Dec 2025 06:11:31 +0000 Subject: [PATCH 03/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 91d691a9..7f217607 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 91d691a90831e79381c2b5c457cc6dd70a0d5ea2 +Subproject commit 7f217607e498301fd6aac0a623e893419fb14ae5 From d1743d82a69824f02bfc39e3ec96f75cc74af34a Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Fri, 19 Dec 2025 05:49:40 +0000 Subject: [PATCH 04/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 7f217607..1c87e80f 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 7f217607e498301fd6aac0a623e893419fb14ae5 +Subproject commit 1c87e80fdc53ba213ab19d7e21bfff7087d8306c From 8aa268a55a21b03ceafe0b41256bfd943cde2474 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Sun, 21 Dec 2025 05:51:05 +0000 Subject: [PATCH 05/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 1c87e80f..5a334c23 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 1c87e80fdc53ba213ab19d7e21bfff7087d8306c +Subproject commit 5a334c23dadba0f24322ce609dc0f15bc52c41a8 From 1567cc34afa172e229968bf5264d23fcf41d5257 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Tue, 23 Dec 2025 06:24:59 +0000 Subject: [PATCH 06/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 5a334c23..4713d054 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 5a334c23dadba0f24322ce609dc0f15bc52c41a8 +Subproject commit 4713d0549e055683305a3083791b3b53061c0b01 From 6866a36def9c204302d60d1d09e921c70634ed01 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Thu, 25 Dec 2025 05:47:56 +0000 Subject: [PATCH 07/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 4713d054..0761d5cc 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 4713d0549e055683305a3083791b3b53061c0b01 +Subproject commit 0761d5cc7a033d041ad5514a18642a60cb0cb392 From 295107328c8e5858dede1a4c665c58ed28407b95 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Thu, 1 Jan 2026 06:26:21 +0000 Subject: [PATCH 08/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 0761d5cc..de5283b5 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 0761d5cc7a033d041ad5514a18642a60cb0cb392 +Subproject commit de5283b5b985cc239904f5404925521bf09aba1d From 20d43287453f0229c63b6c6a720c2958acb8bf3f Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Mon, 5 Jan 2026 06:03:39 +0000 Subject: [PATCH 09/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index de5283b5..a5c128a8 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit de5283b5b985cc239904f5404925521bf09aba1d +Subproject commit a5c128a833f0842f276751130c5743be43f3a345 From 29e1a0690e62963aea71d0f138fd9af2dd357e62 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Tue, 6 Jan 2026 08:01:38 +0000 Subject: [PATCH 10/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index a5c128a8..686f25a3 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit a5c128a833f0842f276751130c5743be43f3a345 +Subproject commit 686f25a362e562bb6acc51c59f91a951cdd2668b From dc1aa44eed12059d50b42f8c6e33a062b9b49b53 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Wed, 7 Jan 2026 06:45:55 +0000 Subject: [PATCH 11/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 686f25a3..acc36fbe 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 686f25a362e562bb6acc51c59f91a951cdd2668b +Subproject commit acc36fbe6e1417df3968c00ec35aaf01821467e2 From cad0cd26a0811d3583ad3dbeba7c5b64de5eb861 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Thu, 8 Jan 2026 07:16:56 +0000 Subject: [PATCH 12/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index acc36fbe..908d3eb2 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit acc36fbe6e1417df3968c00ec35aaf01821467e2 +Subproject commit 908d3eb2815c8d96a5a6e7d8f8b7aafcb52a76ad From 8540a8967e2ddf80f37594f8f1b3a010a60b0fc5 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Sun, 11 Jan 2026 08:31:44 +0000 Subject: [PATCH 13/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 908d3eb2..a56ccd80 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 908d3eb2815c8d96a5a6e7d8f8b7aafcb52a76ad +Subproject commit a56ccd8040339c1fcb688122dbe494a482354a01 From c2e7da20c05f4445b7b65ce9829927751706bd9b Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Tue, 13 Jan 2026 08:08:27 +0000 Subject: [PATCH 14/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index a56ccd80..2e7b9f0e 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit a56ccd8040339c1fcb688122dbe494a482354a01 +Subproject commit 2e7b9f0eb8748f257a125f032da1417ab81425d1 From 75fdbd3077c3a461fbc4fbd0ca11fe6118ca69c0 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Wed, 14 Jan 2026 13:02:22 +0100 Subject: [PATCH 15/37] escape idenitifiers in relation aggregations --- src/duckdb_py/pyrelation.cpp | 32 +++- .../relational_api/test_rapi_aggregations.py | 143 ++++++++++++++++++ 2 files changed, 172 insertions(+), 3 deletions(-) diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp index 58cfcc29..2e748f8f 100644 --- a/src/duckdb_py/pyrelation.cpp +++ b/src/duckdb_py/pyrelation.cpp @@ -395,10 +395,36 @@ string DuckDBPyRelation::GenerateExpressionList(const string &function_name, vec function_name + "(" + function_parameter + ((ignore_nulls) ? " ignore nulls) " : ") ") + window_spec; } for (idx_t i = 0; i < input.size(); i++) { + // We parse the input as an expression to validate it. + auto trimmed_input = input[i]; + StringUtil::Trim(trimmed_input); + + unique_ptr expression; + try { + auto expressions = Parser::ParseExpressionList(trimmed_input); + if (expressions.size() == 1) { + expression = std::move(expressions[0]); + } + } catch (const ParserException &) { + // First attempt at parsing failed, the input might be a column name that needs quoting. + auto quoted_input = KeywordHelper::WriteQuoted(trimmed_input, '"'); + auto expressions = Parser::ParseExpressionList(quoted_input); + if (expressions.size() == 1 && expressions[0]->GetExpressionClass() == ExpressionClass::COLUMN_REF) { + expression = std::move(expressions[0]); + } + } + + if (!expression) { + throw ParserException("Invalid column expression: %s", trimmed_input); + } + + // ToString() handles escaping for all expression types + auto escaped_input = expression->ToString(); + if (function_parameter.empty()) { - expr += function_name + "(" + input[i] + ((ignore_nulls) ? " ignore nulls) " : ") ") + window_spec; + expr += function_name + "(" + escaped_input + ((ignore_nulls) ? " ignore nulls) " : ") ") + window_spec; } else { - expr += function_name + "(" + input[i] + "," + function_parameter + + expr += function_name + "(" + escaped_input + "," + function_parameter + ((ignore_nulls) ? " ignore nulls) " : ") ") + window_spec; } @@ -587,7 +613,7 @@ unique_ptr DuckDBPyRelation::Product(const std::string &column unique_ptr DuckDBPyRelation::StringAgg(const std::string &column, const std::string &sep, const std::string &groups, const std::string &window_spec, const std::string &projected_columns) { - auto string_agg_params = "\'" + sep + "\'"; + auto string_agg_params = KeywordHelper::WriteOptionallyQuoted(sep, '\''); return ApplyAggOrWin("string_agg", column, string_agg_params, groups, window_spec, projected_columns); } diff --git a/tests/fast/relational_api/test_rapi_aggregations.py b/tests/fast/relational_api/test_rapi_aggregations.py index ffb7e303..409972fc 100644 --- a/tests/fast/relational_api/test_rapi_aggregations.py +++ b/tests/fast/relational_api/test_rapi_aggregations.py @@ -416,3 +416,146 @@ def test_var_samp(self, table, f): def test_describe(self, table): assert table.describe().fetchall() is not None + + +class TestRAPIAggregationsColumnEscaping: + """Test that aggregate functions properly escape column names that need quoting.""" + + def test_reserved_keyword_column_name(self, duckdb_cursor): + # Column name "select" is a reserved SQL keyword + rel = duckdb_cursor.sql('select 1 as "select", 2 as "order"') + result = rel.sum("select").fetchall() + assert result == [(1,)] + + result = rel.avg("order").fetchall() + assert result == [(2.0,)] + + def test_column_name_with_space(self, duckdb_cursor): + rel = duckdb_cursor.sql('select 10 as "my column"') + result = rel.sum("my column").fetchall() + assert result == [(10,)] + + def test_column_name_with_quotes(self, duckdb_cursor): + # Column name containing a double quote + rel = duckdb_cursor.sql('select 5 as "col""name"') + result = rel.sum('col"name').fetchall() + assert result == [(5,)] + + def test_qualified_column_name(self, duckdb_cursor): + # Qualified column name like table.column + rel = duckdb_cursor.sql("select 42 as value") + # When using qualified names, they should be properly escaped + result = rel.sum("value").fetchall() + assert result == [(42,)] + + +class TestRAPIAggregationsExpressionPassthrough: + """Test that aggregate functions correctly pass through SQL expressions without escaping.""" + + def test_cast_expression(self, duckdb_cursor): + # Cast expressions should pass through without being quoted + rel = duckdb_cursor.sql("select 1 as v, 0 as f") + result = rel.bool_and("v::BOOL").fetchall() + assert result == [(True,)] + + result = rel.bool_or("f::BOOL").fetchall() + assert result == [(False,)] + + def test_star_expression(self, duckdb_cursor): + # Star (*) should pass through for count + rel = duckdb_cursor.sql("select 1 as a union all select 2") + result = rel.count("*").fetchall() + assert result == [(2,)] + + def test_arithmetic_expression(self, duckdb_cursor): + # Arithmetic expressions should pass through + rel = duckdb_cursor.sql("select 10 as a, 5 as b") + result = rel.sum("a + b").fetchall() + assert result == [(15,)] + + def test_function_expression(self, duckdb_cursor): + # Function calls should pass through + rel = duckdb_cursor.sql("select -5 as v") + result = rel.sum("abs(v)").fetchall() + assert result == [(5,)] + + def test_case_expression(self, duckdb_cursor): + # CASE expressions should pass through + rel = duckdb_cursor.sql("select 1 as v union all select 2 union all select 3") + result = rel.sum("case when v > 1 then v else 0 end").fetchall() + assert result == [(5,)] + + +class TestRAPIAggregationsWithInvalidInput: + """Test that only expression can be used.""" + + def test_injection_with_semicolon_is_neutralized(self, duckdb_cursor): + # Semicolon injection fails to parse as expression, gets quoted as identifier + rel = duckdb_cursor.sql("select 1 as v") + with pytest.raises(duckdb.BinderException, match="not found in FROM clause"): + rel.sum("v; drop table agg; --").fetchall() + + def test_injection_with_union_is_neutralized(self, duckdb_cursor): + # UNION fails to parse as single expression, gets quoted + rel = duckdb_cursor.sql("select 1 as v") + with pytest.raises(duckdb.BinderException, match="not found in FROM clause"): + rel.sum("v union select * from agg").fetchall() + + def test_subquery_is_contained(self, duckdb_cursor): + # Subqueries are valid expressions - they're contained within the aggregate + # and cannot break out of the expression context + rel = duckdb_cursor.sql("select 1 as v") + # This executes sum((select 1)) = sum(1) = 1 - contained, not an injection + result = rel.sum("(select 1)").fetchall() + assert result == [(1,)] + + def test_injection_closing_paren_is_neutralized(self, duckdb_cursor): + # Adding a closing paren fails to parse, gets quoted + rel = duckdb_cursor.sql("select 1 as v") + with pytest.raises(duckdb.BinderException, match="not found in FROM clause"): + rel.sum("v) from agg; drop table agg; --").fetchall() + + def test_comment_is_harmless(self, duckdb_cursor): + # SQL comments are stripped during parsing, so "v -- comment" parses as just "v" + rel = duckdb_cursor.sql("select 1 as v") + result = rel.sum("v -- this is ignored").fetchall() + assert result == [(1,)] + + def test_empty_expression_rejected(self, duckdb_cursor): + # Empty or whitespace-only expressions should be rejected + rel = duckdb_cursor.sql("select 1 as v") + with pytest.raises(duckdb.ParserException): + rel.sum("").fetchall() + + def test_whitespace_only_expression_rejected(self, duckdb_cursor): + # Whitespace-only expressions should be rejected + rel = duckdb_cursor.sql("select 1 as v") + with pytest.raises(duckdb.ParserException): + rel.sum(" ").fetchall() + + +class TestRAPIStringAggSeparatorEscaping: + """Test that string_agg separator is properly escaped as a string literal.""" + + def test_simple_separator(self, duckdb_cursor): + rel = duckdb_cursor.sql("select 'a' as s union all select 'b' union all select 'c'") + result = rel.string_agg("s", ",").fetchall() + assert result == [("a,b,c",)] + + def test_separator_with_single_quote(self, duckdb_cursor): + # Separator containing a single quote should be properly escaped + rel = duckdb_cursor.sql("select 'a' as s union all select 'b'") + result = rel.string_agg("s", "','").fetchall() + assert result == [("a','b",)] + + def test_separator_with_special_chars(self, duckdb_cursor): + rel = duckdb_cursor.sql("select 'x' as s union all select 'y'") + result = rel.string_agg("s", " | ").fetchall() + assert result == [("x | y",)] + + def test_separator_injection_attempt(self, duckdb_cursor): + # Attempt to inject via separator - should be safely quoted as string literal + rel = duckdb_cursor.sql("select 'a' as s union all select 'b'") + # This should NOT execute the injection - separator becomes a literal string + result = rel.string_agg("s", "'); drop table agg; --").fetchall() + assert result == [("a'); drop table agg; --b",)] From eeac44ac6dde2f6ec205545c6a71cb376e41c4f4 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Fri, 16 Jan 2026 05:58:50 +0000 Subject: [PATCH 16/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 2e7b9f0e..431ad092 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 2e7b9f0eb8748f257a125f032da1417ab81425d1 +Subproject commit 431ad092c9d666c81b3739438ab19d72fc622362 From 39c384bfc0adf6d44d99634e4e09a8b7af2e761c Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Wed, 21 Jan 2026 15:19:52 +0100 Subject: [PATCH 17/37] Fix DECREF bug during interpreter shutdown --- src/duckdb_py/common/exceptions.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/duckdb_py/common/exceptions.cpp b/src/duckdb_py/common/exceptions.cpp index 51de2bdf..5bf744f1 100644 --- a/src/duckdb_py/common/exceptions.cpp +++ b/src/duckdb_py/common/exceptions.cpp @@ -350,13 +350,19 @@ void RegisterExceptions(const py::module &m) { auto io_exception = py::register_exception(m, "IOException", operational_error).ptr(); py::register_exception(m, "SerializationException", operational_error); - static py::exception HTTP_EXCEPTION(m, "HTTPException", io_exception); - const auto string_type = py::type::of(py::str()); - const auto Dict = py::module_::import("typing").attr("Dict"); - HTTP_EXCEPTION.attr("__annotations__") = - py::dict(py::arg("status_code") = py::type::of(py::int_()), py::arg("body") = string_type, - py::arg("reason") = string_type, py::arg("headers") = Dict[py::make_tuple(string_type, string_type)]); - HTTP_EXCEPTION.doc() = "Thrown when an error occurs in the httpfs extension, or whilst downloading an extension."; + // Use a raw pointer to avoid destructor running after Python finalization. + // The module holds a reference to the exception type, keeping it alive. + static PyObject *HTTP_EXCEPTION = nullptr; + { + auto http_exc = py::register_exception(m, "HTTPException", io_exception); + HTTP_EXCEPTION = http_exc.ptr(); + const auto string_type = py::type::of(py::str()); + const auto Dict = py::module_::import("typing").attr("Dict"); + http_exc.attr("__annotations__") = py::dict( + py::arg("status_code") = py::type::of(py::int_()), py::arg("body") = string_type, + py::arg("reason") = string_type, py::arg("headers") = Dict[py::make_tuple(string_type, string_type)]); + http_exc.doc() = "Thrown when an error occurs in the httpfs extension, or whilst downloading an extension."; + } // IntegrityError auto integrity_error = py::register_exception(m, "IntegrityError", db_error).ptr(); @@ -388,7 +394,7 @@ void RegisterExceptions(const py::module &m) { } catch (const duckdb::Exception &ex) { duckdb::ErrorData error(ex); UnsetPythonException(); - PyThrowException(error, HTTP_EXCEPTION.ptr()); + PyThrowException(error, HTTP_EXCEPTION); } catch (const py::builtin_exception &ex) { // These represent Python exceptions, we don't want to catch these throw; @@ -399,7 +405,7 @@ void RegisterExceptions(const py::module &m) { throw; } UnsetPythonException(); - PyThrowException(error, HTTP_EXCEPTION.ptr()); + PyThrowException(error, HTTP_EXCEPTION); } }); } From c2e65055824d2369e54412638a1478444ca7b6b5 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Thu, 22 Jan 2026 09:39:44 +0100 Subject: [PATCH 18/37] Support Pandas' new str type --- src/duckdb_py/numpy/type.cpp | 3 +++ tests/fast/pandas/test_new_string_type.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 tests/fast/pandas/test_new_string_type.py diff --git a/src/duckdb_py/numpy/type.cpp b/src/duckdb_py/numpy/type.cpp index 92ac4785..3642cbd4 100644 --- a/src/duckdb_py/numpy/type.cpp +++ b/src/duckdb_py/numpy/type.cpp @@ -58,6 +58,9 @@ static NumpyNullableType ConvertNumpyTypeInternal(const string &col_type_str) { if (col_type_str == "string") { return NumpyNullableType::STRING; } + if (col_type_str == "str") { + return NumpyNullableType::STRING; + } if (col_type_str == "object") { return NumpyNullableType::OBJECT; } diff --git a/tests/fast/pandas/test_new_string_type.py b/tests/fast/pandas/test_new_string_type.py new file mode 100644 index 00000000..bd13d53a --- /dev/null +++ b/tests/fast/pandas/test_new_string_type.py @@ -0,0 +1,20 @@ +import pandas as pd +import pytest +from packaging.version import Version + +import duckdb + + +@pytest.mark.skipif( + Version(pd.__version__) < Version("3.0"), reason="Pandas < 3.0 doesn't have the new string type yet" +) +def test_new_str_type_pandas_3_0(): + df = pd.DataFrame({"s": ["DuckDB"]}) # noqa: F841 + duckdb.sql("select * from df") + + +@pytest.mark.skipif(Version(pd.__version__) >= Version("3.0"), reason="Pandas >= 3.0 has the new string type") +def test_new_str_type_pandas_lt_3_0(): + pd.options.future.infer_string = True + df = pd.DataFrame({"s": ["DuckDB"]}) # noqa: F841 + duckdb.sql("select * from df") From 9d6edb6ead36cae62fa65e223b374b5edfaa7731 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Fri, 23 Jan 2026 11:31:26 +0100 Subject: [PATCH 19/37] dependency updates and fix for lazy attribute accessors --- pyproject.toml | 9 ++++++--- src/duckdb_py/pyresult.cpp | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3bd54543..5898e177 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -234,7 +234,8 @@ stubdeps = [ # dependencies used for typehints in the stubs "typing-extensions", ] test = [ # dependencies used for running tests - "adbc-driver-manager; sys_platform != 'win32' or platform_machine != 'ARM64'", + "adbc-driver-manager>=1.10.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", + "adbc-driver-manager>=1.7.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", "pytest", "pytest-reraise", "pytest-timeout", @@ -252,8 +253,10 @@ test = [ # dependencies used for running tests "requests", "urllib3", "fsspec>=2022.11.0; sys_platform != 'win32' or platform_machine != 'ARM64'", - "pandas>=2.0.0", - "pyarrow>=18.0.0; sys_platform != 'win32' or platform_machine != 'ARM64'", + "pandas>=3.0.0; python_version > '3.10'", + "pandas<3.0.0; python_version < '3.11'", + "pyarrow>=23.0.0; python_version >= '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", + "pyarrow>=18.0.0; python_version < '3.10' and (sys_platform != 'win32' or platform_machine != 'ARM64')", "torch>=2.2.2; python_version < '3.14' and ( sys_platform != 'darwin' or platform_machine != 'x86_64' or python_version < '3.13' ) and ( sys_platform != 'win32' or platform_machine != 'ARM64' or python_version > '3.11' )", "tensorflow==2.14.0; sys_platform == 'darwin' and python_version < '3.12'", "tensorflow-cpu>=2.14.0; sys_platform == 'linux' and platform_machine != 'aarch64' and python_version < '3.12'", diff --git a/src/duckdb_py/pyresult.cpp b/src/duckdb_py/pyresult.cpp index e92f6abe..cc6224c2 100644 --- a/src/duckdb_py/pyresult.cpp +++ b/src/duckdb_py/pyresult.cpp @@ -304,7 +304,7 @@ void DuckDBPyResult::ConvertDateTimeTypes(PandasDataFrame &df, bool date_as_obje // We need to create the column anew because the exact dt changed to a new timezone ReplaceDFColumn(df, names[i].c_str(), i, new_value); } else if (date_as_object && result->types[i] == LogicalType::DATE) { - auto new_value = df[names[i].c_str()].attr("dt").attr("date"); + py::object new_value = df[names[i].c_str()].attr("dt").attr("date"); ReplaceDFColumn(df, names[i].c_str(), i, new_value); } } From 5a654d338003a83ae03d8c954fb40ace7cbc6d4e Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Fri, 23 Jan 2026 14:04:02 +0100 Subject: [PATCH 20/37] pandas default backend fixes --- tests/conftest.py | 100 +---- .../test_pandas_categorical_coverage.py | 26 +- tests/extensions/test_httpfs.py | 11 +- tests/fast/api/test_3654.py | 13 +- tests/fast/api/test_config.py | 23 +- tests/fast/api/test_dbapi00.py | 21 +- tests/fast/api/test_dbapi08.py | 12 +- tests/fast/api/test_duckdb_connection.py | 17 +- tests/fast/api/test_duckdb_query.py | 7 +- tests/fast/api/test_native_tz.py | 20 +- tests/fast/api/test_to_csv.py | 110 +++--- tests/fast/api/test_to_parquet.py | 38 +- tests/fast/arrow/test_6796.py | 7 +- tests/fast/pandas/test_2304.py | 32 +- tests/fast/pandas/test_append_df.py | 26 +- tests/fast/pandas/test_bug5922.py | 8 +- tests/fast/pandas/test_copy_on_write.py | 24 +- .../pandas/test_create_table_from_pandas.py | 18 +- tests/fast/pandas/test_datetime_time.py | 23 +- tests/fast/pandas/test_datetime_timestamp.py | 52 ++- tests/fast/pandas/test_df_analyze.py | 46 +-- .../fast/pandas/test_df_object_resolution.py | 360 ++++++++---------- tests/fast/pandas/test_df_recursive_nested.py | 33 +- .../fast/pandas/test_implicit_pandas_scan.py | 32 +- tests/fast/pandas/test_import_cache.py | 23 +- tests/fast/pandas/test_issue_1767.py | 14 +- tests/fast/pandas/test_limit.py | 13 +- tests/fast/pandas/test_pandas_na.py | 21 +- tests/fast/pandas/test_pandas_unregister.py | 12 +- .../fast/pandas/test_parallel_pandas_scan.py | 85 ++--- tests/fast/spark/test_spark_to_csv.py | 49 +-- tests/fast/test_case_alias.py | 8 +- tests/fast/test_insert.py | 10 +- tests/fast/test_map.py | 60 ++- tests/fast/test_multithread.py | 196 ++++------ tests/fast/test_parameter_list.py | 7 +- tests/fast/test_relation.py | 6 +- tests/fast/test_relation_dependency_leak.py | 43 +-- tests/fast/test_runtime_error.py | 27 +- 39 files changed, 671 insertions(+), 962 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bfb458a5..ed7c359a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,7 @@ import warnings from importlib import import_module from pathlib import Path -from typing import Any, Union +from typing import Union import pytest @@ -19,13 +19,27 @@ pandas = None pyarrow_dtype = None -# Check if pandas has arrow dtypes enabled -try: - from pandas.compat import pa_version_under7p0 - pyarrow_dtypes_enabled = not pa_version_under7p0 -except ImportError: - pyarrow_dtypes_enabled = False +# Version-aware helpers for Pandas 2.x vs 3.0 compatibility +def _get_pandas_ge_3(): + if pandas is None: + return False + from packaging.version import Version + + return Version(pandas.__version__) >= Version("3.0.0") + + +PANDAS_GE_3 = _get_pandas_ge_3() + + +def is_string_dtype(dtype): + """Check if a dtype is a string dtype (works across Pandas 2.x and 3.0). + + Uses pd.api.types.is_string_dtype() which handles: + - Pandas 2.x: object dtype for strings + - Pandas 3.0+: str (StringDtype) for strings + """ + return pandas.api.types.is_string_dtype(dtype) def import_pandas(): @@ -113,78 +127,6 @@ def pandas_supports_arrow_backend(): return pandas_2_or_higher() -def numpy_pandas_df(*args, **kwargs): - return import_pandas().DataFrame(*args, **kwargs) - - -def arrow_pandas_df(*args, **kwargs): - df = numpy_pandas_df(*args, **kwargs) - return df.convert_dtypes(dtype_backend="pyarrow") - - -class NumpyPandas: - def __init__(self) -> None: - self.backend = "numpy_nullable" - self.DataFrame = numpy_pandas_df - self.pandas = import_pandas() - - def __getattr__(self, name: str) -> Any: # noqa: ANN401 - return getattr(self.pandas, name) - - -def convert_arrow_to_numpy_backend(df): - names = df.columns - df_content = {} - for name in names: - df_content[name] = df[name].array.__arrow_array__() - # This should convert the pyarrow chunked arrays into numpy arrays - return import_pandas().DataFrame(df_content) - - -def convert_to_numpy(df): - if ( - pyarrow_dtypes_enabled - and pyarrow_dtype is not None - and any(True for x in df.dtypes if isinstance(x, pyarrow_dtype)) - ): - return convert_arrow_to_numpy_backend(df) - return df - - -def convert_and_equal(df1, df2, **kwargs): - df1 = convert_to_numpy(df1) - df2 = convert_to_numpy(df2) - import_pandas().testing.assert_frame_equal(df1, df2, **kwargs) - - -class ArrowMockTesting: - def __init__(self) -> None: - self.testing = import_pandas().testing - self.assert_frame_equal = convert_and_equal - - def __getattr__(self, name: str) -> Any: # noqa: ANN401 - return getattr(self.testing, name) - - -# This converts dataframes constructed with 'DataFrame(...)' to pyarrow backed dataframes -# Assert equal does the opposite, turning all pyarrow backed dataframes into numpy backed ones -# this is done because we don't produce pyarrow backed dataframes yet -class ArrowPandas: - def __init__(self) -> None: - self.pandas = import_pandas() - if pandas_2_or_higher() and pyarrow_dtypes_enabled: - self.backend = "pyarrow" - self.DataFrame = arrow_pandas_df - else: - # For backwards compatible reasons, just mock regular pandas - self.backend = "numpy_nullable" - self.DataFrame = self.pandas.DataFrame - self.testing = ArrowMockTesting() - - def __getattr__(self, name: str) -> Any: # noqa: ANN401 - return getattr(self.pandas, name) - - @pytest.fixture def require(): def _require(extension_name, db_name="") -> Union[duckdb.DuckDBPyConnection, None]: diff --git a/tests/coverage/test_pandas_categorical_coverage.py b/tests/coverage/test_pandas_categorical_coverage.py index 7b0645e0..6155138a 100644 --- a/tests/coverage/test_pandas_categorical_coverage.py +++ b/tests/coverage/test_pandas_categorical_coverage.py @@ -1,5 +1,4 @@ -import pytest -from conftest import NumpyPandas +import pandas as pd import duckdb @@ -9,23 +8,23 @@ def check_result_list(res): assert res_item[0] == res_item[1] -def check_create_table(category, pandas): +def check_create_table(category): conn = duckdb.connect() conn.execute("PRAGMA enable_verification") - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { - "x": pandas.Categorical(category, ordered=True), - "y": pandas.Categorical(category, ordered=True), + "x": pd.Categorical(category, ordered=True), + "y": pd.Categorical(category, ordered=True), "z": category, } ) category.append("bla") - df_in_diff = pandas.DataFrame( # noqa: F841 + df_in_diff = pd.DataFrame( # noqa: F841 { - "k": pandas.Categorical(category, ordered=True), + "k": pd.Categorical(category, ordered=True), } ) @@ -68,14 +67,11 @@ def check_create_table(category, pandas): conn.execute("DROP TABLE t1") -# TODO: extend tests with ArrowPandas # noqa: TD002, TD003 class TestCategory: - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_category_string_uint16(self, duckdb_cursor, pandas): + def test_category_string_uint16(self, duckdb_cursor): category = [str(i) for i in range(300)] - check_create_table(category, pandas) + check_create_table(category) - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_category_string_uint32(self, duckdb_cursor, pandas): + def test_category_string_uint32(self, duckdb_cursor): category = [str(i) for i in range(70000)] - check_create_table(category, pandas) + check_create_table(category) diff --git a/tests/extensions/test_httpfs.py b/tests/extensions/test_httpfs.py index 26ce917c..b8335814 100644 --- a/tests/extensions/test_httpfs.py +++ b/tests/extensions/test_httpfs.py @@ -1,8 +1,8 @@ import datetime import os +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -34,8 +34,7 @@ def test_s3fs(self, require): res = rel.fetchone() assert res == (1, 0, datetime.date(1965, 2, 28), 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 6, 0, 0, 0, 0) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_httpfs(self, require, pandas): + def test_httpfs(self, require): connection = require("httpfs") try: connection.execute(""" @@ -51,14 +50,14 @@ def test_httpfs(self, require, pandas): raise result_df = connection.fetchdf() - exp_result = pandas.DataFrame( + exp_result = pd.DataFrame( { - "id": pandas.Series([1, 2, 3], dtype="int32"), + "id": pd.Series([1, 2, 3], dtype="int32"), "first_name": ["Amanda", "Albert", "Evelyn"], "last_name": ["Jordan", "Freeman", "Morgan"], } ) - pandas.testing.assert_frame_equal(result_df, exp_result) + pd.testing.assert_frame_equal(result_df, exp_result, check_dtype=False) def test_http_exception(self, require): connection = require("httpfs") diff --git a/tests/fast/api/test_3654.py b/tests/fast/api/test_3654.py index a6b01dd5..11f37946 100644 --- a/tests/fast/api/test_3654.py +++ b/tests/fast/api/test_3654.py @@ -1,4 +1,4 @@ -import pytest +import pandas as pd import duckdb @@ -8,13 +8,11 @@ can_run = True except Exception: can_run = False -from conftest import ArrowPandas, NumpyPandas class Test3654: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_3654_pandas(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_3654_pandas(self, duckdb_cursor): + df1 = pd.DataFrame( { "id": [1, 1, 2], } @@ -25,12 +23,11 @@ def test_3654_pandas(self, duckdb_cursor, pandas): print(rel.execute().fetchall()) assert rel.execute().fetchall() == [(1,), (1,), (2,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_3654_arrow(self, duckdb_cursor, pandas): + def test_3654_arrow(self, duckdb_cursor): if not can_run: return - df1 = pandas.DataFrame( + df1 = pd.DataFrame( { "id": [1, 1, 2], } diff --git a/tests/fast/api/test_config.py b/tests/fast/api/test_config.py index aaec24c4..7d1370eb 100644 --- a/tests/fast/api/test_config.py +++ b/tests/fast/api/test_config.py @@ -2,37 +2,32 @@ import os import re -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestDBConfig: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_default_order(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3]}) + def test_default_order(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3]}) con = duckdb.connect(":memory:", config={"default_order": "desc"}) result = con.execute("select * from df order by a").fetchall() assert result == [(3,), (2,), (1,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_null_order(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3, None]}) + def test_null_order(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3, None]}) con = duckdb.connect(":memory:", config={"default_null_order": "nulls_last"}) result = con.execute("select * from df order by a").fetchall() assert result == [(1,), (2,), (3,), (None,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_multiple_options(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3, None]}) + def test_multiple_options(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3, None]}) con = duckdb.connect(":memory:", config={"default_null_order": "nulls_last", "default_order": "desc"}) result = con.execute("select * from df order by a").fetchall() assert result == [(3,), (2,), (1,), (None,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_external_access(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"a": [1, 2, 3]}) + def test_external_access(self, duckdb_cursor): + df = pd.DataFrame({"a": [1, 2, 3]}) # this works (replacement scan) con_regular = duckdb.connect(":memory:", config={}) con_regular.execute("select * from df") diff --git a/tests/fast/api/test_dbapi00.py b/tests/fast/api/test_dbapi00.py index 425cb7e1..4a942128 100644 --- a/tests/fast/api/test_dbapi00.py +++ b/tests/fast/api/test_dbapi00.py @@ -1,8 +1,8 @@ # simple DB API testcase import numpy +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas def assert_result_equal(result): @@ -83,30 +83,29 @@ def test_numpy_selection(self, duckdb_cursor, integers, timestamps): arr.mask = [False, False, True] numpy.testing.assert_array_equal(result["t"], arr, "Incorrect result returned") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_selection(self, duckdb_cursor, pandas, integers, timestamps): + def test_pandas_selection(self, duckdb_cursor, integers, timestamps): import datetime from packaging.version import Version # I don't know when this exactly changed, but 2.0.3 does not support this, recent versions do - if Version(pandas.__version__) <= Version("2.0.3"): + if Version(pd.__version__) <= Version("2.0.3"): pytest.skip("The resulting dtype is 'object' when given a Series with dtype Int32DType") duckdb_cursor.execute("SELECT * FROM integers") result = duckdb_cursor.fetchdf() array = numpy.ma.masked_array(numpy.arange(11)) array.mask = [False] * 10 + [True] - arr = {"i": pandas.Series(array.data, dtype=pandas.Int32Dtype)} - arr["i"][array.mask] = pandas.NA - arr = pandas.DataFrame(arr) - pandas.testing.assert_frame_equal(result, arr) + arr = {"i": pd.Series(array.data, dtype=pd.Int32Dtype)} + arr["i"][array.mask] = pd.NA + arr = pd.DataFrame(arr) + pd.testing.assert_frame_equal(result, arr) duckdb_cursor.execute("SELECT * FROM timestamps") result = duckdb_cursor.fetchdf() - df = pandas.DataFrame( + df = pd.DataFrame( { - "t": pandas.Series( + "t": pd.Series( data=[ datetime.datetime(year=1992, month=10, day=3, hour=18, minute=34, second=45), datetime.datetime(year=2010, month=1, day=1, hour=0, minute=0, second=1), @@ -116,7 +115,7 @@ def test_pandas_selection(self, duckdb_cursor, pandas, integers, timestamps): ) } ) - pandas.testing.assert_frame_equal(result, df) + pd.testing.assert_frame_equal(result, df) # def test_numpy_creation(self, duckdb_cursor): # # numpyarray = {'i': numpy.arange(10), 'v': numpy.random.randint(100, size=(1, 10))} # segfaults diff --git a/tests/fast/api/test_dbapi08.py b/tests/fast/api/test_dbapi08.py index def4e925..79b2ce0b 100644 --- a/tests/fast/api/test_dbapi08.py +++ b/tests/fast/api/test_dbapi08.py @@ -1,21 +1,19 @@ # test fetchdf with various types -import pytest -from conftest import NumpyPandas +import pandas as pd import duckdb class TestType: - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_fetchdf(self, pandas): + def test_fetchdf(self): con = duckdb.connect() con.execute("CREATE TABLE items(item VARCHAR)") con.execute("INSERT INTO items VALUES ('jeans'), (''), (NULL)") res = con.execute("SELECT item FROM items").fetchdf() - assert isinstance(res, pandas.core.frame.DataFrame) + assert isinstance(res, pd.core.frame.DataFrame) - df = pandas.DataFrame({"item": ["jeans", "", None]}) + df = pd.DataFrame({"item": ["jeans", "", None]}) print(res) print(df) - pandas.testing.assert_frame_equal(res, df) + pd.testing.assert_frame_equal(res, df, check_dtype=False) diff --git a/tests/fast/api/test_duckdb_connection.py b/tests/fast/api/test_duckdb_connection.py index 246b9d92..efcc2203 100644 --- a/tests/fast/api/test_duckdb_connection.py +++ b/tests/fast/api/test_duckdb_connection.py @@ -1,7 +1,7 @@ import re +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -25,10 +25,9 @@ def tmp_database(tmp_path_factory): # This file contains tests for DuckDBPyConnection methods, # wrapped by the 'duckdb' module, to execute with the 'default_connection' class TestDuckDBConnection: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append(self, pandas): + def test_append(self): duckdb.execute("Create table integers (i integer)") - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -345,13 +344,12 @@ def test_unregister_with_scary_name(self, duckdb_cursor): with pytest.raises(duckdb.CatalogException): duckdb_cursor.sql(f'select * from "{escaped_scary_name}"') - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_out_of_scope(self, pandas): + def test_relation_out_of_scope(self): def temporary_scope(): # Create a connection, we will return this con = duckdb.connect() # Create a dataframe - df = pandas.DataFrame({"a": [1, 2, 3]}) + df = pd.DataFrame({"a": [1, 2, 3]}) # The dataframe has to be registered as well # making sure it does not go out of scope con.register("df", df) @@ -389,10 +387,11 @@ def test_interrupt(self): assert duckdb.interrupt is not None def test_wrap_shadowing(self): - pd = NumpyPandas() + import pandas as pd_local + import duckdb - df = pd.DataFrame({"a": [1, 2, 3]}) # noqa: F841 + df = pd_local.DataFrame({"a": [1, 2, 3]}) # noqa: F841 res = duckdb.sql("from df").fetchall() assert res == [(1,), (2,), (3,)] diff --git a/tests/fast/api/test_duckdb_query.py b/tests/fast/api/test_duckdb_query.py index 04531e49..8be3287c 100644 --- a/tests/fast/api/test_duckdb_query.py +++ b/tests/fast/api/test_duckdb_query.py @@ -1,5 +1,5 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb from duckdb import Value @@ -21,9 +21,8 @@ def test_duckdb_query(self, duckdb_cursor): res = duckdb_cursor.sql("select 42; select 84;").fetchall() assert res == [(84,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_duckdb_from_query_multiple_statements(self, pandas): - tst_df = pandas.DataFrame({"a": [1, 23, 3, 5]}) # noqa: F841 + def test_duckdb_from_query_multiple_statements(self): + tst_df = pd.DataFrame({"a": [1, 23, 3, 5]}) # noqa: F841 res = duckdb.sql( """ diff --git a/tests/fast/api/test_native_tz.py b/tests/fast/api/test_native_tz.py index 66b06565..61b9ba24 100644 --- a/tests/fast/api/test_native_tz.py +++ b/tests/fast/api/test_native_tz.py @@ -1,4 +1,5 @@ import datetime +import zoneinfo from pathlib import Path import pytest @@ -12,6 +13,17 @@ filename = str(Path(__file__).parent / ".." / "data" / "tz.parquet") +def get_tz_string(obj): + if isinstance(obj, zoneinfo.ZoneInfo): + # Pandas 3.0.0 creates ZoneInfo objects + return obj.key + if hasattr(obj, "zone"): + # Before 3.0.0 Pandas created tzdata objects + return obj.zone + msg = f"Can't get tz string from {obj}" + raise ValueError(msg) + + class TestNativeTimeZone: def test_native_python_timestamp_timezone(self, duckdb_cursor): duckdb_cursor.execute("SET timezone='America/Los_Angeles';") @@ -46,7 +58,7 @@ def test_native_python_time_timezone(self, duckdb_cursor): def test_pandas_timestamp_timezone(self, duckdb_cursor): res = duckdb_cursor.execute("SET timezone='America/Los_Angeles';") res = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").df() - assert res.dtypes["tz"].tz.zone == "America/Los_Angeles" + assert get_tz_string(res.dtypes["tz"].tz) == "America/Los_Angeles" assert res["tz"][0].hour == 14 assert res["tz"][0].minute == 52 @@ -65,16 +77,16 @@ def test_pandas_timestamp_time(self, duckdb_cursor): Version(pa.__version__) < Version("15.0.0"), reason="pyarrow 14.0.2 'to_pandas' causes a DeprecationWarning" ) def test_arrow_timestamp_timezone(self, duckdb_cursor): - res = duckdb_cursor.execute("SET timezone='America/Los_Angeles';") + duckdb_cursor.execute("SET timezone='America/Los_Angeles';") table = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").fetch_arrow_table() res = table.to_pandas() - assert res.dtypes["tz"].tz.zone == "America/Los_Angeles" + assert get_tz_string(res.dtypes["tz"].tz) == "America/Los_Angeles" assert res["tz"][0].hour == 14 assert res["tz"][0].minute == 52 duckdb_cursor.execute("SET timezone='UTC';") res = duckdb_cursor.execute(f"select TimeRecStart as tz from '{filename}'").fetch_arrow_table().to_pandas() - assert res.dtypes["tz"].tz.zone == "UTC" + assert get_tz_string(res.dtypes["tz"].tz) == "UTC" assert res["tz"][0].hour == 21 assert res["tz"][0].minute == 52 diff --git a/tests/fast/api/test_to_csv.py b/tests/fast/api/test_to_csv.py index 97f13d8b..1354888a 100644 --- a/tests/fast/api/test_to_csv.py +++ b/tests/fast/api/test_to_csv.py @@ -3,17 +3,17 @@ import os import tempfile +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas, getTimeSeriesData +from conftest import getTimeSeriesData import duckdb class TestToCSV: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_basic_to_csv(self, pandas): + def test_basic_to_csv(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name) @@ -21,10 +21,9 @@ def test_basic_to_csv(self, pandas): csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_sep(self, pandas): + def test_to_csv_sep(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, sep=",") @@ -32,10 +31,9 @@ def test_to_csv_sep(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, sep=",") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_na_rep(self, pandas): + def test_to_csv_na_rep(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, na_rep="test") @@ -43,10 +41,9 @@ def test_to_csv_na_rep(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, na_values="test") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_header(self, pandas): + def test_to_csv_header(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name) @@ -54,10 +51,9 @@ def test_to_csv_header(self, pandas): csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quotechar(self, pandas): + def test_to_csv_quotechar(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) + df = pd.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quotechar="'", sep=",") @@ -65,10 +61,9 @@ def test_to_csv_quotechar(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, sep=",", quotechar="'") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_escapechar(self, pandas): + def test_to_csv_escapechar(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_bool": [True, False], "c_float": [1.0, 3.2], @@ -81,12 +76,11 @@ def test_to_csv_escapechar(self, pandas): csv_rel = duckdb.read_csv(temp_file_name, quotechar='"', escapechar="!") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_date_format(self, pandas): + def test_to_csv_date_format(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame(getTimeSeriesData()) + df = pd.DataFrame(getTimeSeriesData()) dt_index = df.index - df = pandas.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) + df = pd.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, date_format="%Y%m%d") @@ -94,11 +88,10 @@ def test_to_csv_date_format(self, pandas): assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_timestamp_format(self, pandas): + def test_to_csv_timestamp_format(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 data = [datetime.time(hour=23, minute=1, second=34, microsecond=234345)] - df = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + df = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, timestamp_format="%m/%d/%Y") @@ -106,68 +99,61 @@ def test_to_csv_timestamp_format(self, pandas): assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_off(self, pandas): + def test_to_csv_quoting_off(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quoting=None) csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_on(self, pandas): + def test_to_csv_quoting_on(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quoting="force") csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quoting_quote_all(self, pandas): + def test_to_csv_quoting_quote_all(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, quoting=csv.QUOTE_ALL) csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_encoding_incorrect(self, pandas): + def test_to_csv_encoding_incorrect(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) with pytest.raises( duckdb.InvalidInputException, match="Invalid Input Error: The only supported encoding option is 'UTF8" ): rel.to_csv(temp_file_name, encoding="nope") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_encoding_correct(self, pandas): + def test_to_csv_encoding_correct(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, encoding="UTF-8") csv_rel = duckdb.read_csv(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_compression_gzip(self, pandas): + def test_compression_gzip(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) + df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) rel.to_csv(temp_file_name, compression="gzip") csv_rel = duckdb.read_csv(temp_file_name, compression="gzip") assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_partition(self, pandas): + def test_to_csv_partition(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category": ["a", "a", "b", "b"], "c_bool": [True, False, True, True], @@ -190,10 +176,9 @@ def test_to_csv_partition(self, pandas): assert csv_rel.execute().fetchall() == expected - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_partition_with_columns_written(self, pandas): + def test_to_csv_partition_with_columns_written(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category": ["a", "a", "b", "b"], "c_bool": [True, False, True, True], @@ -210,10 +195,9 @@ def test_to_csv_partition_with_columns_written(self, pandas): ) assert res.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite(self, pandas): + def test_to_csv_overwrite(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], @@ -238,10 +222,9 @@ def test_to_csv_overwrite(self, pandas): ] assert csv_rel.execute().fetchall() == expected - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite_with_columns_written(self, pandas): + def test_to_csv_overwrite_with_columns_written(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], @@ -264,10 +247,9 @@ def test_to_csv_overwrite_with_columns_written(self, pandas): res = duckdb.sql("FROM rel order by all") assert res.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_overwrite_not_enabled(self, pandas): + def test_to_csv_overwrite_not_enabled(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], @@ -282,12 +264,11 @@ def test_to_csv_overwrite_not_enabled(self, pandas): with pytest.raises(duckdb.IOException, match="OVERWRITE"): rel.to_csv(temp_file_name, header=True, partition_by=["c_category_1"]) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_per_thread_output(self, pandas): + def test_to_csv_per_thread_output(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 num_threads = duckdb.sql("select current_setting('threads')").fetchone()[0] print("num_threads:", num_threads) - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category": ["a", "a", "b", "b"], "c_bool": [True, False, True, True], @@ -301,10 +282,9 @@ def test_to_csv_per_thread_output(self, pandas): csv_rel = duckdb.read_csv(f"{temp_file_name}/*.csv", header=True) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_use_tmp_file(self, pandas): + def test_to_csv_use_tmp_file(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 - df = pandas.DataFrame( + df = pd.DataFrame( { "c_category_1": ["a", "a", "b", "b"], "c_category_2": ["c", "c", "d", "d"], diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py index 370ab8e4..5c70bf3f 100644 --- a/tests/fast/api/test_to_parquet.py +++ b/tests/fast/api/test_to_parquet.py @@ -3,15 +3,14 @@ import re import tempfile +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb class TestToParquet: - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_basic_to_parquet(self, pd): + def test_basic_to_parquet(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) rel = duckdb.from_df(df) @@ -21,8 +20,7 @@ def test_basic_to_parquet(self, pd): csv_rel = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_compression_gzip(self, pd): + def test_compression_gzip(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) @@ -50,9 +48,8 @@ def test_field_ids(self): """ ).execute().fetchall() == [("duckdb_schema", None), ("i", 42), ("my_struct", 43), ("j", 44)] - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("row_group_size_bytes", [122880 * 1024, "2MB"]) - def test_row_group_size_bytes(self, pd, row_group_size_bytes): + def test_row_group_size_bytes(self, row_group_size_bytes): con = duckdb.connect() con.execute("SET preserve_insertion_order=false;") @@ -63,8 +60,7 @@ def test_row_group_size_bytes(self, pd, row_group_size_bytes): parquet_rel = con.read_parquet(temp_file_name) assert rel.execute().fetchall() == parquet_rel.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_row_group_size(self, pd): + def test_row_group_size(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame({"a": ["string1", "string2", "string3"]}) rel = duckdb.from_df(df) @@ -72,9 +68,8 @@ def test_row_group_size(self, pd): parquet_rel = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == parquet_rel.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("write_columns", [None, True, False]) - def test_partition(self, pd, write_columns): + def test_partition(self, write_columns): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -89,9 +84,8 @@ def test_partition(self, pd, write_columns): expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("write_columns", [None, True, False]) - def test_overwrite(self, pd, write_columns): + def test_overwrite(self, write_columns): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -108,8 +102,7 @@ def test_overwrite(self, pd, write_columns): assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_use_tmp_file(self, pd): + def test_use_tmp_file(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -124,8 +117,7 @@ def test_use_tmp_file(self, pd): result = duckdb.read_parquet(temp_file_name) assert rel.execute().fetchall() == result.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_per_thread_output(self, pd): + def test_per_thread_output(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 num_threads = duckdb.sql("select current_setting('threads')").fetchone()[0] print("threads:", num_threads) @@ -141,8 +133,7 @@ def test_per_thread_output(self, pd): result = duckdb.read_parquet(f"{temp_file_name}/*.parquet") assert rel.execute().fetchall() == result.execute().fetchall() - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_append(self, pd): + def test_append(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -173,8 +164,7 @@ def test_append(self, pd): ] assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_filename_pattern_with_index(self, pd): + def test_filename_pattern_with_index(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -199,8 +189,7 @@ def test_filename_pattern_with_index(self, pd): expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] assert result.execute().fetchall() == expected - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_filename_pattern_with_uuid(self, pd): + def test_filename_pattern_with_uuid(self): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { @@ -242,9 +231,8 @@ def test_file_size_bytes_basic(self, file_size_bytes): result = duckdb.read_parquet(f"{temp_file_name}/*.parquet") assert len(result.execute().fetchall()) == 10000 - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"]) - def test_file_size_bytes_human_readable(self, pd, file_size_bytes): + def test_file_size_bytes_human_readable(self, file_size_bytes): temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 df = pd.DataFrame( { diff --git a/tests/fast/arrow/test_6796.py b/tests/fast/arrow/test_6796.py index bf557038..a9e877d5 100644 --- a/tests/fast/arrow/test_6796.py +++ b/tests/fast/arrow/test_6796.py @@ -1,15 +1,14 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb pyarrow = pytest.importorskip("pyarrow") -@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) -def test_6796(pandas): +def test_6796(): conn = duckdb.connect() - input_df = pandas.DataFrame({"foo": ["bar"]}) + input_df = pd.DataFrame({"foo": ["bar"]}) conn.register("input_df", input_df) query = """ diff --git a/tests/fast/pandas/test_2304.py b/tests/fast/pandas/test_2304.py index c60b1b4a..e40c2dd1 100644 --- a/tests/fast/pandas/test_2304.py +++ b/tests/fast/pandas/test_2304.py @@ -1,14 +1,12 @@ import numpy as np -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestPandasMergeSameName: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_2304(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_2304(self, duckdb_cursor): + df1 = pd.DataFrame( { "id_1": [1, 1, 1, 2, 2], "agedate": np.array(["2010-01-01", "2010-02-01", "2010-03-01", "2020-02-01", "2020-03-01"]).astype( @@ -19,7 +17,7 @@ def test_2304(self, duckdb_cursor, pandas): } ) - df2 = pandas.DataFrame( + df2 = pd.DataFrame( { "id_1": [1, 1, 2], "agedate": np.array(["2010-01-01", "2010-02-01", "2020-03-01"]).astype("datetime64[D]"), @@ -54,9 +52,8 @@ def test_2304(self, duckdb_cursor, pandas): assert result == expected_result - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pd_names(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_pd_names(self, duckdb_cursor): + df1 = pd.DataFrame( { "id": [1, 1, 2], "id_1": [1, 1, 2], @@ -64,9 +61,9 @@ def test_pd_names(self, duckdb_cursor, pandas): } ) - df2 = pandas.DataFrame({"id": [1, 1, 2], "id_1": [1, 1, 2], "id_2": [1, 1, 1]}) + df2 = pd.DataFrame({"id": [1, 1, 2], "id_1": [1, 1, 2], "id_2": [1, 1, 1]}) - exp_result = pandas.DataFrame( + exp_result = pd.DataFrame( { "id": [1, 1, 2, 1, 1], "id_1": [1, 1, 2, 1, 1], @@ -85,11 +82,10 @@ def test_pd_names(self, duckdb_cursor, pandas): ON (df1.id_1=df2.id_1)""" result_df = con.execute(query).fetchdf() - pandas.testing.assert_frame_equal(exp_result, result_df) + pd.testing.assert_frame_equal(exp_result, result_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_repeat_name(self, duckdb_cursor, pandas): - df1 = pandas.DataFrame( + def test_repeat_name(self, duckdb_cursor): + df1 = pd.DataFrame( { "id": [1], "id_1": [1], @@ -97,9 +93,9 @@ def test_repeat_name(self, duckdb_cursor, pandas): } ) - df2 = pandas.DataFrame({"id": [1]}) + df2 = pd.DataFrame({"id": [1]}) - exp_result = pandas.DataFrame( + exp_result = pd.DataFrame( { "id": [1], "id_1": [1], @@ -119,4 +115,4 @@ def test_repeat_name(self, duckdb_cursor, pandas): ON (df1.id=df2.id) """ ).fetchdf() - pandas.testing.assert_frame_equal(exp_result, result_df) + pd.testing.assert_frame_equal(exp_result, result_df) diff --git a/tests/fast/pandas/test_append_df.py b/tests/fast/pandas/test_append_df.py index d93cfa2d..be287a8f 100644 --- a/tests/fast/pandas/test_append_df.py +++ b/tests/fast/pandas/test_append_df.py @@ -1,15 +1,14 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb class TestAppendDF: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_to_table_append(self, duckdb_cursor, pandas): + def test_df_to_table_append(self, duckdb_cursor): conn = duckdb.connect() conn.execute("Create table integers (i integer)") - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -17,11 +16,10 @@ def test_df_to_table_append(self, duckdb_cursor, pandas): conn.append("integers", df_in) assert conn.execute("select count(*) from integers").fetchone()[0] == 5 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append_by_name(self, pandas): + def test_append_by_name(self): con = duckdb.connect() con.execute("create table tbl (a integer, b bool, c varchar)") - df_in = pandas.DataFrame({"c": ["duck", "db"], "b": [False, True], "a": [4, 2]}) + df_in = pd.DataFrame({"c": ["duck", "db"], "b": [False, True], "a": [4, 2]}) # By default we append by position, causing the following exception: with pytest.raises( duckdb.ConversionException, match="Conversion Error: Could not convert string 'duck' to INT32" @@ -33,29 +31,27 @@ def test_append_by_name(self, pandas): res = con.table("tbl").fetchall() assert res == [(4, False, "duck"), (2, True, "db")] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append_by_name_quoted(self, pandas): + def test_append_by_name_quoted(self): con = duckdb.connect() con.execute( """ create table tbl ("needs to be quoted" integer, other varchar) """ ) - df_in = pandas.DataFrame({"needs to be quoted": [1, 2, 3]}) + df_in = pd.DataFrame({"needs to be quoted": [1, 2, 3]}) con.append("tbl", df_in, by_name=True) res = con.table("tbl").fetchall() assert res == [(1, None), (2, None), (3, None)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_append_by_name_no_exact_match(self, pandas): + def test_append_by_name_no_exact_match(self): con = duckdb.connect() con.execute("create table tbl (a integer, b bool)") - df_in = pandas.DataFrame({"c": ["a", "b"], "b": [True, False], "a": [42, 1337]}) + df_in = pd.DataFrame({"c": ["a", "b"], "b": [True, False], "a": [42, 1337]}) # Too many columns raises an error, because the columns cant be found in the targeted table with pytest.raises(duckdb.BinderException, match='Table "tbl" does not have a column with name "c"'): con.append("tbl", df_in, by_name=True) - df_in = pandas.DataFrame({"b": [False, False, False]}) + df_in = pd.DataFrame({"b": [False, False, False]}) # Not matching all columns is not a problem, as they will be filled with NULL instead con.append("tbl", df_in, by_name=True) @@ -66,7 +62,7 @@ def test_append_by_name_no_exact_match(self, pandas): # Empty the table con.execute("create or replace table tbl (a integer, b bool)") - df_in = pandas.DataFrame({"a": [1, 2, 3]}) + df_in = pd.DataFrame({"a": [1, 2, 3]}) con.append("tbl", df_in, by_name=True) res = con.table("tbl").fetchall() # Also works for missing columns *after* the supplied ones diff --git a/tests/fast/pandas/test_bug5922.py b/tests/fast/pandas/test_bug5922.py index b75ddf1b..196764e3 100644 --- a/tests/fast/pandas/test_bug5922.py +++ b/tests/fast/pandas/test_bug5922.py @@ -1,13 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestPandasAcceptFloat16: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_accept_float16(self, duckdb_cursor, pandas): - df = pandas.DataFrame({"col": [1, 2, 3]}) + def test_pandas_accept_float16(self, duckdb_cursor): + df = pd.DataFrame({"col": [1, 2, 3]}) df16 = df.astype({"col": "float16"}) # noqa: F841 con = duckdb.connect() con.execute("CREATE TABLE tbl AS SELECT * FROM df16") diff --git a/tests/fast/pandas/test_copy_on_write.py b/tests/fast/pandas/test_copy_on_write.py index 176c2133..417fae0d 100644 --- a/tests/fast/pandas/test_copy_on_write.py +++ b/tests/fast/pandas/test_copy_on_write.py @@ -1,26 +1,27 @@ import datetime import pytest +from packaging.version import Version import duckdb # https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html pandas = pytest.importorskip("pandas", "1.5", reason="copy_on_write does not exist in earlier versions") +# Starting from Pandas 3.0.0 copy-on-write can no longer be disabled and this setting is deprecated +pre_3_0 = Version(pandas.__version__) < Version("3.0.0") # Make sure the variable get's properly reset even in case of error @pytest.fixture(autouse=True) def scoped_copy_on_write_setting(): - old_value = pandas.options.mode.copy_on_write - pandas.options.mode.copy_on_write = True - yield - # Reset it at the end of the function - pandas.options.mode.copy_on_write = old_value - return - - -def convert_to_result(col): - return [(x,) for x in col] + if pre_3_0: + old_value = pandas.options.mode.copy_on_write + pandas.options.mode.copy_on_write = True + yield + # Reset it at the end of the function + pandas.options.mode.copy_on_write = old_value + else: + yield class TestCopyOnWrite: @@ -35,7 +36,6 @@ class TestCopyOnWrite: ], ) def test_copy_on_write(self, col): - assert pandas.options.mode.copy_on_write con = duckdb.connect() df_in = pandas.DataFrame( # noqa: F841 { @@ -45,5 +45,5 @@ def test_copy_on_write(self, col): rel = con.sql("select * from df_in") res = rel.fetchall() print(res) - expected = convert_to_result(col) + expected = [(x,) for x in col] assert res == expected diff --git a/tests/fast/pandas/test_create_table_from_pandas.py b/tests/fast/pandas/test_create_table_from_pandas.py index 436fd0c8..b9937de2 100644 --- a/tests/fast/pandas/test_create_table_from_pandas.py +++ b/tests/fast/pandas/test_create_table_from_pandas.py @@ -1,12 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb -def assert_create(internal_data, expected_result, data_type, pandas): +def assert_create(internal_data, expected_result, data_type): conn = duckdb.connect() - df_in = pandas.DataFrame(data=internal_data, dtype=data_type) # noqa: F841 + df_in = pd.DataFrame(data=internal_data, dtype=data_type) # noqa: F841 conn.execute("CREATE TABLE t AS SELECT * FROM df_in") @@ -14,9 +13,9 @@ def assert_create(internal_data, expected_result, data_type, pandas): assert result == expected_result -def assert_create_register(internal_data, expected_result, data_type, pandas): +def assert_create_register(internal_data, expected_result, data_type): conn = duckdb.connect() - df_in = pandas.DataFrame(data=internal_data, dtype=data_type) + df_in = pd.DataFrame(data=internal_data, dtype=data_type) conn.register("dataframe", df_in) conn.execute("CREATE TABLE t AS SELECT * FROM dataframe") @@ -25,15 +24,14 @@ def assert_create_register(internal_data, expected_result, data_type, pandas): class TestCreateTableFromPandas: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_integer_create_table(self, duckdb_cursor, pandas): + def test_integer_create_table(self, duckdb_cursor): # TODO: This should work with other data types e.g., int8... # noqa: TD002, TD003 data_types = ["Int8", "Int16", "Int32", "Int64"] internal_data = [1, 2, 3, 4] expected_result = [(1,), (2,), (3,), (4,)] for data_type in data_types: print(data_type) - assert_create_register(internal_data, expected_result, data_type, pandas) - assert_create(internal_data, expected_result, data_type, pandas) + assert_create_register(internal_data, expected_result, data_type) + assert_create(internal_data, expected_result, data_type) # TODO: Also test other data types # noqa: TD002, TD003 diff --git a/tests/fast/pandas/test_datetime_time.py b/tests/fast/pandas/test_datetime_time.py index 0b2642b0..a2fda09a 100644 --- a/tests/fast/pandas/test_datetime_time.py +++ b/tests/fast/pandas/test_datetime_time.py @@ -1,8 +1,8 @@ from datetime import datetime, time, timezone import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -10,25 +10,22 @@ class TestDateTimeTime: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_time_high(self, duckdb_cursor, pandas): + def test_time_high(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql("SELECT make_time(23, 1, 34.234345) AS '0'").df() data = [time(hour=23, minute=1, second=34, microsecond=234345)] - df_in = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + df_in = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) df_out = duckdb.query_df(df_in, "df", "select * from df").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_time_low(self, duckdb_cursor, pandas): + def test_time_low(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql("SELECT make_time(00, 01, 1.000) AS '0'").df() data = [time(hour=0, minute=1, second=1)] - df_in = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + df_in = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) df_out = duckdb.query_df(df_in, "df", "select * from df").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("input", ["2263-02-28", "9999-01-01"]) - def test_pandas_datetime_big(self, pandas, input): + def test_pandas_datetime_big(self, input): duckdb_con = duckdb.connect() duckdb_con.execute("create table test (date DATE)") @@ -36,8 +33,8 @@ def test_pandas_datetime_big(self, pandas, input): res = duckdb_con.execute("select * from test").df() date_value = np.array([f"{input}"], dtype="datetime64[us]") - df = pandas.DataFrame({"date": date_value}) - pandas.testing.assert_frame_equal(res, df) + df = pd.DataFrame({"date": date_value}) + pd.testing.assert_frame_equal(res, df) def test_timezone_datetime(self): con = duckdb.connect() diff --git a/tests/fast/pandas/test_datetime_timestamp.py b/tests/fast/pandas/test_datetime_timestamp.py index c6d4e3a9..063be160 100644 --- a/tests/fast/pandas/test_datetime_timestamp.py +++ b/tests/fast/pandas/test_datetime_timestamp.py @@ -1,39 +1,35 @@ import datetime +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas from packaging.version import Version -pd = pytest.importorskip("pandas") - class TestDateTimeTimeStamp: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_high(self, pandas, duckdb_cursor): + def test_timestamp_high(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql("SELECT '2260-01-01 23:59:00'::TIMESTAMP AS '0'").df() - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2260, month=1, day=1, hour=23, minute=59)], dtype="datetime64[us]", ) } ) df_out = duckdb_cursor.sql("select * from df_in").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_low(self, pandas, duckdb_cursor): + def test_timestamp_low(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT '1680-01-01 23:59:00.234243'::TIMESTAMP AS '0' """ ).df() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { - "0": pandas.Series( + "0": pd.Series( data=[ - pandas.Timestamp( + pd.Timestamp( datetime.datetime(year=1680, month=1, day=1, hour=23, minute=59, microsecond=234243), unit="us", ) @@ -46,13 +42,12 @@ def test_timestamp_low(self, pandas, duckdb_cursor): print("df_in:", df_in["0"].dtype) df_out = duckdb_cursor.sql("select * from df_in").df() print("df_out:", df_out["0"].dtype) - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_timezone_regular(self, pandas, duckdb_cursor): + def test_timestamp_timezone_regular(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT timestamp '2022-01-01 12:00:00' AT TIME ZONE 'Pacific/Easter' as "0" @@ -61,9 +56,9 @@ def test_timestamp_timezone_regular(self, pandas, duckdb_cursor): offset = datetime.timedelta(hours=-2) timezone = datetime.timezone(offset) - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2022, month=1, day=1, hour=15, tzinfo=timezone)], dtype="object" ) } @@ -71,13 +66,12 @@ def test_timestamp_timezone_regular(self, pandas, duckdb_cursor): df_out = duckdb_cursor.sql("select * from df_in").df() print(df_out) print(duckdb_time) - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_timezone_negative_extreme(self, pandas, duckdb_cursor): + def test_timestamp_timezone_negative_extreme(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT timestamp '2022-01-01 12:00:00' AT TIME ZONE 'Chile/EasterIsland' as "0" @@ -87,21 +81,20 @@ def test_timestamp_timezone_negative_extreme(self, pandas, duckdb_cursor): offset = datetime.timedelta(hours=-19) timezone = datetime.timezone(offset) - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2021, month=12, day=31, hour=22, tzinfo=timezone)], dtype="object" ) } ) df_out = duckdb_cursor.sql("select * from df_in").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_timestamp_timezone_positive_extreme(self, pandas, duckdb_cursor): + def test_timestamp_timezone_positive_extreme(self, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ SELECT timestamp '2021-12-31 23:00:00' AT TIME ZONE 'Etc/GMT-14' as "0" @@ -111,22 +104,21 @@ def test_timestamp_timezone_positive_extreme(self, pandas, duckdb_cursor): offset = datetime.timedelta(hours=14) timezone = datetime.timezone(offset) - df_in = pandas.DataFrame( # noqa: F841 + df_in = pd.DataFrame( # noqa: F841 { - 0: pandas.Series( + 0: pd.Series( data=[datetime.datetime(year=2021, month=12, day=31, hour=23, tzinfo=timezone)], dtype="object" ) } ) df_out = duckdb_cursor.sql("""select * from df_in""").df() - pandas.testing.assert_frame_equal(df_out, duckdb_time) + pd.testing.assert_frame_equal(df_out, duckdb_time) @pytest.mark.skipif( Version(pd.__version__) < Version("2.0.2"), reason="pandas < 2.0.2 does not properly convert timezones" ) @pytest.mark.parametrize("unit", ["ms", "ns", "s"]) def test_timestamp_timezone_coverage(self, unit, duckdb_cursor): - pd = pytest.importorskip("pandas") ts_df = pd.DataFrame( # noqa: F841 {"ts": pd.Series(data=[pd.Timestamp(datetime.datetime(1990, 12, 21))], dtype=f"datetime64[{unit}]")} ) diff --git a/tests/fast/pandas/test_df_analyze.py b/tests/fast/pandas/test_df_analyze.py index 96cd426d..d9881ffa 100644 --- a/tests/fast/pandas/test_df_analyze.py +++ b/tests/fast/pandas/test_df_analyze.py @@ -1,58 +1,51 @@ -import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas +from conftest import is_string_dtype import duckdb -def create_generic_dataframe(data, pandas): - return pandas.DataFrame({"col0": pandas.Series(data=data, dtype="object")}) +def create_generic_dataframe(data): + return pd.DataFrame({"col0": pd.Series(data=data, dtype="object")}) class TestResolveObjectColumns: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_low_correct(self, duckdb_cursor, pandas): - print(pandas.backend) + def test_sample_low_correct(self, duckdb_cursor): duckdb_conn = duckdb.connect() duckdb_conn.execute("SET pandas_analyze_sample=3") data = [1000008, 6, 9, 4, 1, 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) roundtripped_df = duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() duckdb_df = duckdb_conn.query("select * FROM (VALUES (1000008), (6), (9), (4), (1), (6)) as '0'").df() - pandas.testing.assert_frame_equal(duckdb_df, roundtripped_df, check_dtype=False) + pd.testing.assert_frame_equal(duckdb_df, roundtripped_df, check_dtype=False) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_low_incorrect_detected(self, duckdb_cursor, pandas): + def test_sample_low_incorrect_detected(self, duckdb_cursor): duckdb_conn = duckdb.connect() duckdb_conn.execute("SET pandas_analyze_sample=2") # size of list (6) divided by 'pandas_analyze_sample' (2) is the increment used # in this case index 0 (1000008) and index 3 ([4]) are checked, which dont match data = [1000008, 6, 9, [4], 1, 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) roundtripped_df = duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() # Sample high enough to detect mismatch in types, fallback to VARCHAR - assert roundtripped_df["col0"].dtype == np.dtype("object") + assert is_string_dtype(roundtripped_df["col0"].dtype) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_zero(self, duckdb_cursor, pandas): + def test_sample_zero_infers_varchar(self, duckdb_cursor): + """Test that with analyze disabled, object columns are treated as VARCHAR.""" duckdb_conn = duckdb.connect() # Disable dataframe analyze duckdb_conn.execute("SET pandas_analyze_sample=0") data = [1000008, 6, 9, 3, 1, 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) roundtripped_df = duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() - # Always converts to VARCHAR - if pandas.backend == "pyarrow": - assert roundtripped_df["col0"].dtype == np.dtype("int64") - else: - assert roundtripped_df["col0"].dtype == np.dtype("object") + # Always converts to VARCHAR when analyze is disabled + assert is_string_dtype(roundtripped_df["col0"].dtype) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_sample_low_incorrect_undetected(self, duckdb_cursor, pandas): + def test_sample_low_incorrect_undetected(self, duckdb_cursor): duckdb_conn = duckdb.connect() duckdb_conn.execute("SET pandas_analyze_sample=1") data = [1000008, 6, 9, [4], [1], 6] - df = create_generic_dataframe(data, pandas) + df = create_generic_dataframe(data) # Sample size is too low to detect the mismatch, exception is raised when trying to convert with pytest.raises(duckdb.InvalidInputException, match="Failed to cast value: Unimplemented type for cast"): duckdb.query_df(df, "x", "select * from x", connection=duckdb_conn).df() @@ -65,12 +58,11 @@ def test_reset_analyze_sample_setting(self, duckdb_cursor): res = duckdb_cursor.execute("select current_setting('pandas_analyze_sample')").fetchall() assert res == [(1000,)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_10750(self, duckdb_cursor, pandas): + def test_10750(self, duckdb_cursor): max_row_number = 2000 data = {"id": list(range(max_row_number + 1)), "content": [None for _ in range(max_row_number + 1)]} - pdf = pandas.DataFrame(data=data) + pdf = pd.DataFrame(data=data) duckdb_cursor.register("content", pdf) res = duckdb_cursor.query("select id from content").fetchall() expected = [(i,) for i in range(2001)] diff --git a/tests/fast/pandas/test_df_object_resolution.py b/tests/fast/pandas/test_df_object_resolution.py index 58ae0c94..2f78e27d 100644 --- a/tests/fast/pandas/test_df_object_resolution.py +++ b/tests/fast/pandas/test_df_object_resolution.py @@ -7,16 +7,17 @@ from decimal import Decimal import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb +from tests.conftest import is_string_dtype standard_vector_size = duckdb.__standard_vector_size__ -def create_generic_dataframe(data, pandas): - return pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) +def create_generic_dataframe(data): + return pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) def create_repeated_nulls(size): @@ -42,11 +43,11 @@ def __str__(self) -> str: # To avoid DECIMAL being upgraded to DOUBLE (because DOUBLE outranks DECIMAL as a LogicalType) # These floats had their precision preserved as string and are now cast to decimal.Decimal -def ConvertStringToDecimal(data: list, pandas): +def ConvertStringToDecimal(data: list): for i in range(len(data)): if isinstance(data[i], str): data[i] = decimal.Decimal(data[i]) - data = pandas.Series(data=data, dtype="object") + data = pd.Series(data=data, dtype="object") return data @@ -74,9 +75,9 @@ def construct_map(pair): ] -def check_struct_upgrade(expected_type: str, creation_method, pair: ObjectPair, pandas, cursor): +def check_struct_upgrade(expected_type: str, creation_method, pair: ObjectPair, cursor): column_data = creation_method(pair) - df = pandas.DataFrame(data={"col": column_data}) + df = pd.DataFrame(data={"col": column_data}) rel = cursor.query("select col from df") res = rel.fetchall() print("COLUMN_DATA", column_data) @@ -85,29 +86,25 @@ def check_struct_upgrade(expected_type: str, creation_method, pair: ObjectPair, class TestResolveObjectColumns: - # TODO: add support for ArrowPandas # noqa: TD002, TD003 - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_integers(self, pandas, duckdb_cursor): + def test_integers(self, duckdb_cursor): data = [5, 0, 3] - df_in = create_generic_dataframe(data, pandas) + df_in = create_generic_dataframe(data) # These are float64 because pandas would force these to be float64 even if we set them to int8, int16, # int32, int64 respectively - df_expected_res = pandas.DataFrame({"0": pandas.Series(data=data, dtype="int32")}) + df_expected_res = pd.DataFrame({"0": pd.Series(data=data, dtype="int32")}) df_out = duckdb_cursor.sql("SELECT * FROM df_in").df() print(df_out) - pandas.testing.assert_frame_equal(df_expected_res, df_out) + pd.testing.assert_frame_equal(df_expected_res, df_out) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_correct(self, pandas, duckdb_cursor): + def test_struct_correct(self, duckdb_cursor): data = [{"a": 1, "b": 3, "c": 3, "d": 7}] - df = pandas.DataFrame({"0": pandas.Series(data=data)}) + df = pd.DataFrame({"0": pd.Series(data=data)}) duckdb_col = duckdb_cursor.sql("SELECT {a: 1, b: 3, c: 3, d: 7} as '0'").df() converted_col = duckdb_cursor.sql("SELECT * FROM df").df() - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_different_keys(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_different_keys(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": 7}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -118,7 +115,7 @@ def test_map_fallback_different_keys(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -128,11 +125,10 @@ def test_map_fallback_different_keys(self, pandas, duckdb_cursor): ] ) equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_incorrect_amount_of_keys(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_incorrect_amount_of_keys(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": 7}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -142,7 +138,7 @@ def test_map_fallback_incorrect_amount_of_keys(self, pandas, duckdb_cursor): ] ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -152,11 +148,10 @@ def test_map_fallback_incorrect_amount_of_keys(self, pandas, duckdb_cursor): ] ) equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": "string"}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -165,7 +160,7 @@ def test_struct_value_upgrade(self, pandas, duckdb_cursor): [{"a": 1, "b": 3, "c": 3, "d": 7}], ] ) - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": "string"}], [{"a": 1, "b": 3, "c": 3, "d": "7"}], @@ -176,11 +171,10 @@ def test_struct_value_upgrade(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_null(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_null(self, duckdb_cursor): + x = pd.DataFrame( [ [None], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -189,7 +183,7 @@ def test_struct_null(self, pandas, duckdb_cursor): [{"a": 1, "b": 3, "c": 3, "d": 7}], ] ) - y = pandas.DataFrame( + y = pd.DataFrame( [ [None], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -200,11 +194,10 @@ def test_struct_null(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [{"a": 1, "b": 3, "c": 3, "d": "test"}], [{"a": 1, "b": 3, "c": 3, "d": 7}], @@ -213,7 +206,7 @@ def test_map_fallback_value_upgrade(self, pandas, duckdb_cursor): [{"a": 1, "b": 3, "c": 3, "d": 7}], ] ) - y = pandas.DataFrame( + y = pd.DataFrame( [ [{"a": "1", "b": "3", "c": "3", "d": "test"}], [{"a": "1", "b": "3", "c": "3", "d": "7"}], @@ -224,11 +217,10 @@ def test_map_fallback_value_upgrade(self, pandas, duckdb_cursor): ) converted_df = duckdb_cursor.sql("SELECT * FROM x").df() equal_df = duckdb_cursor.sql("SELECT * FROM y").df() - pandas.testing.assert_frame_equal(converted_df, equal_df) + pd.testing.assert_frame_equal(converted_df, equal_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_correct(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_correct(self, duckdb_cursor): + x = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -255,23 +247,21 @@ def test_map_correct(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select a from tmp AS '0'").df() print(duckdb_col.columns) print(converted_col.columns) - pandas.testing.assert_frame_equal(converted_col, duckdb_col) + pd.testing.assert_frame_equal(converted_col, duckdb_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) @pytest.mark.parametrize("sample_size", [1, 10]) @pytest.mark.parametrize("fill", [1000, 10000]) @pytest.mark.parametrize("get_data", [create_repeated_nulls, create_trailing_non_null]) - def test_analyzing_nulls(self, pandas, duckdb_cursor, fill, sample_size, get_data): + def test_analyzing_nulls(self, duckdb_cursor, fill, sample_size, get_data): data = get_data(fill) - df1 = pandas.DataFrame(data={"col1": data}) + df1 = pd.DataFrame(data={"col1": data}) duckdb_cursor.execute(f"SET GLOBAL pandas_analyze_sample={sample_size}") df = duckdb_cursor.execute("select * from df1").df() - pandas.testing.assert_frame_equal(df1, df) + pd.testing.assert_frame_equal(df1, df, check_dtype=False) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_nested_map(self, pandas, duckdb_cursor): - df = pandas.DataFrame(data={"col1": [{"a": {"b": {"x": "A", "y": "B"}}}, {"c": {"b": {"x": "A"}}}]}) + def test_nested_map(self, duckdb_cursor): + df = pd.DataFrame(data={"col1": [{"a": {"b": {"x": "A", "y": "B"}}}, {"c": {"b": {"x": "A"}}}]}) rel = duckdb_cursor.sql("select * from df") expected_rel = duckdb_cursor.sql( @@ -287,9 +277,8 @@ def test_nested_map(self, pandas, duckdb_cursor): expected_res = str(expected_rel) assert res == expected_res - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, "test"]}], [{"key": ["a", "b", "c", "d"], "value": [1, 3, 3, 7]}], @@ -321,36 +310,31 @@ def test_map_value_upgrade(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select a from tmp2 AS '0'").df() print(duckdb_col.columns) print(converted_col.columns) - pandas.testing.assert_frame_equal(converted_col, duckdb_col) + pd.testing.assert_frame_equal(converted_col, duckdb_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_duplicate(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"key": ["a", "a", "b"], "value": [4, 0, 4]}]]) + def test_map_duplicate(self, duckdb_cursor): + x = pd.DataFrame([[{"key": ["a", "a", "b"], "value": [4, 0, 4]}]]) with pytest.raises(duckdb.InvalidInputException, match="Map keys must be unique"): duckdb_cursor.sql("select * from x").show() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_nullkey(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"key": [None, "a", "b"], "value": [4, 0, 4]}]]) + def test_map_nullkey(self, duckdb_cursor): + x = pd.DataFrame([[{"key": [None, "a", "b"], "value": [4, 0, 4]}]]) with pytest.raises(duckdb.InvalidInputException, match="Map keys can not be NULL"): converted_col = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_nullkeylist(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"key": None, "value": None}]]) + def test_map_nullkeylist(self, duckdb_cursor): + x = pd.DataFrame([[{"key": None, "value": None}]]) converted_col = duckdb_cursor.sql("select * from x").df() duckdb_col = duckdb_cursor.sql("SELECT MAP(NULL, NULL) as '0'").df() - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_nullkey(self, pandas, duckdb_cursor): - x = pandas.DataFrame([[{"a": 4, None: 0, "c": 4}], [{"a": 4, None: 0, "d": 4}]]) + def test_map_fallback_nullkey(self, duckdb_cursor): + x = pd.DataFrame([[{"a": 4, None: 0, "c": 4}], [{"a": 4, None: 0, "d": 4}]]) with pytest.raises(duckdb.InvalidInputException, match="Map keys can not be NULL"): converted_col = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_map_fallback_nullkey_coverage(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_map_fallback_nullkey_coverage(self, duckdb_cursor): + x = pd.DataFrame( [ [{"key": None, "value": None}], [{"key": None, None: 5}], @@ -359,8 +343,7 @@ def test_map_fallback_nullkey_coverage(self, pandas, duckdb_cursor): with pytest.raises(duckdb.InvalidInputException, match="Map keys can not be NULL"): converted_col = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_structs_in_nested_types(self, pandas, duckdb_cursor): + def test_structs_in_nested_types(self, duckdb_cursor): # This test is testing a bug that occurred when type upgrades occurred inside nested types # STRUCT(key1 varchar) + STRUCT(key1 varchar, key2 varchar) turns into MAP # But when inside a nested structure, this upgrade did not happen properly @@ -373,20 +356,19 @@ def test_structs_in_nested_types(self, pandas, duckdb_cursor): } for pair in pairs.values(): - check_struct_upgrade("MAP(VARCHAR, INTEGER)[]", construct_list, pair, pandas, duckdb_cursor) + check_struct_upgrade("MAP(VARCHAR, INTEGER)[]", construct_list, pair, duckdb_cursor) for key, pair in pairs.items(): expected_type = "MAP(VARCHAR, MAP(VARCHAR, INTEGER))" if key == "v4" else "STRUCT(v1 MAP(VARCHAR, INTEGER))" - check_struct_upgrade(expected_type, construct_struct, pair, pandas, duckdb_cursor) + check_struct_upgrade(expected_type, construct_struct, pair, duckdb_cursor) for pair in pairs.values(): - check_struct_upgrade("MAP(VARCHAR, MAP(VARCHAR, INTEGER))", construct_map, pair, pandas, duckdb_cursor) + check_struct_upgrade("MAP(VARCHAR, MAP(VARCHAR, INTEGER))", construct_map, pair, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_structs_of_different_sizes(self, pandas, duckdb_cursor): + def test_structs_of_different_sizes(self, duckdb_cursor): # This list has both a STRUCT(v1) and a STRUCT(v1, v2) member # Those can't be combined - df = pandas.DataFrame( + df = pd.DataFrame( data={ "col": [ [ @@ -416,9 +398,8 @@ def test_structs_of_different_sizes(self, pandas, duckdb_cursor): ): res = duckdb_cursor.execute("select $1", [malformed_struct]) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_key_conversion(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_key_conversion(self, duckdb_cursor): + x = pd.DataFrame( [ [{IntString(5): 1, IntString(-25): 3, IntString(32): 3, IntString(32456): 7}], ] @@ -426,43 +407,38 @@ def test_struct_key_conversion(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select {'5':1, '-25':3, '32':3, '32456':7} as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_correct(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [[5], [34], [-245]]}]) + def test_list_correct(self, duckdb_cursor): + x = pd.DataFrame([{"0": [[5], [34], [-245]]}]) duckdb_col = duckdb_cursor.sql("select [[5], [34], [-245]] as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_contains_null(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [[5], None, [-245]]}]) + def test_list_contains_null(self, duckdb_cursor): + x = pd.DataFrame([{"0": [[5], None, [-245]]}]) duckdb_col = duckdb_cursor.sql("select [[5], NULL, [-245]] as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_starts_with_null(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [None, [5], [-245]]}]) + def test_list_starts_with_null(self, duckdb_cursor): + x = pd.DataFrame([{"0": [None, [5], [-245]]}]) duckdb_col = duckdb_cursor.sql("select [NULL, [5], [-245]] as '0'").df() converted_col = duckdb_cursor.sql("select * from x").df() duckdb_cursor.sql("drop view if exists tbl") - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame([{"0": [["5"], [34], [-245]]}]) + def test_list_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame([{"0": [["5"], [34], [-245]]}]) duckdb_rel = duckdb_cursor.sql("select [['5'], ['34'], ['-245']] as '0'") duckdb_col = duckdb_rel.df() converted_col = duckdb_cursor.sql("select * from x").df() - pandas.testing.assert_frame_equal(duckdb_col, converted_col) + pd.testing.assert_frame_equal(duckdb_col, converted_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_column_value_upgrade(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_list_column_value_upgrade(self, duckdb_cursor): + x = pd.DataFrame( [ [[1, 25, 300]], [[500, 345, 30]], @@ -496,46 +472,35 @@ def test_list_column_value_upgrade(self, pandas, duckdb_cursor): duckdb_col = duckdb_cursor.sql("select a from tmp3 AS '0'").df() print(duckdb_col.columns) print(converted_col.columns) - pandas.testing.assert_frame_equal(converted_col, duckdb_col) + pd.testing.assert_frame_equal(converted_col, duckdb_col) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_ubigint_object_conversion(self, pandas, duckdb_cursor): + def test_ubigint_object_conversion(self, duckdb_cursor): # UBIGINT + TINYINT would result in HUGEINT, but conversion to HUGEINT is not supported yet from pandas->duckdb # So this instead becomes a DOUBLE data = [18446744073709551615, 0] - x = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) converted_col = duckdb_cursor.sql("select * from x").df() - if pandas.backend == "numpy_nullable": - float64 = np.dtype("float64") - assert isinstance(converted_col["0"].dtype, float64.__class__) - else: - uint64 = np.dtype("uint64") - assert isinstance(converted_col["0"].dtype, uint64.__class__) - - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_double_object_conversion(self, pandas, duckdb_cursor): + float64 = np.dtype("float64") + assert isinstance(converted_col["0"].dtype, float64.__class__) + + def test_double_object_conversion(self, duckdb_cursor): data = [18446744073709551616, 0] - x = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) converted_col = duckdb_cursor.sql("select * from x").df() double_dtype = np.dtype("float64") assert isinstance(converted_col["0"].dtype, double_dtype.__class__) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) @pytest.mark.xfail( condition=platform.system() == "Emscripten", reason="older numpy raises a warning when running with Pyodide", ) - def test_numpy_object_with_stride(self, pandas, duckdb_cursor): - df = pandas.DataFrame(columns=["idx", "evens", "zeros"]) - - df["idx"] = list(range(10)) - for col in df.columns[1:]: - df[col].values[:] = 0 + def test_numpy_object_with_stride(self, duckdb_cursor): + # Create 2D array in C-order (row-major) + data = np.zeros((10, 3), dtype=np.int64) + data[:, 0] = np.arange(10) + data[:, 1] = np.arange(0, 20, 2) - counter = 0 - for i in range(10): - df.loc[df["idx"] == i, "evens"] += counter - counter += 2 + df = pd.DataFrame(data, columns=["idx", "evens", "zeros"]) res = duckdb_cursor.sql("select * from df").fetchall() assert res == [ @@ -551,27 +516,24 @@ def test_numpy_object_with_stride(self, pandas, duckdb_cursor): (9, 18, 0), ] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numpy_stringliterals(self, pandas, duckdb_cursor): - df = pandas.DataFrame({"x": list(map(np.str_, range(3)))}) + def test_numpy_stringliterals(self, duckdb_cursor): + df = pd.DataFrame({"x": list(map(np.str_, range(3)))}) res = duckdb_cursor.execute("select * from df").fetchall() assert res == [("0",), ("1",), ("2",)] - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_integer_conversion_fail(self, pandas, duckdb_cursor): + def test_integer_conversion_fail(self, duckdb_cursor): data = [2**10000, 0] - x = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) converted_col = duckdb_cursor.sql("select * from x").df() print(converted_col["0"]) - double_dtype = np.dtype("object") - assert isinstance(converted_col["0"].dtype, double_dtype.__class__) + # default: VARCHAR + assert is_string_dtype(converted_col["0"].dtype) # Most of the time numpy.datetime64 is just a wrapper around a datetime.datetime object # But to support arbitrary precision, it can fall back to using an `int` internally - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) # Which we don't support yet - def test_numpy_datetime(self, pandas, duckdb_cursor): + def test_numpy_datetime(self, duckdb_cursor): numpy = pytest.importorskip("numpy") data = [] @@ -579,25 +541,23 @@ def test_numpy_datetime(self, pandas, duckdb_cursor): data += [numpy.datetime64("2022-02-21T06:59:23.324812")] * standard_vector_size data += [numpy.datetime64("1974-06-05T13:12:01.000000")] * standard_vector_size data += [numpy.datetime64("2049-01-13T00:24:31.999999")] * standard_vector_size - x = pandas.DataFrame({"dates": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"dates": pd.Series(data=data, dtype="object")}) res = duckdb_cursor.sql("select distinct * from x").df() assert len(res["dates"].__array__()) == 4 - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_numpy_datetime_int_internally(self, pandas, duckdb_cursor): + def test_numpy_datetime_int_internally(self, duckdb_cursor): numpy = pytest.importorskip("numpy") data = [numpy.datetime64("2022-12-10T21:38:24.0000000000001")] - x = pandas.DataFrame({"dates": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"dates": pd.Series(data=data, dtype="object")}) with pytest.raises( duckdb.ConversionException, match=re.escape("Conversion Error: Unimplemented type for cast (BIGINT -> TIMESTAMP)"), ): rel = duckdb.query_df(x, "x", "create table dates as select dates::TIMESTAMP WITHOUT TIME ZONE from x") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fallthrough_object_conversion(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_fallthrough_object_conversion(self, duckdb_cursor): + x = pd.DataFrame( [ [IntString(4)], [IntString(2)], @@ -605,11 +565,10 @@ def test_fallthrough_object_conversion(self, pandas, duckdb_cursor): ] ) duckdb_col = duckdb_cursor.sql("select * from x").df() - df_expected_res = pandas.DataFrame({"0": pandas.Series(["4", "2", "0"])}) - pandas.testing.assert_frame_equal(duckdb_col, df_expected_res) + df_expected_res = pd.DataFrame({"0": pd.Series(["4", "2", "0"])}) + pd.testing.assert_frame_equal(duckdb_col, df_expected_res, check_dtype=False) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal(self, pandas, duckdb_cursor): + def test_numeric_decimal(self, duckdb_cursor): # DuckDB uses DECIMAL where possible, so all the 'float' types here are actually DECIMAL reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( @@ -625,14 +584,12 @@ def test_numeric_decimal(self, pandas, duckdb_cursor): duckdb_cursor.execute(reference_query) # Because of this we need to wrap these native floats as DECIMAL for this test, to avoid these decimals being # "upgraded" to DOUBLE - x = pandas.DataFrame( + x = pd.DataFrame( { - "0": ConvertStringToDecimal([5, "12.0", "-123.0", "-234234.0", None, "1.234"], pandas), - "1": ConvertStringToDecimal( - [5002340, 13, "-12.0000000005", "7453324234.0", None, "-324234234"], pandas - ), + "0": ConvertStringToDecimal([5, "12.0", "-123.0", "-234234.0", None, "1.234"]), + "1": ConvertStringToDecimal([5002340, 13, "-12.0000000005", "7453324234.0", None, "-324234234"]), "2": ConvertStringToDecimal( - ["-234234234234.0", "324234234.00000005", -128, 345345, "1E5", "1324234359"], pandas + ["-234234234234.0", "324234234.00000005", -128, 345345, "1E5", "1324234359"] ), } ) @@ -641,9 +598,8 @@ def test_numeric_decimal(self, pandas, duckdb_cursor): assert conversion == reference - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_coverage(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_numeric_decimal_coverage(self, duckdb_cursor): + x = pd.DataFrame( {"0": [Decimal("nan"), Decimal("+nan"), Decimal("-nan"), Decimal("inf"), Decimal("+inf"), Decimal("-inf")]} ) conversion = duckdb_cursor.sql("select * from x").fetchall() @@ -659,22 +615,18 @@ def test_numeric_decimal_coverage(self, pandas, duckdb_cursor): assert str(conversion) == "[(nan,), (nan,), (nan,), (inf,), (inf,), (inf,)]" # Test that the column 'offset' is actually used when converting, - - @pytest.mark.parametrize( - "pandas", [NumpyPandas(), ArrowPandas()] - ) # and that the same 2048 (STANDARD_VECTOR_SIZE) values are not being scanned over and over again - def test_multiple_chunks(self, pandas, duckdb_cursor): + # and that the same 2048 (STANDARD_VECTOR_SIZE) values are not being scanned over and over again + def test_multiple_chunks(self, duckdb_cursor): data = [] data += [datetime.date(2022, 9, 13) for x in range(standard_vector_size)] data += [datetime.date(2022, 9, 14) for x in range(standard_vector_size)] data += [datetime.date(2022, 9, 15) for x in range(standard_vector_size)] data += [datetime.date(2022, 9, 16) for x in range(standard_vector_size)] - x = pandas.DataFrame({"dates": pandas.Series(data=data, dtype="object")}) + x = pd.DataFrame({"dates": pd.Series(data=data, dtype="object")}) res = duckdb_cursor.sql("select distinct * from x").df() assert len(res["dates"].__array__()) == 4 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): + def test_multiple_chunks_aggregate(self, duckdb_cursor): duckdb_cursor.execute("SET GLOBAL pandas_analyze_sample=4096") duckdb_cursor.execute( "create table dates as select '2022-09-14'::DATE + INTERVAL (i::INTEGER) DAY as i from range(4096) tbl(i);" @@ -684,7 +636,7 @@ def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): date_df = res.copy() # Convert the dataframe to datetime - date_df["i"] = pandas.to_datetime(res["i"]).dt.date + date_df["i"] = pd.to_datetime(res["i"]).dt.date assert str(date_df["i"].dtype) == "object" expected_res = [ @@ -722,7 +674,7 @@ def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): ] # Convert the dataframe to datetime date_df = res.copy() - date_df["i"] = pandas.to_datetime(res["i"]).dt.date + date_df["i"] = pd.to_datetime(res["i"]).dt.date assert str(date_df["i"].dtype) == "object" actual_res = duckdb_cursor.sql( @@ -737,21 +689,19 @@ def test_multiple_chunks_aggregate(self, pandas, duckdb_cursor): assert expected_res == actual_res - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_mixed_object_types(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_mixed_object_types(self, duckdb_cursor): + x = pd.DataFrame( { - "nested": pandas.Series( + "nested": pd.Series( data=[{"a": 1, "b": 2}, [5, 4, 3], {"key": [1, 2, 3], "value": ["a", "b", "c"]}], dtype="object" ), } ) res = duckdb_cursor.sql("select * from x").df() - assert res["nested"].dtype == np.dtype("object") + assert is_string_dtype(res["nested"].dtype) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_deeply_nested_in_struct(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_deeply_nested_in_struct(self, duckdb_cursor): + x = pd.DataFrame( [ { # STRUCT(b STRUCT(x VARCHAR, y VARCHAR)) @@ -768,9 +718,8 @@ def test_struct_deeply_nested_in_struct(self, pandas, duckdb_cursor): res = duckdb_cursor.sql("select * from x").fetchall() assert res == [({"b": {"x": "A", "y": "B"}},), ({"b": {"x": "A"}},)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_struct_deeply_nested_in_list(self, pandas, duckdb_cursor): - x = pandas.DataFrame( + def test_struct_deeply_nested_in_list(self, duckdb_cursor): + x = pd.DataFrame( { "a": [ [ @@ -787,16 +736,14 @@ def test_struct_deeply_nested_in_list(self, pandas, duckdb_cursor): res = duckdb_cursor.sql("select * from x").fetchall() assert res == [([{"x": "A", "y": "B"}, {"x": "A"}],)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_analyze_sample_too_small(self, pandas, duckdb_cursor): + def test_analyze_sample_too_small(self, duckdb_cursor): data = [1 for _ in range(9)] + [[1, 2, 3]] + [1 for _ in range(9991)] - x = pandas.DataFrame({"a": pandas.Series(data=data)}) + x = pd.DataFrame({"a": pd.Series(data=data)}) with pytest.raises(duckdb.InvalidInputException, match="Failed to cast value: Unimplemented type for cast"): res = duckdb_cursor.sql("select * from x").df() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_zero_fractional(self, pandas, duckdb_cursor): - decimals = pandas.DataFrame( + def test_numeric_decimal_zero_fractional(self, duckdb_cursor): + decimals = pd.DataFrame( data={ "0": [ Decimal("0.00"), @@ -827,8 +774,7 @@ def test_numeric_decimal_zero_fractional(self, pandas, duckdb_cursor): assert conversion == reference - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_incompatible(self, pandas, duckdb_cursor): + def test_numeric_decimal_incompatible(self, duckdb_cursor): reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES @@ -841,13 +787,11 @@ def test_numeric_decimal_incompatible(self, pandas, duckdb_cursor): ) tbl(a, b, c); """ duckdb_cursor.execute(reference_query) - x = pandas.DataFrame( + x = pd.DataFrame( { - "0": ConvertStringToDecimal(["5", "12.0", "-123.0", "-234234.0", None, "1.234"], pandas), - "1": ConvertStringToDecimal([5002340, 13, "-12.0000000005", 7453324234, None, "-324234234"], pandas), - "2": ConvertStringToDecimal( - [-234234234234, "324234234.00000005", -128, 345345, 0, "1324234359"], pandas - ), + "0": ConvertStringToDecimal(["5", "12.0", "-123.0", "-234234.0", None, "1.234"]), + "1": ConvertStringToDecimal([5002340, 13, "-12.0000000005", 7453324234, None, "-324234234"]), + "2": ConvertStringToDecimal([-234234234234, "324234234.00000005", -128, 345345, 0, "1324234359"]), } ) reference = duckdb_cursor.sql("select * from tbl").fetchall() @@ -857,11 +801,9 @@ def test_numeric_decimal_incompatible(self, pandas, duckdb_cursor): print(reference) print(conversion) - @pytest.mark.parametrize( - "pandas", [NumpyPandas(), ArrowPandas()] - ) # result: [('1E-28',), ('10000000000000000000000000.0',)] - def test_numeric_decimal_combined(self, pandas, duckdb_cursor): - decimals = pandas.DataFrame( + # result: [('1E-28',), ('10000000000000000000000000.0',)] + def test_numeric_decimal_combined(self, duckdb_cursor): + decimals = pd.DataFrame( data={"0": [Decimal("0.0000000000000000000000000001"), Decimal("10000000000000000000000000.0")]} ) reference_query = """ @@ -879,9 +821,8 @@ def test_numeric_decimal_combined(self, pandas, duckdb_cursor): print(conversion) # result: [('1234.0',), ('123456789.0',), ('1234567890123456789.0',), ('0.1234567890123456789',)] - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_varying_sizes(self, pandas, duckdb_cursor): - decimals = pandas.DataFrame( + def test_numeric_decimal_varying_sizes(self, duckdb_cursor): + decimals = pd.DataFrame( data={ "0": [ Decimal("1234.0"), @@ -907,14 +848,13 @@ def test_numeric_decimal_varying_sizes(self, pandas, duckdb_cursor): print(reference) print(conversion) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_fallback_to_double(self, pandas, duckdb_cursor): + def test_numeric_decimal_fallback_to_double(self, duckdb_cursor): # The widths of these decimal values are bigger than the max supported width for DECIMAL data = [ Decimal("1.234567890123456789012345678901234567890123456789"), Decimal("123456789012345678901234567890123456789012345678.0"), ] - decimals = pandas.DataFrame(data={"0": data}) + decimals = pd.DataFrame(data={"0": data}) reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES @@ -928,8 +868,7 @@ def test_numeric_decimal_fallback_to_double(self, pandas, duckdb_cursor): assert conversion == reference assert isinstance(conversion[0][0], float) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_double_mixed(self, pandas, duckdb_cursor): + def test_numeric_decimal_double_mixed(self, duckdb_cursor): data = [ Decimal("1.234"), Decimal("1.234567891234567890123456789012345678901234567890123456789"), @@ -940,7 +879,7 @@ def test_numeric_decimal_double_mixed(self, pandas, duckdb_cursor): Decimal("1232354.000000000000000000000000000035"), Decimal("123.5e300"), ] - decimals = pandas.DataFrame(data={"0": data}) + decimals = pd.DataFrame(data={"0": data}) reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES @@ -960,10 +899,9 @@ def test_numeric_decimal_double_mixed(self, pandas, duckdb_cursor): assert conversion == reference assert isinstance(conversion[0][0], float) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_numeric_decimal_out_of_range(self, pandas, duckdb_cursor): + def test_numeric_decimal_out_of_range(self, duckdb_cursor): data = [Decimal("1.234567890123456789012345678901234567"), Decimal("123456789012345678901234567890123456.0")] - decimals = pandas.DataFrame(data={"0": data}) + decimals = pd.DataFrame(data={"0": data}) reference_query = """ CREATE TABLE tbl AS SELECT * FROM ( VALUES diff --git a/tests/fast/pandas/test_df_recursive_nested.py b/tests/fast/pandas/test_df_recursive_nested.py index 871132ae..c3971cf6 100644 --- a/tests/fast/pandas/test_df_recursive_nested.py +++ b/tests/fast/pandas/test_df_recursive_nested.py @@ -1,5 +1,4 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb from duckdb import Value @@ -21,39 +20,35 @@ def create_reference_query(): class TestDFRecursiveNested: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_of_structs(self, duckdb_cursor, pandas): + def test_list_of_structs(self, duckdb_cursor): data = [[{"a": 5}, NULL, {"a": NULL}], NULL, [{"a": 5}, NULL, {"a": NULL}]] reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal(duckdb_cursor, df, reference_query, Value(data, "STRUCT(a INTEGER)[]")) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_list_of_map(self, duckdb_cursor, pandas): + def test_list_of_map(self, duckdb_cursor): # LIST(MAP(VARCHAR, VARCHAR)) data = [[{5: NULL}, NULL, {}], NULL, [NULL, {3: NULL, 2: "a", 4: NULL}, {"a": 1, "b": 2, "c": 3}]] reference_query = create_reference_query() print(reference_query) - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal(duckdb_cursor, df, reference_query, Value(data, "MAP(VARCHAR, VARCHAR)[][]")) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_list(self, duckdb_cursor, pandas): + def test_recursive_list(self, duckdb_cursor): # LIST(LIST(LIST(LIST(INTEGER)))) data = [[[[3, NULL, 5], NULL], NULL, [[5, -20, NULL]]], NULL, [[[NULL]], [[]], NULL]] reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal(duckdb_cursor, df, reference_query, Value(data, "INTEGER[][][][]")) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_struct(self, duckdb_cursor, pandas): + def test_recursive_struct(self, duckdb_cursor): # STRUCT(STRUCT(STRUCT(LIST))) data = { "A": {"a": {"1": [1, 2, 3]}, "b": NULL, "c": {"1": NULL}}, "B": {"a": {"1": [1, NULL, 3]}, "b": NULL, "c": {"1": NULL}}, } reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal( duckdb_cursor, df, @@ -89,8 +84,7 @@ def test_recursive_struct(self, duckdb_cursor, pandas): ), ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_map(self, duckdb_cursor, pandas): + def test_recursive_map(self, duckdb_cursor): # MAP( # MAP( # INTEGER, @@ -106,13 +100,12 @@ def test_recursive_map(self, duckdb_cursor, pandas): "value": [1, 2], } reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) check_equal( duckdb_cursor, df, reference_query, Value(data, "MAP(MAP(INTEGER, MAP(INTEGER, VARCHAR)), INTEGER)") ) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_recursive_stresstest(self, duckdb_cursor, pandas): + def test_recursive_stresstest(self, duckdb_cursor): data = [ { "a": { @@ -134,7 +127,7 @@ def test_recursive_stresstest(self, duckdb_cursor, pandas): } ] reference_query = create_reference_query() - df = pandas.DataFrame([{"a": data}]) + df = pd.DataFrame([{"a": data}]) duckdb_type = """ STRUCT( a MAP( diff --git a/tests/fast/pandas/test_implicit_pandas_scan.py b/tests/fast/pandas/test_implicit_pandas_scan.py index 76f2c200..af3a8758 100644 --- a/tests/fast/pandas/test_implicit_pandas_scan.py +++ b/tests/fast/pandas/test_implicit_pandas_scan.py @@ -1,43 +1,27 @@ # simple DB API testcase import pandas as pd -import pytest -from conftest import ArrowPandas, NumpyPandas -from packaging.version import Version import duckdb -numpy_nullable_df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val4", "CoL2": 17}]) - -try: - from pandas.compat import pa_version_under7p0 - - pyarrow_dtypes_enabled = not pa_version_under7p0 -except Exception: - pyarrow_dtypes_enabled = False - -if Version(pd.__version__) >= Version("2.0.0") and pyarrow_dtypes_enabled: - pyarrow_df = numpy_nullable_df.convert_dtypes(dtype_backend="pyarrow") -else: - # dtype_backend is not supported in pandas < 2.0.0 - pyarrow_df = numpy_nullable_df - class TestImplicitPandasScan: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_local_pandas_scan(self, duckdb_cursor, pandas): + def test_local_pandas_scan(self, duckdb_cursor): con = duckdb.connect() - df = pandas.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) # noqa: F841 + df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) # noqa: F841 r1 = con.execute("select * from df").fetchdf() assert r1["COL1"][0] == "val1" assert r1["COL1"][1] == "val3" assert r1["CoL2"][0] == 1.05 assert r1["CoL2"][1] == 17 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_global_pandas_scan(self, duckdb_cursor, pandas): + def test_global_pandas_scan(self, duckdb_cursor): + """Test that DuckDB can scan a module-level DataFrame variable.""" con = duckdb.connect() - r1 = con.execute(f"select * from {pandas.backend}_df").fetchdf() + # Create a global-scope dataframe for this test + global test_global_df + test_global_df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val4", "CoL2": 17}]) + r1 = con.execute("select * from test_global_df").fetchdf() assert r1["COL1"][0] == "val1" assert r1["COL1"][1] == "val4" assert r1["CoL2"][0] == 1.05 diff --git a/tests/fast/pandas/test_import_cache.py b/tests/fast/pandas/test_import_cache.py index eb1c8fb8..f744c671 100644 --- a/tests/fast/pandas/test_import_cache.py +++ b/tests/fast/pandas/test_import_cache.py @@ -1,29 +1,28 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb -@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) -def test_import_cache_explicit_dtype(pandas): - df = pandas.DataFrame( # noqa: F841 +@pytest.mark.parametrize("string_dtype", ["python", "pyarrow"]) +def test_import_cache_explicit_dtype(string_dtype): + df = pd.DataFrame( # noqa: F841 { "id": [1, 2, 3], - "value": pandas.Series(["123.123", pandas.NaT, pandas.NA], dtype=pandas.StringDtype(storage="python")), + "value": pd.Series(["123.123", pd.NaT, pd.NA], dtype=pd.StringDtype(storage=string_dtype)), } ) con = duckdb.connect() result_df = con.query("select id, value from df").df() - assert result_df["value"][1] is None - assert result_df["value"][2] is None + assert pd.isna(result_df["value"][1]) + assert pd.isna(result_df["value"][2]) -@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) -def test_import_cache_implicit_dtype(pandas): - df = pandas.DataFrame({"id": [1, 2, 3], "value": pandas.Series(["123.123", pandas.NaT, pandas.NA])}) # noqa: F841 +def test_import_cache_implicit_dtype(): + df = pd.DataFrame({"id": [1, 2, 3], "value": pd.Series(["123.123", pd.NaT, pd.NA])}) # noqa: F841 con = duckdb.connect() result_df = con.query("select id, value from df").df() - assert result_df["value"][1] is None - assert result_df["value"][2] is None + assert pd.isna(result_df["value"][1]) + assert pd.isna(result_df["value"][2]) diff --git a/tests/fast/pandas/test_issue_1767.py b/tests/fast/pandas/test_issue_1767.py index 48d3e852..1677001e 100644 --- a/tests/fast/pandas/test_issue_1767.py +++ b/tests/fast/pandas/test_issue_1767.py @@ -1,22 +1,20 @@ #!/usr/bin/env python -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb # Join from pandas not matching identical strings #1767 class TestIssue1767: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_unicode_join_pandas(self, duckdb_cursor, pandas): - A = pandas.DataFrame({"key": ["a", "п"]}) - B = pandas.DataFrame({"key": ["a", "п"]}) + def test_unicode_join_pandas(self, duckdb_cursor): + A = pd.DataFrame({"key": ["a", "п"]}) + B = pd.DataFrame({"key": ["a", "п"]}) con = duckdb.connect(":memory:") arrow = con.register("A", A).register("B", B) q = arrow.query("""SELECT key FROM "A" FULL JOIN "B" USING ("key") ORDER BY key""") result = q.df() d = {"key": ["a", "п"]} - df = pandas.DataFrame(data=d) - pandas.testing.assert_frame_equal(result, df) + df = pd.DataFrame(data=d) + pd.testing.assert_frame_equal(result, df, check_dtype=False) diff --git a/tests/fast/pandas/test_limit.py b/tests/fast/pandas/test_limit.py index 51c4a382..2fb6c769 100644 --- a/tests/fast/pandas/test_limit.py +++ b/tests/fast/pandas/test_limit.py @@ -1,13 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestLimitPandas: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_limit_df(self, duckdb_cursor, pandas): - df_in = pandas.DataFrame( + def test_limit_df(self, duckdb_cursor): + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -15,9 +13,8 @@ def test_limit_df(self, duckdb_cursor, pandas): limit_df = duckdb.limit(df_in, 2) assert len(limit_df.execute().fetchall()) == 2 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_aggregate_df(self, duckdb_cursor, pandas): - df_in = pandas.DataFrame( + def test_aggregate_df(self, duckdb_cursor): + df_in = pd.DataFrame( { "numbers": [1, 2, 2, 2], } diff --git a/tests/fast/pandas/test_pandas_na.py b/tests/fast/pandas/test_pandas_na.py index 6462c298..166fc21e 100644 --- a/tests/fast/pandas/test_pandas_na.py +++ b/tests/fast/pandas/test_pandas_na.py @@ -1,8 +1,9 @@ import platform import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas +from conftest import is_string_dtype import duckdb @@ -10,27 +11,25 @@ def assert_nullness(items, null_indices): for i in range(len(items)): if i in null_indices: - assert items[i] is None + assert pd.isna(items[i]) else: - assert items[i] is not None + assert not pd.isna(items[i]) @pytest.mark.skipif(platform.system() == "Emscripten", reason="Pandas interaction is broken in Pyodide 3.11") class TestPandasNA: @pytest.mark.parametrize("rows", [100, duckdb.__standard_vector_size__, 5000, 1000000]) - @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) - def test_pandas_string_null(self, duckdb_cursor, rows, pd): - df: pd.DataFrame = pd.DataFrame(index=np.arange(rows)) + def test_pandas_string_null(self, duckdb_cursor, rows): + df = pd.DataFrame(index=np.arange(rows)) df["string_column"] = pd.Series(dtype="string") e_df_rel = duckdb_cursor.from_df(df) assert e_df_rel.types == ["VARCHAR"] roundtrip = e_df_rel.df() - assert roundtrip["string_column"].dtype == "object" + assert is_string_dtype(roundtrip["string_column"].dtype) expected = pd.DataFrame({"string_column": [None for _ in range(rows)]}) - pd.testing.assert_frame_equal(expected, roundtrip) + pd.testing.assert_frame_equal(expected, roundtrip, check_dtype=False) def test_pandas_na(self, duckdb_cursor): - pd = pytest.importorskip("pandas", minversion="1.0.0", reason="Support for pandas.NA has not been added yet") # DataFrame containing a single pd.NA df = pd.DataFrame(pd.Series([pd.NA])) @@ -74,7 +73,9 @@ def test_pandas_na(self, duckdb_cursor): } ) assert str(nan_df["a"].dtype) == "float64" - assert str(na_df["a"].dtype) == "object" # pd.NA values turn the column into 'object' + # pd.NA values turn the column into 'object' in Pandas 2.x + # In Pandas 3.0+, it may be different but we just check it's not float64 + assert str(na_df["a"].dtype) != "float64" nan_result = duckdb_cursor.execute("select * from nan_df").df() na_result = duckdb_cursor.execute("select * from na_df").df() diff --git a/tests/fast/pandas/test_pandas_unregister.py b/tests/fast/pandas/test_pandas_unregister.py index ab83eb42..c89ae320 100644 --- a/tests/fast/pandas/test_pandas_unregister.py +++ b/tests/fast/pandas/test_pandas_unregister.py @@ -1,16 +1,15 @@ import gc import tempfile +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb class TestPandasUnregister: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_unregister1(self, duckdb_cursor, pandas): - df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]]) + def test_pandas_unregister1(self, duckdb_cursor): + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) connection = duckdb.connect(":memory:") connection.register("dataframe", df) @@ -22,13 +21,12 @@ def test_pandas_unregister1(self, duckdb_cursor, pandas): connection.execute("DROP VIEW dataframe;") connection.execute("DROP VIEW IF EXISTS dataframe;") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_unregister2(self, duckdb_cursor, pandas): + def test_pandas_unregister2(self, duckdb_cursor): with tempfile.NamedTemporaryFile() as tmp: db = tmp.name connection = duckdb.connect(db) - df = pandas.DataFrame([[1, 2, 3], [4, 5, 6]]) + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) connection.register("dataframe", df) connection.unregister("dataframe") # Attempting to unregister. diff --git a/tests/fast/pandas/test_parallel_pandas_scan.py b/tests/fast/pandas/test_parallel_pandas_scan.py index 9ac7b738..7e04a933 100644 --- a/tests/fast/pandas/test_parallel_pandas_scan.py +++ b/tests/fast/pandas/test_parallel_pandas_scan.py @@ -2,13 +2,12 @@ import datetime import numpy -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb -def run_parallel_queries(main_table, left_join_table, expected_df, pandas, iteration_count=5): +def run_parallel_queries(main_table, left_join_table, expected_df, iteration_count=5): for _i in range(iteration_count): output_df = None sql = """ @@ -28,7 +27,7 @@ def run_parallel_queries(main_table, left_join_table, expected_df, pandas, itera duckdb_conn.register("main_table", main_table) duckdb_conn.register("left_join_table", left_join_table) output_df = duckdb_conn.execute(sql).fetchdf() - pandas.testing.assert_frame_equal(expected_df, output_df) + pd.testing.assert_frame_equal(expected_df, output_df, check_dtype=False) print(output_df) except Exception as err: print(err) @@ -37,67 +36,59 @@ def run_parallel_queries(main_table, left_join_table, expected_df, pandas, itera class TestParallelPandasScan: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_numeric_scan(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": 3}]) - left_join_table = pandas.DataFrame([{"join_column": 3, "other_column": 4}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_numeric_scan(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": 3}]) + left_join_table = pd.DataFrame([{"join_column": 3, "other_column": 4}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_ascii_text(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "text"}]) - left_join_table = pandas.DataFrame([{"join_column": "text", "other_column": "more text"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_ascii_text(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "text"}]) + left_join_table = pd.DataFrame([{"join_column": "text", "other_column": "more text"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_unicode_text(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "mühleisen"}]) - left_join_table = pandas.DataFrame([{"join_column": "mühleisen", "other_column": "höhöhö"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_unicode_text(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "mühleisen"}]) + left_join_table = pd.DataFrame([{"join_column": "mühleisen", "other_column": "höhöhö"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_complex_unicode_text(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "鴨"}]) - left_join_table = pandas.DataFrame([{"join_column": "鴨", "other_column": "數據庫"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_complex_unicode_text(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "鴨"}]) + left_join_table = pd.DataFrame([{"join_column": "鴨", "other_column": "數據庫"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_emojis(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️"}]) - left_join_table = pandas.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️", "other_column": "🦆🍞🦆"}]) - run_parallel_queries(main_table, left_join_table, left_join_table, pandas) + def test_parallel_emojis(self, duckdb_cursor): + main_table = pd.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️"}]) + left_join_table = pd.DataFrame([{"join_column": "🤦🏼‍♂️ L🤦🏼‍♂️R 🤦🏼‍♂️", "other_column": "🦆🍞🦆"}]) + run_parallel_queries(main_table, left_join_table, left_join_table) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_numeric_object(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame({"join_column": pandas.Series([3], dtype="Int8")}) - left_join_table = pandas.DataFrame( - {"join_column": pandas.Series([3], dtype="Int8"), "other_column": pandas.Series([4], dtype="Int8")} + def test_parallel_numeric_object(self, duckdb_cursor): + main_table = pd.DataFrame({"join_column": pd.Series([3], dtype="Int8")}) + left_join_table = pd.DataFrame( + {"join_column": pd.Series([3], dtype="Int8"), "other_column": pd.Series([4], dtype="Int8")} ) - expected_df = pandas.DataFrame( + expected_df = pd.DataFrame( {"join_column": numpy.array([3], dtype=numpy.int8), "other_column": numpy.array([4], dtype=numpy.int8)} ) - run_parallel_queries(main_table, left_join_table, expected_df, pandas) + run_parallel_queries(main_table, left_join_table, expected_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_timestamp(self, duckdb_cursor, pandas): - main_table = pandas.DataFrame({"join_column": [pandas.Timestamp("20180310T11:17:54Z")]}) - left_join_table = pandas.DataFrame( + def test_parallel_timestamp(self, duckdb_cursor): + main_table = pd.DataFrame({"join_column": [pd.Timestamp("20180310T11:17:54Z")]}) + left_join_table = pd.DataFrame( { - "join_column": [pandas.Timestamp("20180310T11:17:54Z")], - "other_column": [pandas.Timestamp("20190310T11:17:54Z")], + "join_column": [pd.Timestamp("20180310T11:17:54Z")], + "other_column": [pd.Timestamp("20190310T11:17:54Z")], } ) - expected_df = pandas.DataFrame( + expected_df = pd.DataFrame( { "join_column": numpy.array([datetime.datetime(2018, 3, 10, 11, 17, 54)], dtype="datetime64[ns]"), "other_column": numpy.array([datetime.datetime(2019, 3, 10, 11, 17, 54)], dtype="datetime64[ns]"), } ) - run_parallel_queries(main_table, left_join_table, expected_df, pandas) + run_parallel_queries(main_table, left_join_table, expected_df) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_parallel_empty(self, duckdb_cursor, pandas): - df_empty = pandas.DataFrame({"A": []}) + def test_parallel_empty(self, duckdb_cursor): + df_empty = pd.DataFrame({"A": []}) duckdb_conn = duckdb.connect() duckdb_conn.execute("PRAGMA threads=4") duckdb_conn.execute("PRAGMA verify_parallelism") diff --git a/tests/fast/spark/test_spark_to_csv.py b/tests/fast/spark/test_spark_to_csv.py index 10e0028c..5003a20b 100644 --- a/tests/fast/spark/test_spark_to_csv.py +++ b/tests/fast/spark/test_spark_to_csv.py @@ -2,8 +2,9 @@ import datetime import os +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas, getTimeSeriesData +from conftest import getTimeSeriesData from spark_namespace import USE_ACTUAL_SPARK from duckdb import InvalidInputException, read_csv @@ -33,17 +34,15 @@ def df(spark): return dataframe -@pytest.fixture(params=[NumpyPandas(), ArrowPandas()]) -def pandas_df_ints(request, spark): - pandas = request.param - dataframe = pandas.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) +@pytest.fixture +def pandas_df_ints(spark): + dataframe = pd.DataFrame({"a": [5, 3, 23, 2], "b": [45, 234, 234, 2]}) return dataframe -@pytest.fixture(params=[NumpyPandas(), ArrowPandas()]) -def pandas_df_strings(request, spark): - pandas = request.param - dataframe = pandas.DataFrame({"a": ["string1", "string2", "string3"]}) +@pytest.fixture +def pandas_df_strings(spark): + dataframe = pd.DataFrame({"a": ["string1", "string2", "string3"]}) return dataframe @@ -69,10 +68,9 @@ def test_to_csv_sep(self, pandas_df_ints, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, sep=",") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_na_rep(self, pandas, spark, tmp_path): + def test_to_csv_na_rep(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + pandas_df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) df = spark.createDataFrame(pandas_df) @@ -81,10 +79,9 @@ def test_to_csv_na_rep(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, nullValue="test") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_header(self, pandas, spark, tmp_path): + def test_to_csv_header(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) + pandas_df = pd.DataFrame({"a": [5, None, 23, 2], "b": [45, 234, 234, 2]}) df = spark.createDataFrame(pandas_df) @@ -93,11 +90,10 @@ def test_to_csv_header(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name) assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_quotechar(self, pandas, spark, tmp_path): + def test_to_csv_quotechar(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) + pandas_df = pd.DataFrame({"a": ["'a,b,c'", None, "hello", "bye"], "b": [45, 234, 234, 2]}) df = spark.createDataFrame(pandas_df) @@ -106,10 +102,9 @@ def test_to_csv_quotechar(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, sep=",", quote="'") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_escapechar(self, pandas, spark, tmp_path): + def test_to_csv_escapechar(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame( + pandas_df = pd.DataFrame( { "c_bool": [True, False], "c_float": [1.0, 3.2], @@ -124,12 +119,11 @@ def test_to_csv_escapechar(self, pandas, spark, tmp_path): csv_rel = spark.read.csv(temp_file_name, quote='"', escape="!") assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_date_format(self, pandas, spark, tmp_path): + def test_to_csv_date_format(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 - pandas_df = pandas.DataFrame(getTimeSeriesData()) + pandas_df = pd.DataFrame(getTimeSeriesData()) dt_index = pandas_df.index - pandas_df = pandas.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) + pandas_df = pd.DataFrame({"A": dt_index, "B": dt_index.shift(1)}, index=dt_index) df = spark.createDataFrame(pandas_df) @@ -139,11 +133,10 @@ def test_to_csv_date_format(self, pandas, spark, tmp_path): assert df.collect() == csv_rel.collect() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_to_csv_timestamp_format(self, pandas, spark, tmp_path): + def test_to_csv_timestamp_format(self, spark, tmp_path): temp_file_name = os.path.join(tmp_path, "temp_file.csv") # noqa: PTH118 data = [datetime.time(hour=23, minute=1, second=34, microsecond=234345)] - pandas_df = pandas.DataFrame({"0": pandas.Series(data=data, dtype="object")}) + pandas_df = pd.DataFrame({"0": pd.Series(data=data, dtype="object")}) df = spark.createDataFrame(pandas_df) diff --git a/tests/fast/test_case_alias.py b/tests/fast/test_case_alias.py index d1afb4d8..f99b994e 100644 --- a/tests/fast/test_case_alias.py +++ b/tests/fast/test_case_alias.py @@ -1,15 +1,13 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestCaseAlias: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_case_alias(self, duckdb_cursor, pandas): + def test_case_alias(self, duckdb_cursor): con = duckdb.connect(":memory:") - df = pandas.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) + df = pd.DataFrame([{"COL1": "val1", "CoL2": 1.05}, {"COL1": "val3", "CoL2": 17}]) r1 = con.from_df(df).query("df", "select * from df").df() assert r1["COL1"][0] == "val1" diff --git a/tests/fast/test_insert.py b/tests/fast/test_insert.py index c5de1589..6eeabd67 100644 --- a/tests/fast/test_insert.py +++ b/tests/fast/test_insert.py @@ -1,13 +1,11 @@ -import pytest -from conftest import ArrowPandas, NumpyPandas +import pandas as pd import duckdb class TestInsert: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_insert(self, pandas): - test_df = pandas.DataFrame({"i": [1, 2, 3], "j": ["one", "two", "three"]}) + def test_insert(self): + test_df = pd.DataFrame({"i": [1, 2, 3], "j": ["one", "two", "three"]}) # connect to an in-memory temporary database conn = duckdb.connect() # get a cursor @@ -18,7 +16,7 @@ def test_insert(self, pandas): rel.insert([2, "two"]) rel.insert([3, "three"]) rel_a3 = cursor.table("test").project("CAST(i as BIGINT)i, j").to_df() - pandas.testing.assert_frame_equal(rel_a3, test_df) + pd.testing.assert_frame_equal(rel_a3, test_df) def test_insert_with_schema(self, duckdb_cursor): duckdb_cursor.sql("create schema not_main") diff --git a/tests/fast/test_map.py b/tests/fast/test_map.py index 336b2775..622095c2 100644 --- a/tests/fast/test_map.py +++ b/tests/fast/test_map.py @@ -2,8 +2,8 @@ from datetime import date, timedelta from typing import NoReturn +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -17,15 +17,13 @@ def evil1(df): class TestMap: - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_evil_map(self, duckdb_cursor, pandas): + def test_evil_map(self, duckdb_cursor): testrel = duckdb.values([1, 2]) rel = testrel.map(evil1, schema={"i": str}) with pytest.raises(duckdb.InvalidInputException, match="Expected 1 columns from UDF, got 2"): rel.df() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_map(self, duckdb_cursor, pandas): + def test_map(self, duckdb_cursor): testrel = duckdb.values([1, 2]) conn = duckdb_cursor conn.execute("CREATE TABLE t (a integer)") @@ -57,16 +55,16 @@ def evil5(df) -> NoReturn: raise TypeError def return_dataframe(df): - return pandas.DataFrame({"A": [1]}) + return pd.DataFrame({"A": [1]}) def return_big_dataframe(df): - return pandas.DataFrame({"A": [1] * 5000}) + return pd.DataFrame({"A": [1] * 5000}) def return_none(df) -> None: return None def return_empty_df(df): - return pandas.DataFrame() + return pd.DataFrame() with pytest.raises(duckdb.InvalidInputException, match="Expected 1 columns from UDF, got 2"): print(testrel.map(evil1).df()) @@ -93,14 +91,14 @@ def return_empty_df(df): with pytest.raises(TypeError): print(testrel.map().df()) - testrel.map(return_dataframe).df().equals(pandas.DataFrame({"A": [1]})) + testrel.map(return_dataframe).df().equals(pd.DataFrame({"A": [1]})) with pytest.raises( duckdb.InvalidInputException, match="UDF returned more than 2048 rows, which is not allowed" ): testrel.map(return_big_dataframe).df() - empty_rel.map(return_dataframe).df().equals(pandas.DataFrame({"A": []})) + empty_rel.map(return_dataframe).df().equals(pd.DataFrame({"A": []})) with pytest.raises(duckdb.InvalidInputException, match="No return value from Python function"): testrel.map(return_none).df() @@ -118,18 +116,17 @@ def return_with_no_modification(df): # in this case we assume the returned type should be the same as the input type duckdb_cursor.values([b"1234"]).map(return_with_no_modification).fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_isse_3237(self, duckdb_cursor, pandas): + def test_isse_3237(self, duckdb_cursor): def process(rel): def mapper(x): dates = x["date"].to_numpy("datetime64[us]") days = x["days_to_add"].to_numpy("int") - x["result1"] = pandas.Series( - [pandas.to_datetime(y[0]).date() + timedelta(days=y[1].item()) for y in zip(dates, days)], + x["result1"] = pd.Series( + [pd.to_datetime(y[0]).date() + timedelta(days=y[1].item()) for y in zip(dates, days)], dtype="datetime64[us]", ) - x["result2"] = pandas.Series( - [pandas.to_datetime(y[0]).date() + timedelta(days=-y[1].item()) for y in zip(dates, days)], + x["result2"] = pd.Series( + [pd.to_datetime(y[0]).date() + timedelta(days=-y[1].item()) for y in zip(dates, days)], dtype="datetime64[us]", ) return x @@ -140,8 +137,8 @@ def mapper(x): rel = rel.project("*, IF(ABS(one) > ABS(two), one, two) as three") return rel - df = pandas.DataFrame( - {"date": pandas.Series([date(2000, 1, 1), date(2000, 1, 2)], dtype="datetime64[us]"), "days_to_add": [1, 2]} + df = pd.DataFrame( + {"date": pd.Series([date(2000, 1, 1), date(2000, 1, 2)], dtype="datetime64[us]"), "days_to_add": [1, 2]} ) rel = duckdb.from_df(df) rel = process(rel) @@ -172,10 +169,9 @@ def does_nothing(df): ): rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_explicit_schema_name_mismatch(self, pandas): + def test_explicit_schema_name_mismatch(self): def renames_column(df): - return pandas.DataFrame({"a": df["i"]}) + return pd.DataFrame({"a": df["i"]}) con = duckdb.connect() rel = con.sql("select i from range(10) tbl(i)") @@ -183,8 +179,7 @@ def renames_column(df): with pytest.raises(duckdb.InvalidInputException, match=re.escape("UDF column name mismatch")): rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_explicit_schema_error(self, pandas): + def test_explicit_schema_error(self): def no_op(df): return df @@ -196,8 +191,7 @@ def no_op(df): ): rel.map(no_op, schema=[int]) - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_returns_non_dataframe(self, pandas): + def test_returns_non_dataframe(self): def returns_series(df): return df.loc[:, "i"] @@ -205,17 +199,14 @@ def returns_series(df): rel = con.sql("select i, i as j from range(10) tbl(i)") with pytest.raises( duckdb.InvalidInputException, - match=re.escape( - "Expected the UDF to return an object of type 'pandas.DataFrame', found " - "'' instead" - ), + match=r"Expected the UDF to return an object of type 'pandas\.DataFrame', found " + r"'' instead", ): rel = rel.map(returns_series) - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_explicit_schema_columncount_mismatch(self, pandas): + def test_explicit_schema_columncount_mismatch(self): def returns_subset(df): - return pandas.DataFrame({"i": df.loc[:, "i"]}) + return pd.DataFrame({"i": df.loc[:, "i"]}) con = duckdb.connect() rel = con.sql("select i, i as j from range(10) tbl(i)") @@ -225,14 +216,13 @@ def returns_subset(df): ): rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas()]) - def test_pyarrow_df(self, pandas): + def test_pyarrow_df(self): # PyArrow backed dataframes only exist on pandas >= 2.0.0 pytest.importorskip("pandas", "2.0.0") def basic_function(df): # Create a pyarrow backed dataframe - df = pandas.DataFrame({"a": [5, 3, 2, 1, 2]}).convert_dtypes(dtype_backend="pyarrow") + df = pd.DataFrame({"a": [5, 3, 2, 1, 2]}).convert_dtypes(dtype_backend="pyarrow") return df con = duckdb.connect() diff --git a/tests/fast/test_multithread.py b/tests/fast/test_multithread.py index dfefb918..ccf809c5 100644 --- a/tests/fast/test_multithread.py +++ b/tests/fast/test_multithread.py @@ -4,8 +4,8 @@ from pathlib import Path import numpy as np +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -25,11 +25,10 @@ def everything_succeeded(results: list[bool]): class DuckDBThreaded: - def __init__(self, duckdb_insert_thread_count, thread_function, pandas) -> None: + def __init__(self, duckdb_insert_thread_count, thread_function) -> None: self.duckdb_insert_thread_count = duckdb_insert_thread_count self.threads = [] self.thread_function = thread_function - self.pandas = pandas def multithread_test(self, result_verification=everything_succeeded): duckdb_conn = duckdb.connect() @@ -38,9 +37,7 @@ def multithread_test(self, result_verification=everything_succeeded): # Create all threads for i in range(self.duckdb_insert_thread_count): self.threads.append( - threading.Thread( - target=self.thread_function, args=(duckdb_conn, queue, self.pandas), name="duckdb_thread_" + str(i) - ) + threading.Thread(target=self.thread_function, args=(duckdb_conn, queue), name="duckdb_thread_" + str(i)) ) # Record for every thread if they succeeded or not @@ -58,7 +55,7 @@ def multithread_test(self, result_verification=everything_succeeded): assert result_verification(thread_results) -def execute_query_same_connection(duckdb_conn, queue, pandas): +def execute_query_same_connection(duckdb_conn, queue): try: duckdb_conn.execute("select i from (values (42), (84), (NULL), (128)) tbl(i)") queue.put(False) @@ -66,7 +63,7 @@ def execute_query_same_connection(duckdb_conn, queue, pandas): queue.put(True) -def execute_query(duckdb_conn, queue, pandas): +def execute_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -76,7 +73,7 @@ def execute_query(duckdb_conn, queue, pandas): queue.put(False) -def insert_runtime_error(duckdb_conn, queue, pandas): +def insert_runtime_error(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -86,7 +83,7 @@ def insert_runtime_error(duckdb_conn, queue, pandas): queue.put(True) -def execute_many_query(duckdb_conn, queue, pandas): +def execute_many_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -119,7 +116,7 @@ def execute_many_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchone_query(duckdb_conn, queue, pandas): +def fetchone_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -129,7 +126,7 @@ def fetchone_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchall_query(duckdb_conn, queue, pandas): +def fetchall_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -139,7 +136,7 @@ def fetchall_query(duckdb_conn, queue, pandas): queue.put(False) -def conn_close(duckdb_conn, queue, pandas): +def conn_close(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -149,7 +146,7 @@ def conn_close(duckdb_conn, queue, pandas): queue.put(False) -def fetchnp_query(duckdb_conn, queue, pandas): +def fetchnp_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -159,7 +156,7 @@ def fetchnp_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchdf_query(duckdb_conn, queue, pandas): +def fetchdf_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -169,7 +166,7 @@ def fetchdf_query(duckdb_conn, queue, pandas): queue.put(False) -def fetchdf_chunk_query(duckdb_conn, queue, pandas): +def fetchdf_chunk_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -179,7 +176,7 @@ def fetchdf_chunk_query(duckdb_conn, queue, pandas): queue.put(False) -def fetch_arrow_query(duckdb_conn, queue, pandas): +def fetch_arrow_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -189,7 +186,7 @@ def fetch_arrow_query(duckdb_conn, queue, pandas): queue.put(False) -def fetch_record_batch_query(duckdb_conn, queue, pandas): +def fetch_record_batch_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -199,7 +196,7 @@ def fetch_record_batch_query(duckdb_conn, queue, pandas): queue.put(False) -def transaction_query(duckdb_conn, queue, pandas): +def transaction_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") @@ -214,11 +211,11 @@ def transaction_query(duckdb_conn, queue, pandas): queue.put(False) -def df_append(duckdb_conn, queue, pandas): +def df_append(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") - df = pandas.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) + df = pd.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) try: duckdb_conn.append("T", df) queue.put(True) @@ -226,10 +223,10 @@ def df_append(duckdb_conn, queue, pandas): queue.put(False) -def df_register(duckdb_conn, queue, pandas): +def df_register(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() - df = pandas.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) + df = pd.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) try: duckdb_conn.register("T", df) queue.put(True) @@ -237,10 +234,10 @@ def df_register(duckdb_conn, queue, pandas): queue.put(False) -def df_unregister(duckdb_conn, queue, pandas): +def df_unregister(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() - df = pandas.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) + df = pd.DataFrame(np.random.randint(0, 100, size=15), columns=["A"]) try: duckdb_conn.register("T", df) duckdb_conn.unregister("T") @@ -249,7 +246,7 @@ def df_unregister(duckdb_conn, queue, pandas): queue.put(False) -def arrow_register_unregister(duckdb_conn, queue, pandas): +def arrow_register_unregister(duckdb_conn, queue): # Get a new connection pa = pytest.importorskip("pyarrow") duckdb_conn = duckdb.connect() @@ -262,7 +259,7 @@ def arrow_register_unregister(duckdb_conn, queue, pandas): queue.put(False) -def table(duckdb_conn, queue, pandas): +def table(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") @@ -273,7 +270,7 @@ def table(duckdb_conn, queue, pandas): queue.put(False) -def view(duckdb_conn, queue, pandas): +def view(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE T ( i INTEGER)") @@ -285,7 +282,7 @@ def view(duckdb_conn, queue, pandas): queue.put(False) -def values(duckdb_conn, queue, pandas): +def values(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -295,7 +292,7 @@ def values(duckdb_conn, queue, pandas): queue.put(False) -def from_query(duckdb_conn, queue, pandas): +def from_query(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() try: @@ -305,10 +302,10 @@ def from_query(duckdb_conn, queue, pandas): queue.put(False) -def from_df(duckdb_conn, queue, pandas): +def from_df(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() - df = pandas.DataFrame(["bla", "blabla"] * 10, columns=["A"]) # noqa: F841 + df = pd.DataFrame(["bla", "blabla"] * 10, columns=["A"]) # noqa: F841 try: duckdb_conn.execute("select * from df").fetchall() queue.put(True) @@ -316,7 +313,7 @@ def from_df(duckdb_conn, queue, pandas): queue.put(False) -def from_arrow(duckdb_conn, queue, pandas): +def from_arrow(duckdb_conn, queue): # Get a new connection pa = pytest.importorskip("pyarrow") duckdb_conn = duckdb.connect() @@ -328,7 +325,7 @@ def from_arrow(duckdb_conn, queue, pandas): queue.put(False) -def from_csv_auto(duckdb_conn, queue, pandas): +def from_csv_auto(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() filename = str(Path(__file__).parent / "data" / "integers.csv") @@ -339,7 +336,7 @@ def from_csv_auto(duckdb_conn, queue, pandas): queue.put(False) -def from_parquet(duckdb_conn, queue, pandas): +def from_parquet(duckdb_conn, queue): # Get a new connection duckdb_conn = duckdb.connect() filename = str(Path(__file__).parent / "data" / "binary_string.parquet") @@ -350,7 +347,7 @@ def from_parquet(duckdb_conn, queue, pandas): queue.put(False) -def description(_, queue, __): +def description(_, queue): # Get a new connection duckdb_conn = duckdb.connect() duckdb_conn.execute("CREATE TABLE test (i bool, j TIME, k VARCHAR)") @@ -364,7 +361,7 @@ def description(_, queue, __): queue.put(False) -def cursor(duckdb_conn, queue, pandas): +def cursor(duckdb_conn, queue): # Get a new connection cx = duckdb_conn.cursor() try: @@ -375,136 +372,111 @@ def cursor(duckdb_conn, queue, pandas): class TestDuckMultithread: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_execute(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, execute_query, pandas) + def test_execute(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, execute_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_execute_many(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, execute_many_query, pandas) + def test_execute_many(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, execute_many_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchone(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchone_query, pandas) + def test_fetchone(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchone_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchall(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchall_query, pandas) + def test_fetchall(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchall_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_close(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, conn_close, pandas) + def test_close(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, conn_close) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchnp(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchnp_query, pandas) + def test_fetchnp(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchnp_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchdf(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchdf_query, pandas) + def test_fetchdf(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchdf_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetchdfchunk(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, fetchdf_chunk_query, pandas) + def test_fetchdfchunk(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, fetchdf_chunk_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetcharrow(self, duckdb_cursor, pandas): + def test_fetcharrow(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, fetch_arrow_query, pandas) + duck_threads = DuckDBThreaded(10, fetch_arrow_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_fetch_record_batch(self, duckdb_cursor, pandas): + def test_fetch_record_batch(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, fetch_record_batch_query, pandas) + duck_threads = DuckDBThreaded(10, fetch_record_batch_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_transaction(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, transaction_query, pandas) + def test_transaction(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, transaction_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_append(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, df_append, pandas) + def test_df_append(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, df_append) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_register(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, df_register, pandas) + def test_df_register(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, df_register) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_df_unregister(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, df_unregister, pandas) + def test_df_unregister(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, df_unregister) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_arrow_register_unregister(self, duckdb_cursor, pandas): + def test_arrow_register_unregister(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, arrow_register_unregister, pandas) + duck_threads = DuckDBThreaded(10, arrow_register_unregister) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_table(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, table, pandas) + def test_table(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, table) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_view(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, view, pandas) + def test_view(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, view) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_values(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, values, pandas) + def test_values(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, values) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_query(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_query, pandas) + def test_from_query(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_query) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_DF(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_df, pandas) + def test_from_DF(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_df) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_arrow(self, duckdb_cursor, pandas): + def test_from_arrow(self, duckdb_cursor): pytest.importorskip("pyarrow") - duck_threads = DuckDBThreaded(10, from_arrow, pandas) + duck_threads = DuckDBThreaded(10, from_arrow) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_csv_auto(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_csv_auto, pandas) + def test_from_csv_auto(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_csv_auto) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_parquet(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, from_parquet, pandas) + def test_from_parquet(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, from_parquet) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_description(self, duckdb_cursor, pandas): - duck_threads = DuckDBThreaded(10, description, pandas) + def test_description(self, duckdb_cursor): + duck_threads = DuckDBThreaded(10, description) duck_threads.multithread_test() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_cursor(self, duckdb_cursor, pandas): + def test_cursor(self, duckdb_cursor): def only_some_succeed(results: list[bool]) -> bool: if not any(result for result in results): return False return not all(result for result in results) - duck_threads = DuckDBThreaded(10, cursor, pandas) + duck_threads = DuckDBThreaded(10, cursor) duck_threads.multithread_test(only_some_succeed) diff --git a/tests/fast/test_parameter_list.py b/tests/fast/test_parameter_list.py index 22413999..6d101bcb 100644 --- a/tests/fast/test_parameter_list.py +++ b/tests/fast/test_parameter_list.py @@ -1,5 +1,5 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -12,10 +12,9 @@ def test_bool(self, duckdb_cursor): res = conn.execute("select count(*) from bool_table where a =?", [True]) assert res.fetchone()[0] == 1 - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_exception(self, duckdb_cursor, pandas): + def test_exception(self, duckdb_cursor): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } diff --git a/tests/fast/test_relation.py b/tests/fast/test_relation.py index f386b091..a4949d64 100644 --- a/tests/fast/test_relation.py +++ b/tests/fast/test_relation.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb from duckdb import ColumnExpression @@ -39,10 +38,9 @@ def test_csv_auto(self): csv_rel = duckdb.from_csv_auto(temp_file_name) assert df_rel.execute().fetchall() == csv_rel.execute().fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_view(self, duckdb_cursor, pandas): + def test_relation_view(self, duckdb_cursor): def create_view(duckdb_cursor) -> None: - df_in = pandas.DataFrame({"numbers": [1, 2, 3, 4, 5]}) + df_in = pd.DataFrame({"numbers": [1, 2, 3, 4, 5]}) rel = duckdb_cursor.query("select * from df_in") rel.to_view("my_view") diff --git a/tests/fast/test_relation_dependency_leak.py b/tests/fast/test_relation_dependency_leak.py index 659e1c28..db83ff1c 100644 --- a/tests/fast/test_relation_dependency_leak.py +++ b/tests/fast/test_relation_dependency_leak.py @@ -1,6 +1,7 @@ import os import numpy as np +import pandas as pd import pytest try: @@ -9,67 +10,61 @@ can_run = True except ImportError: can_run = False -from conftest import ArrowPandas, NumpyPandas psutil = pytest.importorskip("psutil") -def check_memory(function_to_check, pandas, duckdb_cursor): +def check_memory(function_to_check, duckdb_cursor): process = psutil.Process(os.getpid()) mem_usage = process.memory_info().rss / (10**9) for __ in range(100): - function_to_check(pandas, duckdb_cursor) + function_to_check(duckdb_cursor) cur_mem_usage = process.memory_info().rss / (10**9) # This seems a good empirical value assert cur_mem_usage / 3 < mem_usage -def from_df(pandas, duckdb_cursor): - df = pandas.DataFrame({"x": np.random.rand(1_000_000)}) +def from_df(duckdb_cursor): + df = pd.DataFrame({"x": np.random.rand(1_000_000)}) return duckdb_cursor.from_df(df) -def from_arrow(pandas, duckdb_cursor): +def from_arrow(duckdb_cursor): data = pa.array(np.random.rand(1_000_000), type=pa.float32()) arrow_table = pa.Table.from_arrays([data], ["a"]) duckdb_cursor.from_arrow(arrow_table) -def arrow_replacement(pandas, duckdb_cursor): +def arrow_replacement(duckdb_cursor): data = pa.array(np.random.rand(1_000_000), type=pa.float32()) arrow_table = pa.Table.from_arrays([data], ["a"]) # noqa: F841 duckdb_cursor.query("select sum(a) from arrow_table").fetchall() -def pandas_replacement(pandas, duckdb_cursor): - df = pandas.DataFrame({"x": np.random.rand(1_000_000)}) # noqa: F841 +def pandas_replacement(duckdb_cursor): + df = pd.DataFrame({"x": np.random.rand(1_000_000)}) # noqa: F841 duckdb_cursor.query("select sum(x) from df").fetchall() class TestRelationDependencyMemoryLeak: - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_arrow_leak(self, pandas, duckdb_cursor): + def test_from_arrow_leak(self, duckdb_cursor): if not can_run: return - check_memory(from_arrow, pandas, duckdb_cursor) + check_memory(from_arrow, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_from_df_leak(self, pandas, duckdb_cursor): - check_memory(from_df, pandas, duckdb_cursor) + def test_from_df_leak(self, duckdb_cursor): + check_memory(from_df, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_arrow_replacement_scan_leak(self, pandas, duckdb_cursor): + def test_arrow_replacement_scan_leak(self, duckdb_cursor): if not can_run: return - check_memory(arrow_replacement, pandas, duckdb_cursor) + check_memory(arrow_replacement, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_pandas_replacement_scan_leak(self, pandas, duckdb_cursor): - check_memory(pandas_replacement, pandas, duckdb_cursor) + def test_pandas_replacement_scan_leak(self, duckdb_cursor): + check_memory(pandas_replacement, duckdb_cursor) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_view_leak(self, pandas, duckdb_cursor): - rel = from_df(pandas, duckdb_cursor) + def test_relation_view_leak(self, duckdb_cursor): + rel = from_df(duckdb_cursor) rel.create_view("bla") duckdb_cursor.unregister("bla") assert rel.query("bla", "select count(*) from bla").fetchone()[0] == 1_000_000 diff --git a/tests/fast/test_runtime_error.py b/tests/fast/test_runtime_error.py index 9f1975a0..44910a13 100644 --- a/tests/fast/test_runtime_error.py +++ b/tests/fast/test_runtime_error.py @@ -1,5 +1,5 @@ +import pandas as pd import pytest -from conftest import ArrowPandas, NumpyPandas import duckdb @@ -61,10 +61,9 @@ def test_arrow_record_batch_reader_error(self): with pytest.raises(duckdb.ProgrammingError, match="There is no query result"): res.fetch_arrow_reader(1) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_cache_fetchall(self, pandas): + def test_relation_cache_fetchall(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -78,10 +77,9 @@ def test_relation_cache_fetchall(self, pandas): # so the dependency of 'x' on 'df_in' is not registered in 'rel' rel.fetchall() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_cache_execute(self, pandas): + def test_relation_cache_execute(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -92,10 +90,9 @@ def test_relation_cache_execute(self, pandas): with pytest.raises(duckdb.ProgrammingError, match="Table with name df_in does not exist"): rel.execute() - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_relation_query_error(self, pandas): + def test_relation_query_error(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -106,10 +103,9 @@ def test_relation_query_error(self, pandas): with pytest.raises(duckdb.CatalogException, match="Table with name df_in does not exist"): rel.query("bla", "select * from bla") - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_conn_broken_statement_error(self, pandas): + def test_conn_broken_statement_error(self): conn = duckdb.connect() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } @@ -128,11 +124,10 @@ def test_conn_prepared_statement_error(self): ): conn.execute("select * from integers where a =? and b=?", [1]) - @pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()]) - def test_closed_conn_exceptions(self, pandas): + def test_closed_conn_exceptions(self): conn = duckdb.connect() conn.close() - df_in = pandas.DataFrame( + df_in = pd.DataFrame( { "numbers": [1, 2, 3, 4, 5], } From 0576a1c23ba1878f3d4868a8aa6bf36f57921d6d Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Fri, 23 Jan 2026 16:54:36 +0100 Subject: [PATCH 21/37] bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index bd560976..32940e60 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit bd56097640dd7509c2baafe2734aa3830b17097d +Subproject commit 32940e6025a1698440ce6524c3ed4730dc7bc517 From 5d9b64ee2b77de72b53d741a355c20dcd239129a Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Fri, 23 Jan 2026 16:41:02 +0100 Subject: [PATCH 22/37] Add timedelta[s|ms|us|ns] numpy types --- .../duckdb_python/numpy/numpy_type.hpp | 41 ++++++++++--------- src/duckdb_py/numpy/array_wrapper.cpp | 2 +- src/duckdb_py/numpy/numpy_scan.cpp | 25 ++++++++++- src/duckdb_py/numpy/raw_array_wrapper.cpp | 2 +- src/duckdb_py/numpy/type.cpp | 16 +++++++- .../fast/pandas/test_df_object_resolution.py | 2 +- tests/fast/pandas/test_stride.py | 4 +- tests/fast/pandas/test_timestamp.py | 4 +- 8 files changed, 68 insertions(+), 28 deletions(-) diff --git a/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp b/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp index 982f00ec..d58bc139 100644 --- a/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp +++ b/src/duckdb_py/include/duckdb_python/numpy/numpy_type.hpp @@ -18,25 +18,28 @@ namespace duckdb { // Pandas Specific Types (e.g., categorical, datetime_tz,...) enum class NumpyNullableType : uint8_t { //! NumPy dtypes - BOOL, //! bool_, bool8 - INT_8, //! byte, int8 - UINT_8, //! ubyte, uint8 - INT_16, //! int16, short - UINT_16, //! uint16, ushort - INT_32, //! int32, intc - UINT_32, //! uint32, uintc, - INT_64, //! int64, int0, int_, intp, matrix - UINT_64, //! uint64, uint, uint0, uintp - FLOAT_16, //! float16, half - FLOAT_32, //! float32, single - FLOAT_64, //! float64, float_, double - OBJECT, //! object - UNICODE, //! static int64_t ConvertValue(interval_t val, NumpyAppendData &append_data) { (void)append_data; - return Interval::GetNanoseconds(val); + return Interval::GetMicro(val); } template diff --git a/src/duckdb_py/numpy/numpy_scan.cpp b/src/duckdb_py/numpy/numpy_scan.cpp index 0117eaae..b1cd6e60 100644 --- a/src/duckdb_py/numpy/numpy_scan.cpp +++ b/src/duckdb_py/numpy/numpy_scan.cpp @@ -302,7 +302,10 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset, } break; } - case NumpyNullableType::TIMEDELTA: { + case NumpyNullableType::TIMEDELTA_NS: + case NumpyNullableType::TIMEDELTA_US: + case NumpyNullableType::TIMEDELTA_MS: + case NumpyNullableType::TIMEDELTA_S: { auto src_ptr = reinterpret_cast(array.data()); auto tgt_ptr = FlatVector::GetData(out); auto &mask = FlatVector::Validity(out); @@ -314,7 +317,25 @@ void NumpyScan::Scan(PandasColumnBindData &bind_data, idx_t count, idx_t offset, mask.SetInvalid(row); continue; } - int64_t micro = src_ptr[source_idx] / 1000; + + int64_t micro; + switch (bind_data.numpy_type.type) { + case NumpyNullableType::TIMEDELTA_NS: + micro = src_ptr[source_idx] / 1000; // ns -> us + break; + case NumpyNullableType::TIMEDELTA_US: + micro = src_ptr[source_idx]; // already us + break; + case NumpyNullableType::TIMEDELTA_MS: + micro = src_ptr[source_idx] * 1000; // ms -> us + break; + case NumpyNullableType::TIMEDELTA_S: + micro = src_ptr[source_idx] * 1000000; // s -> us + break; + default: + throw InternalException("Unexpected timedelta type"); + } + int64_t days = micro / Interval::MICROS_PER_DAY; micro = micro % Interval::MICROS_PER_DAY; int64_t months = days / Interval::DAYS_PER_MONTH; diff --git a/src/duckdb_py/numpy/raw_array_wrapper.cpp b/src/duckdb_py/numpy/raw_array_wrapper.cpp index 5d73685b..0b3f8d14 100644 --- a/src/duckdb_py/numpy/raw_array_wrapper.cpp +++ b/src/duckdb_py/numpy/raw_array_wrapper.cpp @@ -108,7 +108,7 @@ string RawArrayWrapper::DuckDBToNumpyDtype(const LogicalType &type) { case LogicalTypeId::DATE: return "datetime64[us]"; case LogicalTypeId::INTERVAL: - return "timedelta64[ns]"; + return "timedelta64[us]"; case LogicalTypeId::TIME: case LogicalTypeId::TIME_TZ: case LogicalTypeId::VARCHAR: diff --git a/src/duckdb_py/numpy/type.cpp b/src/duckdb_py/numpy/type.cpp index 3642cbd4..3d8d9096 100644 --- a/src/duckdb_py/numpy/type.cpp +++ b/src/duckdb_py/numpy/type.cpp @@ -65,7 +65,16 @@ static NumpyNullableType ConvertNumpyTypeInternal(const string &col_type_str) { return NumpyNullableType::OBJECT; } if (col_type_str == "timedelta64[ns]") { - return NumpyNullableType::TIMEDELTA; + return NumpyNullableType::TIMEDELTA_NS; + } + if (col_type_str == "timedelta64[us]") { + return NumpyNullableType::TIMEDELTA_US; + } + if (col_type_str == "timedelta64[ms]") { + return NumpyNullableType::TIMEDELTA_MS; + } + if (col_type_str == "timedelta64[s]") { + return NumpyNullableType::TIMEDELTA_S; } // We use 'StartsWith' because it might have ', tz' at the end, indicating timezone if (StringUtil::StartsWith(col_type_str, "datetime64[ns")) { @@ -143,7 +152,10 @@ LogicalType NumpyToLogicalType(const NumpyType &col_type) { return LogicalType::VARCHAR; case NumpyNullableType::OBJECT: return LogicalType::VARCHAR; - case NumpyNullableType::TIMEDELTA: + case NumpyNullableType::TIMEDELTA_NS: + case NumpyNullableType::TIMEDELTA_US: + case NumpyNullableType::TIMEDELTA_MS: + case NumpyNullableType::TIMEDELTA_S: return LogicalType::INTERVAL; case NumpyNullableType::DATETIME_MS: { if (col_type.has_timezone) { diff --git a/tests/fast/pandas/test_df_object_resolution.py b/tests/fast/pandas/test_df_object_resolution.py index 2f78e27d..0c5ab311 100644 --- a/tests/fast/pandas/test_df_object_resolution.py +++ b/tests/fast/pandas/test_df_object_resolution.py @@ -9,9 +9,9 @@ import numpy as np import pandas as pd import pytest +from conftest import is_string_dtype import duckdb -from tests.conftest import is_string_dtype standard_vector_size = duckdb.__standard_vector_size__ diff --git a/tests/fast/pandas/test_stride.py b/tests/fast/pandas/test_stride.py index cbe23cfd..65204ea8 100644 --- a/tests/fast/pandas/test_stride.py +++ b/tests/fast/pandas/test_stride.py @@ -57,7 +57,9 @@ def test_stride_timedelta(self, duckdb_cursor): ] } ) - pd.testing.assert_frame_equal(roundtrip, expected) + # DuckDB INTERVAL type stores in microseconds, so output is always timedelta64[us] + # Check values match without strict dtype comparison + pd.testing.assert_frame_equal(roundtrip, expected, check_dtype=False) def test_stride_fp64(self, duckdb_cursor): expected_df = pd.DataFrame(np.arange(20, dtype="float64").reshape(5, 4), columns=["a", "b", "c", "d"]) diff --git a/tests/fast/pandas/test_timestamp.py b/tests/fast/pandas/test_timestamp.py index 81651634..c6d080b8 100644 --- a/tests/fast/pandas/test_timestamp.py +++ b/tests/fast/pandas/test_timestamp.py @@ -65,7 +65,9 @@ def test_timestamp_timedelta(self): } ) df_from_duck = duckdb.from_df(df).df() - assert df_from_duck.equals(df) + # DuckDB INTERVAL type stores in microseconds, so output is always timedelta64[us] + # Check values match without strict dtype comparison + pd.testing.assert_frame_equal(df_from_duck, df, check_dtype=False) @pytest.mark.xfail( condition=platform.system() == "Emscripten" and os.environ.get("TZ") != "UTC", From 8361d73425cc7b6e53785ad2b76ed898342a768d Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Fri, 23 Jan 2026 18:02:20 +0100 Subject: [PATCH 23/37] Move slow tests to slow and fix test error when pyarrow is missing --- tests/fast/pandas/test_import_cache.py | 12 +++++- tests/fast/test_relation.py | 10 ----- tests/slow/test_materialized_relation.py | 52 ++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 11 deletions(-) create mode 100644 tests/slow/test_materialized_relation.py diff --git a/tests/fast/pandas/test_import_cache.py b/tests/fast/pandas/test_import_cache.py index f744c671..1b3a98ee 100644 --- a/tests/fast/pandas/test_import_cache.py +++ b/tests/fast/pandas/test_import_cache.py @@ -1,10 +1,20 @@ +import importlib.util + import pandas as pd import pytest import duckdb -@pytest.mark.parametrize("string_dtype", ["python", "pyarrow"]) +@pytest.mark.parametrize( + "string_dtype", + [ + "python", + pytest.param( + "pyarrow", marks=pytest.mark.skipif(not importlib.util.find_spec("pyarrow"), reason="pyarrow not installed") + ), + ], +) def test_import_cache_explicit_dtype(string_dtype): df = pd.DataFrame( # noqa: F841 { diff --git a/tests/fast/test_relation.py b/tests/fast/test_relation.py index a4949d64..220fb954 100644 --- a/tests/fast/test_relation.py +++ b/tests/fast/test_relation.py @@ -2,7 +2,6 @@ import datetime import gc import os -import platform import tempfile import numpy as np @@ -534,15 +533,6 @@ def test_relation_print(self): 1024, 2048, 5000, - 1000000, - pytest.param( - 10000000, - marks=pytest.mark.skipif( - condition=platform.system() == "Emscripten", - reason="Emscripten/Pyodide builds run out of memory at this scale, and error might not " - "thrown reliably", - ), - ), ], ) def test_materialized_relation(self, duckdb_cursor, num_rows): diff --git a/tests/slow/test_materialized_relation.py b/tests/slow/test_materialized_relation.py new file mode 100644 index 00000000..69008adc --- /dev/null +++ b/tests/slow/test_materialized_relation.py @@ -0,0 +1,52 @@ +import platform + +import pytest + + +class TestMaterializedRelationSlow: + @pytest.mark.parametrize( + "num_rows", + [ + 1000000, + pytest.param( + 10000000, + marks=pytest.mark.skipif( + condition=platform.system() == "Emscripten", + reason="Emscripten/Pyodide builds run out of memory at this scale, and error might not " + "thrown reliably", + ), + ), + ], + ) + def test_materialized_relation(self, duckdb_cursor, num_rows): + # Anything that is not a SELECT statement becomes a materialized relation, so we use `CALL` + query = f"call repeat_row(42, 'test', 'this is a long string', true, num_rows={num_rows})" + rel = duckdb_cursor.sql(query) + res = rel.fetchone() + assert res is not None + + res = rel.fetchmany(num_rows) + assert len(res) == num_rows - 1 + + res = rel.fetchmany(5) + assert len(res) == 0 + res = rel.fetchmany(5) + assert len(res) == 0 + res = rel.fetchone() + assert res is None + + rel.execute() + res = rel.fetchone() + assert res is not None + + res = rel.fetchall() + assert len(res) == num_rows - 1 + res = rel.fetchall() + assert len(res) == num_rows + + rel = duckdb_cursor.sql(query) + projection = rel.select("column0") + assert projection.fetchall() == [(42,) for _ in range(num_rows)] + + filtered = rel.filter("column1 != 'test'") + assert filtered.fetchall() == [] From 77a3f8d637982542f6e91569b9ecad3e96a02b32 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Fri, 23 Jan 2026 19:25:52 +0100 Subject: [PATCH 24/37] Set submodule to release hash --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 431ad092..6ddac802 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 431ad092c9d666c81b3739438ab19d72fc622362 +Subproject commit 6ddac802ffa9bcfbcc3f5f0d71de5dff9b0bc250 From c5e764ae221a5cb8018ae4b91aed948de0899340 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 26 Jan 2026 10:56:15 +0100 Subject: [PATCH 25/37] Fix logical type creation --- src/duckdb_py/typing/pytype.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/duckdb_py/typing/pytype.cpp b/src/duckdb_py/typing/pytype.cpp index e7e31a18..285a4838 100644 --- a/src/duckdb_py/typing/pytype.cpp +++ b/src/duckdb_py/typing/pytype.cpp @@ -122,7 +122,13 @@ static LogicalType FromString(const string &type_str, shared_ptrcon.GetConnection(); - return TransformStringToLogicalType(type_str, *connection.context); + if (connection.HasActiveTransaction()) { + return TransformStringToLogicalType(type_str, *connection.context); + } + connection.BeginTransaction(); + auto type = TransformStringToLogicalType(type_str, *connection.context); + connection.Commit(); + return type; } static bool FromNumpyType(const py::object &type, LogicalType &result) { From ae1fc0b39aa9182f52136e434c2771ce252a44cd Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 26 Jan 2026 10:57:18 +0100 Subject: [PATCH 26/37] Bump submodule to pull in fixes --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index 32940e60..cbb5d8b7 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit 32940e6025a1698440ce6524c3ed4730dc7bc517 +Subproject commit cbb5d8b7388d034c83d7fd22467074c00b322254 From fcdc93b5315a0a7a52662eabe82895b156499a47 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 26 Jan 2026 11:17:49 +0100 Subject: [PATCH 27/37] Missing import --- src/duckdb_py/include/duckdb_python/expression/pyexpression.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/duckdb_py/include/duckdb_python/expression/pyexpression.hpp b/src/duckdb_py/include/duckdb_python/expression/pyexpression.hpp index 39dd252f..43c0c5c3 100644 --- a/src/duckdb_py/include/duckdb_python/expression/pyexpression.hpp +++ b/src/duckdb_py/include/duckdb_python/expression/pyexpression.hpp @@ -12,6 +12,7 @@ #include "duckdb.hpp" #include "duckdb/common/string.hpp" #include "duckdb/parser/parsed_expression.hpp" +#include "duckdb/parser/expression/case_expression.hpp" #include "duckdb/parser/expression/constant_expression.hpp" #include "duckdb/parser/expression/columnref_expression.hpp" #include "duckdb/parser/expression/function_expression.hpp" From ab2d2a3fed591656ab3a55eb71b5a9294805fa83 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 26 Jan 2026 17:14:09 +0100 Subject: [PATCH 28/37] check conversion errors in time --- src/duckdb_py/native/python_conversion.cpp | 4 +-- src/duckdb_py/python_udf.cpp | 39 ++++++++++++---------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/duckdb_py/native/python_conversion.cpp b/src/duckdb_py/native/python_conversion.cpp index caa409c5..4a01b566 100644 --- a/src/duckdb_py/native/python_conversion.cpp +++ b/src/duckdb_py/native/python_conversion.cpp @@ -953,18 +953,18 @@ void TransformPythonObjectInternal(py::handle ele, A &result, const B ¶m, bo default: break; } - if (overflow == 1) { + if (overflow == 1) { // value is > LLONG_MAX uint64_t unsigned_value = PyLong_AsUnsignedLongLong(ptr); if (!PyErr_Occurred()) { // value does not fit within an int64, but it fits within a uint64 OP::HandleUnsignedBigint(result, param, unsigned_value); break; } + PyErr_Clear(); if (conversion_target.id() == LogicalTypeId::UBIGINT) { throw InvalidInputException("Python Conversion Failure: Value out of range for type %s", conversion_target); } - PyErr_Clear(); } double number = PyLong_AsDouble(ele.ptr()); if (number == -1.0 && PyErr_Occurred()) { diff --git a/src/duckdb_py/python_udf.cpp b/src/duckdb_py/python_udf.cpp index fd6775e0..c67ae50e 100644 --- a/src/duckdb_py/python_udf.cpp +++ b/src/duckdb_py/python_udf.cpp @@ -307,26 +307,31 @@ static scalar_function_t CreateNativeFunction(PyObject *function, PythonExceptio for (idx_t row = 0; row < input.size(); row++) { - auto bundled_parameters = py::tuple((int)input.ColumnCount()); - bool contains_null = false; - for (idx_t i = 0; i < input.ColumnCount(); i++) { - // Fill the tuple with the arguments for this row - auto &column = input.data[i]; - auto value = column.GetValue(row); - if (value.IsNull() && default_null_handling) { - contains_null = true; - break; + py::object ret; + if (input.ColumnCount() > 0) { + auto bundled_parameters = py::tuple((int)input.ColumnCount()); + bool contains_null = false; + for (idx_t i = 0; i < input.ColumnCount(); i++) { + // Fill the tuple with the arguments for this row + auto &column = input.data[i]; + auto value = column.GetValue(row); + if (value.IsNull() && default_null_handling) { + contains_null = true; + break; + } + bundled_parameters[i] = PythonObject::FromValue(value, column.GetType(), client_properties); } - bundled_parameters[i] = PythonObject::FromValue(value, column.GetType(), client_properties); - } - if (contains_null) { - // Immediately insert None, no need to call the function - FlatVector::SetNull(result, row, true); - continue; + if (contains_null) { + // Immediately insert None, no need to call the function + FlatVector::SetNull(result, row, true); + continue; + } + // Call the function + ret = py::reinterpret_steal(PyObject_CallObject(function, bundled_parameters.ptr())); + } else { + ret = py::reinterpret_steal(PyObject_CallObject(function, nullptr)); } - // Call the function - auto ret = py::reinterpret_steal(PyObject_CallObject(function, bundled_parameters.ptr())); if (ret == nullptr && PyErr_Occurred()) { if (exception_handling == PythonExceptionHandling::FORWARD_ERROR) { auto exception = py::error_already_set(); From 7e5a18c50fd4e11ac164c4e71dc80ad3a78d1f16 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 26 Jan 2026 21:14:34 +0100 Subject: [PATCH 29/37] Fix transaction errors and more --- src/duckdb_py/pyrelation.cpp | 3 +- src/duckdb_py/typing/pytype.cpp | 10 ++-- tests/fast/arrow/test_filter_pushdown.py | 60 ++++++------------- tests/fast/pandas/test_pandas_arrow.py | 2 - .../test_pyarrow_projection_pushdown.py | 2 - tests/fast/test_replacement_scan.py | 3 +- 6 files changed, 25 insertions(+), 55 deletions(-) diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp index f265938c..62e9cd2d 100644 --- a/src/duckdb_py/pyrelation.cpp +++ b/src/duckdb_py/pyrelation.cpp @@ -128,7 +128,8 @@ unique_ptr DuckDBPyRelation::ProjectFromTypes(const py::object LogicalType type; if (py::isinstance(item)) { string type_str = py::str(item); - type = TransformStringToLogicalType(type_str, *rel->context->GetContext()); + rel->context->GetContext()->RunFunctionInTransaction( + [&]() { type = TransformStringToLogicalType(type_str, *rel->context->GetContext().get()); }); } else if (py::isinstance(item)) { auto *type_p = item.cast(); type = type_p->Type(); diff --git a/src/duckdb_py/typing/pytype.cpp b/src/duckdb_py/typing/pytype.cpp index 285a4838..58503cf3 100644 --- a/src/duckdb_py/typing/pytype.cpp +++ b/src/duckdb_py/typing/pytype.cpp @@ -122,12 +122,10 @@ static LogicalType FromString(const string &type_str, shared_ptrcon.GetConnection(); - if (connection.HasActiveTransaction()) { - return TransformStringToLogicalType(type_str, *connection.context); - } - connection.BeginTransaction(); - auto type = TransformStringToLogicalType(type_str, *connection.context); - connection.Commit(); + + LogicalType type; + connection.context->RunFunctionInTransaction( + [&]() { type = TransformStringToLogicalType(type_str, *connection.context); }); return type; } diff --git a/tests/fast/arrow/test_filter_pushdown.py b/tests/fast/arrow/test_filter_pushdown.py index 225f48c0..ad63f547 100644 --- a/tests/fast/arrow/test_filter_pushdown.py +++ b/tests/fast/arrow/test_filter_pushdown.py @@ -2,23 +2,25 @@ import sys import pytest -from conftest import pandas_supports_arrow_backend +from conftest import PANDAS_GE_3 from packaging.version import Version import duckdb pa = pytest.importorskip("pyarrow") -pd = pytest.importorskip("pyarrow.dataset") +pa_ds = pytest.importorskip("pyarrow.dataset") pa_lib = pytest.importorskip("pyarrow.lib") -pq = pytest.importorskip("pyarrow.parquet") +pa_parquet = pytest.importorskip("pyarrow.parquet") +pd = pytest.importorskip("pandas") np = pytest.importorskip("numpy") re = pytest.importorskip("re") def create_pyarrow_pandas(rel): - if not pandas_supports_arrow_backend(): - pytest.skip(reason="Pandas version doesn't support 'pyarrow' backend") - return rel.df().convert_dtypes(dtype_backend="pyarrow") + if PANDAS_GE_3: + return rel.df() + else: + return rel.df().convert_dtypes(dtype_backend="pyarrow") def create_pyarrow_table(rel): @@ -27,7 +29,7 @@ def create_pyarrow_table(rel): def create_pyarrow_dataset(rel): table = create_pyarrow_table(rel) - return pd.dataset(table) + return pa_ds.dataset(table) def test_decimal_filter_pushdown(duckdb_cursor): @@ -550,7 +552,7 @@ def test_9371(self, duckdb_cursor, tmp_path): df = df.set_index("ts") # SET INDEX! (It all works correctly when the index is not set) df.to_parquet(str(file_path)) - my_arrow_dataset = pd.dataset(str(file_path)) + my_arrow_dataset = pa_ds.dataset(str(file_path)) res = duckdb_cursor.execute("SELECT * FROM my_arrow_dataset WHERE ts = ?", parameters=[dt]).fetch_arrow_table() output = duckdb_cursor.sql("select * from res").fetchall() expected = [(1, dt), (2, dt), (3, dt)] @@ -708,34 +710,6 @@ def test_filter_pushdown_2145(self, duckdb_cursor, tmp_path, create_table): expected_df = duckdb.from_parquet(glob_pattern.as_posix()).filter("date > '2019-01-01'").df() pandas.testing.assert_frame_equal(expected_df, output_df) - # https://github.com/duckdb/duckdb/pull/4817/files#r1339973721 - @pytest.mark.parametrize("create_table", [create_pyarrow_pandas, create_pyarrow_table]) - def test_filter_column_removal(self, duckdb_cursor, create_table): - duckdb_cursor.execute( - """ - CREATE TABLE test AS SELECT - range a, - 100 - range b - FROM range(100) - """ - ) - duck_test_table = duckdb_cursor.table("test") - arrow_table = create_table(duck_test_table) - - # PR 4817 - remove filter columns that are unused in the remainder of the query plan from the table function - query_res = duckdb_cursor.execute( - """ - EXPLAIN SELECT count(*) FROM arrow_table WHERE - a > 25 AND b > 25 - """ - ).fetchall() - - # scanned columns that come out of the scan are displayed like this, so we shouldn't see them - match = re.search("│ +a +│", query_res[0][1]) - assert not match - match = re.search("│ +b +│", query_res[0][1]) - assert not match - @pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires python 3.9") @pytest.mark.parametrize("create_table", [create_pyarrow_pandas, create_pyarrow_table]) def test_struct_filter_pushdown(self, duckdb_cursor, create_table): @@ -1023,14 +997,14 @@ def test_dynamic_filter(self, duckdb_cursor): def test_binary_view_filter(self, duckdb_cursor): """Filters on a view column work (without pushdown because pyarrow does not support view filters yet).""" table = pa.table({"col": pa.array([b"abc", b"efg"], type=pa.binary_view())}) - dset = pd.dataset(table) + dset = pa_ds.dataset(table) res = duckdb_cursor.sql("select * from dset where col = 'abc'::binary") assert len(res) == 1 def test_string_view_filter(self, duckdb_cursor): """Filters on a view column work (without pushdown because pyarrow does not support view filters yet).""" table = pa.table({"col": pa.array(["abc", "efg"], type=pa.string_view())}) - dset = pd.dataset(table) + dset = pa_ds.dataset(table) res = duckdb_cursor.sql("select * from dset where col = 'abc'") assert len(res) == 1 @@ -1038,10 +1012,10 @@ def test_string_view_filter(self, duckdb_cursor): def test_canary_for_pyarrow_string_view_filter_support(self, duckdb_cursor): """This canary will xpass when pyarrow implements string view filter support.""" # predicate: field == "string value" - filter_expr = pd.field("col") == pd.scalar("val1") + filter_expr = pa_ds.field("col") == pa_ds.scalar("val1") # dataset with a string view column table = pa.table({"col": pa.array(["val1", "val2"], type=pa.string_view())}) - dset = pd.dataset(table) + dset = pa_ds.dataset(table) # creating the scanner fails dset.scanner(columns=["col"], filter=filter_expr) @@ -1049,10 +1023,10 @@ def test_canary_for_pyarrow_string_view_filter_support(self, duckdb_cursor): def test_canary_for_pyarrow_binary_view_filter_support(self, duckdb_cursor): """This canary will xpass when pyarrow implements binary view filter support.""" # predicate: field == const - const = pd.scalar(pa.scalar(b"bin1", pa.binary_view())) - filter_expr = pd.field("col") == const + const = pa_ds.scalar(pa.scalar(b"bin1", pa.binary_view())) + filter_expr = pa_ds.field("col") == const # dataset with a string view column table = pa.table({"col": pa.array([b"bin1", b"bin2"], type=pa.binary_view())}) - dset = pd.dataset(table) + dset = pa_ds.dataset(table) # creating the scanner fails dset.scanner(columns=["col"], filter=filter_expr) diff --git a/tests/fast/pandas/test_pandas_arrow.py b/tests/fast/pandas/test_pandas_arrow.py index 0cb1f00d..bab23eec 100644 --- a/tests/fast/pandas/test_pandas_arrow.py +++ b/tests/fast/pandas/test_pandas_arrow.py @@ -2,7 +2,6 @@ import numpy as np import pytest -from conftest import pandas_supports_arrow_backend import duckdb @@ -11,7 +10,6 @@ from pandas.api.types import is_integer_dtype # noqa: E402 -@pytest.mark.skipif(not pandas_supports_arrow_backend(), reason="pandas does not support the 'pyarrow' backend") class TestPandasArrow: def test_pandas_arrow(self, duckdb_cursor): pd = pytest.importorskip("pandas") diff --git a/tests/fast/pandas/test_pyarrow_projection_pushdown.py b/tests/fast/pandas/test_pyarrow_projection_pushdown.py index 87f49f04..ca7bc905 100644 --- a/tests/fast/pandas/test_pyarrow_projection_pushdown.py +++ b/tests/fast/pandas/test_pyarrow_projection_pushdown.py @@ -1,5 +1,4 @@ import pytest -from conftest import pandas_supports_arrow_backend import duckdb @@ -8,7 +7,6 @@ _ = pytest.importorskip("pandas", "2.0.0") -@pytest.mark.skipif(not pandas_supports_arrow_backend(), reason="pandas does not support the 'pyarrow' backend") class TestArrowDFProjectionPushdown: def test_projection_pushdown_no_filter(self, duckdb_cursor): duckdb_conn = duckdb.connect() diff --git a/tests/fast/test_replacement_scan.py b/tests/fast/test_replacement_scan.py index 1e76d1d5..ff1c423f 100644 --- a/tests/fast/test_replacement_scan.py +++ b/tests/fast/test_replacement_scan.py @@ -469,7 +469,8 @@ def test_replacement_disabled(self): with pytest.raises(duckdb.CatalogException, match="Table with name df does not exist!"): create_relation(con, "select * from df") with pytest.raises( - duckdb.InvalidInputException, match="Cannot change enable_external_access setting while database is running" + duckdb.InvalidInputException, + match="Invalid Input Error: Cannot enable external access while database is running", ): con.execute("set enable_external_access=true") From f4a82b6fd52fd5d920270d92b5d2c1ccc899abe3 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Mon, 26 Jan 2026 21:15:36 +0100 Subject: [PATCH 30/37] bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index cbb5d8b7..c65f4e48 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit cbb5d8b7388d034c83d7fd22467074c00b322254 +Subproject commit c65f4e48a300bfd49a5d799195e4100f30637cf0 From bbb418b9f8049f3999b934e4fb12762674008da9 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 10:18:03 +0100 Subject: [PATCH 31/37] Fix last failing tests and remove support for Python 3.9 --- .github/workflows/code_quality.yml | 2 +- .github/workflows/coverage.yml | 2 +- .github/workflows/packaging_sdist.yml | 2 +- .github/workflows/packaging_wheels.yml | 3 +-- .github/workflows/targeted_test.yml | 1 - LICENSE | 2 +- pyproject.toml | 31 +++++++++++++------------- tests/fast/test_filesystem.py | 4 +++- tests/fast/test_replacement_scan.py | 1 - 9 files changed, 23 insertions(+), 25 deletions(-) diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml index 99b7884c..b6f1d1c2 100644 --- a/.github/workflows/code_quality.yml +++ b/.github/workflows/code_quality.yml @@ -32,7 +32,7 @@ jobs: uses: astral-sh/setup-uv@v7 with: version: "0.9.0" - python-version: 3.9 + python-version: 3.10 - name: pre-commit (cache) uses: actions/cache@v4 diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index fd62b6c1..3187fd83 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -71,7 +71,7 @@ jobs: uses: astral-sh/setup-uv@v7 with: version: "0.9.0" - python-version: 3.9 + python-version: 3.12 enable-cache: true cache-suffix: -${{ github.workflow }} diff --git a/.github/workflows/packaging_sdist.yml b/.github/workflows/packaging_sdist.yml index b6558744..fb45b366 100644 --- a/.github/workflows/packaging_sdist.yml +++ b/.github/workflows/packaging_sdist.yml @@ -59,7 +59,7 @@ jobs: uses: astral-sh/setup-uv@v7 with: version: "0.9.0" - python-version: 3.11 + python-version: 3.12 - name: Build sdist run: uv build --sdist diff --git a/.github/workflows/packaging_wheels.yml b/.github/workflows/packaging_wheels.yml index 23a16af7..469e252e 100644 --- a/.github/workflows/packaging_wheels.yml +++ b/.github/workflows/packaging_wheels.yml @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - python: [ cp39, cp310, cp311, cp312, cp313, cp314 ] + python: [ cp310, cp311, cp312, cp313, cp314 ] platform: - { os: windows-2025, arch: amd64, cibw_system: win } - { os: windows-11-arm, arch: ARM64, cibw_system: win } # cibw requires ARM64 to be uppercase @@ -47,7 +47,6 @@ jobs: - { minimal: true, python: cp312 } - { minimal: true, python: cp313 } - { minimal: true, platform: { arch: universal2 } } - - { python: cp39, platform: { os: windows-11-arm, arch: ARM64 } } # too many dependency problems for win arm64 - { python: cp310, platform: { os: windows-11-arm, arch: ARM64 } } # too many dependency problems for win arm64 runs-on: ${{ matrix.platform.os }} env: diff --git a/.github/workflows/targeted_test.yml b/.github/workflows/targeted_test.yml index 13ae9566..d1a828de 100644 --- a/.github/workflows/targeted_test.yml +++ b/.github/workflows/targeted_test.yml @@ -19,7 +19,6 @@ on: required: true type: choice options: - - '3.9' - '3.10' - '3.11' - '3.12' diff --git a/LICENSE b/LICENSE index 4e1fbb76..2719c9a2 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright 2018-2025 Stichting DuckDB Foundation +Copyright 2018-2026 Stichting DuckDB Foundation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: diff --git a/pyproject.toml b/pyproject.toml index 4de926d9..1a478f7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ dynamic = ["version"] description = "DuckDB in-process database" readme = "README.md" keywords = ["DuckDB", "Database", "SQL", "OLAP"] -requires-python = ">=3.9.0" +requires-python = ">=3.10.0" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", @@ -25,7 +25,6 @@ classifiers = [ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -176,12 +175,12 @@ exclude = [ # # This section has dependency groups for testing and development. Tread carefully, the current setup makes sure that # test dependencies can be installed on as many platforms we build wheel for. Especially picky are: -# - tensorflow: we can only run tests on cp39-cp311, for osx there is no tensorflow-cpu, for windows we need +# - tensorflow: we can only run tests on cp310-cp311, for osx there is no tensorflow-cpu, for windows we need # tensorflow-cpu-aws and there is no distribution availalbe for Linux aarch64. # - torch: since we can't use gpu acceleration, we need to rely on torch-cpu, which isn't available on pypi. We use # `tool.uv.index` and `tool.uv.sources` to make sure the official pytorch index is used. Even there, we don't # have a wheel available for x86_64 OSX + cp313. -# - numpy: tensorflow doesn't play nice with numpy>2 so for every platform that can run tensorflow (cp39-cp311) we use +# - numpy: tensorflow doesn't play nice with numpy>2 so for every platform that can run tensorflow (cp310-cp311) we use # numpy<2. numpy<2 has no wheels for cp31[2|3], meaning an sdist will be used. However, on Windows amd64 + # cp313 this results in a segfault / access violation. To get around this, we install numpy>=2 on all >=cp312 # platforms. Then for windows arm64, for which there is no tensorflow, we only allow numpy>=2.3 because that @@ -195,20 +194,20 @@ default-groups = ["dev"] # build wheels for. # See https://docs.astral.sh/uv/concepts/resolution/#universal-resolution environments = [ # no need to resolve packages beyond these platforms with uv... - "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'arm64'", - "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'x86_64'", - "python_version >= '3.9' and sys_platform == 'win32' and platform_machine == 'AMD64'", + "python_version >= '3.10' and sys_platform == 'darwin' and platform_machine == 'arm64'", + "python_version >= '3.10' and sys_platform == 'darwin' and platform_machine == 'x86_64'", + "python_version >= '3.10' and sys_platform == 'win32' and platform_machine == 'AMD64'", "python_version >= '3.11' and sys_platform == 'win32' and platform_machine == 'ARM64'", - "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'x86_64'", - "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'aarch64'", + "python_version >= '3.10' and sys_platform == 'linux' and platform_machine == 'x86_64'", + "python_version >= '3.10' and sys_platform == 'linux' and platform_machine == 'aarch64'", ] required-environments = [ # ... but do always resolve for all of them - "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'arm64'", - "python_version >= '3.9' and sys_platform == 'darwin' and platform_machine == 'x86_64'", - "python_version >= '3.9' and sys_platform == 'win32' and platform_machine == 'AMD64'", + "python_version >= '3.10' and sys_platform == 'darwin' and platform_machine == 'arm64'", + "python_version >= '3.10' and sys_platform == 'darwin' and platform_machine == 'x86_64'", + "python_version >= '3.10' and sys_platform == 'win32' and platform_machine == 'AMD64'", "python_version >= '3.11' and sys_platform == 'win32' and platform_machine == 'ARM64'", - "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'x86_64'", - "python_version >= '3.9' and sys_platform == 'linux' and platform_machine == 'aarch64'", + "python_version >= '3.10' and sys_platform == 'linux' and platform_machine == 'x86_64'", + "python_version >= '3.10' and sys_platform == 'linux' and platform_machine == 'aarch64'", ] # We just need pytorch for tests, wihtout GPU acceleration. PyPI doesn't host a cpu-only version for Linux, so we have @@ -330,7 +329,7 @@ packages = ["duckdb", "_duckdb"] strict = true warn_unreachable = true pretty = true -python_version = "3.9" +python_version = "3.10" exclude = [ "duckdb/experimental/", # not checking the pyspark API "duckdb/query_graph/", # old and unmaintained (should probably remove) @@ -363,7 +362,7 @@ source = ["duckdb"] [tool.ruff] line-length = 120 indent-width = 4 -target-version = "py39" +target-version = "py310" fix = true exclude = ['external/duckdb', 'sqllogic'] diff --git a/tests/fast/test_filesystem.py b/tests/fast/test_filesystem.py index f9f08266..121bb441 100644 --- a/tests/fast/test_filesystem.py +++ b/tests/fast/test_filesystem.py @@ -1,8 +1,8 @@ import logging import sys +from collections.abc import Callable from pathlib import Path, PurePosixPath from shutil import copyfileobj -from typing import Callable import pytest @@ -56,10 +56,12 @@ def add_file(fs, filename=FILENAME): class TestPythonFilesystem: + @pytest.mark.xfail(reason="Unregister support was removed. Should be added back before release.") def test_unregister_non_existent_filesystem(self, duckdb_cursor: DuckDBPyConnection): with pytest.raises(InvalidInputException): duckdb_cursor.unregister_filesystem("fake") + @pytest.mark.xfail(reason="Unregister support was removed. Should be added back before release.") def test_memory_filesystem(self, duckdb_cursor: DuckDBPyConnection, memory: fsspec.AbstractFileSystem): duckdb_cursor.register_filesystem(memory) diff --git a/tests/fast/test_replacement_scan.py b/tests/fast/test_replacement_scan.py index ff1c423f..2b195ab9 100644 --- a/tests/fast/test_replacement_scan.py +++ b/tests/fast/test_replacement_scan.py @@ -314,7 +314,6 @@ def test_cte_with_joins(self, duckdb_cursor): res = rel.fetchall() assert res == [(2, 2, 2)] - @pytest.mark.xfail(reason="Bug in DuckDB core (MRE at #19154)") def test_same_name_cte(self, duckdb_cursor): query = """ WITH df AS ( From 05989c4e9bae40e56835b0f840b2edbc64487d4d Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 10:43:19 +0100 Subject: [PATCH 32/37] bump python version for code quality workflow --- .github/workflows/code_quality.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml index b6f1d1c2..575f6f5b 100644 --- a/.github/workflows/code_quality.yml +++ b/.github/workflows/code_quality.yml @@ -32,7 +32,7 @@ jobs: uses: astral-sh/setup-uv@v7 with: version: "0.9.0" - python-version: 3.10 + python-version: "3.12" - name: pre-commit (cache) uses: actions/cache@v4 From 1acece4666f52bb5ccc1822274e0c0b9791299cb Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 11:38:49 +0100 Subject: [PATCH 33/37] auto fixes --- adbc_driver_duckdb/__init__.py | 3 +- adbc_driver_duckdb/dbapi.py | 4 +- duckdb/bytes_io_wrapper.py | 6 +- duckdb/experimental/spark/_globals.py | 3 +- duckdb/experimental/spark/_typing.py | 8 +- duckdb/experimental/spark/conf.py | 6 +- duckdb/experimental/spark/context.py | 8 +- .../spark/errors/exceptions/base.py | 12 +- duckdb/experimental/spark/exception.py | 6 +- duckdb/experimental/spark/sql/_typing.py | 18 +- duckdb/experimental/spark/sql/catalog.py | 18 +- duckdb/experimental/spark/sql/column.py | 11 +- duckdb/experimental/spark/sql/conf.py | 4 +- duckdb/experimental/spark/sql/dataframe.py | 34 ++-- duckdb/experimental/spark/sql/functions.py | 39 ++-- duckdb/experimental/spark/sql/group.py | 7 +- duckdb/experimental/spark/sql/readwriter.py | 170 +++++++++--------- duckdb/experimental/spark/sql/session.py | 22 +-- duckdb/experimental/spark/sql/streaming.py | 12 +- duckdb/experimental/spark/sql/type_utils.py | 4 +- duckdb/experimental/spark/sql/types.py | 82 ++++----- duckdb/experimental/spark/sql/udf.py | 11 +- duckdb/query_graph/__main__.py | 5 +- duckdb_packaging/_versioning.py | 9 +- duckdb_packaging/build_backend.py | 9 +- duckdb_packaging/pypi_cleanup.py | 11 +- pyproject.toml | 2 + scripts/generate_import_cache_json.py | 4 +- scripts/get_cpp_methods.py | 2 +- tests/conftest.py | 3 +- tests/fast/adbc/test_statement_bind.py | 2 +- .../fast/arrow/test_arrow_run_end_encoding.py | 6 +- tests/fast/pandas/test_pandas_types.py | 4 +- .../relational_api/test_rapi_aggregations.py | 132 +++++++------- .../fast/relational_api/test_rapi_windows.py | 102 +++++------ .../spark/test_spark_functions_numeric.py | 2 +- tests/fast/test_map.py | 7 +- tests/fast/test_type.py | 19 +- tests/fast/udf/test_null_filtering.py | 6 +- 39 files changed, 405 insertions(+), 408 deletions(-) diff --git a/adbc_driver_duckdb/__init__.py b/adbc_driver_duckdb/__init__.py index f925ea9e..c2777c90 100644 --- a/adbc_driver_duckdb/__init__.py +++ b/adbc_driver_duckdb/__init__.py @@ -20,7 +20,6 @@ import enum import functools import importlib.util -import typing import adbc_driver_manager @@ -32,7 +31,7 @@ class StatementOptions(enum.Enum): BATCH_ROWS = "adbc.duckdb.query.batch_rows" -def connect(path: typing.Optional[str] = None) -> adbc_driver_manager.AdbcDatabase: +def connect(path: str | None = None) -> adbc_driver_manager.AdbcDatabase: """Create a low level ADBC connection to DuckDB.""" if path is None: return adbc_driver_manager.AdbcDatabase(driver=driver_path(), entrypoint="duckdb_adbc_init") diff --git a/adbc_driver_duckdb/dbapi.py b/adbc_driver_duckdb/dbapi.py index 5d0a8702..377f86a0 100644 --- a/adbc_driver_duckdb/dbapi.py +++ b/adbc_driver_duckdb/dbapi.py @@ -17,8 +17,6 @@ """DBAPI 2.0-compatible facade for the ADBC DuckDB driver.""" -import typing - import adbc_driver_manager import adbc_driver_manager.dbapi @@ -91,7 +89,7 @@ # Functions -def connect(path: typing.Optional[str] = None, **kwargs) -> "Connection": +def connect(path: str | None = None, **kwargs) -> "Connection": """Connect to DuckDB via ADBC.""" db = None conn = None diff --git a/duckdb/bytes_io_wrapper.py b/duckdb/bytes_io_wrapper.py index 722c7cb4..d0ef78bf 100644 --- a/duckdb/bytes_io_wrapper.py +++ b/duckdb/bytes_io_wrapper.py @@ -34,7 +34,7 @@ """ from io import StringIO, TextIOBase -from typing import Any, Union +from typing import Any class BytesIOWrapper: @@ -43,7 +43,7 @@ class BytesIOWrapper: Created for compat with pyarrow read_csv. """ - def __init__(self, buffer: Union[StringIO, TextIOBase], encoding: str = "utf-8") -> None: # noqa: D107 + def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8") -> None: # noqa: D107 self.buffer = buffer self.encoding = encoding # Because a character can be represented by more than 1 byte, @@ -55,7 +55,7 @@ def __init__(self, buffer: Union[StringIO, TextIOBase], encoding: str = "utf-8") def __getattr__(self, attr: str) -> Any: # noqa: D105, ANN401 return getattr(self.buffer, attr) - def read(self, n: Union[int, None] = -1) -> bytes: # noqa: D102 + def read(self, n: int | None = -1) -> bytes: # noqa: D102 assert self.buffer is not None bytestring = self.buffer.read(n).encode(self.encoding) # When n=-1/n greater than remaining bytes: Read entire file/rest of file diff --git a/duckdb/experimental/spark/_globals.py b/duckdb/experimental/spark/_globals.py index 0625a140..3dd7232f 100644 --- a/duckdb/experimental/spark/_globals.py +++ b/duckdb/experimental/spark/_globals.py @@ -33,6 +33,7 @@ def foo(arg=pyducdkb.spark._NoValue): __ALL__ = ["_NoValue"] +from typing import Self # Disallow reloading this module so as to preserve the identities of the # classes defined here. @@ -54,7 +55,7 @@ class _NoValueType: __instance = None - def __new__(cls) -> "_NoValueType": + def __new__(cls) -> Self: # ensure that only one instance exists if not cls.__instance: cls.__instance = super().__new__(cls) diff --git a/duckdb/experimental/spark/_typing.py b/duckdb/experimental/spark/_typing.py index 1ed78ea8..de7f2fff 100644 --- a/duckdb/experimental/spark/_typing.py +++ b/duckdb/experimental/spark/_typing.py @@ -16,16 +16,16 @@ # specific language governing permissions and limitations # under the License. -from collections.abc import Iterable, Sized -from typing import Callable, TypeVar, Union +from collections.abc import Callable, Iterable, Sized +from typing import Literal, TypeVar from numpy import float32, float64, int32, int64, ndarray -from typing_extensions import Literal, Protocol, Self +from typing_extensions import Protocol, Self F = TypeVar("F", bound=Callable) T_co = TypeVar("T_co", covariant=True) -PrimitiveType = Union[bool, float, int, str] +PrimitiveType = bool | float | int | str NonUDFType = Literal[0] diff --git a/duckdb/experimental/spark/conf.py b/duckdb/experimental/spark/conf.py index 974115d6..9b2cc0eb 100644 --- a/duckdb/experimental/spark/conf.py +++ b/duckdb/experimental/spark/conf.py @@ -1,5 +1,3 @@ -from typing import Optional # noqa: D100 - from duckdb.experimental.spark.exception import ContributionsAcceptedError @@ -10,7 +8,7 @@ def __init__(self) -> None: # noqa: D107 def contains(self, key: str) -> bool: # noqa: D102 raise ContributionsAcceptedError - def get(self, key: str, defaultValue: Optional[str] = None) -> Optional[str]: # noqa: D102 + def get(self, key: str, defaultValue: str | None = None) -> str | None: # noqa: D102 raise ContributionsAcceptedError def getAll(self) -> list[tuple[str, str]]: # noqa: D102 @@ -26,7 +24,7 @@ def setAppName(self, value: str) -> "SparkConf": # noqa: D102 raise ContributionsAcceptedError def setExecutorEnv( # noqa: D102 - self, key: Optional[str] = None, value: Optional[str] = None, pairs: Optional[list[tuple[str, str]]] = None + self, key: str | None = None, value: str | None = None, pairs: list[tuple[str, str]] | None = None ) -> "SparkConf": raise ContributionsAcceptedError diff --git a/duckdb/experimental/spark/context.py b/duckdb/experimental/spark/context.py index c78bde65..311153b2 100644 --- a/duckdb/experimental/spark/context.py +++ b/duckdb/experimental/spark/context.py @@ -1,5 +1,3 @@ -from typing import Optional # noqa: D100 - import duckdb from duckdb import DuckDBPyConnection from duckdb.experimental.spark.conf import SparkConf @@ -20,7 +18,7 @@ def stop(self) -> None: # noqa: D102 self._connection.close() @classmethod - def getOrCreate(cls, conf: Optional[SparkConf] = None) -> "SparkContext": # noqa: D102 + def getOrCreate(cls, conf: SparkConf | None = None) -> "SparkContext": # noqa: D102 raise ContributionsAcceptedError @classmethod @@ -93,13 +91,13 @@ def dump_profiles(self, path: str) -> None: # noqa: D102 # def emptyRDD(self) -> duckdb.experimental.spark.rdd.RDD[typing.Any]: # pass - def getCheckpointDir(self) -> Optional[str]: # noqa: D102 + def getCheckpointDir(self) -> str | None: # noqa: D102 raise ContributionsAcceptedError def getConf(self) -> SparkConf: # noqa: D102 raise ContributionsAcceptedError - def getLocalProperty(self, key: str) -> Optional[str]: # noqa: D102 + def getLocalProperty(self, key: str) -> str | None: # noqa: D102 raise ContributionsAcceptedError # def hadoopFile(self, path: str, inputFormatClass: str, keyClass: str, valueClass: str, diff --git a/duckdb/experimental/spark/errors/exceptions/base.py b/duckdb/experimental/spark/errors/exceptions/base.py index 2eae2a19..9a60512f 100644 --- a/duckdb/experimental/spark/errors/exceptions/base.py +++ b/duckdb/experimental/spark/errors/exceptions/base.py @@ -1,4 +1,4 @@ -from typing import Optional, cast # noqa: D100 +from typing import cast from ..utils import ErrorClassesReader @@ -8,11 +8,11 @@ class PySparkException(Exception): def __init__( # noqa: D107 self, - message: Optional[str] = None, + message: str | None = None, # The error class, decides the message format, must be one of the valid options listed in 'error_classes.py' - error_class: Optional[str] = None, + error_class: str | None = None, # The dictionary listing the arguments specified in the message (or the error_class) - message_parameters: Optional[dict[str, str]] = None, + message_parameters: dict[str, str] | None = None, ) -> None: # `message` vs `error_class` & `message_parameters` are mutually exclusive. assert (message is not None and (error_class is None and message_parameters is None)) or ( @@ -31,7 +31,7 @@ def __init__( # noqa: D107 self.error_class = error_class self.message_parameters = message_parameters - def getErrorClass(self) -> Optional[str]: + def getErrorClass(self) -> str | None: """Returns an error class as a string. .. versionadded:: 3.4.0 @@ -43,7 +43,7 @@ def getErrorClass(self) -> Optional[str]: """ return self.error_class - def getMessageParameters(self) -> Optional[dict[str, str]]: + def getMessageParameters(self) -> dict[str, str] | None: """Returns a message parameters as a dictionary. .. versionadded:: 3.4.0 diff --git a/duckdb/experimental/spark/exception.py b/duckdb/experimental/spark/exception.py index c3a7c1b6..440b7819 100644 --- a/duckdb/experimental/spark/exception.py +++ b/duckdb/experimental/spark/exception.py @@ -1,14 +1,10 @@ -# ruff: noqa: D100 -from typing import Optional - - class ContributionsAcceptedError(NotImplementedError): """This method is not planned to be implemented, if you would like to implement this method or show your interest in this method to other members of the community, feel free to open up a PR or a Discussion over on https://github.com/duckdb/duckdb. """ # noqa: D205 - def __init__(self, message: Optional[str] = None) -> None: # noqa: D107 + def __init__(self, message: str | None = None) -> None: # noqa: D107 doc = self.__class__.__doc__ if message: doc = message + "\n" + doc diff --git a/duckdb/experimental/spark/sql/_typing.py b/duckdb/experimental/spark/sql/_typing.py index caf0058c..cf0b15e1 100644 --- a/duckdb/experimental/spark/sql/_typing.py +++ b/duckdb/experimental/spark/sql/_typing.py @@ -16,18 +16,18 @@ # specific language governing permissions and limitations # under the License. +from collections.abc import Callable from typing import ( Any, - Callable, - Optional, TypeVar, - Union, ) try: from typing import Literal, Protocol except ImportError: - from typing_extensions import Literal, Protocol + from typing import Literal + + from typing_extensions import Protocol import datetime import decimal @@ -36,14 +36,14 @@ from . import types from .column import Column -ColumnOrName = Union[Column, str] +ColumnOrName = Column | str ColumnOrName_ = TypeVar("ColumnOrName_", bound=ColumnOrName) DecimalLiteral = decimal.Decimal -DateTimeLiteral = Union[datetime.datetime, datetime.date] +DateTimeLiteral = datetime.datetime | datetime.date LiteralType = PrimitiveType -AtomicDataTypeOrString = Union[types.AtomicType, str] -DataTypeOrString = Union[types.DataType, str] -OptionalPrimitiveType = Optional[PrimitiveType] +AtomicDataTypeOrString = types.AtomicType | str +DataTypeOrString = types.DataType | str +OptionalPrimitiveType = PrimitiveType | None AtomicValue = TypeVar( "AtomicValue", diff --git a/duckdb/experimental/spark/sql/catalog.py b/duckdb/experimental/spark/sql/catalog.py index 70fc7b18..f43bab59 100644 --- a/duckdb/experimental/spark/sql/catalog.py +++ b/duckdb/experimental/spark/sql/catalog.py @@ -1,25 +1,25 @@ -from typing import NamedTuple, Optional, Union # noqa: D100 +from typing import NamedTuple from .session import SparkSession class Database(NamedTuple): # noqa: D101 name: str - description: Optional[str] + description: str | None locationUri: str class Table(NamedTuple): # noqa: D101 name: str - database: Optional[str] - description: Optional[str] + database: str | None + description: str | None tableType: str isTemporary: bool class Column(NamedTuple): # noqa: D101 name: str - description: Optional[str] + description: str | None dataType: str nullable: bool isPartition: bool @@ -28,7 +28,7 @@ class Column(NamedTuple): # noqa: D101 class Function(NamedTuple): # noqa: D101 name: str - description: Optional[str] + description: str | None className: str isTemporary: bool @@ -55,7 +55,7 @@ def transform_to_table(x: list[str]) -> Table: tables = [transform_to_table(x) for x in res] return tables - def listColumns(self, tableName: str, dbName: Optional[str] = None) -> list[Column]: # noqa: D102 + def listColumns(self, tableName: str, dbName: str | None = None) -> list[Column]: # noqa: D102 query = f""" select column_name, data_type, is_nullable from duckdb_columns() where table_name = '{tableName}' """ @@ -63,13 +63,13 @@ def listColumns(self, tableName: str, dbName: Optional[str] = None) -> list[Colu query += f" and database_name = '{dbName}'" res = self._session.conn.sql(query).fetchall() - def transform_to_column(x: list[Union[str, bool]]) -> Column: + def transform_to_column(x: list[str | bool]) -> Column: return Column(name=x[0], description=None, dataType=x[1], nullable=x[2], isPartition=False, isBucket=False) columns = [transform_to_column(x) for x in res] return columns - def listFunctions(self, dbName: Optional[str] = None) -> list[Function]: # noqa: D102 + def listFunctions(self, dbName: str | None = None) -> list[Function]: # noqa: D102 raise NotImplementedError def setCurrentDatabase(self, dbName: str) -> None: # noqa: D102 diff --git a/duckdb/experimental/spark/sql/column.py b/duckdb/experimental/spark/sql/column.py index 661e4da7..e013a56d 100644 --- a/duckdb/experimental/spark/sql/column.py +++ b/duckdb/experimental/spark/sql/column.py @@ -1,5 +1,8 @@ -from collections.abc import Iterable # noqa: D100 -from typing import TYPE_CHECKING, Any, Callable, Union, cast +from collections.abc import ( + Callable, + Iterable, +) +from typing import TYPE_CHECKING, Any, Union, cast from ..exception import ContributionsAcceptedError from .types import DataType @@ -222,11 +225,11 @@ def otherwise(self, value: Union["Column", str]) -> "Column": # noqa: D102 expr = self.expr.otherwise(v) return Column(expr) - def cast(self, dataType: Union[DataType, str]) -> "Column": # noqa: D102 + def cast(self, dataType: DataType | str) -> "Column": # noqa: D102 internal_type = DuckDBPyType(dataType) if isinstance(dataType, str) else dataType.duckdb_type return Column(self.expr.cast(internal_type)) - def isin(self, *cols: Union[Iterable[Union["Column", str]], Union["Column", str]]) -> "Column": # noqa: D102 + def isin(self, *cols: Iterable[Union["Column", str]] | Union["Column", str]) -> "Column": # noqa: D102 if len(cols) == 1 and isinstance(cols[0], (list, set)): # Only one argument supplied, it's a list cols = cast("tuple", cols[0]) diff --git a/duckdb/experimental/spark/sql/conf.py b/duckdb/experimental/spark/sql/conf.py index e44f2566..75a77899 100644 --- a/duckdb/experimental/spark/sql/conf.py +++ b/duckdb/experimental/spark/sql/conf.py @@ -1,5 +1,3 @@ -from typing import Optional, Union # noqa: D100 - from duckdb import DuckDBPyConnection from duckdb.experimental.spark._globals import _NoValue, _NoValueType @@ -17,7 +15,7 @@ def isModifiable(self, key: str) -> bool: # noqa: D102 def unset(self, key: str) -> None: # noqa: D102 raise NotImplementedError - def get(self, key: str, default: Union[Optional[str], _NoValueType] = _NoValue) -> str: # noqa: D102 + def get(self, key: str, default: str | None | _NoValueType = _NoValue) -> str: # noqa: D102 raise NotImplementedError diff --git a/duckdb/experimental/spark/sql/dataframe.py b/duckdb/experimental/spark/sql/dataframe.py index 066cad09..83b2dd09 100644 --- a/duckdb/experimental/spark/sql/dataframe.py +++ b/duckdb/experimental/spark/sql/dataframe.py @@ -1,11 +1,10 @@ -import uuid # noqa: D100 +import uuid +from collections.abc import Callable from functools import reduce from keyword import iskeyword from typing import ( TYPE_CHECKING, Any, - Callable, - Optional, Union, cast, overload, @@ -206,7 +205,7 @@ def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame": # In case anything is remaining, these are new columns # that we need to add to the DataFrame - for col_name, col in zip(column_names, columns): + for col_name, col in zip(column_names, columns, strict=False): cols.append(col.expr.alias(col_name)) rel = self.relation.select(*cols) @@ -341,7 +340,7 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) ) return result - def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame": # noqa: ANN401 + def sort(self, *cols: str | Column | list[str | Column], **kwargs: Any) -> "DataFrame": # noqa: ANN401 """Returns a new :class:`DataFrame` sorted by the specified column(s). Parameters @@ -458,7 +457,7 @@ def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: An if not ascending: columns = [c.desc() for c in columns] elif isinstance(ascending, list): - columns = [c if asc else c.desc() for asc, c in zip(ascending, columns)] + columns = [c if asc else c.desc() for asc, c in zip(ascending, columns, strict=False)] else: raise PySparkTypeError( error_class="NOT_BOOL_OR_LIST", @@ -471,7 +470,7 @@ def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: An orderBy = sort - def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]: # noqa: D102 + def head(self, n: int | None = None) -> Row | None | list[Row]: # noqa: D102 if n is None: rs = self.head(1) return rs[0] if rs else None @@ -597,8 +596,8 @@ def __dir__(self) -> list[str]: # noqa: D105 def join( self, other: "DataFrame", - on: Optional[Union[str, list[str], Column, list[Column]]] = None, - how: Optional[str] = None, + on: str | list[str] | Column | list[Column] | None = None, + how: str | None = None, ) -> "DataFrame": """Joins with another :class:`DataFrame`, using the given join expression. @@ -871,12 +870,12 @@ def schema(self) -> StructType: return self._schema @overload - def __getitem__(self, item: Union[int, str]) -> Column: ... + def __getitem__(self, item: int | str) -> Column: ... @overload - def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ... + def __getitem__(self, item: Column | list | tuple) -> "DataFrame": ... - def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]: + def __getitem__(self, item: int | str | Column | list | tuple) -> Union[Column, "DataFrame"]: """Returns the column as a :class:`Column`. Examples: @@ -919,7 +918,7 @@ def __getattr__(self, name: str) -> Column: def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ... @overload - def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ... # noqa: PYI063 + def groupBy(self, __cols: list[Column] | list[str]) -> "GroupedData": ... # noqa: PYI063 def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] """Groups the :class:`DataFrame` using the specified columns, @@ -997,7 +996,7 @@ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc] def write(self) -> DataFrameWriter: # noqa: D102 return DataFrameWriter(self) - def printSchema(self, level: Optional[int] = None) -> None: + def printSchema(self, level: int | None = None) -> None: """Prints out the schema in the tree format. Parameters @@ -1262,7 +1261,7 @@ def exceptAll(self, other: "DataFrame") -> "DataFrame": """ # noqa: D205 return DataFrame(self.relation.except_(other.relation), self.session) - def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame": + def dropDuplicates(self, subset: list[str] | None = None) -> "DataFrame": """Return a new :class:`DataFrame` with duplicate rows removed, optionally only considering certain columns. @@ -1371,7 +1370,8 @@ def _cast_types(self, *types) -> "DataFrame": assert types_count == len(existing_columns) cast_expressions = [ - f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types) + f"{existing}::{target_type} as {existing}" + for existing, target_type in zip(existing_columns, types, strict=False) ] cast_expressions = ", ".join(cast_expressions) new_rel = self.relation.project(cast_expressions) @@ -1384,7 +1384,7 @@ def toDF(self, *cols) -> "DataFrame": # noqa: D102 raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match") existing_columns = [ColumnExpression(x) for x in existing_columns] - projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)] + projections = [existing.alias(new) for existing, new in zip(existing_columns, cols, strict=False)] new_rel = self.relation.project(*projections) return DataFrame(new_rel, self.session) diff --git a/duckdb/experimental/spark/sql/functions.py b/duckdb/experimental/spark/sql/functions.py index 49c475a4..71ff8c59 100644 --- a/duckdb/experimental/spark/sql/functions.py +++ b/duckdb/experimental/spark/sql/functions.py @@ -1,5 +1,6 @@ -import warnings # noqa: D100 -from typing import TYPE_CHECKING, Any, Callable, Optional, Union, overload +import warnings +from collections.abc import Callable +from typing import TYPE_CHECKING, Any, Optional, Union, overload from duckdb import ( CaseExpression, @@ -109,7 +110,7 @@ def ucase(str: "ColumnOrName") -> Column: return upper(str) -def when(condition: "Column", value: Union[Column, str]) -> Column: # noqa: D103 +def when(condition: "Column", value: Column | str) -> Column: # noqa: D103 if not isinstance(condition, Column): msg = "condition should be a Column" raise TypeError(msg) @@ -118,7 +119,7 @@ def when(condition: "Column", value: Union[Column, str]) -> Column: # noqa: D10 return Column(expr) -def _inner_expr_or_val(val: Union[Column, str]) -> Union[Column, str]: +def _inner_expr_or_val(val: Column | str) -> Column | str: return val.expr if isinstance(val, Column) else val @@ -126,7 +127,7 @@ def struct(*cols: Column) -> Column: # noqa: D103 return Column(FunctionExpression("struct_pack", *[_inner_expr_or_val(x) for x in cols])) -def array(*cols: Union["ColumnOrName", Union[list["ColumnOrName"], tuple["ColumnOrName", ...]]]) -> Column: +def array(*cols: Union["ColumnOrName", list["ColumnOrName"] | tuple["ColumnOrName", ...]]) -> Column: r"""Creates a new array column. .. versionadded:: 1.4.0 @@ -449,7 +450,7 @@ def right(str: "ColumnOrName", len: "ColumnOrName") -> Column: ) -def levenshtein(left: "ColumnOrName", right: "ColumnOrName", threshold: Optional[int] = None) -> Column: +def levenshtein(left: "ColumnOrName", right: "ColumnOrName", threshold: int | None = None) -> Column: """Computes the Levenshtein distance of the two given strings. .. versionadded:: 1.5.0 @@ -766,7 +767,7 @@ def collect_list(col: "ColumnOrName") -> Column: return array_agg(col) -def array_append(col: "ColumnOrName", value: Union[Column, str]) -> Column: +def array_append(col: "ColumnOrName", value: Column | str) -> Column: """Collection function: returns an array of the elements in col1 along with the added element in col2 at the last of the array. @@ -800,7 +801,7 @@ def array_append(col: "ColumnOrName", value: Union[Column, str]) -> Column: return _invoke_function("list_append", _to_column_expr(col), _get_expr(value)) -def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: Union[Column, str]) -> Column: +def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: Column | str) -> Column: """Collection function: adds an item into a given array at a specified array index. Array indices start at 1, or start from the end if index is negative. Index above array size appends the array, or prepends the array if index is negative, @@ -893,7 +894,7 @@ def array_insert(arr: "ColumnOrName", pos: Union["ColumnOrName", int], value: Un ) -def array_contains(col: "ColumnOrName", value: Union[Column, str]) -> Column: +def array_contains(col: "ColumnOrName", value: Column | str) -> Column: """Collection function: returns null if the array is null, true if the array contains the given value, and false otherwise. @@ -1373,7 +1374,7 @@ def count(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("count", col) -def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> Column: +def approx_count_distinct(col: "ColumnOrName", rsd: float | None = None) -> Column: """Aggregate function: returns a new :class:`~pyspark.sql.Column` for approximate distinct count of column `col`. @@ -1410,7 +1411,7 @@ def approx_count_distinct(col: "ColumnOrName", rsd: Optional[float] = None) -> C return _invoke_function_over_columns("approx_count_distinct", col) -def approxCountDistinct(col: "ColumnOrName", rsd: Optional[float] = None) -> Column: +def approxCountDistinct(col: "ColumnOrName", rsd: float | None = None) -> Column: """.. versionadded:: 1.3.0. .. versionchanged:: 3.4.0 @@ -1433,7 +1434,7 @@ def transform(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -> Col def transform( col: "ColumnOrName", - f: Union[Callable[[Column], Column], Callable[[Column, Column], Column]], + f: Callable[[Column], Column] | Callable[[Column, Column], Column], ) -> Column: """Returns an array of elements after applying a transformation to each element in the input array. @@ -2255,7 +2256,7 @@ def product(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("product", col) -def rand(seed: Optional[int] = None) -> Column: +def rand(seed: int | None = None) -> Column: """Generates a random column with independent and identically distributed (i.i.d.) samples uniformly distributed in [0.0, 1.0). @@ -2419,7 +2420,7 @@ def regexp_extract(str: "ColumnOrName", pattern: str, idx: int) -> Column: ) -def regexp_extract_all(str: "ColumnOrName", regexp: "ColumnOrName", idx: Optional[Union[int, Column]] = None) -> Column: +def regexp_extract_all(str: "ColumnOrName", regexp: "ColumnOrName", idx: int | Column | None = None) -> Column: r"""Extract all strings in the `str` that match the Java regex `regexp` and corresponding to the regex group index. @@ -4968,7 +4969,7 @@ def add_months(start: "ColumnOrName", months: Union["ColumnOrName", int]) -> Col return _invoke_function("date_add", _to_column_expr(start), FunctionExpression("to_months", months)).cast("date") -def array_join(col: "ColumnOrName", delimiter: str, null_replacement: Optional[str] = None) -> Column: +def array_join(col: "ColumnOrName", delimiter: str, null_replacement: str | None = None) -> Column: """Concatenates the elements of `column` using the `delimiter`. Null values are replaced with `null_replacement` if set, otherwise they are ignored. @@ -5136,7 +5137,7 @@ def array_size(col: "ColumnOrName") -> Column: return _invoke_function_over_columns("len", col) -def array_sort(col: "ColumnOrName", comparator: Optional[Callable[[Column, Column], Column]] = None) -> Column: +def array_sort(col: "ColumnOrName", comparator: Callable[[Column, Column], Column] | None = None) -> Column: """Collection function: sorts the input array in ascending order. The elements of the input array must be orderable. Null elements will be placed at the end of the returned array. @@ -5592,7 +5593,7 @@ def zeroifnull(col: "ColumnOrName") -> Column: return coalesce(col, lit(0)) -def _to_date_or_timestamp(col: "ColumnOrName", spark_datatype: _types.DataType, format: Optional[str] = None) -> Column: +def _to_date_or_timestamp(col: "ColumnOrName", spark_datatype: _types.DataType, format: str | None = None) -> Column: if format is not None: raise ContributionsAcceptedError( "format is not yet supported as DuckDB and PySpark use a different way of specifying them." @@ -5601,7 +5602,7 @@ def _to_date_or_timestamp(col: "ColumnOrName", spark_datatype: _types.DataType, return Column(_to_column_expr(col)).cast(spark_datatype) -def to_date(col: "ColumnOrName", format: Optional[str] = None) -> Column: +def to_date(col: "ColumnOrName", format: str | None = None) -> Column: """Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType` using the optionally specified format. Specify formats according to `datetime pattern`_. By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format @@ -5639,7 +5640,7 @@ def to_date(col: "ColumnOrName", format: Optional[str] = None) -> Column: return _to_date_or_timestamp(col, _types.DateType(), format) -def to_timestamp(col: "ColumnOrName", format: Optional[str] = None) -> Column: +def to_timestamp(col: "ColumnOrName", format: str | None = None) -> Column: """Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType` using the optionally specified format. Specify formats according to `datetime pattern`_. By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format diff --git a/duckdb/experimental/spark/sql/group.py b/duckdb/experimental/spark/sql/group.py index aa3e56d6..5f784453 100644 --- a/duckdb/experimental/spark/sql/group.py +++ b/duckdb/experimental/spark/sql/group.py @@ -1,4 +1,4 @@ -# # noqa: D100 +# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. @@ -15,7 +15,8 @@ # limitations under the License. # -from typing import TYPE_CHECKING, Callable, Union, overload +from collections.abc import Callable +from typing import TYPE_CHECKING, overload from ..exception import ContributionsAcceptedError from .column import Column @@ -319,7 +320,7 @@ def agg(self, *exprs: Column) -> DataFrame: ... @overload def agg(self, __exprs: dict[str, str]) -> DataFrame: ... # noqa: PYI063 - def agg(self, *exprs: Union[Column, dict[str, str]]) -> DataFrame: + def agg(self, *exprs: Column | dict[str, str]) -> DataFrame: """Compute aggregates and returns the result as a :class:`DataFrame`. The available aggregate functions can be: diff --git a/duckdb/experimental/spark/sql/readwriter.py b/duckdb/experimental/spark/sql/readwriter.py index eef99043..230d5d2a 100644 --- a/duckdb/experimental/spark/sql/readwriter.py +++ b/duckdb/experimental/spark/sql/readwriter.py @@ -1,11 +1,11 @@ -from typing import TYPE_CHECKING, Optional, Union, cast # noqa: D100 +from typing import TYPE_CHECKING, cast from ..errors import PySparkNotImplementedError, PySparkTypeError from ..exception import ContributionsAcceptedError from .types import StructType -PrimitiveType = Union[bool, float, int, str] -OptionalPrimitiveType = Optional[PrimitiveType] +PrimitiveType = bool | float | int | str +OptionalPrimitiveType = PrimitiveType | None if TYPE_CHECKING: from duckdb.experimental.spark.sql.dataframe import DataFrame @@ -23,9 +23,9 @@ def saveAsTable(self, table_name: str) -> None: # noqa: D102 def parquet( # noqa: D102 self, path: str, - mode: Optional[str] = None, - partitionBy: Union[str, list[str], None] = None, - compression: Optional[str] = None, + mode: str | None = None, + partitionBy: str | list[str] | None = None, + compression: str | None = None, ) -> None: relation = self.dataframe.relation if mode: @@ -38,23 +38,23 @@ def parquet( # noqa: D102 def csv( # noqa: D102 self, path: str, - mode: Optional[str] = None, - compression: Optional[str] = None, - sep: Optional[str] = None, - quote: Optional[str] = None, - escape: Optional[str] = None, - header: Optional[Union[bool, str]] = None, - nullValue: Optional[str] = None, - escapeQuotes: Optional[Union[bool, str]] = None, - quoteAll: Optional[Union[bool, str]] = None, - dateFormat: Optional[str] = None, - timestampFormat: Optional[str] = None, - ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None, - ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None, - charToEscapeQuoteEscaping: Optional[str] = None, - encoding: Optional[str] = None, - emptyValue: Optional[str] = None, - lineSep: Optional[str] = None, + mode: str | None = None, + compression: str | None = None, + sep: str | None = None, + quote: str | None = None, + escape: str | None = None, + header: bool | str | None = None, + nullValue: str | None = None, + escapeQuotes: bool | str | None = None, + quoteAll: bool | str | None = None, + dateFormat: str | None = None, + timestampFormat: str | None = None, + ignoreLeadingWhiteSpace: bool | str | None = None, + ignoreTrailingWhiteSpace: bool | str | None = None, + charToEscapeQuoteEscaping: str | None = None, + encoding: str | None = None, + emptyValue: str | None = None, + lineSep: str | None = None, ) -> None: if mode not in (None, "overwrite"): raise NotImplementedError @@ -92,9 +92,9 @@ def __init__(self, session: "SparkSession") -> None: # noqa: D107 def load( # noqa: D102 self, - path: Optional[Union[str, list[str]]] = None, - format: Optional[str] = None, - schema: Optional[Union[StructType, str]] = None, + path: str | list[str] | None = None, + format: str | None = None, + schema: StructType | str | None = None, **options: OptionalPrimitiveType, ) -> "DataFrame": from duckdb.experimental.spark.sql.dataframe import DataFrame @@ -129,40 +129,40 @@ def load( # noqa: D102 def csv( # noqa: D102 self, - path: Union[str, list[str]], - schema: Optional[Union[StructType, str]] = None, - sep: Optional[str] = None, - encoding: Optional[str] = None, - quote: Optional[str] = None, - escape: Optional[str] = None, - comment: Optional[str] = None, - header: Optional[Union[bool, str]] = None, - inferSchema: Optional[Union[bool, str]] = None, - ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None, - ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None, - nullValue: Optional[str] = None, - nanValue: Optional[str] = None, - positiveInf: Optional[str] = None, - negativeInf: Optional[str] = None, - dateFormat: Optional[str] = None, - timestampFormat: Optional[str] = None, - maxColumns: Optional[Union[int, str]] = None, - maxCharsPerColumn: Optional[Union[int, str]] = None, - maxMalformedLogPerPartition: Optional[Union[int, str]] = None, - mode: Optional[str] = None, - columnNameOfCorruptRecord: Optional[str] = None, - multiLine: Optional[Union[bool, str]] = None, - charToEscapeQuoteEscaping: Optional[str] = None, - samplingRatio: Optional[Union[float, str]] = None, - enforceSchema: Optional[Union[bool, str]] = None, - emptyValue: Optional[str] = None, - locale: Optional[str] = None, - lineSep: Optional[str] = None, - pathGlobFilter: Optional[Union[bool, str]] = None, - recursiveFileLookup: Optional[Union[bool, str]] = None, - modifiedBefore: Optional[Union[bool, str]] = None, - modifiedAfter: Optional[Union[bool, str]] = None, - unescapedQuoteHandling: Optional[str] = None, + path: str | list[str], + schema: StructType | str | None = None, + sep: str | None = None, + encoding: str | None = None, + quote: str | None = None, + escape: str | None = None, + comment: str | None = None, + header: bool | str | None = None, + inferSchema: bool | str | None = None, + ignoreLeadingWhiteSpace: bool | str | None = None, + ignoreTrailingWhiteSpace: bool | str | None = None, + nullValue: str | None = None, + nanValue: str | None = None, + positiveInf: str | None = None, + negativeInf: str | None = None, + dateFormat: str | None = None, + timestampFormat: str | None = None, + maxColumns: int | str | None = None, + maxCharsPerColumn: int | str | None = None, + maxMalformedLogPerPartition: int | str | None = None, + mode: str | None = None, + columnNameOfCorruptRecord: str | None = None, + multiLine: bool | str | None = None, + charToEscapeQuoteEscaping: str | None = None, + samplingRatio: float | str | None = None, + enforceSchema: bool | str | None = None, + emptyValue: str | None = None, + locale: str | None = None, + lineSep: str | None = None, + pathGlobFilter: bool | str | None = None, + recursiveFileLookup: bool | str | None = None, + modifiedBefore: bool | str | None = None, + modifiedAfter: bool | str | None = None, + unescapedQuoteHandling: str | None = None, ) -> "DataFrame": if not isinstance(path, str): raise NotImplementedError @@ -263,31 +263,31 @@ def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame def json( self, - path: Union[str, list[str]], - schema: Optional[Union[StructType, str]] = None, - primitivesAsString: Optional[Union[bool, str]] = None, - prefersDecimal: Optional[Union[bool, str]] = None, - allowComments: Optional[Union[bool, str]] = None, - allowUnquotedFieldNames: Optional[Union[bool, str]] = None, - allowSingleQuotes: Optional[Union[bool, str]] = None, - allowNumericLeadingZero: Optional[Union[bool, str]] = None, - allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None, - mode: Optional[str] = None, - columnNameOfCorruptRecord: Optional[str] = None, - dateFormat: Optional[str] = None, - timestampFormat: Optional[str] = None, - multiLine: Optional[Union[bool, str]] = None, - allowUnquotedControlChars: Optional[Union[bool, str]] = None, - lineSep: Optional[str] = None, - samplingRatio: Optional[Union[float, str]] = None, - dropFieldIfAllNull: Optional[Union[bool, str]] = None, - encoding: Optional[str] = None, - locale: Optional[str] = None, - pathGlobFilter: Optional[Union[bool, str]] = None, - recursiveFileLookup: Optional[Union[bool, str]] = None, - modifiedBefore: Optional[Union[bool, str]] = None, - modifiedAfter: Optional[Union[bool, str]] = None, - allowNonNumericNumbers: Optional[Union[bool, str]] = None, + path: str | list[str], + schema: StructType | str | None = None, + primitivesAsString: bool | str | None = None, + prefersDecimal: bool | str | None = None, + allowComments: bool | str | None = None, + allowUnquotedFieldNames: bool | str | None = None, + allowSingleQuotes: bool | str | None = None, + allowNumericLeadingZero: bool | str | None = None, + allowBackslashEscapingAnyCharacter: bool | str | None = None, + mode: str | None = None, + columnNameOfCorruptRecord: str | None = None, + dateFormat: str | None = None, + timestampFormat: str | None = None, + multiLine: bool | str | None = None, + allowUnquotedControlChars: bool | str | None = None, + lineSep: str | None = None, + samplingRatio: float | str | None = None, + dropFieldIfAllNull: bool | str | None = None, + encoding: str | None = None, + locale: str | None = None, + pathGlobFilter: bool | str | None = None, + recursiveFileLookup: bool | str | None = None, + modifiedBefore: bool | str | None = None, + modifiedAfter: bool | str | None = None, + allowNonNumericNumbers: bool | str | None = None, ) -> "DataFrame": """Loads JSON files and returns the results as a :class:`DataFrame`. diff --git a/duckdb/experimental/spark/sql/session.py b/duckdb/experimental/spark/sql/session.py index b05b9705..c407a9f1 100644 --- a/duckdb/experimental/spark/sql/session.py +++ b/duckdb/experimental/spark/sql/session.py @@ -1,6 +1,6 @@ -import uuid # noqa: D100 +import uuid from collections.abc import Iterable, Sized -from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union +from typing import TYPE_CHECKING, Any, NoReturn, Union import duckdb @@ -38,7 +38,7 @@ def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[du new_data = [] for row in data: - new_row = [Value(x, dtype.duckdb_type) for x, dtype in zip(row, [y.dataType for y in schema])] + new_row = [Value(x, dtype.duckdb_type) for x, dtype in zip(row, [y.dataType for y in schema], strict=False)] new_data.append(new_row) return new_data @@ -113,7 +113,7 @@ def construct_parameters(tuples: Iterable) -> list[list]: return DataFrame(rel, self) def _createDataFrameFromPandas( - self, data: "PandasDataFrame", types: Union[list[str], None], names: Union[list[str], None] + self, data: "PandasDataFrame", types: list[str] | None, names: list[str] | None ) -> DataFrame: df = self._create_dataframe(data) @@ -128,8 +128,8 @@ def _createDataFrameFromPandas( def createDataFrame( # noqa: D102 self, data: Union["PandasDataFrame", Iterable[Any]], - schema: Optional[Union[StructType, list[str]]] = None, - samplingRatio: Optional[float] = None, + schema: StructType | list[str] | None = None, + samplingRatio: float | None = None, verifySchema: bool = True, ) -> DataFrame: if samplingRatio: @@ -194,9 +194,9 @@ def newSession(self) -> "SparkSession": # noqa: D102 def range( # noqa: D102 self, start: int, - end: Optional[int] = None, + end: int | None = None, step: int = 1, - numPartitions: Optional[int] = None, + numPartitions: int | None = None, ) -> "DataFrame": if numPartitions: raise ContributionsAcceptedError @@ -281,9 +281,9 @@ def getOrCreate(self) -> "SparkSession": # noqa: D102 def config( # noqa: D102 self, - key: Optional[str] = None, - value: Optional[Any] = None, # noqa: ANN401 - conf: Optional[SparkConf] = None, + key: str | None = None, + value: Any | None = None, # noqa: ANN401 + conf: SparkConf | None = None, ) -> "SparkSession.Builder": return self diff --git a/duckdb/experimental/spark/sql/streaming.py b/duckdb/experimental/spark/sql/streaming.py index 08b7cc30..e40bfbf4 100644 --- a/duckdb/experimental/spark/sql/streaming.py +++ b/duckdb/experimental/spark/sql/streaming.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional, Union # noqa: D100 +from typing import TYPE_CHECKING from .types import StructType @@ -6,8 +6,8 @@ from .dataframe import DataFrame from .session import SparkSession -PrimitiveType = Union[bool, float, int, str] -OptionalPrimitiveType = Optional[PrimitiveType] +PrimitiveType = bool | float | int | str +OptionalPrimitiveType = PrimitiveType | None class DataStreamWriter: # noqa: D101 @@ -25,9 +25,9 @@ def __init__(self, session: "SparkSession") -> None: # noqa: D107 def load( # noqa: D102 self, - path: Optional[str] = None, - format: Optional[str] = None, - schema: Union[StructType, str, None] = None, + path: str | None = None, + format: str | None = None, + schema: StructType | str | None = None, **options: OptionalPrimitiveType, ) -> "DataFrame": raise NotImplementedError diff --git a/duckdb/experimental/spark/sql/type_utils.py b/duckdb/experimental/spark/sql/type_utils.py index 7452a599..6f98eb15 100644 --- a/duckdb/experimental/spark/sql/type_utils.py +++ b/duckdb/experimental/spark/sql/type_utils.py @@ -1,4 +1,4 @@ -from typing import cast # noqa: D100 +from typing import cast from duckdb.sqltypes import DuckDBPyType @@ -111,5 +111,5 @@ def convert_type(dtype: DuckDBPyType) -> DataType: # noqa: D103 def duckdb_to_spark_schema(names: list[str], types: list[DuckDBPyType]) -> StructType: # noqa: D103 - fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])] + fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types], strict=False)] return StructType(fields) diff --git a/duckdb/experimental/spark/sql/types.py b/duckdb/experimental/spark/sql/types.py index 507e1041..34c9b57b 100644 --- a/duckdb/experimental/spark/sql/types.py +++ b/duckdb/experimental/spark/sql/types.py @@ -1,4 +1,3 @@ -# ruff: noqa: D100 # This code is based on code from Apache Spark under the license found in the LICENSE # file located in the 'spark' folder. @@ -14,7 +13,7 @@ Any, ClassVar, NoReturn, - Optional, + Self, TypeVar, Union, cast, @@ -93,7 +92,7 @@ def typeName(cls) -> str: # noqa: D102 def simpleString(self) -> str: # noqa: D102 return self.typeName() - def jsonValue(self) -> Union[str, dict[str, Any]]: # noqa: D102 + def jsonValue(self) -> str | dict[str, Any]: # noqa: D102 raise ContributionsAcceptedError def json(self) -> str: # noqa: D102 @@ -533,9 +532,9 @@ class DayTimeIntervalType(AtomicType): } ) - _inverted_fields: Mapping[int, str] = MappingProxyType(dict(zip(_fields.values(), _fields.keys()))) + _inverted_fields: Mapping[int, str] = MappingProxyType(dict(zip(_fields.values(), _fields.keys(), strict=False))) - def __init__(self, startField: Optional[int] = None, endField: Optional[int] = None) -> None: # noqa: D107 + def __init__(self, startField: int | None = None, endField: int | None = None) -> None: # noqa: D107 super().__init__(DuckDBPyType("INTERVAL")) if startField is None and endField is None: # Default matched to scala side. @@ -568,11 +567,11 @@ def __repr__(self) -> str: # noqa: D105 def needConversion(self) -> bool: # noqa: D102 return True - def toInternal(self, dt: datetime.timedelta) -> Optional[int]: # noqa: D102 + def toInternal(self, dt: datetime.timedelta) -> int | None: # noqa: D102 if dt is not None: return (math.floor(dt.total_seconds()) * 1000000) + dt.microseconds - def fromInternal(self, micros: int) -> Optional[datetime.timedelta]: # noqa: D102 + def fromInternal(self, micros: int) -> datetime.timedelta | None: # noqa: D102 if micros is not None: return datetime.timedelta(microseconds=micros) @@ -610,12 +609,12 @@ def __repr__(self) -> str: # noqa: D105 def needConversion(self) -> bool: # noqa: D102 return self.elementType.needConversion() - def toInternal(self, obj: list[Optional[T]]) -> list[Optional[T]]: # noqa: D102 + def toInternal(self, obj: list[T | None]) -> list[T | None]: # noqa: D102 if not self.needConversion(): return obj return obj and [self.elementType.toInternal(v) for v in obj] - def fromInternal(self, obj: list[Optional[T]]) -> list[Optional[T]]: # noqa: D102 + def fromInternal(self, obj: list[T | None]) -> list[T | None]: # noqa: D102 if not self.needConversion(): return obj return obj and [self.elementType.fromInternal(v) for v in obj] @@ -662,12 +661,12 @@ def __repr__(self) -> str: # noqa: D105 def needConversion(self) -> bool: # noqa: D102 return self.keyType.needConversion() or self.valueType.needConversion() - def toInternal(self, obj: dict[T, Optional[U]]) -> dict[T, Optional[U]]: # noqa: D102 + def toInternal(self, obj: dict[T, U | None]) -> dict[T, U | None]: # noqa: D102 if not self.needConversion(): return obj return obj and {self.keyType.toInternal(k): self.valueType.toInternal(v) for k, v in obj.items()} - def fromInternal(self, obj: dict[T, Optional[U]]) -> dict[T, Optional[U]]: # noqa: D102 + def fromInternal(self, obj: dict[T, U | None]) -> dict[T, U | None]: # noqa: D102 if not self.needConversion(): return obj return obj and {self.keyType.fromInternal(k): self.valueType.fromInternal(v) for k, v in obj.items()} @@ -700,7 +699,7 @@ def __init__( # noqa: D107 name: str, dataType: DataType, nullable: bool = True, - metadata: Optional[dict[str, Any]] = None, + metadata: dict[str, Any] | None = None, ) -> None: super().__init__(dataType.duckdb_type) assert isinstance(dataType, DataType), f"dataType {dataType} should be an instance of {DataType}" @@ -759,9 +758,9 @@ class StructType(DataType): """ def _update_internal_duckdb_type(self) -> None: - self.duckdb_type = duckdb.struct_type(dict(zip(self.names, [x.duckdb_type for x in self.fields]))) + self.duckdb_type = duckdb.struct_type(dict(zip(self.names, [x.duckdb_type for x in self.fields], strict=False))) - def __init__(self, fields: Optional[list[StructField]] = None) -> None: # noqa: D107 + def __init__(self, fields: list[StructField] | None = None) -> None: # noqa: D107 if not fields: self.fields = [] self.names = [] @@ -772,15 +771,15 @@ def __init__(self, fields: Optional[list[StructField]] = None) -> None: # noqa: # Precalculated list of fields that need conversion with fromInternal/toInternal functions self._needConversion = [f.needConversion() for f in self] self._needSerializeAnyField = any(self._needConversion) - super().__init__(duckdb.struct_type(dict(zip(self.names, [x.duckdb_type for x in self.fields])))) + super().__init__(duckdb.struct_type(dict(zip(self.names, [x.duckdb_type for x in self.fields], strict=False)))) @overload def add( self, field: str, - data_type: Union[str, DataType], + data_type: str | DataType, nullable: bool = True, - metadata: Optional[dict[str, Any]] = None, + metadata: dict[str, Any] | None = None, ) -> "StructType": ... @overload @@ -788,10 +787,10 @@ def add(self, field: StructField) -> "StructType": ... def add( self, - field: Union[str, StructField], - data_type: Optional[Union[str, DataType]] = None, + field: str | StructField, + data_type: str | DataType | None = None, nullable: bool = True, - metadata: Optional[dict[str, Any]] = None, + metadata: dict[str, Any] | None = None, ) -> "StructType": r"""Construct a :class:`StructType` by adding new elements to it, to define the schema. The method accepts either: @@ -857,7 +856,7 @@ def __len__(self) -> int: """Return the number of fields.""" return len(self.fields) - def __getitem__(self, key: Union[str, int]) -> StructField: + def __getitem__(self, key: str | int) -> StructField: """Access fields by name or slice.""" if isinstance(key, str): for field in self: @@ -905,7 +904,7 @@ def fieldNames(self) -> list[str]: """ return list(self.names) - def treeString(self, level: Optional[int] = None) -> str: + def treeString(self, level: int | None = None) -> str: """Returns a string representation of the schema in tree format. Parameters @@ -926,7 +925,7 @@ def treeString(self, level: Optional[int] = None) -> str: |-- age: integer (nullable = true) """ - def _tree_string(schema: "StructType", depth: int = 0, max_depth: Optional[int] = None) -> list[str]: + def _tree_string(schema: "StructType", depth: int = 0, max_depth: int | None = None) -> list[str]: """Recursively build tree string lines.""" lines = [] if depth == 0: @@ -989,15 +988,17 @@ def toInternal(self, obj: tuple) -> tuple: # noqa: D102 if isinstance(obj, dict): return tuple( f.toInternal(obj.get(n)) if c else obj.get(n) - for n, f, c in zip(self.names, self.fields, self._needConversion) + for n, f, c in zip(self.names, self.fields, self._needConversion, strict=False) ) elif isinstance(obj, (tuple, list)): - return tuple(f.toInternal(v) if c else v for f, v, c in zip(self.fields, obj, self._needConversion)) + return tuple( + f.toInternal(v) if c else v for f, v, c in zip(self.fields, obj, self._needConversion, strict=False) + ) elif hasattr(obj, "__dict__"): d = obj.__dict__ return tuple( f.toInternal(d.get(n)) if c else d.get(n) - for n, f, c in zip(self.names, self.fields, self._needConversion) + for n, f, c in zip(self.names, self.fields, self._needConversion, strict=False) ) else: msg = f"Unexpected tuple {obj!r} with StructType" @@ -1021,10 +1022,12 @@ def fromInternal(self, obj: tuple) -> "Row": # noqa: D102 # it's already converted by pickler return obj - values: Union[tuple, list] + values: tuple | list if self._needSerializeAnyField: # Only calling fromInternal function for fields that need conversion - values = [f.fromInternal(v) if c else v for f, v, c in zip(self.fields, obj, self._needConversion)] + values = [ + f.fromInternal(v) if c else v for f, v, c in zip(self.fields, obj, self._needConversion, strict=False) + ] else: values = obj return _create_row(self.names, values) @@ -1121,19 +1124,19 @@ def __eq__(self, other: object) -> bool: ] _all_atomic_types: dict[str, type[DataType]] = {t.typeName(): t for t in _atomic_types} -_complex_types: list[type[Union[ArrayType, MapType, StructType]]] = [ +_complex_types: list[type[ArrayType | MapType | StructType]] = [ ArrayType, MapType, StructType, ] -_all_complex_types: dict[str, type[Union[ArrayType, MapType, StructType]]] = {v.typeName(): v for v in _complex_types} +_all_complex_types: dict[str, type[ArrayType | MapType | StructType]] = {v.typeName(): v for v in _complex_types} _FIXED_DECIMAL = re.compile(r"decimal\(\s*(\d+)\s*,\s*(-?\d+)\s*\)") _INTERVAL_DAYTIME = re.compile(r"interval (day|hour|minute|second)( to (day|hour|minute|second))?") -def _create_row(fields: Union["Row", list[str]], values: Union[tuple[Any, ...], list[Any]]) -> "Row": +def _create_row(fields: Union["Row", list[str]], values: tuple[Any, ...] | list[Any]) -> "Row": row = Row(*values) row.__fields__ = fields return row @@ -1199,7 +1202,7 @@ def __new__(cls, *args: str) -> "Row": ... @overload def __new__(cls, **kwargs: Any) -> "Row": ... # noqa: ANN401 - def __new__(cls, *args: Optional[str], **kwargs: Optional[Any]) -> "Row": # noqa: D102 + def __new__(cls, *args: str | None, **kwargs: Any | None) -> Self: # noqa: D102 if args and kwargs: msg = "Can not use both args and kwargs to create Row" raise ValueError(msg) @@ -1208,9 +1211,8 @@ def __new__(cls, *args: Optional[str], **kwargs: Optional[Any]) -> "Row": # noq row = tuple.__new__(cls, list(kwargs.values())) row.__fields__ = list(kwargs.keys()) return row - else: - # create row class or objects - return tuple.__new__(cls, args) + # create row class or objects + return tuple.__new__(cls, args) def asDict(self, recursive: bool = False) -> dict[str, Any]: """Return as a dict. @@ -1244,7 +1246,7 @@ def asDict(self, recursive: bool = False) -> dict[str, Any]: if recursive: - def conv(obj: Union[Row, list, dict, object]) -> Union[list, dict, object]: + def conv(obj: Row | list | dict | object) -> list | dict | object: if isinstance(obj, Row): return obj.asDict(True) elif isinstance(obj, list): @@ -1254,9 +1256,9 @@ def conv(obj: Union[Row, list, dict, object]) -> Union[list, dict, object]: else: return obj - return dict(zip(self.__fields__, (conv(o) for o in self))) + return dict(zip(self.__fields__, (conv(o) for o in self), strict=False)) else: - return dict(zip(self.__fields__, self)) + return dict(zip(self.__fields__, self, strict=False)) def __contains__(self, item: Any) -> bool: # noqa: D105, ANN401 if hasattr(self, "__fields__"): @@ -1306,7 +1308,7 @@ def __setattr__(self, key: Any, value: Any) -> None: # noqa: D105, ANN401 def __reduce__( self, - ) -> Union[str, tuple[Any, ...]]: + ) -> str | tuple[Any, ...]: """Returns a tuple so Python knows how to pickle Row.""" if hasattr(self, "__fields__"): return (_create_row, (self.__fields__, tuple(self))) @@ -1316,6 +1318,6 @@ def __reduce__( def __repr__(self) -> str: """Printable representation of Row used in Python REPL.""" if hasattr(self, "__fields__"): - return "Row({})".format(", ".join(f"{k}={v!r}" for k, v in zip(self.__fields__, tuple(self)))) + return "Row({})".format(", ".join(f"{k}={v!r}" for k, v in zip(self.__fields__, tuple(self), strict=False))) else: return "".format(", ".join(f"{field!r}" for field in self)) diff --git a/duckdb/experimental/spark/sql/udf.py b/duckdb/experimental/spark/sql/udf.py index 7437ed6b..c22f6be9 100644 --- a/duckdb/experimental/spark/sql/udf.py +++ b/duckdb/experimental/spark/sql/udf.py @@ -1,12 +1,15 @@ -# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/ # noqa: D100 -from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union +# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/ +from typing import TYPE_CHECKING, Any, Optional, TypeVar + +if TYPE_CHECKING: + from collections.abc import Callable from .types import DataType if TYPE_CHECKING: from .session import SparkSession -DataTypeOrString = Union[DataType, str] +DataTypeOrString = DataType | str UserDefinedFunctionLike = TypeVar("UserDefinedFunctionLike") @@ -17,7 +20,7 @@ def __init__(self, sparkSession: "SparkSession") -> None: # noqa: D107 def register( # noqa: D102 self, name: str, - f: Union[Callable[..., Any], "UserDefinedFunctionLike"], + f: "Callable[..., Any] | UserDefinedFunctionLike", returnType: Optional["DataTypeOrString"] = None, ) -> "UserDefinedFunctionLike": self.sparkSession.conn.create_function(name, f, return_type=returnType) diff --git a/duckdb/query_graph/__main__.py b/duckdb/query_graph/__main__.py index 5ffb942d..dd4ff959 100644 --- a/duckdb/query_graph/__main__.py +++ b/duckdb/query_graph/__main__.py @@ -4,7 +4,6 @@ import webbrowser from functools import reduce from pathlib import Path -from typing import Optional from duckdb import DuckDBPyConnection @@ -321,7 +320,7 @@ def open_utf8(fpath: str, flags: str) -> object: # noqa: D103 class ProfilingInfo: # noqa: D101 - def __init__(self, conn: Optional[DuckDBPyConnection] = None, from_file: Optional[str] = None) -> None: # noqa: D107 + def __init__(self, conn: DuckDBPyConnection | None = None, from_file: str | None = None) -> None: # noqa: D107 self.conn = conn self.from_file = from_file @@ -580,7 +579,7 @@ def _gather_timing_information(self, json: str, query_timings: object) -> None: self._get_child_timings(json["children"][0], query_timings) def _translate_json_to_html( - self, input_file: Optional[str] = None, input_text: Optional[str] = None, output_file: str = "profile.html" + self, input_file: str | None = None, input_text: str | None = None, output_file: str = "profile.html" ) -> None: query_timings = AllTimings() if input_text is not None: diff --git a/duckdb_packaging/_versioning.py b/duckdb_packaging/_versioning.py index 0a5eb66b..0ec984f3 100644 --- a/duckdb_packaging/_versioning.py +++ b/duckdb_packaging/_versioning.py @@ -9,7 +9,6 @@ import pathlib import re import subprocess -from typing import Optional VERSION_RE = re.compile( r"^(?P[0-9]+)\.(?P[0-9]+)\.(?P[0-9]+)(?:rc(?P[0-9]+)|\.post(?P[0-9]+))?$" @@ -100,7 +99,7 @@ def pep440_to_git_tag(version: str) -> str: return f"v{version}" -def get_current_version() -> Optional[str]: +def get_current_version() -> str | None: """Get the current version from git tags. Returns: @@ -115,7 +114,7 @@ def get_current_version() -> Optional[str]: return None -def create_git_tag(version: str, message: Optional[str] = None, repo_path: Optional[pathlib.Path] = None) -> None: +def create_git_tag(version: str, message: str | None = None, repo_path: pathlib.Path | None = None) -> None: """Create a git tag for the given version. Args: @@ -148,10 +147,10 @@ def strip_post_from_version(version: str) -> str: def get_git_describe( - repo_path: Optional[pathlib.Path] = None, + repo_path: pathlib.Path | None = None, since_major: bool = False, # noqa: FBT001 since_minor: bool = False, # noqa: FBT001 -) -> Optional[str]: +) -> str | None: """Get git describe output for version determination. Returns: diff --git a/duckdb_packaging/build_backend.py b/duckdb_packaging/build_backend.py index 799a43c9..114b81f3 100644 --- a/duckdb_packaging/build_backend.py +++ b/duckdb_packaging/build_backend.py @@ -16,7 +16,6 @@ import subprocess import sys from pathlib import Path -from typing import Optional, Union from scikit_build_core.build import ( build_editable, @@ -132,7 +131,7 @@ def _read_duckdb_long_version() -> str: return _version_file_path().read_text(encoding="utf-8").strip() -def _skbuild_config_add(key: str, value: Union[list, str], config_settings: dict[str, Union[list[str], str]]) -> None: +def _skbuild_config_add(key: str, value: list | str, config_settings: dict[str, list[str] | str]) -> None: """Add or modify a configuration setting for scikit-build-core. This function handles adding values to scikit-build-core configuration settings, @@ -179,7 +178,7 @@ def _skbuild_config_add(key: str, value: Union[list, str], config_settings: dict raise RuntimeError(msg) -def build_sdist(sdist_directory: str, config_settings: Optional[dict[str, Union[list[str], str]]] = None) -> str: +def build_sdist(sdist_directory: str, config_settings: dict[str, list[str] | str] | None = None) -> str: """Build a source distribution using the DuckDB submodule. This function extracts the DuckDB version from either the git submodule and saves it @@ -210,8 +209,8 @@ def build_sdist(sdist_directory: str, config_settings: Optional[dict[str, Union[ def build_wheel( wheel_directory: str, - config_settings: Optional[dict[str, Union[list[str], str]]] = None, - metadata_directory: Optional[str] = None, + config_settings: dict[str, list[str] | str] | None = None, + metadata_directory: str | None = None, ) -> str: """Build a wheel from either git submodule or extracted sdist sources. diff --git a/duckdb_packaging/pypi_cleanup.py b/duckdb_packaging/pypi_cleanup.py index 094df741..53990cdb 100644 --- a/duckdb_packaging/pypi_cleanup.py +++ b/duckdb_packaging/pypi_cleanup.py @@ -19,7 +19,6 @@ from collections.abc import Generator from enum import Enum from html.parser import HTMLParser -from typing import Optional, Union from urllib.parse import urlparse import pyotp @@ -173,7 +172,7 @@ def session_with_retries() -> Generator[Session, None, None]: yield session -def load_credentials() -> tuple[Optional[str], Optional[str]]: +def load_credentials() -> tuple[str | None, str | None]: """Load credentials from environment variables.""" password = os.getenv("PYPI_CLEANUP_PASSWORD") otp = os.getenv("PYPI_CLEANUP_OTP") @@ -200,7 +199,7 @@ def __init__(self, target: str) -> None: # noqa: D107 self.csrf = None # Result value from all forms on page self._in_form = False # Currently parsing a form with an action we're interested in - def handle_starttag(self, tag: str, attrs: list[tuple[str, Union[str, None]]]) -> None: # noqa: D102 + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: D102 if not self.csrf: if tag == "form": attrs = dict(attrs) @@ -232,9 +231,9 @@ def __init__( # noqa: D107 index_url: str, mode: CleanMode, max_dev_releases: int = _DEFAULT_MAX_NIGHTLIES, - username: Optional[str] = None, - password: Optional[str] = None, - otp: Optional[str] = None, + username: str | None = None, + password: str | None = None, + otp: str | None = None, ) -> None: parsed_url = urlparse(index_url) self._index_url = parsed_url.geturl().rstrip("/") diff --git a/pyproject.toml b/pyproject.toml index 1a478f7c..12c853dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -411,6 +411,8 @@ strict = true [tool.ruff.lint.per-file-ignores] "duckdb/experimental/spark/**.py" = [ + # No need for moduledocstrings for spark + 'D100', # Ignore boolean positional args in the Spark API 'FBT001' ] diff --git a/scripts/generate_import_cache_json.py b/scripts/generate_import_cache_json.py index dd8c3d5c..389866c5 100644 --- a/scripts/generate_import_cache_json.py +++ b/scripts/generate_import_cache_json.py @@ -46,7 +46,7 @@ def __init__(self, full_path) -> None: self.type = "module" self.name = parts[-1] self.full_path = full_path - self.items: dict[str, Union[ImportCacheAttribute, ImportCacheModule]] = {} + self.items: dict[str, ImportCacheAttribute | ImportCacheModule] = {} def add_item(self, item: Union[ImportCacheAttribute, "ImportCacheModule"]): assert self.full_path != item.full_path @@ -111,7 +111,7 @@ def get_module(self, module_name: str) -> ImportCacheModule: raise ValueError(msg) return self.modules[module_name] - def get_item(self, item_name: str) -> Union[ImportCacheModule, ImportCacheAttribute]: + def get_item(self, item_name: str) -> ImportCacheModule | ImportCacheAttribute: parts = item_name.split(".") if len(parts) == 1: return self.get_module(item_name) diff --git a/scripts/get_cpp_methods.py b/scripts/get_cpp_methods.py index a86b609e..0a77192a 100644 --- a/scripts/get_cpp_methods.py +++ b/scripts/get_cpp_methods.py @@ -1,7 +1,7 @@ # Requires `python3 -m pip install cxxheaderparser pcpp` +from collections.abc import Callable from enum import Enum from pathlib import Path -from typing import Callable import cxxheaderparser.parser import cxxheaderparser.preprocessor diff --git a/tests/conftest.py b/tests/conftest.py index ed7c359a..a5d0249f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,6 @@ import warnings from importlib import import_module from pathlib import Path -from typing import Union import pytest @@ -129,7 +128,7 @@ def pandas_supports_arrow_backend(): @pytest.fixture def require(): - def _require(extension_name, db_name="") -> Union[duckdb.DuckDBPyConnection, None]: + def _require(extension_name, db_name="") -> duckdb.DuckDBPyConnection | None: # Paths to search for extensions build = Path(__file__).parent.parent / "build" diff --git a/tests/fast/adbc/test_statement_bind.py b/tests/fast/adbc/test_statement_bind.py index e8df14c7..b6cff16c 100644 --- a/tests/fast/adbc/test_statement_bind.py +++ b/tests/fast/adbc/test_statement_bind.py @@ -118,7 +118,7 @@ def test_bind_composite_type(self): # Create the StructArray struct_array = pa.StructArray.from_arrays(arrays=data_dict.values(), names=data_dict.keys()) - schema = pa.schema([(name, array.type) for name, array in zip(["a"], [struct_array])]) + schema = pa.schema([(name, array.type) for name, array in zip(["a"], [struct_array], strict=False)]) # Create the RecordBatch record_batch = pa.RecordBatch.from_arrays([struct_array], schema=schema) diff --git a/tests/fast/arrow/test_arrow_run_end_encoding.py b/tests/fast/arrow/test_arrow_run_end_encoding.py index e04f9ea0..40d9131a 100644 --- a/tests/fast/arrow/test_arrow_run_end_encoding.py +++ b/tests/fast/arrow/test_arrow_run_end_encoding.py @@ -303,7 +303,7 @@ def test_arrow_ree_struct(self, duckdb_cursor): # Create a (chunked) StructArray from the chunked arrays (columns) of the ArrowTable names = unstructured.column_names iterables = [x.iterchunks() for x in columns] - zipped = zip(*iterables) + zipped = zip(*iterables, strict=False) structured_chunks = [pa.StructArray.from_arrays(list(x), names=names) for x in zipped] structured = pa.chunked_array(structured_chunks) @@ -345,7 +345,7 @@ def test_arrow_ree_union(self, duckdb_cursor): # Create a (chunked) UnionArray from the chunked arrays (columns) of the ArrowTable names = unstructured.column_names iterables = [x.iterchunks() for x in columns] - zipped = zip(*iterables) + zipped = zip(*iterables, strict=False) structured_chunks = [] for chunk in zipped: @@ -400,7 +400,7 @@ def test_arrow_ree_map(self, duckdb_cursor): # Create a (chunked) MapArray from the chunked arrays (columns) of the ArrowTable iterables = [x.iterchunks() for x in columns] - zipped = zip(*iterables) + zipped = zip(*iterables, strict=False) structured_chunks = [] for chunk in zipped: diff --git a/tests/fast/pandas/test_pandas_types.py b/tests/fast/pandas/test_pandas_types.py index 7510cb28..6335f2ee 100644 --- a/tests/fast/pandas/test_pandas_types.py +++ b/tests/fast/pandas/test_pandas_types.py @@ -56,7 +56,7 @@ def test_pandas_numeric(self): # c=type2 # .. data = {} - for letter, dtype in zip(string.ascii_lowercase, data_types): + for letter, dtype in zip(string.ascii_lowercase, data_types, strict=False): data[letter] = base_df.a.astype(dtype) df = pd.DataFrame.from_dict(data) # noqa: F841 @@ -65,7 +65,7 @@ def test_pandas_numeric(self): # Verify that the types in the out_df are correct # TODO: we don't support outputting pandas specific types (i.e UInt64) # noqa: TD002, TD003 - for letter, item in zip(string.ascii_lowercase, data_types): + for letter, item in zip(string.ascii_lowercase, data_types, strict=False): column_name = letter assert str(out_df[column_name].dtype) == item.lower() diff --git a/tests/fast/relational_api/test_rapi_aggregations.py b/tests/fast/relational_api/test_rapi_aggregations.py index 409972fc..9253d541 100644 --- a/tests/fast/relational_api/test_rapi_aggregations.py +++ b/tests/fast/relational_api/test_rapi_aggregations.py @@ -35,33 +35,33 @@ def test_any_value(self, table): result = table.order("id, t").any_value("v").execute().fetchall() expected = [(1,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.order("id, t").any_value("v", groups="id", projected_columns="id").order("id").execute().fetchall() ) expected = [(1, 1), (2, 11), (3, 5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_arg_max(self, table): result = table.arg_max("t", "v").execute().fetchall() expected = [(-1,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.arg_max("t", "v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 3), (2, -1), (3, -2)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_arg_min(self, table): result = table.arg_min("t", "v").execute().fetchall() expected = [(0,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.arg_min("t", "v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 2), (2, 4), (3, 0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_avg(self, table): result = table.avg("v").execute().fetchall() @@ -78,41 +78,41 @@ def test_bit_and(self, table): result = table.bit_and("v").execute().fetchall() expected = [(0,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.bit_and("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 0), (2, 10), (3, 5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bit_or(self, table): result = table.bit_or("v").execute().fetchall() expected = [(-1,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.bit_or("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 3), (2, 11), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bit_xor(self, table): result = table.bit_xor("v").execute().fetchall() expected = [(-7,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.bit_xor("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 2), (2, 1), (3, -6)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bitstring_agg(self, table): result = table.bitstring_agg("v").execute().fetchall() expected = [("1011001000011",)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.bitstring_agg("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, "0011000000000"), (2, "0000000000011"), (3, "1000001000000")] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) with pytest.raises(duckdb.InvalidInputException): table.bitstring_agg("v", min="1") with pytest.raises(duckdb.InvalidTypeException): @@ -122,156 +122,156 @@ def test_bool_and(self, table): result = table.bool_and("v::BOOL").execute().fetchall() expected = [(True,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.bool_and("t::BOOL", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, True), (2, True), (3, False)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bool_or(self, table): result = table.bool_or("v::BOOL").execute().fetchall() expected = [(True,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.bool_or("v::BOOL", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, True), (2, True), (3, True)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_count(self, table): result = table.count("*").execute().fetchall() expected = [(8,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.count("*", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 3), (2, 2), (3, 3)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_value_counts(self, table): result = table.value_counts("v").execute().fetchall() expected = [(None, 0), (-1, 1), (1, 2), (2, 1), (5, 1), (10, 1), (11, 1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.value_counts("v", groups="v").order("v").execute().fetchall() expected = [(-1, 1), (1, 2), (2, 1), (5, 1), (10, 1), (11, 1), (None, 0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_favg(self, table): result = [round(r[0], 2) for r in table.favg("f").execute().fetchall()] expected = [5.12] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in table.favg("f", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.25), (2, 5.24), (3, 9.92)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_first(self, table): result = table.first("v").execute().fetchall() expected = [(1,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.first("v", "id", "id").order("id").execute().fetchall() expected = [(1, 1), (2, 10), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_last(self, table): result = table.last("v").execute().fetchall() expected = [(None,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.last("v", "id", "id").order("id").execute().fetchall() expected = [(1, 2), (2, 11), (3, None)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_fsum(self, table): result = [round(r[0], 2) for r in table.fsum("f").execute().fetchall()] expected = [40.99] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in table.fsum("f", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.75), (2, 10.49), (3, 29.75)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_geomean(self, table): result = [round(r[0], 2) for r in table.geomean("f").execute().fetchall()] expected = [0.67] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in table.geomean("f", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.05), (2, 0.65), (3, 9.52)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_histogram(self, table): result = table.histogram("v").execute().fetchall() expected = [({-1: 1, 1: 2, 2: 1, 5: 1, 10: 1, 11: 1},)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.histogram("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, {1: 2, 2: 1}), (2, {10: 1, 11: 1}), (3, {-1: 1, 5: 1})] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_list(self, table): result = table.list("v").execute().fetchall() expected = [([1, 1, 2, 10, 11, -1, 5, None],)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.list("v", groups="id order by t asc", projected_columns="id").order("id").execute().fetchall() expected = [(1, [1, 1, 2]), (2, [10, 11]), (3, [-1, 5, None])] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_max(self, table): result = table.max("v").execute().fetchall() expected = [(11,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.max("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 2), (2, 11), (3, 5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_min(self, table): result = table.min("v").execute().fetchall() expected = [(-1,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.min("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 1), (2, 10), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_product(self, table): result = table.product("v").execute().fetchall() expected = [(-1100,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.product("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 2), (2, 110), (3, -5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_string_agg(self, table): result = table.string_agg("s", sep="/").execute().fetchall() expected = [("h/e/l/l/o/,/wor/ld",)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.string_agg("s", sep="/", groups="id order by t asc", projected_columns="id") .order("id") @@ -280,17 +280,17 @@ def test_string_agg(self, table): ) expected = [(1, "h/e/l"), (2, "l/o"), (3, ",/wor/ld")] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_sum(self, table): result = table.sum("v").execute().fetchall() expected = [(29,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.sum("v", groups="id", projected_columns="id").execute().fetchall() expected = [(1, 4), (2, 21), (3, 4)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) # TODO: Approximate aggregate functions # noqa: TD002, TD003 @@ -299,35 +299,35 @@ def test_median(self, table): result = table.median("v").execute().fetchall() expected = [(2.0,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.median("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 1.0), (2, 10.5), (3, 2.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_mode(self, table): result = table.mode("v").execute().fetchall() expected = [(1,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.mode("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 1), (2, 10), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_quantile_cont(self, table): result = table.quantile_cont("v").execute().fetchall() expected = [(2.0,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [[round(x, 2) for x in r[0]] for r in table.quantile_cont("v", q=[0.1, 0.5]).execute().fetchall()] expected = [[0.2, 2.0]] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = table.quantile_cont("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 1.0), (2, 10.5), (3, 2.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], [round(x, 2) for x in r[1]]) for r in table.quantile_cont("v", q=[0.2, 0.5], groups="id", projected_columns="id") @@ -337,82 +337,82 @@ def test_quantile_cont(self, table): ] expected = [(1, [1.0, 1.0]), (2, [10.2, 10.5]), (3, [0.2, 2.0])] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["quantile_disc", "quantile"]) def test_quantile_disc(self, table, f): result = getattr(table, f)("v").execute().fetchall() expected = [(2,)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = getattr(table, f)("v", q=[0.2, 0.5]).execute().fetchall() expected = [([1, 2],)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = getattr(table, f)("v", groups="id", projected_columns="id").order("id").execute().fetchall() expected = [(1, 1), (2, 10), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( getattr(table, f)("v", q=[0.2, 0.8], groups="id", projected_columns="id").order("id").execute().fetchall() ) expected = [(1, [1, 2]), (2, [10, 11]), (3, [-1, 5])] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_std_pop(self, table): result = [round(r[0], 2) for r in table.stddev_pop("v").execute().fetchall()] expected = [4.36] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in table.stddev_pop("v", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.47), (2, 0.5), (3, 3.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["stddev_samp", "stddev", "std"]) def test_std_samp(self, table, f): result = [round(r[0], 2) for r in getattr(table, f)("v").execute().fetchall()] expected = [4.71] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in getattr(table, f)("v", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.58), (2, 0.71), (3, 4.24)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_var_pop(self, table): result = [round(r[0], 2) for r in table.var_pop("v").execute().fetchall()] expected = [18.98] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in table.var_pop("v", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.22), (2, 0.25), (3, 9.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["var_samp", "variance", "var"]) def test_var_samp(self, table, f): result = [round(r[0], 2) for r in getattr(table, f)("v").execute().fetchall()] expected = [22.14] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], round(r[1], 2)) for r in getattr(table, f)("v", groups="id", projected_columns="id").order("id").execute().fetchall() ] expected = [(1, 0.33), (2, 0.5), (3, 18.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_describe(self, table): assert table.describe().fetchall() is not None diff --git a/tests/fast/relational_api/test_rapi_windows.py b/tests/fast/relational_api/test_rapi_windows.py index 28d533b7..093829a8 100644 --- a/tests/fast/relational_api/test_rapi_windows.py +++ b/tests/fast/relational_api/test_rapi_windows.py @@ -34,7 +34,7 @@ def test_row_number(self, table): result = table.row_number("over ()").execute().fetchall() expected = list(range(1, 9)) assert len(result) == len(expected) - assert all(r[0] == e for r, e in zip(result, expected)) + assert all(r[0] == e for r, e in zip(result, expected, strict=False)) result = table.row_number("over (partition by id order by t asc)", "id, v, t").order("id").execute().fetchall() expected = [ (1, 1, 1, 1), @@ -47,34 +47,34 @@ def test_row_number(self, table): (3, None, 10, 3), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_rank(self, table): result = table.rank("over ()").execute().fetchall() expected = [1] * 8 assert len(result) == len(expected) - assert all(r[0] == e for r, e in zip(result, expected)) + assert all(r[0] == e for r, e in zip(result, expected, strict=False)) result = table.rank("over (partition by id order by v asc)", "id, v").order("id").execute().fetchall() expected = [(1, 1, 1), (1, 1, 1), (1, 2, 3), (2, 10, 1), (2, 11, 2), (3, -1, 1), (3, 5, 2), (3, None, 3)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["dense_rank", "rank_dense"]) def test_dense_rank(self, table, f): result = getattr(table, f)("over ()").execute().fetchall() expected = [1] * 8 assert len(result) == len(expected) - assert all(r[0] == e for r, e in zip(result, expected)) + assert all(r[0] == e for r, e in zip(result, expected, strict=False)) result = getattr(table, f)("over (partition by id order by v asc)", "id, v").order("id").execute().fetchall() expected = [(1, 1, 1), (1, 1, 1), (1, 2, 2), (2, 10, 1), (2, 11, 2), (3, -1, 1), (3, 5, 2), (3, None, 3)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_percent_rank(self, table): result = table.percent_rank("over ()").execute().fetchall() expected = [0.0] * 8 assert len(result) == len(expected) - assert all(r[0] == e for r, e in zip(result, expected)) + assert all(r[0] == e for r, e in zip(result, expected, strict=False)) result = table.percent_rank("over (partition by id order by v asc)", "id, v").order("id").execute().fetchall() expected = [ (1, 1, 0.0), @@ -87,13 +87,13 @@ def test_percent_rank(self, table): (3, None, 1.0), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_cume_dist(self, table): result = table.cume_dist("over ()").execute().fetchall() expected = [1.0] * 8 assert len(result) == len(expected) - assert all(r[0] == e for r, e in zip(result, expected)) + assert all(r[0] == e for r, e in zip(result, expected, strict=False)) result = table.cume_dist("over (partition by id order by v asc)", "id, v").order("id").execute().fetchall() expected = [ (1, 1, 2 / 3), @@ -106,13 +106,13 @@ def test_cume_dist(self, table): (3, None, 1.0), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_ntile(self, table): result = table.n_tile("over (order by v)", 3, "v").execute().fetchall() expected = [(-1, 1), (1, 1), (1, 1), (2, 2), (5, 2), (10, 2), (11, 3), (None, 3)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_lag(self, table): result = ( @@ -132,7 +132,7 @@ def test_lag(self, table): (3, None, 10, -1), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.lag("v", "over (partition by id order by t asc)", default_value="-1", projected_columns="id, v, t") .order("id") @@ -150,7 +150,7 @@ def test_lag(self, table): (3, None, 10, -1), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.lag("v", "over (partition by id order by t asc)", offset=2, projected_columns="id, v, t") .order("id") @@ -168,7 +168,7 @@ def test_lag(self, table): (3, None, 10, 5), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_lead(self, table): result = ( @@ -188,7 +188,7 @@ def test_lead(self, table): (3, None, 10, None), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.lead("v", "over (partition by id order by t asc)", default_value="-1", projected_columns="id, v, t") .order("id") @@ -206,7 +206,7 @@ def test_lead(self, table): (3, None, 10, -1), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.lead("v", "over (partition by id order by t asc)", offset=2, projected_columns="id, v, t") .order("id") @@ -224,7 +224,7 @@ def test_lead(self, table): (3, None, 10, None), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_first_value(self, table): result = ( @@ -244,7 +244,7 @@ def test_first_value(self, table): (3, None, 10, 5), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_last_value(self, table): result = ( @@ -268,7 +268,7 @@ def test_last_value(self, table): (3, None, 10, None), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_nth_value(self, table): result = ( @@ -288,7 +288,7 @@ def test_nth_value(self, table): (3, None, 10, -1), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( table.nth_value("v", "over (partition by id order by t asc)", offset=4, projected_columns="id, v, t") .order("id") @@ -306,7 +306,7 @@ def test_nth_value(self, table): (3, None, 10, None), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) # agg functions within win def test_any_value(self, table): @@ -318,7 +318,7 @@ def test_any_value(self, table): ) expected = [(1, 1), (1, 1), (1, 1), (2, 11), (2, 11), (3, 5), (3, 5), (3, 5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_arg_max(self, table): result = ( @@ -329,7 +329,7 @@ def test_arg_max(self, table): ) expected = [(1, 3), (1, 3), (1, 3), (2, -1), (2, -1), (3, -2), (3, -2), (3, -2)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_arg_min(self, table): result = ( @@ -340,7 +340,7 @@ def test_arg_min(self, table): ) expected = [(1, 2), (1, 2), (1, 2), (2, 4), (2, 4), (3, 0), (3, 0), (3, 0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_avg(self, table): result = [ @@ -359,7 +359,7 @@ def test_avg(self, table): ] expected = [(1, 1.0), (1, 1.0), (1, 1.33), (2, 11.0), (2, 10.5), (3, 5.0), (3, 2.0), (3, 2.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bit_and(self, table): result = ( @@ -374,7 +374,7 @@ def test_bit_and(self, table): ) expected = [(1, 1), (1, 1), (1, 0), (2, 11), (2, 10), (3, 5), (3, 5), (3, 5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bit_or(self, table): result = ( @@ -389,7 +389,7 @@ def test_bit_or(self, table): ) expected = [(1, 1), (1, 1), (1, 3), (2, 11), (2, 11), (3, 5), (3, -1), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bit_xor(self, table): result = ( @@ -404,7 +404,7 @@ def test_bit_xor(self, table): ) expected = [(1, 1), (1, 0), (1, 2), (2, 11), (2, 1), (3, 5), (3, -6), (3, -6)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bitstring_agg(self, table): with pytest.raises(duckdb.BinderException, match="Could not retrieve required statistics"): @@ -436,7 +436,7 @@ def test_bitstring_agg(self, table): (3, "1000001000000"), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bool_and(self, table): result = ( @@ -447,7 +447,7 @@ def test_bool_and(self, table): ) expected = [(1, True), (1, True), (1, True), (2, True), (2, True), (3, False), (3, False), (3, False)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_bool_or(self, table): result = ( @@ -458,7 +458,7 @@ def test_bool_or(self, table): ) expected = [(1, True), (1, True), (1, True), (2, True), (2, True), (3, True), (3, True), (3, True)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_count(self, table): result = ( @@ -473,7 +473,7 @@ def test_count(self, table): ) expected = [(1, 1), (1, 2), (1, 3), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_favg(self, table): result = [ @@ -489,7 +489,7 @@ def test_favg(self, table): ] expected = [(1, 0.21), (1, 0.38), (1, 0.25), (2, 10.45), (2, 5.24), (3, 9.87), (3, 11.59), (3, 9.92)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_fsum(self, table): result = [ @@ -505,7 +505,7 @@ def test_fsum(self, table): ] expected = [(1, 0.21), (1, 0.75), (1, 0.75), (2, 10.45), (2, 10.49), (3, 9.87), (3, 23.19), (3, 29.75)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.skip(reason="geomean is not supported from a windowing context") def test_geomean(self, table): @@ -533,7 +533,7 @@ def test_histogram(self, table): (3, {-1: 1, 5: 1}), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_list(self, table): result = ( @@ -557,7 +557,7 @@ def test_list(self, table): (3, [5, -1, None]), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_max(self, table): result = ( @@ -572,7 +572,7 @@ def test_max(self, table): ) expected = [(1, 1), (1, 1), (1, 2), (2, 11), (2, 11), (3, 5), (3, 5), (3, 5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_min(self, table): result = ( @@ -587,7 +587,7 @@ def test_min(self, table): ) expected = [(1, 1), (1, 1), (1, 1), (2, 11), (2, 10), (3, 5), (3, -1), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_product(self, table): result = ( @@ -602,7 +602,7 @@ def test_product(self, table): ) expected = [(1, 1), (1, 1), (1, 2), (2, 11), (2, 110), (3, 5), (3, -5), (3, -5)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_string_agg(self, table): result = ( @@ -618,7 +618,7 @@ def test_string_agg(self, table): ) expected = [(1, "e"), (1, "e/h"), (1, "e/h/l"), (2, "o"), (2, "o/l"), (3, "wor"), (3, "wor/,"), (3, "wor/,/ld")] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_sum(self, table): result = ( @@ -633,7 +633,7 @@ def test_sum(self, table): ) expected = [(1, 1), (1, 2), (1, 4), (2, 11), (2, 21), (3, 5), (3, 4), (3, 4)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_median(self, table): result = ( @@ -648,7 +648,7 @@ def test_median(self, table): ) expected = [(1, 1.0), (1, 1.0), (1, 1.0), (2, 11.0), (2, 10.5), (3, 5.0), (3, 2.0), (3, 2.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_mode(self, table): result = ( @@ -663,7 +663,7 @@ def test_mode(self, table): ) expected = [(1, 2), (1, 2), (1, 1), (2, 10), (2, 10), (3, None), (3, -1), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_quantile_cont(self, table): result = ( @@ -678,7 +678,7 @@ def test_quantile_cont(self, table): ) expected = [(1, 2.0), (1, 1.5), (1, 1.0), (2, 10.0), (2, 10.5), (3, None), (3, -1.0), (3, 2.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = [ (r[0], [round(x, 2) for x in r[1]] if r[1] is not None else None) for r in table.quantile_cont( @@ -702,7 +702,7 @@ def test_quantile_cont(self, table): (3, [0.2, 2.0]), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["quantile_disc", "quantile"]) def test_quantile_disc(self, table, f): @@ -718,7 +718,7 @@ def test_quantile_disc(self, table, f): ) expected = [(1, 2), (1, 1), (1, 1), (2, 10), (2, 10), (3, None), (3, -1), (3, -1)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) result = ( getattr(table, f)( "v", @@ -741,7 +741,7 @@ def test_quantile_disc(self, table, f): (3, [-1, 5]), ] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_stddev_pop(self, table): result = [ @@ -757,7 +757,7 @@ def test_stddev_pop(self, table): ] expected = [(1, 0.0), (1, 0.5), (1, 0.47), (2, 0.0), (2, 0.5), (3, None), (3, 0.0), (3, 3.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["stddev_samp", "stddev", "std"]) def test_stddev_samp(self, table, f): @@ -774,7 +774,7 @@ def test_stddev_samp(self, table, f): ] expected = [(1, None), (1, 0.71), (1, 0.58), (2, None), (2, 0.71), (3, None), (3, None), (3, 4.24)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) def test_var_pop(self, table): result = [ @@ -790,7 +790,7 @@ def test_var_pop(self, table): ] expected = [(1, 0.0), (1, 0.25), (1, 0.22), (2, 0.0), (2, 0.25), (3, None), (3, 0.0), (3, 9.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) @pytest.mark.parametrize("f", ["var_samp", "variance", "var"]) def test_var_samp(self, table, f): @@ -807,4 +807,4 @@ def test_var_samp(self, table, f): ] expected = [(1, None), (1, 0.5), (1, 0.33), (2, None), (2, 0.5), (3, None), (3, None), (3, 18.0)] assert len(result) == len(expected) - assert all(r == e for r, e in zip(result, expected)) + assert all(r == e for r, e in zip(result, expected, strict=False)) diff --git a/tests/fast/spark/test_spark_functions_numeric.py b/tests/fast/spark/test_spark_functions_numeric.py index 98966548..ef24c676 100644 --- a/tests/fast/spark/test_spark_functions_numeric.py +++ b/tests/fast/spark/test_spark_functions_numeric.py @@ -294,7 +294,7 @@ def test_corr(self, spark): a = range(N) b = [2 * x for x in range(N)] # Have to use a groupby to test this as agg is not yet implemented without - df = spark.createDataFrame(zip(a, b, ["group1"] * N), ["a", "b", "g"]) + df = spark.createDataFrame(zip(a, b, ["group1"] * N, strict=False), ["a", "b", "g"]) res = df.groupBy("g").agg(sf.corr("a", "b").alias("c")).collect() assert pytest.approx(res[0].c) == 1 diff --git a/tests/fast/test_map.py b/tests/fast/test_map.py index 622095c2..2209fe1b 100644 --- a/tests/fast/test_map.py +++ b/tests/fast/test_map.py @@ -122,11 +122,14 @@ def mapper(x): dates = x["date"].to_numpy("datetime64[us]") days = x["days_to_add"].to_numpy("int") x["result1"] = pd.Series( - [pd.to_datetime(y[0]).date() + timedelta(days=y[1].item()) for y in zip(dates, days)], + [pd.to_datetime(y[0]).date() + timedelta(days=y[1].item()) for y in zip(dates, days, strict=False)], dtype="datetime64[us]", ) x["result2"] = pd.Series( - [pd.to_datetime(y[0]).date() + timedelta(days=-y[1].item()) for y in zip(dates, days)], + [ + pd.to_datetime(y[0]).date() + timedelta(days=-y[1].item()) + for y in zip(dates, days, strict=False) + ], dtype="datetime64[us]", ) return x diff --git a/tests/fast/test_type.py b/tests/fast/test_type.py index d8145166..17cec9e6 100644 --- a/tests/fast/test_type.py +++ b/tests/fast/test_type.py @@ -1,5 +1,4 @@ import sys -from typing import Optional, Union import pytest @@ -138,7 +137,7 @@ def test_implicit_convert_from_builtin_type(self): res = duckdb.list_type(list[dict[str, dict[list[str], str]]]) assert str(res.child) == "MAP(VARCHAR, MAP(VARCHAR[], VARCHAR))[]" - res = duckdb.list_type(list[Union[str, int]]) + res = duckdb.list_type(list[str | int]) assert str(res.child) == "UNION(u1 VARCHAR, u2 BIGINT)[]" def test_implicit_convert_from_numpy(self, duckdb_cursor): @@ -227,21 +226,21 @@ def test_hash_method(self): # NOTE: we can support this, but I don't think going through hoops for an outdated version of python is worth it @pytest.mark.skipif(sys.version_info < (3, 9), reason="python3.7 does not store Optional[..] in a recognized way") def test_optional(self): - type = DuckDBPyType(Optional[str]) + type = DuckDBPyType(str | None) assert type == "VARCHAR" - type = DuckDBPyType(Optional[Union[int, bool]]) + type = DuckDBPyType(int | bool | None) assert type == "UNION(u1 BIGINT, u2 BOOLEAN)" - type = DuckDBPyType(Optional[list[int]]) + type = DuckDBPyType(list[int] | None) assert type == "BIGINT[]" - type = DuckDBPyType(Optional[dict[int, str]]) + type = DuckDBPyType(dict[int, str] | None) assert type == "MAP(BIGINT, VARCHAR)" - type = DuckDBPyType(Optional[dict[Optional[int], Optional[str]]]) + type = DuckDBPyType(dict[int | None, str | None] | None) assert type == "MAP(BIGINT, VARCHAR)" - type = DuckDBPyType(Optional[dict[Optional[int], Optional[str]]]) + type = DuckDBPyType(dict[int | None, str | None] | None) assert type == "MAP(BIGINT, VARCHAR)" - type = DuckDBPyType(Optional[Union[Optional[str], Optional[bool]]]) + type = DuckDBPyType(str | None | bool) assert type == "UNION(u1 VARCHAR, u2 BOOLEAN)" - type = DuckDBPyType(Union[str, None]) + type = DuckDBPyType(str | None) assert type == "VARCHAR" @pytest.mark.skipif(sys.version_info < (3, 10), reason="'str | None' syntax requires Python 3.10 or higher") diff --git a/tests/fast/udf/test_null_filtering.py b/tests/fast/udf/test_null_filtering.py index 8bf2ce73..33ae208c 100644 --- a/tests/fast/udf/test_null_filtering.py +++ b/tests/fast/udf/test_null_filtering.py @@ -180,14 +180,14 @@ class TestUDFNullFiltering: ) @pytest.mark.parametrize("udf_type", ["arrow", "native"]) def test_null_filtering(self, duckdb_cursor, table_data: dict, test_type: Candidate, udf_type): - null_count = sum([1 for x in list(zip(*table_data.values())) if any(y is None for y in x)]) + null_count = sum([1 for x in list(zip(*table_data.values(), strict=False)) if any(y is None for y in x)]) row_count = len(table_data) table_data = { key: [None if not x else test_type.variant_one if x == "x" else test_type.variant_two for x in value] for key, value in table_data.items() } - tuples = list(zip(*table_data.values())) + tuples = list(zip(*table_data.values(), strict=False)) query = construct_query(tuples) parameters = construct_parameters(tuples, test_type.type) rel = duckdb_cursor.sql(query + " t(a, b, c)", params=parameters) @@ -210,7 +210,7 @@ def create_parameters(table_data, dbtype): result = duckdb_cursor.sql(query).fetchall() expected_output = [ - (t[0],) if not any(x is None for x in t) else (None,) for t in list(zip(*table_data.values())) + (t[0],) if not any(x is None for x in t) else (None,) for t in list(zip(*table_data.values(), strict=False)) ] assert result == expected_output assert len(result) == row_count From 23d3cf971831a0e98badc83cf448ccbd1adfc535 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 12:16:26 +0100 Subject: [PATCH 34/37] Bump submodule --- external/duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/duckdb b/external/duckdb index c65f4e48..a1cac11e 160000 --- a/external/duckdb +++ b/external/duckdb @@ -1 +1 @@ -Subproject commit c65f4e48a300bfd49a5d799195e4100f30637cf0 +Subproject commit a1cac11ecf71f867f107aa8d13d9f1bec16183df From f15013424511fc7c4b866956b3e6ff27fc2d762a Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 12:17:27 +0100 Subject: [PATCH 35/37] auto fixes --- duckdb/experimental/spark/errors/error_classes.py | 2 +- duckdb/experimental/spark/errors/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/duckdb/experimental/spark/errors/error_classes.py b/duckdb/experimental/spark/errors/error_classes.py index 22055cbf..c43a5f18 100644 --- a/duckdb/experimental/spark/errors/error_classes.py +++ b/duckdb/experimental/spark/errors/error_classes.py @@ -1,4 +1,4 @@ -# ruff: noqa: D100, E501 +# ruff: noqa: E501 # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/duckdb/experimental/spark/errors/utils.py b/duckdb/experimental/spark/errors/utils.py index 8a71f3b0..f2962fc8 100644 --- a/duckdb/experimental/spark/errors/utils.py +++ b/duckdb/experimental/spark/errors/utils.py @@ -1,4 +1,4 @@ -# # noqa: D100 +# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. From ac97da340783d4a5422c0038787340138a35d367 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 13:03:44 +0100 Subject: [PATCH 36/37] skip pandas arrow tests if pyarrow not available --- tests/fast/pandas/test_pandas_arrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/fast/pandas/test_pandas_arrow.py b/tests/fast/pandas/test_pandas_arrow.py index bab23eec..ed387d52 100644 --- a/tests/fast/pandas/test_pandas_arrow.py +++ b/tests/fast/pandas/test_pandas_arrow.py @@ -6,6 +6,7 @@ import duckdb pd = pytest.importorskip("pandas", "2.0.0") +pytest.importorskip("pyarrow") from pandas.api.types import is_integer_dtype # noqa: E402 From 11b65f068ae67f4b66472e72a495d1dc69e40872 Mon Sep 17 00:00:00 2001 From: Evert Lammerts Date: Tue, 27 Jan 2026 14:09:52 +0100 Subject: [PATCH 37/37] move from tyiping.Self to type_extensions.Self for py310 --- duckdb/experimental/spark/_globals.py | 2 +- duckdb/experimental/spark/sql/types.py | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/duckdb/experimental/spark/_globals.py b/duckdb/experimental/spark/_globals.py index 3dd7232f..23a6f171 100644 --- a/duckdb/experimental/spark/_globals.py +++ b/duckdb/experimental/spark/_globals.py @@ -33,7 +33,7 @@ def foo(arg=pyducdkb.spark._NoValue): __ALL__ = ["_NoValue"] -from typing import Self +from typing_extensions import Self # Disallow reloading this module so as to preserve the identities of the # classes defined here. diff --git a/duckdb/experimental/spark/sql/types.py b/duckdb/experimental/spark/sql/types.py index 34c9b57b..2a6a94b6 100644 --- a/duckdb/experimental/spark/sql/types.py +++ b/duckdb/experimental/spark/sql/types.py @@ -9,16 +9,9 @@ from builtins import tuple from collections.abc import Iterator, Mapping from types import MappingProxyType -from typing import ( - Any, - ClassVar, - NoReturn, - Self, - TypeVar, - Union, - cast, - overload, -) +from typing import Any, ClassVar, NoReturn, TypeVar, Union, cast, overload + +from typing_extensions import Self import duckdb from duckdb.sqltypes import DuckDBPyType