Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions python/pyarrow/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,32 @@ def __getattr__(name):
)


def _is_likely_uri(path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mention on the issue:

Working on a fix that ports the C++ IsLikelyUri() heuristic to Python and uses it in _resolve_filesystem_and_path()

Could you check if it's possible to expose IsLikelyUri through Python so ultimately the logic is only implemented once and is always doing the same?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think calling the C++ function would be preferable. The change to expose the functionallity could look like:

diff --git i/cpp/src/arrow/filesystem/filesystem.h w/cpp/src/arrow/filesystem/filesystem.h
index 3a47eb62f5..85ee3ff114 100644
--- i/cpp/src/arrow/filesystem/filesystem.h
+++ w/cpp/src/arrow/filesystem/filesystem.h
@@ -23,6 +23,7 @@
 #include <iosfwd>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>

@@ -529,6 +530,12 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem {
 /// The user is responsible for synchronization of calls to this function.
 void EnsureFinalized();

+/// \brief Return whether a path string is likely a URI.
+///
+/// This heuristic is conservative and may return false for malformed URIs.
+ARROW_EXPORT
+bool IsLikelyUri(std::string_view path);
+
 /// \defgroup filesystem-factories Functions for creating FileSystem instances
 ///
 /// @{
diff --git i/cpp/src/arrow/filesystem/path_util.cc w/cpp/src/arrow/filesystem/path_util.cc
index dc82afd07e..cdc1807781 100644
--- i/cpp/src/arrow/filesystem/path_util.cc
+++ w/cpp/src/arrow/filesystem/path_util.cc
@@ -28,8 +28,28 @@
 #include "arrow/util/uri.h"

 namespace arrow {
-
 namespace fs {
+
+bool IsLikelyUri(std::string_view v) {
+  if (v.empty() || v[0] == '/') {
+    return false;
+  }
+  const auto pos = v.find_first_of(':');
+  if (pos == v.npos) {
+    return false;
+  }
+  if (pos < 2) {
+    // One-letter URI schemes don't officially exist, perhaps a Windows drive letter?
+    return false;
+  }
+  if (pos > 36) {
+    // The largest IANA-registered URI scheme is "microsoft.windows.camera.multipicker"
+    // with 36 characters.
+    return false;
+  }
+  return ::arrow::util::IsValidUriScheme(v.substr(0, pos));
+}
+
 namespace internal {

 // XXX How does this encode Windows UNC paths?
@@ -337,26 +357,6 @@ bool IsEmptyPath(std::string_view v) {
   return true;
 }

diff --git i/cpp/src/arrow/filesystem/path_util.h w/cpp/src/arrow/filesystem/path_util.h
index d49d9d2efa..2f767fc7dc 100644
--- i/cpp/src/arrow/filesystem/path_util.h
+++ w/cpp/src/arrow/filesystem/path_util.h
@@ -27,6 +27,10 @@

 namespace arrow {
 namespace fs {
+
+ARROW_EXPORT
+bool IsLikelyUri(std::string_view s);
+
 namespace internal {

 constexpr char kSep = '/';
@@ -159,8 +163,7 @@ std::string ToSlashes(std::string_view s);
 ARROW_EXPORT
 bool IsEmptyPath(std::string_view s);

-ARROW_EXPORT
-bool IsLikelyUri(std::string_view s);
+inline bool IsLikelyUri(std::string_view s) { return ::arrow::fs::IsLikelyUri(s); }

 class ARROW_EXPORT Globber {
  public:
diff --git i/python/pyarrow/_fs.pyx w/python/pyarrow/_fs.pyx
index 0739b6acba..0b8ab7480f 100644
--- i/python/pyarrow/_fs.pyx
+++ w/python/pyarrow/_fs.pyx
@@ -84,6 +84,14 @@ def _file_type_to_string(ty):
     return f"{ty.__class__.__name__}.{ty._name_}"


+def _is_likely_uri(path):
+    cdef c_string c_path
+    if not isinstance(path, str):
+        raise TypeError("Path must be a string")
+    c_path = tobytes(path)
+    return CIsLikelyUri(cpp_string_view(c_path))
+
+
 cdef class FileInfo(_Weakrefable):
     """
     FileSystem entry info.
diff --git i/python/pyarrow/fs.py w/python/pyarrow/fs.py
index 41308cd60c..b0acb97d94 100644
--- i/python/pyarrow/fs.py
+++ w/python/pyarrow/fs.py
@@ -31,6 +31,7 @@ from pyarrow._fs import (  # noqa
     _MockFileSystem,
     FileSystemHandler,
     PyFileSystem,
+    _is_likely_uri,
     _copy_files,
     _copy_files_selector,
 )
@@ -82,32 +83,6 @@ def __getattr__(name):
     )

diff --git i/python/pyarrow/includes/libarrow_fs.pxd w/python/pyarrow/includes/libarrow_fs.pxd
index af01c47c8c..48604972d2 100644
--- i/python/pyarrow/includes/libarrow_fs.pxd
+++ w/python/pyarrow/includes/libarrow_fs.pxd
@@ -92,6 +92,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil:
     CResult[shared_ptr[CFileSystem]] CFileSystemFromUriOrPath \
         "arrow::fs::FileSystemFromUriOrPath"(const c_string& uri,
                                              c_string* out_path)
+    c_bool CIsLikelyUri "arrow::fs::IsLikelyUri"(cpp_string_view path)

     cdef cppclass CFileSystemGlobalOptions \
             "arrow::fs::FileSystemGlobalOptions":
diff --git i/python/pyarrow/tests/test_fs.py w/python/pyarrow/tests/test_fs.py
index 537ceb9999..31f3f8a8ca 100644
--- i/python/pyarrow/tests/test_fs.py
+++ w/python/pyarrow/tests/test_fs.py
@@ -1715,6 +1715,8 @@ def test_is_likely_uri():
     assert not _is_likely_uri("C:/Users/foo")
     assert not _is_likely_uri("3bucket://key")       # scheme starts with digit
     assert not _is_likely_uri("-scheme://key")        # scheme starts with dash
+    assert not _is_likely_uri("schéme://bucket/key")  # non-ASCII in scheme
+    assert not _is_likely_uri("漢字://bucket/key")     # non-ASCII in scheme

Does that make sense?

"""
Check if a string looks like a URI (has a valid scheme followed by ':').

This is a Python port of the C++ ``IsLikelyUri()`` heuristic in
``cpp/src/arrow/filesystem/path_util.cc``. It is intentionally
conservative: single-letter schemes are excluded (could be a Windows
drive letter) and the scheme must conform to RFC 3986.
"""
if not path or path[0] == '/':
return False
colon_pos = path.find(':')
if colon_pos < 0:
return False
# One-letter URI schemes don't officially exist; may be a Windows drive
if colon_pos < 2:
return False
# The largest IANA-registered scheme is 36 characters
if colon_pos > 36:
return False
scheme = path[:colon_pos]
if not scheme[0].isalpha():
return False
return all(c.isalnum() or c in ('+', '-', '.') for c in scheme[1:])


def _ensure_filesystem(filesystem, *, use_mmap=False):
if isinstance(filesystem, FileSystem):
return filesystem
Expand Down Expand Up @@ -174,13 +200,17 @@ def _resolve_filesystem_and_path(path, filesystem=None, *, memory_map=False):
filesystem, path = FileSystem.from_uri(path)
except ValueError as e:
msg = str(e)
if "empty scheme" in msg or "Cannot parse URI" in msg:
# neither an URI nor a locally existing path, so assume that
# local path was given and propagate a nicer file not found
# error instead of a more confusing scheme parsing error
if "empty scheme" in msg:
# No scheme at all — treat as a local path and propagate
# a nicer "file not found" error later.
pass
elif "Cannot parse URI" in msg and not _is_likely_uri(path):
# Path doesn't look like a URI (no valid scheme prefix),
# so treat it as a local path rather than surfacing a
# confusing URI-parsing error.
pass
else:
raise e
raise
else:
path = filesystem.normalize_path(path)

Expand Down
61 changes: 61 additions & 0 deletions python/pyarrow/tests/test_fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1695,6 +1695,67 @@ def test_filesystem_from_path_object(path):
assert path == p.resolve().absolute().as_posix()


def test_is_likely_uri():
"""Unit tests for the _is_likely_uri() heuristic."""
from pyarrow.fs import _is_likely_uri

# Valid URI schemes
assert _is_likely_uri("s3://bucket/key")
assert _is_likely_uri("gs://bucket/key")
assert _is_likely_uri("hdfs://host/path")
assert _is_likely_uri("file:///local/path")
assert _is_likely_uri("abfss://container@account/path")
assert _is_likely_uri("grpc+https://host:443")

# Not URIs — local paths, Windows drives, empty, etc.
assert not _is_likely_uri("")
assert not _is_likely_uri("/absolute/path")
assert not _is_likely_uri("relative/path")
assert not _is_likely_uri("C:\\Users\\foo") # single-letter → drive
assert not _is_likely_uri("C:/Users/foo")
assert not _is_likely_uri("3bucket://key") # scheme starts with digit
assert not _is_likely_uri("-scheme://key") # scheme starts with dash


def test_resolve_filesystem_and_path_uri_with_spaces():
"""
URIs with a recognised scheme but un-encoded spaces must raise
ValueError — NOT silently fall back to LocalFileSystem.
(GH-41365)
"""
from pyarrow.fs import _resolve_filesystem_and_path

# S3 URI with spaces should raise, not return LocalFileSystem
with pytest.raises(ValueError, match="Cannot parse URI"):
_resolve_filesystem_and_path("s3://bucket/path with space/file.parquet")

# GCS URI with spaces should also raise
with pytest.raises(ValueError, match="Cannot parse URI"):
_resolve_filesystem_and_path("gs://bucket/path with space/file.csv")

# abfss URI with spaces
with pytest.raises(ValueError, match="Cannot parse URI"):
_resolve_filesystem_and_path(
"abfss://container@account/dir with space/file"
)


def test_resolve_filesystem_and_path_local_with_spaces():
"""
Local paths (no scheme) with spaces should still resolve to
LocalFileSystem — they must NOT be confused with malformed URIs.
"""
from pyarrow.fs import _resolve_filesystem_and_path

# Absolute local path with spaces → LocalFileSystem
fs, path = _resolve_filesystem_and_path("/tmp/path with spaces/data")
assert isinstance(fs, LocalFileSystem)

# Non-existent absolute path → still LocalFileSystem
fs, path = _resolve_filesystem_and_path("/nonexistent/path")
assert isinstance(fs, LocalFileSystem)


@pytest.mark.s3
def test_filesystem_from_uri_s3(s3_server):
from pyarrow.fs import S3FileSystem
Expand Down
Loading