From bc62f7becbb2ba3ae91aece13ae039eddb215dbd Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 16 Feb 2026 09:09:51 +0800 Subject: [PATCH] HG-3394: Cache FileStatus in Footer to reduce redundant NameNode RPC calls --- .../org/apache/parquet/hadoop/Footer.java | 31 ++ .../parquet/hadoop/ParquetFileReader.java | 4 +- .../parquet/hadoop/ParquetInputFormat.java | 8 +- .../hadoop/TestFooterFileStatusCaching.java | 280 ++++++++++++++++++ 4 files changed, 321 insertions(+), 2 deletions(-) create mode 100644 parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestFooterFileStatusCaching.java diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Footer.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Footer.java index 86505229eb..63259a5d5f 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Footer.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/Footer.java @@ -18,6 +18,7 @@ */ package org.apache.parquet.hadoop; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.metadata.ParquetMetadata; @@ -30,10 +31,30 @@ public class Footer { private final ParquetMetadata parquetMetadata; + private final FileStatus fileStatus; + + /** + * Constructor for backwards compatibility + * + * @param file the file path + * @param parquetMetadata the parquet metadata + */ public Footer(Path file, ParquetMetadata parquetMetadata) { + this(file, parquetMetadata, null); + } + + /** + * Constructor with FileStatus to avoid redundant getFileStatus RPC calls + * + * @param file the file path + * @param parquetMetadata the parquet metadata + * @param fileStatus the file status (may be null for backwards compatibility) + */ + public Footer(Path file, ParquetMetadata parquetMetadata, FileStatus fileStatus) { super(); this.file = file; this.parquetMetadata = parquetMetadata; + this.fileStatus = fileStatus; } public Path getFile() { @@ -44,6 +65,16 @@ public ParquetMetadata getParquetMetadata() { return parquetMetadata; } + /** + * Get the FileStatus associated with this footer. + * This is used to avoid redundant getFileStatus RPC calls to the NameNode. + * + * @return the FileStatus, or null if not available + */ + public FileStatus getFileStatus() { + return fileStatus; + } + @Override public String toString() { return "Footer{" + file + ", " + parquetMetadata + "}"; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 551b1bf6c7..aa5676cff6 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -297,7 +297,9 @@ public static List