diff --git a/eng/pipelines/scripts/Get-Test-Logs.ps1 b/eng/pipelines/scripts/Get-Test-Logs.ps1
index d840a71ca2ae..d4afe231a76a 100644
--- a/eng/pipelines/scripts/Get-Test-Logs.ps1
+++ b/eng/pipelines/scripts/Get-Test-Logs.ps1
@@ -1,13 +1,19 @@
<#
.SYNOPSIS
-Captures any test.log files in the build directory and moves them to a staging directory for artifact publishing.
+Captures any test.log files, JVM crash logs, surefire dumpstream files, and jstack dumps in the build directory
+and moves them to a staging directory for artifact publishing.
.DESCRIPTION
-This script is used to capture any test.log files in the build directory and move them to a staging directory for
-artifact publishing. It also sets a pipeline variable to indicate whether any test.log files were found.
+This script is used to capture diagnostic files from the build directory and move them to a staging directory for
+artifact publishing. It also sets a pipeline variable to indicate whether any diagnostic files were found.
+Collected files include:
+ - *test.log (test logs)
+ - hs_err_pid*.log (JVM crash reports)
+ - *.dumpstream (Surefire forked JVM crash/corruption reports)
+ - jstack-dumps.log (periodic jstack thread dumps from the Java process monitor)
.PARAMETER StagingDirectory
-The directory where the test.log files will be moved to.
+The directory where the diagnostic files will be moved to.
.PARAMETER TestLogsArtifactName
The name of the artifact to be created.
@@ -22,11 +28,21 @@ param(
)
$testLogs = Get-ChildItem -Path . -Recurse -Filter *test.log -File -Depth 4
+$jvmCrashLogs = Get-ChildItem -Path . -Recurse -Filter hs_err_pid*.log -File -Depth 6
+$dumpstreamFiles = Get-ChildItem -Path . -Recurse -Filter *.dumpstream -File -Depth 6
+$jstackDumps = Get-ChildItem -Path "$StagingDirectory/troubleshooting" -Filter jstack-dumps.log -File -ErrorAction SilentlyContinue
-if ($testLogs.Count -gt 0) {
+$allFiles = @()
+if ($testLogs) { $allFiles += $testLogs }
+if ($jvmCrashLogs) { $allFiles += $jvmCrashLogs }
+if ($dumpstreamFiles) { $allFiles += $dumpstreamFiles }
+if ($jstackDumps) { $allFiles += $jstackDumps }
+
+if ($allFiles.Count -gt 0) {
if (-not (Test-Path "$StagingDirectory/troubleshooting")) {
New-Item -ItemType Directory -Path "$StagingDirectory/troubleshooting" | Out-Null
}
Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true"
- Compress-Archive -Path $testLogs -DestinationPath "$StagingDirectory/troubleshooting/$TestLogsArtifactName.zip"
+ Write-Host "Found $($testLogs.Count) test log(s), $($jvmCrashLogs.Count) JVM crash log(s), $($dumpstreamFiles.Count) dumpstream file(s), $($jstackDumps.Count) jstack dump(s)"
+ Compress-Archive -Path $allFiles -DestinationPath "$StagingDirectory/troubleshooting/$TestLogsArtifactName.zip"
}
diff --git a/eng/pipelines/scripts/Monitor-Java-Processes.ps1 b/eng/pipelines/scripts/Monitor-Java-Processes.ps1
new file mode 100644
index 000000000000..283db7b9f678
--- /dev/null
+++ b/eng/pipelines/scripts/Monitor-Java-Processes.ps1
@@ -0,0 +1,105 @@
+<#
+.SYNOPSIS
+Monitors Java processes by taking periodic jstack thread dumps.
+
+.DESCRIPTION
+This script runs in the background, periodically capturing thread dumps of all running Java processes.
+It uses both 'ps' (to reliably find Java processes on Linux) and 'jstack' (for thread dumps).
+It writes the output to a log file in the troubleshooting directory. This is useful for diagnosing CI pipeline
+hangs caused by deadlocked or stuck Java processes.
+
+.PARAMETER StagingDirectory
+The directory where jstack dump files will be written.
+
+.PARAMETER IntervalSeconds
+The interval in seconds between captures. Default is 120 (2 minutes).
+
+.PARAMETER DurationMinutes
+The maximum duration in minutes to run the monitor. Default is 55 minutes.
+#>
+
+param(
+ [Parameter(Mandatory = $true)]
+ [string]$StagingDirectory,
+
+ [Parameter(Mandatory = $false)]
+ [int]$IntervalSeconds = 120,
+
+ [Parameter(Mandatory = $false)]
+ [int]$DurationMinutes = 55
+)
+
+$troubleshootingDir = "$StagingDirectory/troubleshooting"
+if (-not (Test-Path $troubleshootingDir)) {
+ New-Item -ItemType Directory -Path $troubleshootingDir | Out-Null
+}
+
+$outputFile = "$troubleshootingDir/jstack-dumps.log"
+$endTime = (Get-Date).AddMinutes($DurationMinutes)
+
+Add-Content -Path $outputFile -Value "Monitor started at $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
+Add-Content -Path $outputFile -Value "JAVA_HOME=$($env:JAVA_HOME)"
+
+while ((Get-Date) -lt $endTime) {
+ Start-Sleep -Seconds $IntervalSeconds
+
+ $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss"
+ Add-Content -Path $outputFile -Value "`n========== Snapshot at $timestamp =========="
+
+ # Use 'ps' to find Java processes (more reliable than jps on CI agents)
+ try {
+ if ($IsLinux -or $IsMacOS) {
+ $psOutput = bash -c "ps aux | grep '[j]ava'" 2>&1
+ } else {
+ $psOutput = Get-Process -Name java -ErrorAction SilentlyContinue | Format-Table Id, CPU, WorkingSet64, CommandLine -AutoSize | Out-String
+ }
+ Add-Content -Path $outputFile -Value "`n--- Java processes (ps) ---"
+ if ($psOutput) {
+ Add-Content -Path $outputFile -Value $psOutput
+ } else {
+ Add-Content -Path $outputFile -Value "(no Java processes found)"
+ }
+ } catch {
+ Add-Content -Path $outputFile -Value "Error listing processes: $_"
+ }
+
+ # Also try jps for comparison
+ $javaHome = $env:JAVA_HOME
+ $jpsPath = if ($javaHome) { "$javaHome/bin/jps" } else { "jps" }
+ $jstackPath = if ($javaHome) { "$javaHome/bin/jstack" } else { "jstack" }
+
+ try {
+ $jpsOutput = & $jpsPath -l 2>&1
+ Add-Content -Path $outputFile -Value "`n--- Java processes (jps -l) ---"
+ Add-Content -Path $outputFile -Value $jpsOutput
+ } catch {
+ Add-Content -Path $outputFile -Value "Error running jps: $_"
+ }
+
+ # Extract PIDs from ps output and take jstack dumps
+ if ($IsLinux -or $IsMacOS) {
+ try {
+ $javaPids = bash -c "ps -eo pid,comm | grep '[j]ava' | awk '{print \$1}'" 2>&1
+ if ($javaPids) {
+ foreach ($pid in ($javaPids -split "`n" | Where-Object { $_.Trim() })) {
+ $pid = $pid.Trim()
+ Add-Content -Path $outputFile -Value "`n--- jstack for PID $pid ---"
+ try {
+ $stackTrace = & $jstackPath $pid 2>&1
+ Add-Content -Path $outputFile -Value $stackTrace
+ } catch {
+ Add-Content -Path $outputFile -Value "Failed to get jstack for PID $pid : $_"
+ }
+ }
+ }
+ } catch {
+ Add-Content -Path $outputFile -Value "Error extracting PIDs: $_"
+ }
+ }
+}
+
+Add-Content -Path $outputFile -Value "`nMonitor finished at $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
+# Mark that we have troubleshooting artifacts
+if (Test-Path $outputFile) {
+ Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true"
+}
diff --git a/sdk/parents/azure-client-sdk-parent/pom.xml b/sdk/parents/azure-client-sdk-parent/pom.xml
index fb194a4906c6..c80d6d63c1b9 100644
--- a/sdk/parents/azure-client-sdk-parent/pom.xml
+++ b/sdk/parents/azure-client-sdk-parent/pom.xml
@@ -894,6 +894,7 @@
debug
1
+ 1800
false
${defaultSurefireArgLine}
@@ -944,6 +945,7 @@
debug
1
+ 1800
false
${defaultFailsafeArgLine}
diff --git a/sdk/spring/ci.yml b/sdk/spring/ci.yml
index b91fedb2a87a..b76556b50240 100644
--- a/sdk/spring/ci.yml
+++ b/sdk/spring/ci.yml
@@ -254,6 +254,17 @@ extends:
template: ../../eng/pipelines/templates/stages/archetype-sdk-client.yml
parameters:
ServiceDirectory: spring
+ PreBuildSteps:
+ - bash: |
+ nohup pwsh -File "$(Build.SourcesDirectory)/eng/pipelines/scripts/Monitor-Java-Processes.ps1" \
+ -StagingDirectory "$(System.DefaultWorkingDirectory)" \
+ -IntervalSeconds 180 \
+ -DurationMinutes 55 \
+ > /dev/null 2>&1 &
+ echo "Java process monitor started in background (PID: $!)"
+ displayName: 'Start Java process monitor (background)'
+ continueOnError: true
+ condition: always()
Artifacts:
- name: azure-spring-data-cosmos
groupId: com.azure
diff --git a/sdk/spring/pom.xml b/sdk/spring/pom.xml
index d85cf0f56bc8..be007e1921d0 100644
--- a/sdk/spring/pom.xml
+++ b/sdk/spring/pom.xml
@@ -136,6 +136,7 @@
azure-spring-data-cosmos
+
monitor