diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh index 21b6ff0b6f..285e58325c 100755 --- a/.github/scripts/run_case_optimization.sh +++ b/.github/scripts/run_case_optimization.sh @@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then ngpus=1 fi -# Verify the venv Python interpreter exists (created by ./mfc.sh build) -if [ ! -x build/venv/bin/python3 ]; then - echo "ERROR: build/venv/bin/python3 not found." - echo "The MFC build venv may not have been created. Was the pre-build step successful?" - exit 1 -fi - benchmarks=( benchmarks/5eq_rk3_weno3_hllc/case.py benchmarks/viscous_weno5_sgb_acoustic/case.py @@ -28,6 +21,30 @@ benchmarks=( benchmarks/igr/case.py ) +# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only; +# build case-optimized binaries here on the compute node before running. +# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job. +# +# Clean stale MFC target staging before building. On self-hosted CI runners, +# corrupted intermediate files from a prior failed build (e.g. CCE optcg crash) +# can persist and poison subsequent builds. Each case-opt config gets its own +# hash-named staging dir, but install dirs and other artifacts may be stale. +if [ "$job_cluster" != "phoenix" ]; then + # Clean stale MFC target dirs (hash-named) from prior builds, but + # preserve dependency dirs (hipfort, fftw, etc.) since the compute + # node has no internet to re-fetch them. + echo "=== Cleaning stale MFC target staging/install ===" + find build/staging -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true + find build/install -maxdepth 1 -regex '.*/[0-9a-f]+' -type d -exec rm -rf {} + 2>/dev/null || true + + echo "=== Building case-optimized binaries on compute node ===" + for case in "${benchmarks[@]}"; do + echo "--- Building: $case ---" + ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8 + done + echo "=== All case-optimized binaries built ===" +fi + passed=0 failed=0 failed_cases="" diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 7ce02c1e3f..87130dd116 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -68,7 +68,7 @@ jobs: flag: f device: gpu interface: acc - build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" + build_script: "bash .github/workflows/frontier/build.sh gpu acc" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -76,7 +76,7 @@ jobs: flag: f device: gpu interface: omp - build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier/build.sh gpu omp" - cluster: frontier_amd name: Oak Ridge | Frontier (AMD) group: phoenix @@ -84,17 +84,21 @@ jobs: flag: famd device: gpu interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp" continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }} runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} timeout-minutes: 480 steps: + - name: Clean stale output files + run: rm -f *.out + - name: Clone - PR uses: actions/checkout@v4 with: path: pr + clean: false - name: Clone - Master uses: actions/checkout@v4 @@ -102,8 +106,9 @@ jobs: repository: MFlowCode/MFC ref: master path: master + clean: false - - name: Setup & Build + - name: Fetch Dependencies if: matrix.build_script != '' timeout-minutes: 150 run: | diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 66d77cfd99..9522e3a043 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -21,19 +21,24 @@ if [ "$job_cluster" = "phoenix" ]; then trap 'rm -rf "$currentdir" || true' EXIT fi -# --- Build (if not pre-built on login node) --- -# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# --- Build --- +# Phoenix builds everything inside SLURM (no login-node build step). +# Frontier/Frontier AMD: deps already fetched on login node via --deps-only; +# source code is built here on the compute node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then source .github/scripts/clean-build.sh clean_build fi -if [ ! -d "build" ]; then - source .github/scripts/retry-build.sh - retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +if [ "$job_cluster" = "frontier" ]; then + n_jobs=1 fi +source .github/scripts/retry-build.sh +retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 + # --- Bench cluster flag --- if [ "$job_cluster" = "phoenix" ]; then bench_cluster="phoenix-bench" diff --git a/.github/workflows/common/build.sh b/.github/workflows/common/build.sh new file mode 100755 index 0000000000..e15d93afd8 --- /dev/null +++ b/.github/workflows/common/build.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Build-only script for all clusters. +# Runs inside a SLURM job via submit-slurm-job.sh. +# Builds MFC without running tests (--dry-run). +# Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster + +set -euo pipefail + +source .github/scripts/gpu-opts.sh +build_opts="$gpu_opts" + +# --- Phoenix TMPDIR setup --- +if [ "$job_cluster" = "phoenix" ]; then + tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 9000 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + export TMPDIR=$currentdir + trap 'rm -rf "$currentdir" || true' EXIT +fi + +# --- Build --- +# Phoenix builds everything inside SLURM (no login-node build step). +# Frontier/Frontier AMD: deps already fetched on login node via --deps-only; +# source code is built here on the compute node. +# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled +# on a different microarchitecture. +if [ "$job_cluster" = "phoenix" ]; then + source .github/scripts/clean-build.sh + clean_build +fi + +source .github/scripts/retry-build.sh + +# Phoenix: smoke-test the syscheck binary to catch architecture mismatches +# (SIGILL from binaries compiled on a different compute node). +validate_cmd="" +if [ "$job_cluster" = "phoenix" ]; then + validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' +fi + +RETRY_VALIDATE_CMD="$validate_cmd" \ + retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index e155fd48f8..c1b951dfda 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Unified test script for all clusters. +# Test-only script for all clusters. # Runs inside a SLURM job via submit-slurm-job.sh. +# Assumes MFC is already built (by a prior build.sh SLURM job). # Expects env vars: $job_device, $job_interface, $job_shard, $job_cluster set -euo pipefail @@ -9,9 +10,6 @@ source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" # --- Phoenix TMPDIR setup --- -# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each -# spawning MPI processes, it fills up and ORTE session dir creation fails. -# Redirect TMPDIR to project storage, same as bench.sh. if [ "$job_cluster" = "phoenix" ]; then tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build currentdir=$tmpbuild/run-$(( RANDOM % 9000 )) @@ -21,29 +19,6 @@ if [ "$job_cluster" = "phoenix" ]; then trap 'rm -rf "$currentdir" || true' EXIT fi -# --- Build (if not pre-built on login node) --- -# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. -# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh -# to avoid SIGILL from stale binaries compiled on a different microarchitecture. -if [ "$job_cluster" = "phoenix" ]; then - source .github/scripts/clean-build.sh - clean_build -fi - -if [ ! -d "build" ]; then - source .github/scripts/retry-build.sh - - # Phoenix: smoke-test the syscheck binary to catch architecture mismatches - # (SIGILL from binaries compiled on a different compute node). - validate_cmd="" - if [ "$job_cluster" = "phoenix" ]; then - validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' - fi - - RETRY_VALIDATE_CMD="$validate_cmd" \ - retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 -fi - # --- GPU detection and thread count --- device_opts="" rdma_opts="" @@ -88,4 +63,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then prune_flag="--only-changes" fi -./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster +./mfc.sh test -v --max-attempts 3 --no-build $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 5bd40999d7..cd289ef074 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -14,7 +14,6 @@ esac job_device=$1 job_interface=$2 -run_bench=$3 source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" @@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh clean_build source .github/scripts/retry-build.sh -if [ "$run_bench" == "bench" ]; then - retry_build ./mfc.sh build -j 8 $build_opts || exit 1 -else - retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1 -fi +retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 90ad965c52..f0d8128495 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -400,11 +400,14 @@ jobs: echo "Coverage cache: none available — full test suite will run" fi - - name: Build (login node) + - name: Fetch Dependencies if: matrix.cluster != 'phoenix' timeout-minutes: 60 run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} + - name: Build + run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/build.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }} + - name: Test run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }} @@ -421,23 +424,29 @@ jobs: if: always() id: log run: | - SLUG="test-${{ matrix.device }}-${{ matrix.interface }}" + SHARD_SUFFIX="" SHARD="${{ matrix.shard }}" if [ -n "$SHARD" ]; then - SLUG="${SLUG}-$(echo "$SHARD" | sed 's|/|-of-|')" + SHARD_SUFFIX="-$(echo "$SHARD" | sed 's|/|-of-|')" fi - echo "slug=${SLUG}" >> "$GITHUB_OUTPUT" + echo "build_slug=build-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT" + echo "test_slug=test-${{ matrix.device }}-${{ matrix.interface }}${SHARD_SUFFIX}" >> "$GITHUB_OUTPUT" - name: Print Logs if: always() - run: cat ${{ steps.log.outputs.slug }}.out + run: | + for f in ${{ steps.log.outputs.build_slug }}.out ${{ steps.log.outputs.test_slug }}.out; do + [ -f "$f" ] && echo "=== $f ===" && cat "$f" + done - name: Archive Logs uses: actions/upload-artifact@v4 if: matrix.cluster != 'phoenix' with: - name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.slug }} - path: ${{ steps.log.outputs.slug }}.out + name: logs-${{ strategy.job-index }}-${{ steps.log.outputs.test_slug }} + path: | + ${{ steps.log.outputs.build_slug }}.out + ${{ steps.log.outputs.test_slug }}.out case-optimization: name: "Case Opt | ${{ matrix.cluster_name }} (${{ matrix.device }}-${{ matrix.interface }})" @@ -486,15 +495,20 @@ jobs: - name: Clean stale output files run: rm -f *.out + - name: Fetch Dependencies + if: matrix.cluster != 'phoenix' + run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} + - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }} - - name: Pre-Build (login node) + - name: Build & Run Case-Optimization Tests if: matrix.cluster != 'phoenix' - run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Run Case-Optimization Tests + if: matrix.cluster == 'phoenix' run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Cancel SLURM Jobs diff --git a/CMakeLists.txt b/CMakeLists.txt index fb77271a37..c41b323b0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,7 +176,7 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") add_compile_options( - "SHELL:-M 296,878,1391,1069,5025" + "SHELL:-M 296,878,1391,1069,990,5025,7208,7212,7242" "SHELL:-h static" "SHELL:-h keepfiles" "SHELL:-h acc_model=auto_async_none" "SHELL: -h acc_model=no_fast_addr" @@ -190,9 +190,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") add_compile_options( "SHELL:-h acc_model=auto_async_none" "SHELL: -h acc_model=no_fast_addr" - "SHELL: -K trap=fp" "SHELL: -G2" + "SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0" ) - add_link_options("SHELL: -K trap=fp" "SHELL: -G2") + add_link_options("SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0") endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang") diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index d6daf97bb6..9fed43c271 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil history.add(target.name) + # Dependencies are pinned to fixed versions. If already configured + # (built & installed by a prior --deps-only step), skip entirely + # to avoid re-entering the superbuild (which may access the network). + if target.isDependency and target.is_configured(case): + return + for dep in target.requires.compute(): # If we have already built and installed this target, # do not do so again. This can be inferred by whether @@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() + if ARG("deps_only", False) and len(history) == 0: + all_deps = set() + for t in targets: + resolved = get_target(t) + for dep in resolved.requires.compute(): + all_deps.add(dep) + + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") + cons.print(no_indent=True) + + if not all_deps: + cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") + return + + for dep in all_deps: + __build_target(dep, case, history) + + return + if len(history) == 0: cons.print(__generate_header(case, targets)) cons.print(no_indent=True) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 85aab95031..e98003aa74 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -134,6 +134,13 @@ default=False, dest="case_optimization", ), + Argument( + name="deps-only", + help="Only fetch and build dependencies, do not build MFC targets.", + action=ArgAction.STORE_TRUE, + default=False, + dest="deps_only", + ), ], examples=[ Example("./mfc.sh build", "Build all default targets (CPU)"),