Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/scripts/prebuild-case-optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,13 @@ case "$job_interface" in
*) echo "ERROR: prebuild requires gpu interface (acc or omp)"; exit 1 ;;
esac

# Frontier Cray: --debug for backtrace on build/runtime errors
debug_opts=""
if [ "$cluster" = "frontier" ]; then
debug_opts="--debug"
fi

for case in benchmarks/*/case.py; do
echo "=== Pre-building: $case ==="
./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8
./mfc.sh build -i "$case" --case-optimization $debug_opts $gpu_opts -j 8
done
30 changes: 22 additions & 8 deletions .github/scripts/run_case_optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then
ngpus=1
fi

# Verify the venv Python interpreter exists (created by ./mfc.sh build)
if [ ! -x build/venv/bin/python3 ]; then
echo "ERROR: build/venv/bin/python3 not found."
echo "The MFC build venv may not have been created. Was the pre-build step successful?"
exit 1
fi

benchmarks=(
benchmarks/5eq_rk3_weno3_hllc/case.py
benchmarks/viscous_weno5_sgb_acoustic/case.py
Expand All @@ -28,6 +21,27 @@ benchmarks=(
benchmarks/igr/case.py
)

# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only;
# build case-optimized binaries here on the compute node before running.
# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job.
# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV
# Frontier Cray: --debug for backtrace on build/runtime errors
build_jobs=8
debug_opts=""
if [ "$job_cluster" = "frontier" ]; then
build_jobs=1
debug_opts="--debug"
fi

if [ "$job_cluster" != "phoenix" ]; then
echo "=== Building case-optimized binaries on compute node ==="
for case in "${benchmarks[@]}"; do
echo "--- Building: $case ---"
./mfc.sh build -i "$case" --case-optimization $debug_opts $gpu_opts -j $build_jobs
done
echo "=== All case-optimized binaries built ==="
fi

passed=0
failed=0
failed_cases=""
Expand All @@ -44,7 +58,7 @@ for case in "${benchmarks[@]}"; do
rm -rf "$case_dir/D" "$case_dir/p_all" "$case_dir/restart_data"

# Build + run with --case-optimization, small grid, 10 timesteps
if ./mfc.sh run "$case" --case-optimization $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
if ./mfc.sh run "$case" --case-optimization $debug_opts $gpu_opts -n "$ngpus" -j 8 -- --gbpp 1 --steps 10; then
# Validate output
if build/venv/bin/python3 .github/scripts/check_case_optimization_output.py "$case_dir"; then
echo "PASS: $case_name"
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,23 +68,23 @@ jobs:
flag: f
device: gpu
interface: acc
build_script: "bash .github/workflows/frontier/build.sh gpu acc bench"
build_script: "bash .github/workflows/frontier/build.sh gpu acc"
- cluster: frontier
name: Oak Ridge | Frontier (CCE)
group: phoenix
labels: frontier
flag: f
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
build_script: "bash .github/workflows/frontier/build.sh gpu omp"
- cluster: frontier_amd
name: Oak Ridge | Frontier (AMD)
group: phoenix
labels: frontier
flag: famd
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp"
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
runs-on:
group: ${{ matrix.group }}
Expand All @@ -103,7 +103,7 @@ jobs:
ref: master
path: master

- name: Setup & Build
- name: Fetch Dependencies
if: matrix.build_script != ''
timeout-minutes: 150
run: |
Expand Down
15 changes: 10 additions & 5 deletions .github/workflows/common/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,24 @@ if [ "$job_cluster" = "phoenix" ]; then
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build (if not pre-built on login node) ---
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
# --- Build ---
# Phoenix builds everything inside SLURM (no login-node build step).
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
# source code is built here on the compute node.
# Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

if [ ! -d "build" ]; then
source .github/scripts/retry-build.sh
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV
if [ "$job_cluster" = "frontier" ]; then
n_jobs=1
fi

source .github/scripts/retry-build.sh
retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1

# --- Bench cluster flag ---
if [ "$job_cluster" = "phoenix" ]; then
bench_cluster="phoenix-bench"
Expand Down
39 changes: 24 additions & 15 deletions .github/workflows/common/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,38 @@ if [ "$job_cluster" = "phoenix" ]; then
trap 'rm -rf "$currentdir" || true' EXIT
fi

# --- Build (if not pre-built on login node) ---
# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
# to avoid SIGILL from stale binaries compiled on a different microarchitecture.
# --- Build ---
# Phoenix builds everything inside SLURM (no login-node build step).
# Frontier/Frontier AMD: deps already fetched on login node via --deps-only;
# source code is built here on the compute node.
# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled
# on a different microarchitecture.
if [ "$job_cluster" = "phoenix" ]; then
source .github/scripts/clean-build.sh
clean_build
fi

if [ ! -d "build" ]; then
source .github/scripts/retry-build.sh
source .github/scripts/retry-build.sh

# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
validate_cmd=""
if [ "$job_cluster" = "phoenix" ]; then
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
fi
# Phoenix: smoke-test the syscheck binary to catch architecture mismatches
# (SIGILL from binaries compiled on a different compute node).
validate_cmd=""
if [ "$job_cluster" = "phoenix" ]; then
validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1'
fi

RETRY_VALIDATE_CMD="$validate_cmd" \
retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1
# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV
# Frontier Cray: --debug for backtrace on build/runtime errors
build_jobs=8
debug_opts=""
if [ "$job_cluster" = "frontier" ]; then
build_jobs=1
debug_opts="--debug"
fi

RETRY_VALIDATE_CMD="$validate_cmd" \
retry_build ./mfc.sh test -v --dry-run -j $build_jobs $debug_opts $build_opts || exit 1

# --- GPU detection and thread count ---
device_opts=""
rdma_opts=""
Expand Down Expand Up @@ -88,4 +97,4 @@ if [ "${GITHUB_EVENT_NAME:-}" = "pull_request" ]; then
prune_flag="--only-changes"
fi

./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $build_opts $shard_opts -- -c $job_cluster
./mfc.sh test -v --max-attempts 3 $prune_flag -a -j $n_test_threads $rdma_opts $device_opts $debug_opts $build_opts $shard_opts -- -c $job_cluster
7 changes: 1 addition & 6 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ esac

job_device=$1
job_interface=$2
run_bench=$3
source .github/scripts/gpu-opts.sh
build_opts="$gpu_opts"

Expand All @@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh
clean_build

source .github/scripts/retry-build.sh
if [ "$run_bench" == "bench" ]; then
retry_build ./mfc.sh build -j 8 $build_opts || exit 1
else
retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1
fi
retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1
13 changes: 9 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -400,12 +400,12 @@ jobs:
echo "Coverage cache: none available — full test suite will run"
fi

- name: Build (login node)
- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
timeout-minutes: 60
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Test
- name: Build & Test
run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }}

- name: Cancel SLURM Jobs
Expand Down Expand Up @@ -486,15 +486,20 @@ jobs:
- name: Clean stale output files
run: rm -f *.out

- name: Fetch Dependencies
if: matrix.cluster != 'phoenix'
run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }}

- name: Pre-Build (SLURM)
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Pre-Build (login node)
- name: Build & Run Case-Optimization Tests
if: matrix.cluster != 'phoenix'
run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }}
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Run Case-Optimization Tests
if: matrix.cluster == 'phoenix'
run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Cancel SLURM Jobs
Expand Down
7 changes: 4 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,13 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
endif()
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
add_compile_options(
"SHELL:-M 296,878,1391,1069,5025"
"SHELL:-M 296,878,1391,1069,990,5025,7208,7212,7242"
"SHELL:-h static" "SHELL:-h keepfiles"
"SHELL:-h acc_model=auto_async_none"
"SHELL: -h acc_model=no_fast_addr"
"SHELL: -h list=adm"
"SHELL: -munsafe-fp-atomics" # Not unsafe for operations we do
"SHELL: -h ipa0" # Work around CCE 19.0.0 IPA SIGSEGV in optcg
)

add_link_options("SHELL:-hkeepfiles")
Expand All @@ -190,9 +191,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
add_compile_options(
"SHELL:-h acc_model=auto_async_none"
"SHELL: -h acc_model=no_fast_addr"
"SHELL: -K trap=fp" "SHELL: -G2"
"SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0"
)
add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
add_link_options("SHELL: -K trap=fp" "SHELL: -g" "SHELL: -O0")
endif()

elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
Expand Down
25 changes: 25 additions & 0 deletions toolchain/mfc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil

history.add(target.name)

# Dependencies are pinned to fixed versions. If already configured
# (built & installed by a prior --deps-only step), skip entirely
# to avoid re-entering the superbuild (which may access the network).
if target.isDependency and target.is_configured(case):
return

for dep in target.requires.compute():
# If we have already built and installed this target,
# do not do so again. This can be inferred by whether
Expand Down Expand Up @@ -594,6 +600,25 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str
case = case or input.load(ARG("input"), ARG("--"), {})
case.validate_params()

if ARG("deps_only", False) and len(history) == 0:
all_deps = set()
for t in targets:
resolved = get_target(t)
for dep in resolved.requires.compute():
all_deps.add(dep)

cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]")
cons.print(no_indent=True)

if not all_deps:
cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]")
return

for dep in all_deps:
__build_target(dep, case, history)

return

if len(history) == 0:
cons.print(__generate_header(case, targets))
cons.print(no_indent=True)
Expand Down
7 changes: 7 additions & 0 deletions toolchain/mfc/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@
default=False,
dest="case_optimization",
),
Argument(
name="deps-only",
help="Only fetch and build dependencies, do not build MFC targets.",
action=ArgAction.STORE_TRUE,
default=False,
dest="deps_only",
),
],
examples=[
Example("./mfc.sh build", "Build all default targets (CPU)"),
Expand Down
Loading