#!/bin/sh
set -u

VERSION="0.3.5"
PROFILE="${PROFILE:-inventory}"
REPORT_URL="${REPORT_URL:-}"
REPORT_TOKEN="${REPORT_TOKEN:-}"
DIAG_PUBLIC_BASE_URL="${DIAG_PUBLIC_BASE_URL:-}"
TOOL_BUNDLE="${TOOL_BUNDLE:-auto}"
TOOL_BUNDLE_REQUIRED="${TOOL_BUNDLE_REQUIRED:-0}"
HEARTBEAT_SEC="${HEARTBEAT_SEC:-15}"
MONITOR_SEC="${MONITOR_SEC:-15}"
UPLOAD_EVERY_SEC="${UPLOAD_EVERY_SEC:-15}"
FULL_LOG_UPLOAD_EVERY_SEC="${FULL_LOG_UPLOAD_EVERY_SEC:-60}"
RAM_TEST_MB="${RAM_TEST_MB:-auto}"
RAM_TEST_MAX_MB="${RAM_TEST_MAX_MB:-auto}"
RAM_TEST_RESERVE_MB="${RAM_TEST_RESERVE_MB:-auto}"
RAM_TEST_CHUNK_MB="${RAM_TEST_CHUNK_MB:-64}"
RAM_TEST_PASSES="${RAM_TEST_PASSES:-auto}"
SSD_TEST_MB="${SSD_TEST_MB:-auto}"
SSD_KEEP_FREE_MB="${SSD_KEEP_FREE_MB:-auto}"
SSD_MAX_MB="${SSD_MAX_MB:-auto}"
SSD_READ_TEST_MB="${SSD_READ_TEST_MB:-auto}"
SSD_READ_MAX_MB="${SSD_READ_MAX_MB:-auto}"
SSD_DEVICE_READ_MB="${SSD_DEVICE_READ_MB:-auto}"
SSD_DEVICE_READ_MAX_MB="${SSD_DEVICE_READ_MAX_MB:-8192}"
SSD_RAW_READ_REQUIRED="${SSD_RAW_READ_REQUIRED:-auto}"
SSD_PATTERN_TEST_MB="${SSD_PATTERN_TEST_MB:-auto}"
FIO_RUNTIME_SEC="${FIO_RUNTIME_SEC:-auto}"
CPU_SMOKE_SEC="${CPU_SMOKE_SEC:-30}"
CPU_STRESS_SEC="${CPU_STRESS_SEC:-auto}"
CPU_STRESS_WORKERS="${CPU_STRESS_WORKERS:-auto}"
POWER_SAMPLE_SEC="${POWER_SAMPLE_SEC:-20}"
TELEMETRY_SEC="${TELEMETRY_SEC:-30}"
PREVENT_SLEEP="${PREVENT_SLEEP:-1}"

PATH="/usr/bin:/bin:/usr/sbin:/sbin:${PATH:-}"
export PATH

now_iso() { date -u '+%Y-%m-%dT%H:%M:%SZ'; }
now_epoch() { date '+%s'; }
have() { command -v "$1" >/dev/null 2>&1; }

duration_since() {
  start="$1"
  end="$(now_epoch)"
  dur=$((end - start))
  [ "$dur" -lt 1 ] && dur=1
  echo "$dur"
}

mb_per_sec() {
  mb="$1"
  sec="$2"
  awk -v m="$mb" -v s="$sec" 'BEGIN { if (s <= 0) s = 1; printf "%.1f", m / s }'
}

base_url() {
  if [ -n "$DIAG_PUBLIC_BASE_URL" ]; then
    printf '%s\n' "${DIAG_PUBLIC_BASE_URL%/}"
    return
  fi
  case "$REPORT_URL" in
    */api/v1/upload) printf '%s\n' "${REPORT_URL%/api/v1/upload}" ;;
    */upload) printf '%s\n' "${REPORT_URL%/upload}" ;;
    *) printf '%s\n' "" ;;
  esac
}

json_escape() {
  printf '%s' "$1" | sed 's/\\/\\\\/g; s/"/\\"/g; s/	/\\t/g'
}

first_system_profiler_value() {
  key="$1"
  if have system_profiler; then
    system_profiler SPHardwareDataType 2>/dev/null | awk -F': ' -v k="$key" '$1 ~ k {print $2; exit}'
  fi
}

SERIAL="$(first_system_profiler_value 'Serial Number' || true)"
[ -n "$SERIAL" ] || SERIAL="$(ioreg -rd1 -c IOPlatformExpertDevice 2>/dev/null | awk -F'\"' '/IOPlatformSerialNumber/ {print $4; exit}')"
[ -n "$SERIAL" ] || SERIAL="unknownserial"
MODEL="$(first_system_profiler_value 'Model Identifier' || true)"
[ -n "$MODEL" ] || MODEL="$(sysctl -n hw.model 2>/dev/null || echo unknownmodel)"
START_TS="$(date -u '+%Y%m%d_%H%M%S')"
RUN_ID="${SERIAL}_${START_TS}_$$"
RUN_ROOT="${RUN_ROOT:-${TMPDIR:-/tmp}/elnote-diag-core-macos}"
RUN_DIR="$RUN_ROOT/$RUN_ID"
mkdir -p "$RUN_DIR"
touch "$RUN_DIR/start.marker"

LOG="$RUN_DIR/full.log"
EVENTS="$RUN_DIR/events.jsonl"
MONITOR="$RUN_DIR/monitor.jsonl"
TELEMETRY="$RUN_DIR/telemetry.jsonl"
SUMMARY="$RUN_DIR/summary.json"
STATE="$RUN_DIR/current_stage"
FAIL_COUNT=0
WARN_COUNT=0
LAST_UPLOAD=0
LAST_FULL_UPLOAD=0
LAST_TELEMETRY=0
CHILD_PIDS=""

log() {
  level="$1"
  type="$2"
  shift 2
  msg="$*"
  [ "$level" = "FAIL" ] && FAIL_COUNT=$((FAIL_COUNT + 1))
  [ "$level" = "WARN" ] && WARN_COUNT=$((WARN_COUNT + 1))
  printf '%s %-5s %-28s %s\n' "$(now_iso)" "$level" "$type" "$msg" | tee -a "$LOG"
  printf '{"ts":"%s","level":"%s","type":"%s","message":"%s"}\n' "$(now_iso)" "$level" "$type" "$(json_escape "$msg")" >>"$EVENTS"
}

stage() {
  printf '%s\n' "$1" >"$STATE"
  log INFO stage "$1"
}

run_capture() {
  label="$1"
  outfile="$2"
  shift 2
  log INFO command_start "label=$label command=$*"
  "$@" >"$outfile" 2>&1
  rc=$?
  log INFO command_done "label=$label rc=$rc outfile=$outfile"
  return "$rc"
}

artifact_kind() {
  printf 'artifact_%s\n' "$(printf '%s' "$1" | sed 's#[^A-Za-z0-9_.-]#_#g')"
}

sha256_file() {
  if have shasum; then
    shasum -a 256 "$1" | awk '{print $1}'
  elif have openssl; then
    openssl dgst -sha256 "$1" | awk '{print $NF}'
  else
    return 1
  fi
}

upload_file() {
  path="$1"
  kind="$2"
  [ -n "$REPORT_URL" ] || return 0
  [ -f "$path" ] || return 0
  have curl || return 0
  if [ -n "$REPORT_TOKEN" ]; then
    curl -fsS --connect-timeout 5 --max-time 30 \
      -H "Authorization: Bearer $REPORT_TOKEN" \
      -H "X-Run-Id: $RUN_ID" \
      -H "X-Serial: $SERIAL" \
      -H "X-Profile: $PROFILE" \
      -H "X-Diag-Core: macos-$VERSION" \
      -H "X-Artifact-Type: $kind" \
      --data-binary "@$path" "$REPORT_URL" >/dev/null 2>>"$LOG" || {
        rc=$?
        log WARN upload_failed "kind=$kind rc=$rc"
        return "$rc"
      }
    return 0
  fi
  curl -fsS --connect-timeout 5 --max-time 30 \
    -H "X-Run-Id: $RUN_ID" \
    -H "X-Serial: $SERIAL" \
    -H "X-Profile: $PROFILE" \
    -H "X-Diag-Core: macos-$VERSION" \
    -H "X-Artifact-Type: $kind" \
    --data-binary "@$path" "$REPORT_URL" >/dev/null 2>>"$LOG" || {
      rc=$?
      log WARN upload_failed "kind=$kind rc=$rc"
      return "$rc"
    }
}

upload_if_due() {
  t="$(now_epoch)"
  if [ "$LAST_UPLOAD" -eq 0 ] || [ $((t - LAST_UPLOAD)) -ge "$UPLOAD_EVERY_SEC" ]; then
    LAST_UPLOAD="$t"
    upload_file "$EVENTS" events || true
    upload_file "$MONITOR" monitor || true
    upload_file "$TELEMETRY" telemetry || true
  fi
  if [ "$LAST_FULL_UPLOAD" -eq 0 ] || [ $((t - LAST_FULL_UPLOAD)) -ge "$FULL_LOG_UPLOAD_EVERY_SEC" ]; then
    LAST_FULL_UPLOAD="$t"
    upload_file "$LOG" full_log || true
  fi
}

cpu_busy_pct() {
  have top || return 0
  LC_ALL=C top -l 1 -n 0 2>/dev/null | awk '
    /CPU usage/ {
      sum = 0
      for (i = 1; i <= NF; i++) {
        if ($i ~ /%$/ && ($(i + 1) == "user," || $(i + 1) == "sys,")) {
          val = $i
          gsub("%", "", val)
          sum += val
        }
      }
      printf "%.1f\n", sum
      exit
    }'
}

power_value_w() {
  label="$1"
  file="$2"
  awk -v label="$label" '
    BEGIN { ll = tolower(label) }
    index(tolower($0), ll) {
      for (i = 1; i <= NF; i++) {
        if ($i ~ /^[0-9]+(\.[0-9]+)?$/) {
          v = $i
          unit = $(i + 1)
          if (unit == "mW") v = v / 1000
          printf "%.2f\n", v
          exit
        }
      }
    }' "$file" 2>/dev/null
}

append_telemetry_sample() {
  st="$1"
  ts="$(now_iso)"
  load="$(LC_ALL=C uptime 2>/dev/null | sed 's/^.*load averages*: //')"
  ncpu="$(sysctl -n hw.ncpu 2>/dev/null || echo "")"
  load1="$(printf '%s\n' "$load" | awk '{gsub(",", ".", $1); print $1; exit}')"
  load_pct=""
  if [ -n "$load1" ] && [ -n "$ncpu" ] && [ "$ncpu" -gt 0 ] 2>/dev/null; then
    load_pct="$(awk -v l="$load1" -v c="$ncpu" 'BEGIN { printf "%.1f", (l / c) * 100 }')"
  fi
  cpu_busy="$(cpu_busy_pct || true)"
  cpu_speed_limit=""
  thermal_level=""
  if have pmset; then
    therm="$(pmset -g therm 2>/dev/null || true)"
    cpu_speed_limit="$(printf '%s\n' "$therm" | awk -F'= *' '/CPU_Speed_Limit/ {gsub(/[^0-9.]/, "", $2); print $2; exit}')"
    thermal_level="$(printf '%s\n' "$therm" | awk -F'= *' '/Thermal_Level/ {print $2; exit}')"
  fi
  temp_c=""
  cpu_power_w=""
  gpu_power_w=""
  thermal_pressure=""
  powermetrics_rc=""
  if have powermetrics; then
    pm_out="$RUN_DIR/powermetrics_latest.txt"
    powermetrics --samplers smc,cpu_power,gpu_power,thermal -i 1000 -n 1 >"$pm_out" 2>&1
    powermetrics_rc=$?
    if [ "$powermetrics_rc" -eq 0 ]; then
      cat "$pm_out" >>"$RUN_DIR/powermetrics_telemetry.txt" 2>/dev/null || true
      temp_c="$(awk 'BEGIN{IGNORECASE=1} /temperature/ { for (i=1; i<=NF; i++) if ($i ~ /^[0-9]+(\.[0-9]+)?$/ && $(i+1) ~ /^C/) { printf "%.1f\n", $i; exit } }' "$pm_out" 2>/dev/null)"
      cpu_power_w="$(power_value_w "CPU Power" "$pm_out")"
      gpu_power_w="$(power_value_w "GPU Power" "$pm_out")"
      thermal_pressure="$(awk -F': *' 'BEGIN{IGNORECASE=1} /Thermal pressure/ {print $2; exit}' "$pm_out" 2>/dev/null)"
    fi
  fi
  printf '{"ts":"%s","stage":"%s","load":"%s","load_pct":"%s","ncpu":"%s","cpu_busy_pct":"%s","cpu_speed_limit_pct":"%s","thermal_level":"%s","thermal_pressure":"%s","temp_c":"%s","cpu_power_w":"%s","gpu_power_w":"%s","powermetrics_rc":"%s"}\n' \
    "$ts" "$(json_escape "$st")" "$(json_escape "$load")" "$(json_escape "$load_pct")" "$(json_escape "$ncpu")" "$(json_escape "$cpu_busy")" "$(json_escape "$cpu_speed_limit")" "$(json_escape "$thermal_level")" "$(json_escape "$thermal_pressure")" "$(json_escape "$temp_c")" "$(json_escape "$cpu_power_w")" "$(json_escape "$gpu_power_w")" "$(json_escape "$powermetrics_rc")" >>"$TELEMETRY"
}

monitor_loop() {
  while :; do
    st="$(cat "$STATE" 2>/dev/null || echo starting)"
    vm_free=""
    vm_spec=""
    if have vm_stat; then
      vm_free="$(vm_stat 2>/dev/null | awk '/Pages free/ {gsub("\\.","",$3); print $3; exit}')"
      vm_spec="$(vm_stat 2>/dev/null | awk '/Pages speculative/ {gsub("\\.","",$3); print $3; exit}')"
    fi
    load="$(LC_ALL=C uptime 2>/dev/null | sed 's/^.*load averages*: //')"
    disk_free_mb="$(df -kP "${TMPDIR:-/tmp}" 2>/dev/null | awk 'NR==2 {print int($4/1024)}')"
    ncpu="$(sysctl -n hw.ncpu 2>/dev/null || echo "")"
    load1="$(printf '%s\n' "$load" | awk '{gsub(",", ".", $1); print $1; exit}')"
    load_pct=""
    if [ -n "$load1" ] && [ -n "$ncpu" ] && [ "$ncpu" -gt 0 ] 2>/dev/null; then
      load_pct="$(awk -v l="$load1" -v c="$ncpu" 'BEGIN { printf "%.1f", (l / c) * 100 }')"
    fi
    cpu_busy="$(cpu_busy_pct || true)"
    cpu_speed_limit=""
    thermal_level=""
    if have pmset; then
      therm="$(pmset -g therm 2>/dev/null || true)"
      cpu_speed_limit="$(printf '%s\n' "$therm" | awk -F'= *' '/CPU_Speed_Limit/ {gsub(/[^0-9.]/, "", $2); print $2; exit}')"
      thermal_level="$(printf '%s\n' "$therm" | awk -F'= *' '/Thermal_Level/ {print $2; exit}')"
    fi
    power_source=""
    battery_pct=""
    charging_state=""
    ac_attached=""
    if have pmset; then
      batt="$(pmset -g batt 2>/dev/null || true)"
      power_source="$(printf '%s\n' "$batt" | awk -F"'" '/Now drawing from/ {print $2; exit}')"
      battery_pct="$(printf '%s\n' "$batt" | awk -F'%' '/%/ {gsub(/^[ \t]+/,"",$1); n=split($1,a," "); print a[n]; exit}')"
      charging_state="$(printf '%s\n' "$batt" | awk -F'; *' '/%/ {print $2; exit}')"
      case "$power_source" in *AC*) ac_attached="true" ;; *Battery*) ac_attached="false" ;; *) ac_attached="" ;; esac
    fi
    printf '{"ts":"%s","stage":"%s","load":"%s","load_pct":"%s","ncpu":"%s","cpu_busy_pct":"%s","cpu_speed_limit_pct":"%s","thermal_level":"%s","vm_pages_free":"%s","vm_pages_speculative":"%s","tmp_free_mb":"%s","power_source":"%s","ac_attached":"%s","battery_pct":"%s","charging_state":"%s"}\n' \
      "$(now_iso)" "$(json_escape "$st")" "$(json_escape "$load")" "$(json_escape "$load_pct")" "$(json_escape "$ncpu")" "$(json_escape "$cpu_busy")" "$(json_escape "$cpu_speed_limit")" "$(json_escape "$thermal_level")" "$(json_escape "$vm_free")" "$(json_escape "$vm_spec")" "$(json_escape "$disk_free_mb")" "$(json_escape "$power_source")" "$(json_escape "$ac_attached")" "$(json_escape "$battery_pct")" "$(json_escape "$charging_state")" >>"$MONITOR"
    tnow="$(now_epoch)"
    if [ "$LAST_TELEMETRY" -eq 0 ] || [ $((tnow - LAST_TELEMETRY)) -ge "$TELEMETRY_SEC" ]; then
      LAST_TELEMETRY="$tnow"
      append_telemetry_sample "$st"
    fi
    upload_if_due
    sleep "$MONITOR_SEC"
  done
}

whole_disk_size_mb() {
  whole="$1"
  diskutil info "/dev/$whole" 2>/dev/null | awk -F'[()]' '
    /Disk Size|Total Size/ {
      for (i = 1; i <= NF; i++) {
        if ($i ~ /Bytes/) {
          gsub(/[^0-9]/, "", $i)
          if ($i > 0) {
            printf "%d\n", $i / 1024 / 1024
            exit
          }
        }
      }
    }'
}

start_sleep_prevention() {
  [ "$PREVENT_SLEEP" = "1" ] || return 0
  if have caffeinate; then
    caffeinate -dimsu >>"$LOG" 2>&1 &
    CHILD_PIDS="$CHILD_PIDS $!"
    log INFO sleep_prevention_started "caffeinate pid=$!"
  else
    log WARN sleep_prevention_unavailable "missing caffeinate"
  fi
}

ensure_tool_bundle() {
  [ "$TOOL_BUNDLE" != "0" ] || return 0
  missing=0
  for tool in smartctl fio memtester stress-ng; do
    have "$tool" || missing=1
  done
  if [ "$TOOL_BUNDLE" = "auto" ] && [ "$missing" = "0" ]; then
    log INFO tool_bundle_skipped "reason=tools_already_present"
    return 0
  fi
  if ! have curl || ! have tar; then
    log WARN tool_bundle_skipped "missing curl_or_tar"
    return 0
  fi
  if ! have shasum && ! have openssl; then
    log WARN tool_bundle_skipped "missing_sha256_tool"
    return 0
  fi
  b="$(base_url)"
  if [ -z "$b" ]; then
    log WARN tool_bundle_skipped "missing_base_url"
    return 0
  fi
  arch="$(uname -m 2>/dev/null || echo unknown)"
  case "$arch" in
    arm64) bundle_arch="arm64" ;;
    x86_64) bundle_arch="x86_64" ;;
    *) log WARN tool_bundle_skipped "unsupported_arch=$arch"; return 0 ;;
  esac
  tools_dir="$RUN_DIR/tools"
  mkdir -p "$tools_dir"
  bundle="$tools_dir/diag-tools-macos-${bundle_arch}.tar.gz"
  sha_file="$tools_dir/diag-tools-macos-${bundle_arch}.sha256"
  url="$b/tools/macos/diag-tools-macos-${bundle_arch}.tar.gz"
  sha_url="$b/tools/macos/diag-tools-macos-${bundle_arch}.sha256"
  log INFO tool_bundle_download_start "arch=$bundle_arch url=$url"
  if ! curl -fsSL "$url" -o "$bundle" >>"$LOG" 2>&1; then
    log WARN tool_bundle_download_failed "url=$url"
    [ "$TOOL_BUNDLE_REQUIRED" = "1" ] && log FAIL tool_bundle_required_failed "download"
    return 0
  fi
  if ! curl -fsSL "$sha_url" -o "$sha_file" >>"$LOG" 2>&1; then
    log WARN tool_bundle_sha_download_failed "url=$sha_url"
    [ "$TOOL_BUNDLE_REQUIRED" = "1" ] && log FAIL tool_bundle_required_failed "sha_download"
    return 0
  fi
  expected="$(awk '{print $1}' "$sha_file" | tr -d '\r\n')"
  actual="$(sha256_file "$bundle" || true)"
  if [ -z "$expected" ] || [ "$expected" != "$actual" ]; then
    log WARN tool_bundle_sha_mismatch "expected=$expected actual=$actual"
    [ "$TOOL_BUNDLE_REQUIRED" = "1" ] && log FAIL tool_bundle_required_failed "sha_mismatch"
    return 0
  fi
  extract_dir="$tools_dir/extracted"
  rm -rf "$extract_dir"
  mkdir -p "$extract_dir"
  if ! tar -xzf "$bundle" -C "$extract_dir" >>"$LOG" 2>&1; then
    log WARN tool_bundle_extract_failed "bundle=$bundle"
    [ "$TOOL_BUNDLE_REQUIRED" = "1" ] && log FAIL tool_bundle_required_failed "extract"
    return 0
  fi
  chmod +x "$extract_dir"/bin/* >/dev/null 2>&1 || true
  PATH="$extract_dir/bin:$PATH"
  export PATH
  log INFO tool_bundle_ready "arch=$bundle_arch path=$extract_dir/bin sha256=$actual"
}

detect_tools() {
  stage "tool_detection"
  ensure_tool_bundle
  for tool in smartctl fio memtester stress-ng powermetrics mma; do
    if have "$tool"; then
      log INFO optional_tool_present "$tool"
    else
      log INFO optional_tool_missing "$tool"
    fi
  done
}

collect_smart_snapshot() {
  label="$1"
  if ! have smartctl; then
    log WARN smart_skipped "missing_smartctl label=$label"
    return 0
  fi
  scan="$RUN_DIR/${label}_smartctl_scan.txt"
  smartctl --scan >"$scan" 2>&1 || true
  found=0
  while read -r dev rest; do
    case "$dev" in ""|\#*) continue ;; esac
    found=$((found + 1))
    safe="$(printf '%s' "$dev" | sed 's#[^A-Za-z0-9_.-]#_#g')"
    smartctl -a "$dev" >"$RUN_DIR/${label}_smartctl_${safe}.txt" 2>&1 || true
  done <"$scan"
  log INFO smart_snapshot "label=$label devices=$found"
}

evaluate_smart_health() {
  label="$1"
  files_found=0
  for f in "$RUN_DIR"/"${label}"_smartctl_*.txt; do
    [ -f "$f" ] || continue
    case "$f" in *scan.txt) continue ;; esac
    files_found=$((files_found + 1))
    health="$(awk -F': *' 'BEGIN{IGNORECASE=1} /SMART overall-health self-assessment test result|SMART Health Status/ {print $2; exit}' "$f" 2>/dev/null)"
    crit="$(awk -F': *' 'BEGIN{IGNORECASE=1} /^Critical Warning/ {print $2; exit}' "$f" 2>/dev/null | tr -d ' ')"
    media="$(awk -F': *' 'BEGIN{IGNORECASE=1} /^Media and Data Integrity Errors/ {gsub(/,/,"",$2); print $2; exit}' "$f" 2>/dev/null)"
    errlog="$(awk -F': *' 'BEGIN{IGNORECASE=1} /^Error Information Log Entries/ {gsub(/,/,"",$2); print $2; exit}' "$f" 2>/dev/null)"
    used="$(awk -F': *' 'BEGIN{IGNORECASE=1} /^Percentage Used/ {gsub(/%|,/,"",$2); print $2; exit}' "$f" 2>/dev/null)"
    spare="$(awk -F': *' 'BEGIN{IGNORECASE=1} /^Available Spare/ {gsub(/%|,/,"",$2); print $2; exit}' "$f" 2>/dev/null)"
    log INFO smart_health "label=$label file=$(basename "$f") health=${health:-unknown} critical=${crit:-unknown} media_errors=${media:-unknown} error_log=${errlog:-unknown} used_pct=${used:-unknown} spare_pct=${spare:-unknown}"
    case "$(printf '%s' "$health" | tr '[:upper:]' '[:lower:]')" in
      ""|*pass*|*ok*) ;;
      *) log FAIL smart_health_bad "file=$(basename "$f") health=$health" ;;
    esac
    case "$crit" in ""|"0"|"0x00"|"0X00") ;; *) log FAIL smart_critical_warning "file=$(basename "$f") critical=$crit" ;; esac
    if [ "${media:-0}" -gt 0 ] 2>/dev/null; then log FAIL smart_media_errors "file=$(basename "$f") media_errors=$media"; fi
    if [ "${errlog:-0}" -gt 0 ] 2>/dev/null; then log WARN smart_error_log_entries "file=$(basename "$f") entries=$errlog"; fi
    if [ "${used:-0}" -ge 20 ] 2>/dev/null; then
      log FAIL smart_percentage_used_high "file=$(basename "$f") used_pct=$used"
    elif [ "${used:-0}" -ge 10 ] 2>/dev/null; then
      log WARN smart_percentage_used_warn "file=$(basename "$f") used_pct=$used"
    fi
    if [ -n "$spare" ] && [ "$spare" -lt 95 ] 2>/dev/null; then log WARN smart_available_spare_low "file=$(basename "$f") spare_pct=$spare"; fi
  done
  [ "$files_found" -gt 0 ] || log WARN smart_health_no_devices "label=$label"
}

collect_kernel_storage_logs() {
  label="$1"
  if [ -x /usr/bin/log ]; then
    /usr/bin/log show --style compact --last 45m --predicate 'process == "kernel"' >"$RUN_DIR/${label}_kernel_last45m.txt" 2>&1 || true
    awk 'BEGIN{IGNORECASE=1} /disk|nvme|apfs|i\/o|error|timeout|media|storage|panic|watchdog|reset/ {print}' "$RUN_DIR/${label}_kernel_last45m.txt" >"$RUN_DIR/${label}_kernel_storage_relevant.txt" 2>/dev/null || true
    cnt="$(wc -l <"$RUN_DIR/${label}_kernel_storage_relevant.txt" 2>/dev/null | tr -d ' ')"
    log INFO kernel_storage_scan "label=$label relevant_lines=${cnt:-0}"
  else
    log WARN kernel_storage_skipped "missing_/usr/bin/log"
  fi
}

collect_crash_reports() {
  out="$RUN_DIR/crash_reports_recent.txt"
  : >"$out"
  for dir in /Library/Logs/DiagnosticReports "$HOME/Library/Logs/DiagnosticReports"; do
    [ -d "$dir" ] || continue
    find "$dir" -type f \( -name '*.panic' -o -name '*.crash' -o -name '*.ips' \) -newer "$RUN_DIR/start.marker" 2>/dev/null | while read -r f; do
      echo "===== $f =====" >>"$out"
      sed -n '1,140p' "$f" >>"$out" 2>/dev/null || true
      echo >>"$out"
    done
  done
  cnt="$(grep -c '^===== ' "$out" 2>/dev/null || echo 0)"
  log INFO crash_report_scan "recent_reports=$cnt"
  if grep -Eiq 'panicString|kernel panic|watchdog timeout|shutdown cause' "$out" 2>/dev/null; then
    log FAIL crash_panic_detected "new_panic_or_watchdog_report_found count=$cnt"
  elif [ "$cnt" -gt 0 ] 2>/dev/null; then
    log WARN crash_report_detected "new_crash_reports=$cnt"
  fi
}

collect_power_sample() {
  case "$PROFILE" in inventory) return 0 ;; esac
  if have powermetrics; then
    log INFO powermetrics_start "duration_sec=$POWER_SAMPLE_SEC"
    powermetrics --samplers smc,cpu_power,gpu_power,thermal -i 1000 -n "$POWER_SAMPLE_SEC" >"$RUN_DIR/powermetrics_sample.txt" 2>&1 || log WARN powermetrics_failed "rc=$?"
  else
    log WARN powermetrics_skipped "missing_powermetrics"
  fi
}

collect_inventory() {
  stage "inventory"
  {
    echo "===== identity ====="
    echo "version=$VERSION"
    echo "run_id=$RUN_ID"
    echo "serial=$SERIAL"
    echo "model=$MODEL"
    echo "profile=$PROFILE"
    echo "started=$(now_iso)"
    echo
    echo "===== sw_vers ====="
    sw_vers 2>&1 || true
    echo
    echo "===== uname ====="
    uname -a 2>&1 || true
    echo
    echo "===== sysctl ====="
    sysctl hw.memsize hw.ncpu hw.model kern.boottime machdep.cpu.brand_string 2>&1 || true
  } >"$RUN_DIR/inventory.txt"

  have system_profiler && system_profiler SPHardwareDataType SPStorageDataType SPNVMeDataType SPSerialATADataType SPPowerDataType SPDisplaysDataType SPNetworkDataType >"$RUN_DIR/system_profiler.txt" 2>&1 || log WARN inventory_partial "missing system_profiler"
  have diskutil && diskutil list >"$RUN_DIR/diskutil_list.txt" 2>&1 || log WARN inventory_partial "missing diskutil"
  have diskutil && diskutil apfs list >"$RUN_DIR/diskutil_apfs.txt" 2>&1 || true
  have ioreg && ioreg -r -c AppleSmartBattery >"$RUN_DIR/ioreg_battery.txt" 2>&1 || true
  have pmset && pmset -g >"$RUN_DIR/pmset.txt" 2>&1 || true
  have pmset && pmset -g assertions >"$RUN_DIR/pmset_assertions.txt" 2>&1 || true
  have ifconfig && ifconfig >"$RUN_DIR/ifconfig.txt" 2>&1 || true
  have netstat && netstat -rn >"$RUN_DIR/netstat_routes.txt" 2>&1 || true
	  have networksetup && networksetup -listallhardwareports >"$RUN_DIR/networksetup_ports.txt" 2>&1 || true
  log INFO sizing_policy "ram_target_mb=$(ram_test_mb) ram_passes=$(ram_test_passes) ram_reserve_mb=$(ram_reserve_mb) ssd_write_mb=$(ssd_test_mb) ssd_pattern_mb=$(ssd_pattern_test_mb) ssd_read_mb=$(ssd_read_test_mb) ssd_raw_read_mb=$(ssd_device_read_mb) ssd_keep_free_mb=$(ssd_keep_free_mb) df_total_mb=$(df_total_mb) df_free_mb=$(df_free_mb)"
  detect_tools
  collect_smart_snapshot "pre"
  evaluate_smart_health "pre"
  collect_kernel_storage_logs "pre"
  collect_crash_reports
}

ram_test_mb() {
  if [ "$RAM_TEST_MB" != "auto" ]; then
    echo "$RAM_TEST_MB"
    return
  fi
  mem_mb="$(mem_total_mb)"
  case "$PROFILE" in
    deep)
      if [ "$mem_mb" -le 8192 ] 2>/dev/null; then
        pct=62
      elif [ "$mem_mb" -le 16384 ] 2>/dev/null; then
        pct=70
      else
        pct=80
      fi
      ;;
    standard) pct=60 ;;
    quick) pct=30 ;;
    *) pct=0 ;;
  esac
  mb=$((mem_mb * pct / 100))
  if [ "$RAM_TEST_MAX_MB" != "auto" ]; then
    [ "$mb" -gt "$RAM_TEST_MAX_MB" ] && mb="$RAM_TEST_MAX_MB"
  else
    case "$PROFILE" in
      deep) [ "$mb" -gt 131072 ] && mb=131072 ;;
      standard) [ "$mb" -gt 24576 ] && mb=24576 ;;
      quick) [ "$mb" -gt 4096 ] && mb=4096 ;;
    esac
  fi
  [ "$mb" -lt 256 ] && mb=256
  echo "$mb"
}

ram_test_passes() {
  if [ "$RAM_TEST_PASSES" != "auto" ]; then
    echo "$RAM_TEST_PASSES"
    return
  fi
  mem_mb="$(mem_total_mb)"
  case "$PROFILE" in
    deep)
      if [ "$mem_mb" -le 8192 ] 2>/dev/null; then
        echo 8
      elif [ "$mem_mb" -le 16384 ] 2>/dev/null; then
        echo 6
      else
        echo 4
      fi
      ;;
    standard) echo 2 ;;
    quick) echo 2 ;;
    *) echo 1 ;;
  esac
}

prepare_memory_for_ram_test() {
  case "$PROFILE" in inventory) return 0 ;; esac
  stage "memory_prepare"
  sync >/dev/null 2>&1 || true
  if have purge; then
    purge >>"$LOG" 2>&1 || log WARN memory_purge_failed "rc=$?"
  else
    log INFO memory_purge_unavailable "missing_purge"
  fi
  if have vm_stat; then
    vm_stat >>"$RUN_DIR/vm_stat_before_ram.txt" 2>&1 || true
  fi
}

mem_total_mb() {
  mem_bytes="$(sysctl -n hw.memsize 2>/dev/null || echo 0)"
  echo $((mem_bytes / 1024 / 1024))
}

ram_reserve_mb() {
  if [ "$RAM_TEST_RESERVE_MB" != "auto" ]; then
    echo "$RAM_TEST_RESERVE_MB"
    return
  fi
  mem_mb="$(mem_total_mb)"
  if [ "$mem_mb" -le 8192 ] 2>/dev/null; then
    echo 1536
  elif [ "$mem_mb" -le 16384 ] 2>/dev/null; then
    echo 2048
  elif [ "$mem_mb" -le 32768 ] 2>/dev/null; then
    echo 3072
  else
    echo 4096
  fi
}

run_ram_test() {
  case "$PROFILE" in inventory) return 0 ;; esac
  stage "ram_test"
  if ! have memtester; then
    log WARN ram_test_skipped "missing_memtester"
    return 0
  fi
  target_mb="$(ram_test_mb)"
  passes="$(ram_test_passes)"
  reserve_mb="$(ram_reserve_mb)"
  vm_free_mb=0
  vm_spec_mb=0
  vm_inactive_mb=0
  vm_purgeable_mb=0
  if have vm_stat; then
    page_size="$(vm_stat 2>/dev/null | awk '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
    [ -n "$page_size" ] || page_size=4096
    vm_snapshot="$(vm_stat 2>/dev/null)"
    vm_free_pages="$(printf '%s\n' "$vm_snapshot" | awk '/Pages free/ {gsub("\\.","",$3); print $3; exit}')"
    vm_spec_pages="$(printf '%s\n' "$vm_snapshot" | awk '/Pages speculative/ {gsub("\\.","",$3); print $3; exit}')"
    vm_inactive_pages="$(printf '%s\n' "$vm_snapshot" | awk '/Pages inactive/ {gsub("\\.","",$3); print $3; exit}')"
    vm_purgeable_pages="$(printf '%s\n' "$vm_snapshot" | awk '/Pages purgeable/ {gsub("\\.","",$3); print $3; exit}')"
    [ -n "$vm_free_pages" ] || vm_free_pages=0
    [ -n "$vm_spec_pages" ] || vm_spec_pages=0
    [ -n "$vm_inactive_pages" ] || vm_inactive_pages=0
    [ -n "$vm_purgeable_pages" ] || vm_purgeable_pages=0
    vm_free_mb=$((vm_free_pages * page_size / 1024 / 1024))
    vm_spec_mb=$((vm_spec_pages * page_size / 1024 / 1024))
    vm_inactive_mb=$((vm_inactive_pages * page_size / 1024 / 1024))
    vm_purgeable_mb=$((vm_purgeable_pages * page_size / 1024 / 1024))
    avail_mb=$((vm_free_mb + vm_spec_mb + vm_inactive_mb + vm_purgeable_mb - reserve_mb))
    if [ "$avail_mb" -gt 128 ] 2>/dev/null && [ "$target_mb" -gt "$avail_mb" ] 2>/dev/null; then
      target_mb="$avail_mb"
    fi
  fi
  [ "$target_mb" -lt 128 ] && target_mb=128
  log INFO ram_memtester_start "target_mb=$target_mb passes=$passes vm_free_mb=$vm_free_mb vm_spec_mb=$vm_spec_mb vm_inactive_mb=$vm_inactive_mb vm_purgeable_mb=$vm_purgeable_mb reserve_mb=$reserve_mb"
  attempt_mb="$target_mb"
  while [ "$attempt_mb" -ge 128 ]; do
    pass=1
    total_start="$(now_epoch)"
    while [ "$pass" -le "$passes" ]; do
      log INFO ram_pass_start "pass=$pass passes=$passes mb=$attempt_mb"
      pass_start="$(now_epoch)"
      if memtester "${attempt_mb}M" 1 >>"$LOG" 2>&1; then
        dur="$(duration_since "$pass_start")"
        rate="$(mb_per_sec "$attempt_mb" "$dur")"
        log INFO ram_pass_ok "pass=$pass passes=$passes tested_mb=$attempt_mb duration_sec=$dur mb_s=$rate"
        log INFO speed_metric "kind=ram phase=memtester_pass sample=$pass mb=$attempt_mb duration_sec=$dur mb_s=$rate"
        upload_file "$EVENTS" events || true
        pass=$((pass + 1))
      else
        rc=$?
        if [ "$pass" -eq 1 ]; then
          log WARN ram_memtester_attempt_failed "rc=$rc attempted_mb=$attempt_mb pass=$pass"
          attempt_mb=$((attempt_mb * 75 / 100))
          break
        fi
        log FAIL ram_memtester_failed "rc=$rc target_mb=$attempt_mb pass=$pass"
        return 0
      fi
    done
    if [ "$pass" -gt "$passes" ]; then
      total_dur="$(duration_since "$total_start")"
      effective_mb=$((attempt_mb * passes))
      total_rate="$(mb_per_sec "$effective_mb" "$total_dur")"
      log INFO speed_metric "kind=ram phase=memtester_total samples=$passes mb=$effective_mb duration_sec=$total_dur mb_s=$total_rate"
      log INFO ram_verify_ok "method=memtester tested_mb=$attempt_mb passes=$passes duration_sec=$total_dur mb_s=$total_rate"
      return 0
    fi
  done
  log FAIL ram_memtester_failed "target_mb=$target_mb"
}

run_memory_pressure_test() {
  case "$PROFILE" in inventory) return 0 ;; esac
  if ! have stress-ng; then
    log WARN memory_pressure_skipped "missing_stress-ng"
    return 0
  fi
  case "$PROFILE" in
    deep) timeout_sec=600; vm_bytes="75%" ;;
    standard) timeout_sec=180; vm_bytes="60%" ;;
    quick) timeout_sec=45; vm_bytes="35%" ;;
    *) timeout_sec=30; vm_bytes="25%" ;;
  esac
  log INFO memory_pressure_start "tool=stress-ng vm_bytes=$vm_bytes timeout_sec=$timeout_sec"
  if stress-ng --vm 1 --vm-bytes "$vm_bytes" --verify --timeout "${timeout_sec}s" >>"$LOG" 2>&1; then
    log INFO memory_pressure_ok "tool=stress-ng vm_bytes=$vm_bytes timeout_sec=$timeout_sec"
  else
    rc=$?
    log WARN memory_pressure_failed "tool=stress-ng rc=$rc"
  fi
}

df_free_mb() {
  df -kP "${TMPDIR:-/tmp}" 2>/dev/null | awk 'NR==2 {print int($4/1024)}'
}

df_total_mb() {
  df -kP "${TMPDIR:-/tmp}" 2>/dev/null | awk 'NR==2 {print int($2/1024)}'
}

ssd_keep_free_mb() {
  if [ "$SSD_KEEP_FREE_MB" != "auto" ]; then
    echo "$SSD_KEEP_FREE_MB"
    return
  fi
  total_mb="$(df_total_mb)"
  [ -n "$total_mb" ] || total_mb=0
  case "$PROFILE" in
    deep)
      keep=$((total_mb / 20))
      [ "$keep" -lt 8192 ] && keep=8192
      [ "$keep" -gt 65536 ] && keep=65536
      ;;
    standard)
      keep=$((total_mb / 12))
      [ "$keep" -lt 12288 ] && keep=12288
      [ "$keep" -gt 65536 ] && keep=65536
      ;;
    quick|*)
      keep=20480
      ;;
  esac
  echo "$keep"
}

ssd_test_mb() {
  if [ "$SSD_TEST_MB" != "auto" ]; then
    echo "$SSD_TEST_MB"
    return
  fi
  free_mb="$(df_free_mb)"
  [ -n "$free_mb" ] || free_mb=0
  keep_free_mb="$(ssd_keep_free_mb)"
  usable=$((free_mb - keep_free_mb))
  [ "$usable" -lt 0 ] && usable=0
  if [ "$usable" -lt 512 ]; then
    echo 0
    return
  fi
  case "$PROFILE" in
    deep) want=$((usable * 90 / 100)) ;;
    standard) want=$((usable * 35 / 100)) ;;
    quick) want=$((usable * 10 / 100)) ;;
    *) want=0 ;;
  esac
  if [ "$SSD_MAX_MB" != "auto" ]; then
    [ "$want" -gt "$SSD_MAX_MB" ] && want="$SSD_MAX_MB"
  else
    case "$PROFILE" in
      standard) [ "$want" -gt 32768 ] && want=32768 ;;
      quick) [ "$want" -gt 8192 ] && want=8192 ;;
    esac
  fi
  [ "$want" -lt 512 ] && want=512
  echo "$want"
}

ssd_read_test_mb() {
  if [ "$SSD_READ_TEST_MB" != "auto" ]; then
    echo "$SSD_READ_TEST_MB"
    return
  fi
  case "$PROFILE" in
    deep)
      if [ "$SSD_READ_MAX_MB" = "auto" ]; then
        want="$(ssd_test_mb)"
        [ "$want" -gt 262144 ] && want=262144
      else
        want="$SSD_READ_MAX_MB"
      fi
      ;;
    standard)
      if [ "$SSD_READ_MAX_MB" = "auto" ]; then
        want=8192
      else
        want="$SSD_READ_MAX_MB"
      fi
      ;;
    quick) want=2048 ;;
    *) want=0 ;;
  esac
  [ "$want" -lt 512 ] && want=512
  echo "$want"
}

ssd_device_read_mb() {
  if [ "$SSD_DEVICE_READ_MB" != "auto" ]; then
    echo "$SSD_DEVICE_READ_MB"
    return
  fi
  case "$PROFILE" in
    deep) echo "all"; return ;;
    standard) want="$SSD_DEVICE_READ_MAX_MB" ;;
    quick) want=1024 ;;
    *) want=0 ;;
  esac
  [ "$want" -lt 256 ] && want=256
  echo "$want"
}

ssd_pattern_test_mb() {
  if [ "$SSD_PATTERN_TEST_MB" != "auto" ]; then
    echo "$SSD_PATTERN_TEST_MB"
    return
  fi
  base="$(ssd_test_mb)"
  case "$PROFILE" in
    deep) [ "$base" -gt 262144 ] && base=262144 ;;
    standard) [ "$base" -gt 8192 ] && base=8192 ;;
    quick) [ "$base" -gt 1024 ] && base=1024 ;;
  esac
  if [ "$base" -lt 512 ]; then
    echo 0
    return
  fi
  [ "$base" -lt 512 ] && base=512
  echo "$base"
}

run_ssd_file_test() {
  case "$PROFILE" in inventory) return 0 ;; esac
  stage "ssd_file_test"
  if ! have shasum; then
    log WARN ssd_file_test_skipped "missing shasum"
    return 0
  fi
  mb="$(ssd_test_mb)"
  keep_free_mb="$(ssd_keep_free_mb)"
  free_mb="$(df_free_mb)"
  if [ "$mb" -lt 512 ] 2>/dev/null || [ "$free_mb" -le "$keep_free_mb" ] 2>/dev/null; then
    log WARN ssd_file_test_skipped "mb=$mb free_mb=$free_mb keep_free_mb=$keep_free_mb"
    return 0
  fi
  file="${TMPDIR:-/tmp}/diag_core_macos_ssd_${RUN_ID}.bin"
  count=$((mb / 16))
  [ "$count" -lt 1 ] && count=1
  actual_mb=$((count * 16))
  log INFO ssd_write_start "mb=$actual_mb file=$file"
  write_start="$(now_epoch)"
  if dd if=/dev/urandom of="$file" bs=16m count="$count" >>"$LOG" 2>&1; then
    write_dur="$(duration_since "$write_start")"
    write_rate="$(mb_per_sec "$actual_mb" "$write_dur")"
    log INFO speed_metric "kind=ssd_write phase=random_write mb=$actual_mb duration_sec=$write_dur mb_s=$write_rate"
    sync
    hash_start="$(now_epoch)"
    sum1="$(shasum -a 256 "$file" | awk '{print $1}')"
    hash_dur="$(duration_since "$hash_start")"
    hash_rate="$(mb_per_sec "$actual_mb" "$hash_dur")"
    log INFO speed_metric "kind=ssd_read phase=hash_read_1 mb=$actual_mb duration_sec=$hash_dur mb_s=$hash_rate"
    hash_start="$(now_epoch)"
    sum2="$(shasum -a 256 "$file" | awk '{print $1}')"
    hash_dur="$(duration_since "$hash_start")"
    hash_rate="$(mb_per_sec "$actual_mb" "$hash_dur")"
    log INFO speed_metric "kind=ssd_read phase=hash_read_2 mb=$actual_mb duration_sec=$hash_dur mb_s=$hash_rate"
    if [ "$sum1" = "$sum2" ]; then
      log INFO ssd_verify_ok "mb=$actual_mb sha256=$sum1 write_mb_s=$write_rate"
    else
      log FAIL ssd_verify_mismatch "sum1=$sum1 sum2=$sum2"
    fi
  else
    log FAIL ssd_write_failed "mb=$mb"
  fi
  rm -f "$file"
}

run_ssd_pattern_tests() {
  case "$PROFILE" in deep|standard) ;; *) return 0 ;; esac
  stage "ssd_pattern_tests"
  if ! have shasum; then
    log WARN ssd_pattern_skipped "missing_shasum"
    return 0
  fi
  mb="$(ssd_pattern_test_mb)"
  keep_free_mb="$(ssd_keep_free_mb)"
  free_mb="$(df_free_mb)"
  if [ "$mb" -lt 512 ] 2>/dev/null || [ "$free_mb" -le "$keep_free_mb" ] 2>/dev/null; then
    log WARN ssd_pattern_skipped "mb=$mb free_mb=$free_mb keep_free_mb=$keep_free_mb"
    return 0
  fi
  file="${TMPDIR:-/tmp}/diag_core_pattern_${RUN_ID}.bin"
  count=$((mb / 16))
  [ "$count" -lt 1 ] && count=1
  actual_mb=$((count * 16))
  log INFO ssd_pattern_zero_start "mb=$actual_mb file=$file"
  write_start="$(now_epoch)"
  if dd if=/dev/zero of="$file" bs=16m count="$count" >>"$LOG" 2>&1; then
    write_dur="$(duration_since "$write_start")"
    write_rate="$(mb_per_sec "$actual_mb" "$write_dur")"
    log INFO speed_metric "kind=ssd_write phase=zero_pattern_write mb=$actual_mb duration_sec=$write_dur mb_s=$write_rate"
    sync
    hash_start="$(now_epoch)"
    sum1="$(shasum -a 256 "$file" | awk '{print $1}')"
    hash_dur="$(duration_since "$hash_start")"
    hash_rate="$(mb_per_sec "$actual_mb" "$hash_dur")"
    log INFO speed_metric "kind=ssd_read phase=zero_pattern_hash_1 mb=$actual_mb duration_sec=$hash_dur mb_s=$hash_rate"
    hash_start="$(now_epoch)"
    sum2="$(shasum -a 256 "$file" | awk '{print $1}')"
    hash_dur="$(duration_since "$hash_start")"
    hash_rate="$(mb_per_sec "$actual_mb" "$hash_dur")"
    log INFO speed_metric "kind=ssd_read phase=zero_pattern_hash_2 mb=$actual_mb duration_sec=$hash_dur mb_s=$hash_rate"
    if [ "$sum1" = "$sum2" ]; then
      log INFO ssd_pattern_zero_ok "mb=$actual_mb sha256=$sum1 write_mb_s=$write_rate"
    else
      log FAIL ssd_pattern_zero_mismatch "sum1=$sum1 sum2=$sum2"
    fi
  else
    log FAIL ssd_pattern_zero_failed "mb=$actual_mb"
  fi
  if have fio; then
    log INFO fio_verify_pattern_start "size_mb=$actual_mb file=$file"
    if fio --name=diag_verify_pattern --filename="$file" --rw=write --bs=1m --size="${actual_mb}m" --direct=0 --ioengine=sync --verify=crc32c --do_verify=1 --verify_fatal=1 --group_reporting --output-format=json --output="$RUN_DIR/fio_verify_pattern.json" >>"$LOG" 2>&1; then
      log INFO fio_verify_pattern_ok "size_mb=$actual_mb"
    else
      rc=$?
      log FAIL fio_verify_pattern_failed "rc=$rc size_mb=$actual_mb"
    fi
  fi
  rm -f "$file"
}

run_ssd_device_read_test() {
  case "$PROFILE" in inventory) return 0 ;; esac
  stage "ssd_device_read"
  if ! have diskutil; then
    log WARN ssd_device_read_skipped "missing_diskutil"
    return 0
  fi
  root_dev="$(df -P / 2>/dev/null | awk 'NR==2 {print $1}')"
  whole="$(diskutil info "$root_dev" 2>/dev/null | awk -F': *' '/Part of Whole/ {print $2; exit}')"
  [ -n "$whole" ] || whole="$(diskutil info "$root_dev" 2>/dev/null | awk -F': *' '/Device Node/ {print $2; exit}')"
  whole="${whole#/dev/}"
  case "$whole" in disk*) ;; *) log WARN ssd_device_read_skipped "could_not_detect_whole_disk root_dev=$root_dev whole=$whole"; return 0 ;; esac
  raw="/dev/r$whole"
  mb="$(ssd_device_read_mb)"
  required="$SSD_RAW_READ_REQUIRED"
  if [ "$required" = "auto" ]; then
    case "$PROFILE" in deep) required=1 ;; *) required=0 ;; esac
  fi
  if [ "$mb" = "all" ]; then
    log INFO ssd_device_read_start "device=$raw mb=all"
    read_mb="$(whole_disk_size_mb "$whole")"
    [ -n "$read_mb" ] || read_mb=0
    read_start="$(now_epoch)"
    dd if="$raw" of=/dev/null bs=16m >>"$LOG" 2>&1
    rc=$?
    read_dur="$(duration_since "$read_start")"
    if [ "$rc" -eq 0 ]; then
      if [ "$read_mb" -gt 0 ] 2>/dev/null; then
        read_rate="$(mb_per_sec "$read_mb" "$read_dur")"
        log INFO speed_metric "kind=ssd_read phase=raw_device_read mb=$read_mb duration_sec=$read_dur mb_s=$read_rate"
      else
        read_rate=""
      fi
      log INFO ssd_device_read_ok "device=$raw mb=all read_mb=$read_mb duration_sec=$read_dur mb_s=$read_rate"
      return 0
    fi
    if [ "$required" = "1" ]; then
      log FAIL ssd_device_read_failed "device=$raw rc=$rc required=1 maybe_permission_or_io"
    else
      log WARN ssd_device_read_failed "device=$raw rc=$rc maybe_permission_or_io"
    fi
    return 0
  fi
  count=$((mb / 4))
  [ "$count" -lt 1 ] && count=1
  actual_mb=$((count * 4))
  log INFO ssd_device_read_start "device=$raw mb=$actual_mb"
  read_start="$(now_epoch)"
  if dd if="$raw" of=/dev/null bs=4m count="$count" >>"$LOG" 2>&1; then
    read_dur="$(duration_since "$read_start")"
    read_rate="$(mb_per_sec "$actual_mb" "$read_dur")"
    log INFO speed_metric "kind=ssd_read phase=raw_device_read mb=$actual_mb duration_sec=$read_dur mb_s=$read_rate"
    log INFO ssd_device_read_ok "device=$raw mb=$actual_mb duration_sec=$read_dur mb_s=$read_rate"
  else
    rc=$?
    if [ "$required" = "1" ]; then
      log FAIL ssd_device_read_failed "device=$raw rc=$rc required=1 maybe_permission_or_io"
    else
      log WARN ssd_device_read_failed "device=$raw rc=$rc maybe_permission_or_io"
    fi
  fi
}


run_ssd_fio_tests() {
  case "$PROFILE" in inventory) return 0 ;; esac
  if ! have fio; then
    log WARN fio_skipped "missing_fio"
    return 0
  fi
  stage "ssd_fio_tests"
  mb="$(ssd_test_mb)"
  read_mb="$(ssd_read_test_mb)"
  runtime_sec="$FIO_RUNTIME_SEC"
  if [ "$runtime_sec" = "auto" ]; then
    case "$PROFILE" in
      deep)
        runtime_sec=300
        [ "$mb" -gt 262144 ] 2>/dev/null && runtime_sec=600
        ;;
      standard) runtime_sec=120 ;;
      quick) runtime_sec=60 ;;
      *) runtime_sec=60 ;;
    esac
  fi
  keep_free_mb="$(ssd_keep_free_mb)"
  free_mb="$(df_free_mb)"
  if [ "$mb" -lt 512 ] 2>/dev/null || [ "$free_mb" -le "$keep_free_mb" ] 2>/dev/null; then
    log WARN fio_skipped "mb=$mb free_mb=$free_mb keep_free_mb=$keep_free_mb"
    return 0
  fi
  file="${TMPDIR:-/tmp}/diag_core_fio_${RUN_ID}.bin"
  if [ "$read_mb" -gt "$mb" ] 2>/dev/null; then
    read_mb="$mb"
  fi
  log INFO fio_write_start "size_mb=$mb runtime_sec=$runtime_sec file=$file"
  if fio --name=diag_seq_write --filename="$file" --rw=write --bs=1m --size="${mb}m" --iodepth=8 --direct=0 --ioengine=sync --time_based --runtime="$runtime_sec" --group_reporting --output-format=json --output="$RUN_DIR/fio_seq_write.json" >>"$LOG" 2>&1; then
    log INFO fio_write_ok "size_mb=$mb"
  else
    rc=$?
    log FAIL fio_write_failed "rc=$rc size_mb=$mb"
  fi
  log INFO fio_read_start "size_mb=$read_mb runtime_sec=$runtime_sec file=$file"
  if fio --name=diag_rand_read --filename="$file" --rw=randread --bs=128k --size="${read_mb}m" --iodepth=16 --direct=0 --ioengine=sync --time_based --runtime="$runtime_sec" --group_reporting --output-format=json --output="$RUN_DIR/fio_rand_read.json" >>"$LOG" 2>&1; then
    log INFO fio_read_ok "size_mb=$read_mb"
  else
    rc=$?
    log FAIL fio_read_failed "rc=$rc size_mb=$read_mb"
  fi
  if [ "$PROFILE" = "deep" ]; then
    log INFO fio_mixed_start "size_mb=$read_mb runtime_sec=$runtime_sec file=$file"
    if fio --name=diag_randrw_mixed --filename="$file" --rw=randrw --rwmixread=70 --bs=128k --size="${read_mb}m" --iodepth=16 --direct=0 --ioengine=sync --time_based --runtime="$runtime_sec" --group_reporting --output-format=json --output="$RUN_DIR/fio_randrw_mixed.json" >>"$LOG" 2>&1; then
      log INFO fio_mixed_ok "size_mb=$read_mb"
    else
      rc=$?
      log FAIL fio_mixed_failed "rc=$rc size_mb=$read_mb"
    fi
  fi
  rm -f "$file"
}

write_health_metrics() {
  metrics="$RUN_DIR/health_metrics.json"
  smart_files=0
  for f in "$RUN_DIR"/*smartctl_*.txt; do
    [ -f "$f" ] && smart_files=$((smart_files + 1))
  done
  kernel_hits="$(cat "$RUN_DIR"/*_kernel_storage_relevant.txt 2>/dev/null | wc -l | tr -d ' ')"
  crash_hits="$(grep -c '^===== ' "$RUN_DIR/crash_reports_recent.txt" 2>/dev/null || echo 0)"
  ram_tested="$(grep -E 'ram_verify_ok' "$EVENTS" 2>/dev/null | tail -n 1 | sed -n 's/.*tested_mb=\([0-9][0-9]*\).*/\1/p')"
  ssd_hash_mb="$(grep -E 'ssd_verify_ok' "$EVENTS" 2>/dev/null | tail -n 1 | sed -n 's/.*mb=\([0-9][0-9]*\).*/\1/p')"
  fio_write_ok="$(grep -c 'fio_write_ok' "$EVENTS" 2>/dev/null || echo 0)"
  fio_read_ok="$(grep -c 'fio_read_ok' "$EVENTS" 2>/dev/null || echo 0)"
  fio_mixed_ok="$(grep -c 'fio_mixed_ok' "$EVENTS" 2>/dev/null || echo 0)"
  fio_verify_pattern_ok="$(grep -c 'fio_verify_pattern_ok' "$EVENTS" 2>/dev/null || echo 0)"
  raw_read_ok="$(grep -c 'ssd_device_read_ok' "$EVENTS" 2>/dev/null || echo 0)"
  [ -n "$ram_tested" ] || ram_tested=0
  [ -n "$ssd_hash_mb" ] || ssd_hash_mb=0
  cat >"$metrics" <<EOF
{"run_id":"$(json_escape "$RUN_ID")","serial":"$(json_escape "$SERIAL")","model":"$(json_escape "$MODEL")","version":"$VERSION","profile":"$(json_escape "$PROFILE")","smart_files":$smart_files,"kernel_storage_relevant_lines":${kernel_hits:-0},"recent_crash_reports":${crash_hits:-0},"ram_memtester_mb":$ram_tested,"ssd_hash_verify_mb":$ssd_hash_mb,"raw_device_read_ok":$raw_read_ok,"fio_write_ok":$fio_write_ok,"fio_read_ok":$fio_read_ok,"fio_mixed_ok":$fio_mixed_ok,"fio_verify_pattern_ok":$fio_verify_pattern_ok,"updated_at":"$(now_iso)"}
EOF
  log INFO health_metrics "file=$metrics smart_files=$smart_files kernel_hits=${kernel_hits:-0} crash_reports=${crash_hits:-0}"
}

upload_artifacts() {
  for path in "$RUN_DIR"/inventory.txt "$RUN_DIR"/system_profiler.txt "$RUN_DIR"/diskutil_list.txt "$RUN_DIR"/diskutil_apfs.txt "$RUN_DIR"/ioreg_battery.txt "$RUN_DIR"/pmset.txt "$RUN_DIR"/pmset_assertions.txt "$RUN_DIR"/networksetup_ports.txt "$RUN_DIR"/vm_stat_before_ram.txt "$RUN_DIR"/crash_reports_recent.txt "$RUN_DIR"/powermetrics_sample.txt "$RUN_DIR"/powermetrics_telemetry.txt "$RUN_DIR"/health_metrics.json "$RUN_DIR"/*smartctl*.txt "$RUN_DIR"/*kernel_storage_relevant.txt "$RUN_DIR"/fio_*.json; do
    [ -f "$path" ] || continue
    name="$(basename "$path")"
    upload_file "$path" "$(artifact_kind "$name")" || true
  done
}

run_cpu_smoke() {
  case "$PROFILE" in inventory) return 0 ;; esac
  stage "cpu_smoke"
  end=$(( $(now_epoch) + CPU_SMOKE_SEC ))
  while [ "$(now_epoch)" -lt "$end" ]; do
    dd if=/dev/zero bs=1m count=64 2>/dev/null | shasum >/dev/null 2>&1 || true
  done
  log INFO cpu_smoke_done "duration_sec=$CPU_SMOKE_SEC"
}

cpu_stress_sec() {
  if [ "$CPU_STRESS_SEC" != "auto" ]; then
    echo "$CPU_STRESS_SEC"
    return
  fi
  case "$PROFILE" in
    deep) echo 900 ;;
    standard) echo 300 ;;
    quick) echo 60 ;;
    *) echo 0 ;;
  esac
}

cpu_stress_workers() {
  if [ "$CPU_STRESS_WORKERS" != "auto" ]; then
    echo "$CPU_STRESS_WORKERS"
    return
  fi
  workers="$(sysctl -n hw.ncpu 2>/dev/null || echo 2)"
  [ -n "$workers" ] || workers=2
  [ "$workers" -lt 1 ] 2>/dev/null && workers=1
  echo "$workers"
}

run_cpu_stress() {
  case "$PROFILE" in inventory) return 0 ;; esac
  sec="$(cpu_stress_sec)"
  [ "$sec" -gt 0 ] 2>/dev/null || return 0
  workers="$(cpu_stress_workers)"
  stage "cpu_stress"
  log INFO cpu_stress_start "duration_sec=$sec workers=$workers telemetry_sec=$TELEMETRY_SEC"
  start="$(now_epoch)"
  if have stress-ng; then
    if stress-ng --cpu "$workers" --cpu-method matrixprod --verify --timeout "${sec}s" --metrics-brief >>"$LOG" 2>&1; then
      dur="$(duration_since "$start")"
      log INFO cpu_stress_ok "tool=stress-ng duration_sec=$dur workers=$workers"
      return 0
    fi
    rc=$?
    log WARN cpu_stress_failed "tool=stress-ng rc=$rc fallback=yes"
  fi
  pids=""
  i=1
  while [ "$i" -le "$workers" ]; do
    yes >/dev/null 2>&1 &
    pids="$pids $!"
    i=$((i + 1))
  done
  end=$(( $(now_epoch) + sec ))
  while [ "$(now_epoch)" -lt "$end" ]; do
    sleep 5
  done
  for pid in $pids; do
    kill "$pid" >/dev/null 2>&1 || true
    wait "$pid" >/dev/null 2>&1 || true
  done
  dur="$(duration_since "$start")"
  log INFO cpu_stress_ok "tool=yes_fallback duration_sec=$dur workers=$workers"
}

write_summary() {
  status="PASS"
  [ "$WARN_COUNT" -gt 0 ] && status="WARN"
  [ "$FAIL_COUNT" -gt 0 ] && status="FAIL"
  cat >"$SUMMARY" <<EOF
{"run_id":"$(json_escape "$RUN_ID")","serial":"$(json_escape "$SERIAL")","model":"$(json_escape "$MODEL")","profile":"$(json_escape "$PROFILE")","version":"$VERSION","status":"$status","fail_count":$FAIL_COUNT,"warn_count":$WARN_COUNT,"finished_at":"$(now_iso)"}
EOF
  log INFO summary "status=$status fails=$FAIL_COUNT warns=$WARN_COUNT summary=$SUMMARY"
}

cleanup() {
  for pid in $CHILD_PIDS; do
    kill "$pid" >/dev/null 2>&1 || true
  done
  upload_file "$LOG" full_log || true
  upload_file "$EVENTS" events || true
  upload_file "$MONITOR" monitor || true
  upload_file "$TELEMETRY" telemetry || true
  upload_file "$SUMMARY" summary || true
  upload_artifacts || true
}

trap cleanup EXIT INT TERM HUP

main() {
  echo "Diag-Core macOS v$VERSION"
  echo "Run: $RUN_ID"
  echo "Local artifacts: $RUN_DIR"
  start_sleep_prevention
  monitor_loop &
  CHILD_PIDS="$CHILD_PIDS $!"
  stage "start"
  collect_inventory
  upload_artifacts || true
  run_cpu_smoke
  run_cpu_stress
  prepare_memory_for_ram_test
  run_ram_test
  run_memory_pressure_test
  run_ssd_file_test
  run_ssd_pattern_tests
  run_ssd_device_read_test
  run_ssd_fio_tests
  collect_power_sample
  collect_smart_snapshot "post"
  evaluate_smart_health "post"
  collect_kernel_storage_logs "post"
  collect_crash_reports
  write_health_metrics
  stage "done"
  write_summary
}

main "$@"
