修复nvidia-smi无法找到的问题
2026-01-12
深度学习
00
  1. 查找 nvidia-smi → 找到 /usr/local/nvidia/bin/nvidia-smi
  2. 添加到 PATH → export PATH="/usr/local/nvidia/bin:$PATH"
  3. 查找 CUDA 库 → 找到 /usr/local/nvidia/lib64
  4. 设置 LD_LIBRARY_PATH → export LD_LIBRARY_PATH="/usr/local/nvidia/lib64:$LD_LIBRARY_PATH"
  5. 设置 CUDA_HOME → export CUDA_HOME="/usr/local/cuda"

fix_nvidia_smi.sh 这个代码会直接全局看是否可以修复:

bash
展开代码
#!/bin/bash # ============================================================================ # nvidia-smi and CUDA Driver Library Fix Script # ============================================================================ set -e echo "==========================================" echo "Starting nvidia-smi and CUDA Driver Library Fix" echo "==========================================" echo "" # ============================================================================ # 1. Find nvidia-smi command # ============================================================================ find_nvidia_smi() { local paths=( "/usr/bin" "/usr/local/bin" "/usr/local/cuda/bin" "/usr/local/nvidia/bin" "/opt/nvidia/bin" "/bin" ) for path in "${paths[@]}"; do if [ -f "$path/nvidia-smi" ] && [ -x "$path/nvidia-smi" ]; then echo "$path/nvidia-smi" return 0 fi done # Try to find using find command across the system local found_path=$(find /usr /opt /bin -name "nvidia-smi" -type f -executable 2>/dev/null | head -1 || echo "") if [ -n "$found_path" ] && [ -f "$found_path" ]; then echo "$found_path" return 0 fi # Try to find using which/whereis if command -v nvidia-smi >/dev/null 2>&1; then command -v nvidia-smi return 0 fi return 1 } echo "Step 1: Finding nvidia-smi command..." NVIDIA_SMI_PATH=$(find_nvidia_smi || echo "") if [ -n "$NVIDIA_SMI_PATH" ]; then echo "[OK] Found nvidia-smi: $NVIDIA_SMI_PATH" # Add nvidia-smi directory to PATH NVIDIA_SMI_DIR=$(dirname "$NVIDIA_SMI_PATH") export PATH="${NVIDIA_SMI_DIR}:${PATH}" echo " Added $NVIDIA_SMI_DIR to PATH" else echo "[FAIL] nvidia-smi command not found" echo " Trying to find in common paths..." for path in /usr/bin /usr/local/bin /usr/local/cuda/bin /usr/local/nvidia/bin; do if [ -d "$path" ]; then echo " Checking $path: $(ls -la "$path/nvidia-smi" 2>/dev/null || echo 'not found')" fi done fi echo "" # ============================================================================ # 2. Find CUDA Driver Library (libcuda.so.1) # ============================================================================ find_libcuda() { local paths=( "/usr/lib/x86_64-linux-gnu" "/usr/local/cuda/lib64" "/usr/lib64" "/usr/lib" "/lib/x86_64-linux-gnu" "/usr/local/nvidia/lib64" "/usr/local/nvidia/lib" "/opt/nvidia/lib64" "/opt/nvidia/lib" ) for path in "${paths[@]}"; do if [ -f "$path/libcuda.so.1" ] || [ -f "$path/libcuda.so" ]; then echo "$path" return 0 fi done # Try to find using find command across the system local found_path=$(find /usr /opt /lib -name "libcuda.so.1" 2>/dev/null | head -1 | xargs dirname 2>/dev/null || echo "") if [ -n "$found_path" ] && [ -d "$found_path" ]; then echo "$found_path" return 0 fi # Try to find using ldconfig local libcuda_path=$(ldconfig -p 2>/dev/null | grep libcuda.so.1 | head -1 | awk '{print $4}' | xargs dirname 2>/dev/null || echo "") if [ -n "$libcuda_path" ] && [ -d "$libcuda_path" ]; then echo "$libcuda_path" return 0 fi return 1 } echo "Step 2: Finding CUDA Driver Library (libcuda.so.1)..." CUDA_LIB_PATH=$(find_libcuda || echo "") if [ -n "$CUDA_LIB_PATH" ]; then echo "[OK] Found CUDA driver library path: $CUDA_LIB_PATH" export LD_LIBRARY_PATH="${CUDA_LIB_PATH}:${LD_LIBRARY_PATH}" echo " Added $CUDA_LIB_PATH to LD_LIBRARY_PATH" else echo "[WARN] libcuda.so.1 not found" echo " Diagnostic information:" echo " - Check /usr/lib/x86_64-linux-gnu/libcuda.so*: $(ls /usr/lib/x86_64-linux-gnu/libcuda.so* 2>/dev/null | head -1 || echo 'not found')" echo " - Check /usr/local/cuda/lib64/libcuda.so*: $(ls /usr/local/cuda/lib64/libcuda.so* 2>/dev/null | head -1 || echo 'not found')" echo " - Check /usr/local/nvidia/lib64/libcuda.so*: $(ls /usr/local/nvidia/lib64/libcuda.so* 2>/dev/null | head -1 || echo 'not found')" echo " - libcuda in ldconfig: $(ldconfig -p 2>/dev/null | grep libcuda || echo 'not found')" echo "" echo " Trying to use default paths and set environment variables..." # Try common paths export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64:/usr/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}" fi echo "" # ============================================================================ # 3. Check /dev/nvidia* devices # ============================================================================ echo "Step 3: Checking /dev/nvidia* devices..." if ls /dev/nvidia* >/dev/null 2>&1; then echo "[OK] Found /dev/nvidia* devices:" ls -la /dev/nvidia* 2>/dev/null | head -5 || true else echo "[WARN] /dev/nvidia* devices not found" echo " This may indicate GPU devices are not properly mounted in container" fi echo "" # ============================================================================ # 4. Set CUDA_HOME (if not set) # ============================================================================ echo "Step 4: Setting CUDA_HOME..." if [ -z "$CUDA_HOME" ]; then if [ -d "/usr/local/cuda" ]; then export CUDA_HOME="/usr/local/cuda" export PATH="${CUDA_HOME}/bin:${PATH}" export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" echo "[OK] Set CUDA_HOME=$CUDA_HOME" elif [ -d "/usr/local/nvidia" ]; then export CUDA_HOME="/usr/local/nvidia" export PATH="${CUDA_HOME}/bin:${PATH}" export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" echo "[OK] Set CUDA_HOME=$CUDA_HOME" else echo "[WARN] CUDA installation directory not found" fi else echo "[OK] CUDA_HOME already set: $CUDA_HOME" fi echo "" # ============================================================================ # 5. Verify CUDA library accessibility # ============================================================================ echo "Step 5: Verifying CUDA driver library accessibility..." CUDA_LIB_OK=false if python3 << 'PYEOF' 2>/dev/null | grep -q "OK"; then import ctypes try: ctypes.CDLL("libcuda.so.1") print("OK") except: print("FAIL") PYEOF CUDA_LIB_OK=true fi if [ "$CUDA_LIB_OK" = "true" ]; then echo "[OK] CUDA driver library is accessible" else echo "[WARN] CUDA driver library may not be accessible, trying other fix methods..." # Try to detect CUDA through PyTorch (if PyTorch is available, CUDA runtime exists) if python3 << 'PYEOF' 2>/dev/null | grep -q "True"; then import torch print("PyTorch CUDA available:", torch.cuda.is_available()) PYEOF echo "[OK] PyTorch can access CUDA, setting vLLM environment variables to force GPU usage" # Set vLLM related environment variables to force GPU usage export VLLM_USE_CPU=0 export VLLM_WORKER_MULTIPROC_METHOD=spawn # Try to set device related environment variables export CUDA_DEVICE_ORDER=PCI_BUS_ID else echo "[FAIL] PyTorch cannot access CUDA either" fi fi echo "" # ============================================================================ # 6. Test nvidia-smi command # ============================================================================ echo "Step 6: Testing nvidia-smi command..." if command -v nvidia-smi >/dev/null 2>&1; then echo "[OK] nvidia-smi command is available" echo "" echo "Running nvidia-smi test:" echo "----------------------------------------" if nvidia-smi >/dev/null 2>&1; then nvidia-smi echo "----------------------------------------" echo "[OK] nvidia-smi ran successfully!" else echo "[FAIL] nvidia-smi execution failed" echo " Error information:" nvidia-smi 2>&1 || true fi else echo "[FAIL] nvidia-smi command is still not available" echo " Current PATH: $PATH" echo " Trying to use full path directly:" if [ -n "$NVIDIA_SMI_PATH" ] && [ -f "$NVIDIA_SMI_PATH" ]; then echo " Using: $NVIDIA_SMI_PATH" "$NVIDIA_SMI_PATH" || true fi fi echo "" # ============================================================================ # 7. Output environment variables summary # ============================================================================ echo "==========================================" echo "Environment Variables Summary" echo "==========================================" echo "PATH: $PATH" echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" echo "CUDA_HOME: ${CUDA_HOME:-not set}" echo "NVIDIA_SMI_PATH: ${NVIDIA_SMI_PATH:-not found}" echo "CUDA_LIB_PATH: ${CUDA_LIB_PATH:-not found}" echo "==========================================" echo "" # ============================================================================ # 8. Generate persistent environment variable setup script # ============================================================================ ENV_SCRIPT="/tmp/setup_nvidia_env.sh" cat > "$ENV_SCRIPT" << 'ENVEOF' #!/bin/bash # Auto-generated environment variable setup script # Source this file in training scripts to apply fixes # Set PATH if [ -n "$NVIDIA_SMI_DIR" ]; then export PATH="${NVIDIA_SMI_DIR}:${PATH}" fi # Set LD_LIBRARY_PATH if [ -n "$CUDA_LIB_PATH" ]; then export LD_LIBRARY_PATH="${CUDA_LIB_PATH}:${LD_LIBRARY_PATH}" fi # Set CUDA_HOME if [ -z "$CUDA_HOME" ] && [ -d "/usr/local/cuda" ]; then export CUDA_HOME="/usr/local/cuda" export PATH="${CUDA_HOME}/bin:${PATH}" export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" fi # Set vLLM related environment variables export VLLM_USE_CPU=0 export VLLM_WORKER_MULTIPROC_METHOD=spawn export CUDA_DEVICE_ORDER=PCI_BUS_ID ENVEOF # Replace variables in script if [ -n "$NVIDIA_SMI_DIR" ]; then sed -i "s|NVIDIA_SMI_DIR|\"$NVIDIA_SMI_DIR\"|g" "$ENV_SCRIPT" fi if [ -n "$CUDA_LIB_PATH" ]; then sed -i "s|CUDA_LIB_PATH|\"$CUDA_LIB_PATH\"|g" "$ENV_SCRIPT" fi chmod +x "$ENV_SCRIPT" echo "[OK] Generated environment variable setup script: $ENV_SCRIPT" echo " Use in training scripts: source $ENV_SCRIPT" echo "" echo "==========================================" echo "Fix completed!" echo "==========================================" echo "" echo "If nvidia-smi is still not available, please check:" echo "1. Whether GPU devices (/dev/nvidia*) are properly mounted in container" echo "2. Whether container image contains NVIDIA driver tools" echo "3. Whether rjob configuration correctly sets GPU resources" echo ""

diagnose_nvidia_env.sh 这个代码会诊断环境:

bash
展开代码
#!/bin/bash # ============================================================================ # NVIDIA/CUDA Environment Diagnostic Script # Run this script and provide the output to get hardcoded environment variables # ============================================================================ echo "==========================================" echo "NVIDIA/CUDA Environment Diagnostic" echo "==========================================" echo "" echo "Please run this script and provide the complete output." echo "" # 1. Find nvidia-smi echo "=== 1. NVIDIA-SMI LOCATION ===" NVIDIA_SMI_PATH="" for path in /usr/local/nvidia/bin /usr/bin /usr/local/bin /usr/local/cuda/bin /opt/nvidia/bin /bin; do if [ -f "$path/nvidia-smi" ] && [ -x "$path/nvidia-smi" ]; then NVIDIA_SMI_PATH="$path/nvidia-smi" NVIDIA_SMI_DIR="$path" echo "FOUND: $NVIDIA_SMI_PATH" echo "DIR: $NVIDIA_SMI_DIR" break fi done if [ -z "$NVIDIA_SMI_PATH" ]; then echo "NOT FOUND: Searching with find..." NVIDIA_SMI_PATH=$(find /usr /opt /bin -name "nvidia-smi" -type f -executable 2>/dev/null | head -1 || echo "") if [ -n "$NVIDIA_SMI_PATH" ]; then NVIDIA_SMI_DIR=$(dirname "$NVIDIA_SMI_PATH") echo "FOUND: $NVIDIA_SMI_PATH" echo "DIR: $NVIDIA_SMI_DIR" else echo "NOT FOUND: nvidia-smi not found in system" fi fi echo "" # 2. Find CUDA driver library echo "=== 2. CUDA DRIVER LIBRARY (libcuda.so.1) ===" CUDA_LIB_PATH="" for path in /usr/local/nvidia/lib64 /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/lib64 /usr/lib /lib/x86_64-linux-gnu /opt/nvidia/lib64; do if [ -f "$path/libcuda.so.1" ] || [ -f "$path/libcuda.so" ]; then CUDA_LIB_PATH="$path" echo "FOUND: $CUDA_LIB_PATH" ls -la "$path/libcuda.so"* 2>/dev/null | head -3 || true break fi done if [ -z "$CUDA_LIB_PATH" ]; then echo "NOT FOUND: Searching with find..." CUDA_LIB_PATH=$(find /usr /opt /lib -name "libcuda.so.1" 2>/dev/null | head -1 | xargs dirname 2>/dev/null || echo "") if [ -n "$CUDA_LIB_PATH" ]; then echo "FOUND: $CUDA_LIB_PATH" ls -la "$CUDA_LIB_PATH/libcuda.so"* 2>/dev/null | head -3 || true else echo "NOT FOUND: libcuda.so.1 not found" fi fi echo "" # 2.5. Find libnvidia-ml.so (required by nvidia-smi) echo "=== 2.5. NVIDIA ML LIBRARY (libnvidia-ml.so) ===" NVIDIA_ML_LIB_PATH="" for path in /usr/local/nvidia/lib64 /usr/lib/x86_64-linux-gnu /usr/local/cuda/lib64 /usr/lib64 /usr/lib /lib/x86_64-linux-gnu /opt/nvidia/lib64; do if [ -f "$path/libnvidia-ml.so" ] || [ -f "$path/libnvidia-ml.so.1" ]; then NVIDIA_ML_LIB_PATH="$path" echo "FOUND: $NVIDIA_ML_LIB_PATH" ls -la "$path/libnvidia-ml.so"* 2>/dev/null | head -3 || true break fi done if [ -z "$NVIDIA_ML_LIB_PATH" ]; then echo "NOT FOUND: Searching with find..." NVIDIA_ML_LIB_PATH=$(find /usr /opt /lib -name "libnvidia-ml.so*" 2>/dev/null | head -1 | xargs dirname 2>/dev/null || echo "") if [ -n "$NVIDIA_ML_LIB_PATH" ]; then echo "FOUND: $NVIDIA_ML_LIB_PATH" ls -la "$NVIDIA_ML_LIB_PATH/libnvidia-ml.so"* 2>/dev/null | head -3 || true else echo "NOT FOUND: libnvidia-ml.so not found (this may cause nvidia-smi to fail)" fi fi echo "" # 3. Check CUDA_HOME echo "=== 3. CUDA_HOME ===" if [ -d "/usr/local/cuda" ]; then echo "FOUND: /usr/local/cuda" echo "BIN: /usr/local/cuda/bin" echo "LIB64: /usr/local/cuda/lib64" elif [ -d "/usr/local/nvidia" ]; then echo "FOUND: /usr/local/nvidia" echo "BIN: /usr/local/nvidia/bin" echo "LIB64: /usr/local/nvidia/lib64" else echo "NOT FOUND: No CUDA installation directory found" fi echo "" # 4. Check GPU devices echo "=== 4. GPU DEVICES ===" if ls /dev/nvidia* >/dev/null 2>&1; then echo "FOUND: /dev/nvidia* devices" ls -la /dev/nvidia* 2>/dev/null | head -5 else echo "NOT FOUND: No /dev/nvidia* devices" fi echo "" # 5. Check ldconfig echo "=== 5. LDCONFIG LIBRARY PATHS ===" if command -v ldconfig >/dev/null 2>&1; then echo "libcuda libraries in ldconfig:" ldconfig -p 2>/dev/null | grep libcuda || echo " No libcuda found in ldconfig" else echo "ldconfig command not available" fi echo "" # 6. Current environment variables echo "=== 6. CURRENT ENVIRONMENT VARIABLES ===" echo "PATH: $PATH" echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH" echo "CUDA_HOME: ${CUDA_HOME:-not set}" echo "" # 7. Test nvidia-smi if found echo "=== 7. NVIDIA-SMI TEST ===" if [ -n "$NVIDIA_SMI_PATH" ] && [ -f "$NVIDIA_SMI_PATH" ]; then echo "Testing: $NVIDIA_SMI_PATH" if "$NVIDIA_SMI_PATH" --version >/dev/null 2>&1; then echo "SUCCESS: nvidia-smi works" "$NVIDIA_SMI_PATH" --version 2>&1 | head -1 else echo "FAILED: nvidia-smi found but cannot execute" "$NVIDIA_SMI_PATH" --version 2>&1 || true fi else echo "SKIPPED: nvidia-smi not found" fi echo "" # 8. Summary echo "==========================================" echo "SUMMARY FOR HARDCODING" echo "==========================================" echo "NVIDIA_SMI_DIR=${NVIDIA_SMI_DIR:-NOT_FOUND}" echo "CUDA_LIB_PATH=${CUDA_LIB_PATH:-NOT_FOUND}" echo "NVIDIA_ML_LIB_PATH=${NVIDIA_ML_LIB_PATH:-NOT_FOUND}" echo "CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}" echo "==========================================" echo "" echo "Please provide the complete output above to get hardcoded environment variables."

导出这些环境变量即可,把这些环境变量加入:

bash
展开代码
# NVIDIA-SMI 路径 export PATH="/usr/local/nvidia/bin:${PATH}" # CUDA driver 库路径(包含 libcuda.so.1 和 libnvidia-ml.so) export LD_LIBRARY_PATH="/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}" # CUDA 安装目录 export CUDA_HOME="/usr/local/cuda" export PATH="${CUDA_HOME}/bin:${PATH}" export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" # vLLM 相关环境变量(可选) export VLLM_USE_CPU=0 export VLLM_WORKER_MULTIPROC_METHOD=spawn export CUDA_DEVICE_ORDER=PCI_BUS_ID
如果对你有用的话,可以打赏哦
打赏
ali pay
wechat pay

本文作者:Dong

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 CC BY-NC。本作品采用《知识共享署名-非商业性使用 4.0 国际许可协议》进行许可。您可以在非商业用途下自由转载和修改,但必须注明出处并提供原作者链接。 许可协议。转载请注明出处!