How benchmark local LLMs

How benchmark local LLMs

For testing LLM performance using Ollama I used verbose flag in command and created script to parse and aggregate this data.

Sample:

ollama run "llama3.2:1b" --verbose <<< Why is the sky blue? Provide 2-3 sentences
The sky appears blue because of a phenomenon called Rayleigh scattering, which is named after the British physicist Lord Rayleigh. When sunlight enters Earth's atmosphere, it encounters tiny
molecules of gases such as nitrogen and oxygen. These molecules scatter the shorter, blue wavelengths of light more than the longer, red wavelengths, giving the sky its blue color.

total duration:       6.101543867s
load duration:        4.138974245s
prompt eval count:    38 token(s)
prompt eval duration: 178.974287ms
prompt eval rate:     212.32 tokens/s
eval count:           69 token(s)
eval duration:        1.780918057s
eval rate:            38.74 tokens/s

Benchmark LLMs script

Create file llm-benchmark.sh with content:

#!/bin/bash
# llm-benchmark.sh - Enhanced Ollama model benchmarking tool
export LC_NUMERIC=C

set -uo pipefail

# --- Configuration ---
DEFAULT_PROMPT="Why is the sky blue? Provide 2-3 sentences."
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DEBUG_LOG="benchmark_debug_${TIMESTAMP}.log"
SHARED_VRAM_PERCENT=40
DEBUG_MODE=false

# --- Command-line arguments ---
usage() {
  echo "Usage: $0 [-p PROMPT_FILE] [-v VRAM_PERCENT] [-d]"
  echo "  -p  Custom prompt file (default: built-in prompt)"
  echo "  -v  Shared VRAM percentage (0-100, default: 40)"
  echo "  -d  Enable debug mode (save model outputs)"
  exit 1
}

while getopts "p:v:dh" flag; do
  case "$flag" in
    p) CUSTOM_PROMPT_FILE=$OPTARG ;;
    v) SHARED_VRAM_PERCENT=$OPTARG ;;
    d) DEBUG_MODE=true ;;
    h|*) usage ;;
  esac
done

# --- System Detection ---
detect_system() {
  CPU=$(lscpu | grep 'Model name' | awk -F: '{print $2}' | xargs | tr ' ' '_' | head -c 40 | tr -cd '[:alnum:]_')
  RAM=$(free -g | awk '/^Mem:/{print $2}')
  GPU=$(lspci | grep -i 'vga\|3d\|2d' | grep -i 'nvidia\|amd\|intel' | head -1 | awk -F: '{print $3}' | xargs | tr ' ' '_' | head -c 30 | tr -cd '[:alnum:]_')
  SYSTEM_ID=$(hostname | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9' | head -c 12)

  if command -v nvidia-smi &>/dev/null; then
    BACKEND="cuda"
    VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -1)
  elif command -v rocm-smi &>/dev/null; then
    BACKEND="rocm"
    VRAM=$(rocm-smi --showmeminfo vram | grep 'Total VRAM Memory' | awk '{print $5}')
  else
    BACKEND="cpu"
    VRAM=$(awk '/MemTotal/ {print int($2/1024/1024)}' /proc/meminfo)
  fi

  HARDWARE_ID="${SYSTEM_ID}_${CPU}_${GPU}_${RAM}GB_${VRAM}MB"
  HARDWARE_ID=$(echo "$HARDWARE_ID" | tr -cd '[:alnum:]_' | head -c 100)
}

# --- CPU Usage Monitoring ---
get_cpu_usage() {
  local cpu=($(grep '^cpu ' /proc/stat))
  echo "${cpu[4]} $(( ${cpu[1]} + ${cpu[2]} + ${cpu[3]} + ${cpu[4]} + ${cpu[5]} + ${cpu[6]} + ${cpu[7]} ))"
}

# --- Main Benchmark ---
run_benchmark() {
  local model="$1"
  local size_mb="$2"

  local output_file
  output_file=$(mktemp)
  local cpu_stats
  cpu_stats=$(mktemp)
  echo ""
  echo "=== Benchmarking $model ==="
  echo "• Model size: $(printf "%.2fGB" "$(echo "$size_mb/1024" | bc -l)")"
  echo "• Backend: $BACKEND"

  # Load prompt
  if [ -n "${CUSTOM_PROMPT_FILE:-}" ]; then
    PROMPT=$(cat "$CUSTOM_PROMPT_FILE")
  else
    PROMPT="$DEFAULT_PROMPT"
  fi

  # Start CPU monitoring
  (
    while true; do
      read -r idle1 total1 < <(get_cpu_usage)
      sleep 0.5
      read -r idle2 total2 < <(get_cpu_usage)
      idle_delta=$((idle2 - idle1))
      total_delta=$((total2 - total1))
      if [ $total_delta -gt 0 ]; then
        usage=$((100 * (total_delta - idle_delta) / total_delta))
        echo "$usage" >> "$cpu_stats"
      fi
    done
  ) &
  MONITOR_PID=$!

  # Build Ollama command
  local ollama_cmd="ollama run \"$model\" --verbose"
  if [[ "$model" == *"qwen"* ]]; then
    export OLLAMA_STOP="</answer>"
  fi

  echo -e "\033[2mCommand: echo -e \"$(printf "%q" "$PROMPT")\" | $ollama_cmd\033[0m"

  # Execute and capture output
  local start_time
  start_time=$(date +%s.%N)
  eval "echo -e \"$PROMPT\" | $ollama_cmd" > "$output_file" 2>&1
  local exit_code=$?
  local end_time
  end_time=$(date +%s.%N)
  kill $MONITOR_PID 2>/dev/null

  # Calculate duration
  local duration
  duration=$(printf "%.2f" "$(echo "$end_time - $start_time" | bc -l)")

  # Extract all metrics from Ollama output
  total_duration=$(awk -F: '/total duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
  load_duration=$(awk -F: '/load duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
  prompt_eval_count=$(awk -F: '/prompt eval count:/ {gsub(/[^0-9]/,"",$2); print $2}' "$output_file" | tail -1)
  prompt_eval_duration=$(awk -F: '/prompt eval duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
  prompt_eval_rate=$(awk -F: '/prompt eval rate:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
  eval_count=$(awk -F: '/eval count:/ {gsub(/[^0-9]/,"",$2); print $2}' "$output_file" | tail -1)
  eval_duration=$(awk -F: '/eval duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
  eval_rate=$(awk -F: '/eval rate:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)

  # Set defaults for missing values
  total_duration=${total_duration:-0}
  load_duration=${load_duration:-0}
  prompt_eval_count=${prompt_eval_count:-0}
  prompt_eval_duration=${prompt_eval_duration:-0}
  prompt_eval_rate=${prompt_eval_rate:-0}
  eval_count=${eval_count:-0}
  eval_duration=${eval_duration:-0}
  eval_rate=${eval_rate:-0}

  # CPU stats
  local cpu_avg cpu_max
  if [ -s "$cpu_stats" ]; then
    cpu_avg=$(awk '{sum+=$1; n++} END {if (n>0) print int(sum/n); else print 0}' "$cpu_stats")
    cpu_max=$(awk 'BEGIN{max=0} {if($1>max) max=$1} END{print int(max)}' "$cpu_stats")
  else
    cpu_avg=0
    cpu_max=0
  fi

  # Output CSV line with all metrics
  echo "$model,$BACKEND,$size_mb,$total_duration,$load_duration,$prompt_eval_count,$prompt_eval_duration,$prompt_eval_rate,$eval_count,$eval_duration,$eval_rate,$cpu_avg,$cpu_max" >> "$OUTPUT_CSV"

  # Print output
  echo "┌──────────────────────────────────────────────"
  echo "│ Model: $model"
  echo "│ Backend: $BACKEND"
  echo "│ Size: $(printf "%.2fGB" "$(echo "$size_mb/1024" | bc -l)")"
  echo "└──────────────────────────────────────────────"
  cat "$output_file"
  rm -f "$output_file" "$cpu_stats"
  echo ""
}

# --- Execution ---
detect_system
echo "Starting benchmark at $(date)"
echo "System detected: ${CPU} | ${RAM}GB RAM | ${GPU} | VRAM: ${VRAM}MB"

OUTPUT_CSV="benchmark_${HARDWARE_ID}_${TIMESTAMP}.csv"
echo "model,backend,size_mb,total_duration,load_duration,prompt_eval_count,prompt_eval_duration,prompt_eval_rate,eval_count,eval_duration,eval_rate,cpu_avg,cpu_max" > "$OUTPUT_CSV"

if [ ! -f models.txt ]; then
  echo "Error: models.txt not found. Please provide a list of models and their sizes."
  exit 1
fi

echo "Processing models:"
cat models.txt

while read -r model size_mb; do
  [[ "$model" =~ ^#.*$ || -z "$model" ]] && continue

  if [ "$BACKEND" = "cuda" ]; then
    AVAILABLE_RAM=$((RAM * SHARED_VRAM_PERCENT / 100 * 1024))
    TOTAL_AVAILABLE=$((VRAM + AVAILABLE_RAM))

    if [ "$size_mb" -gt "$TOTAL_AVAILABLE" ]; then
      echo "Skipping $model (needs ${size_mb}MB, only ${TOTAL_AVAILABLE}MB total available)"
      echo "$model,$BACKEND,$size_mb,0,0,0,0,0,0,0,0,0,0" >> "$OUTPUT_CSV"
      continue
    elif [ "$size_mb" -gt "$VRAM" ]; then
      echo "⚠️  Running $model with partial GPU+CPU offloading (${VRAM}MB VRAM + ${AVAILABLE_RAM}MB RAM)"
    fi
  elif [ "$BACKEND" = "cpu" ] && [ "$size_mb" -gt $((RAM * SHARED_VRAM_PERCENT / 100 * 1024)) ]; then
    echo "Skipping $model (needs $size_mb MB, only $((RAM * SHARED_VRAM_PERCENT / 100 * 1024)) MB RAM available)"
    echo "$model,$BACKEND,$size_mb,0,0,0,0,0,0,0,0,0,0" >> "$OUTPUT_CSV"
    continue
  fi

  (run_benchmark "$model" "$size_mb") || true
done < models.txt

echo "Results saved to $OUTPUT_CSV"
[ "$DEBUG_MODE" = true ] && echo "Debug logs saved to $DEBUG_LOG"

Grant execution permission:

chmod +x llm-benchmark.sh

Create file with models for testing:

cat > models.txt <<EOL
gemma3:1b 815
llama3.2:1b 1300
qwen3:1.7b 1400
llama3.2:3b 2000
qwen3:4b 2600
gemma3:4b 3300
mistral:7b 4100
llama3.1:8b 4700
qwen3:8b 5200
gemma3:12b 8000
qwen2.5:14b 9000
qwen2.5-coder:14b 9000
deepseek-r1:14b 9000
phi4:14b 9100
mistral-small:22b 12000
devstral:24b 14000
mistral-small3.1:24b 15000
gemma3:27b 17000
qwen3:30b 18000
qwen2.5-coder:32b 20000
mixtral:8x7b 26000
EOL

Run tests using command:

./llm-benchmark.sh

Output file with results will be in format: benchmark_*.csv

Published on 5/25/2025