
How benchmark local LLMs
For testing LLM performance using Ollama I used verbose flag in command and created script to parse and aggregate this data.
Sample:
ollama run "llama3.2:1b" --verbose <<< Why is the sky blue? Provide 2-3 sentences
The sky appears blue because of a phenomenon called Rayleigh scattering, which is named after the British physicist Lord Rayleigh. When sunlight enters Earth's atmosphere, it encounters tiny
molecules of gases such as nitrogen and oxygen. These molecules scatter the shorter, blue wavelengths of light more than the longer, red wavelengths, giving the sky its blue color.
total duration: 6.101543867s
load duration: 4.138974245s
prompt eval count: 38 token(s)
prompt eval duration: 178.974287ms
prompt eval rate: 212.32 tokens/s
eval count: 69 token(s)
eval duration: 1.780918057s
eval rate: 38.74 tokens/s
Benchmark LLMs script
Create file llm-benchmark.sh with content:
#!/bin/bash
# llm-benchmark.sh - Enhanced Ollama model benchmarking tool
export LC_NUMERIC=C
set -uo pipefail
# --- Configuration ---
DEFAULT_PROMPT="Why is the sky blue? Provide 2-3 sentences."
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
DEBUG_LOG="benchmark_debug_${TIMESTAMP}.log"
SHARED_VRAM_PERCENT=40
DEBUG_MODE=false
# --- Command-line arguments ---
usage() {
echo "Usage: $0 [-p PROMPT_FILE] [-v VRAM_PERCENT] [-d]"
echo " -p Custom prompt file (default: built-in prompt)"
echo " -v Shared VRAM percentage (0-100, default: 40)"
echo " -d Enable debug mode (save model outputs)"
exit 1
}
while getopts "p:v:dh" flag; do
case "$flag" in
p) CUSTOM_PROMPT_FILE=$OPTARG ;;
v) SHARED_VRAM_PERCENT=$OPTARG ;;
d) DEBUG_MODE=true ;;
h|*) usage ;;
esac
done
# --- System Detection ---
detect_system() {
CPU=$(lscpu | grep 'Model name' | awk -F: '{print $2}' | xargs | tr ' ' '_' | head -c 40 | tr -cd '[:alnum:]_')
RAM=$(free -g | awk '/^Mem:/{print $2}')
GPU=$(lspci | grep -i 'vga\|3d\|2d' | grep -i 'nvidia\|amd\|intel' | head -1 | awk -F: '{print $3}' | xargs | tr ' ' '_' | head -c 30 | tr -cd '[:alnum:]_')
SYSTEM_ID=$(hostname | tr '[:upper:]' '[:lower:]' | tr -cd 'a-z0-9' | head -c 12)
if command -v nvidia-smi &>/dev/null; then
BACKEND="cuda"
VRAM=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -1)
elif command -v rocm-smi &>/dev/null; then
BACKEND="rocm"
VRAM=$(rocm-smi --showmeminfo vram | grep 'Total VRAM Memory' | awk '{print $5}')
else
BACKEND="cpu"
VRAM=$(awk '/MemTotal/ {print int($2/1024/1024)}' /proc/meminfo)
fi
HARDWARE_ID="${SYSTEM_ID}_${CPU}_${GPU}_${RAM}GB_${VRAM}MB"
HARDWARE_ID=$(echo "$HARDWARE_ID" | tr -cd '[:alnum:]_' | head -c 100)
}
# --- CPU Usage Monitoring ---
get_cpu_usage() {
local cpu=($(grep '^cpu ' /proc/stat))
echo "${cpu[4]} $(( ${cpu[1]} + ${cpu[2]} + ${cpu[3]} + ${cpu[4]} + ${cpu[5]} + ${cpu[6]} + ${cpu[7]} ))"
}
# --- Main Benchmark ---
run_benchmark() {
local model="$1"
local size_mb="$2"
local output_file
output_file=$(mktemp)
local cpu_stats
cpu_stats=$(mktemp)
echo ""
echo "=== Benchmarking $model ==="
echo "• Model size: $(printf "%.2fGB" "$(echo "$size_mb/1024" | bc -l)")"
echo "• Backend: $BACKEND"
# Load prompt
if [ -n "${CUSTOM_PROMPT_FILE:-}" ]; then
PROMPT=$(cat "$CUSTOM_PROMPT_FILE")
else
PROMPT="$DEFAULT_PROMPT"
fi
# Start CPU monitoring
(
while true; do
read -r idle1 total1 < <(get_cpu_usage)
sleep 0.5
read -r idle2 total2 < <(get_cpu_usage)
idle_delta=$((idle2 - idle1))
total_delta=$((total2 - total1))
if [ $total_delta -gt 0 ]; then
usage=$((100 * (total_delta - idle_delta) / total_delta))
echo "$usage" >> "$cpu_stats"
fi
done
) &
MONITOR_PID=$!
# Build Ollama command
local ollama_cmd="ollama run \"$model\" --verbose"
if [[ "$model" == *"qwen"* ]]; then
export OLLAMA_STOP="</answer>"
fi
echo -e "\033[2mCommand: echo -e \"$(printf "%q" "$PROMPT")\" | $ollama_cmd\033[0m"
# Execute and capture output
local start_time
start_time=$(date +%s.%N)
eval "echo -e \"$PROMPT\" | $ollama_cmd" > "$output_file" 2>&1
local exit_code=$?
local end_time
end_time=$(date +%s.%N)
kill $MONITOR_PID 2>/dev/null
# Calculate duration
local duration
duration=$(printf "%.2f" "$(echo "$end_time - $start_time" | bc -l)")
# Extract all metrics from Ollama output
total_duration=$(awk -F: '/total duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
load_duration=$(awk -F: '/load duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
prompt_eval_count=$(awk -F: '/prompt eval count:/ {gsub(/[^0-9]/,"",$2); print $2}' "$output_file" | tail -1)
prompt_eval_duration=$(awk -F: '/prompt eval duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
prompt_eval_rate=$(awk -F: '/prompt eval rate:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
eval_count=$(awk -F: '/eval count:/ {gsub(/[^0-9]/,"",$2); print $2}' "$output_file" | tail -1)
eval_duration=$(awk -F: '/eval duration:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
eval_rate=$(awk -F: '/eval rate:/ {gsub(/[^0-9.]/,"",$2); print $2}' "$output_file" | tail -1)
# Set defaults for missing values
total_duration=${total_duration:-0}
load_duration=${load_duration:-0}
prompt_eval_count=${prompt_eval_count:-0}
prompt_eval_duration=${prompt_eval_duration:-0}
prompt_eval_rate=${prompt_eval_rate:-0}
eval_count=${eval_count:-0}
eval_duration=${eval_duration:-0}
eval_rate=${eval_rate:-0}
# CPU stats
local cpu_avg cpu_max
if [ -s "$cpu_stats" ]; then
cpu_avg=$(awk '{sum+=$1; n++} END {if (n>0) print int(sum/n); else print 0}' "$cpu_stats")
cpu_max=$(awk 'BEGIN{max=0} {if($1>max) max=$1} END{print int(max)}' "$cpu_stats")
else
cpu_avg=0
cpu_max=0
fi
# Output CSV line with all metrics
echo "$model,$BACKEND,$size_mb,$total_duration,$load_duration,$prompt_eval_count,$prompt_eval_duration,$prompt_eval_rate,$eval_count,$eval_duration,$eval_rate,$cpu_avg,$cpu_max" >> "$OUTPUT_CSV"
# Print output
echo "┌──────────────────────────────────────────────"
echo "│ Model: $model"
echo "│ Backend: $BACKEND"
echo "│ Size: $(printf "%.2fGB" "$(echo "$size_mb/1024" | bc -l)")"
echo "└──────────────────────────────────────────────"
cat "$output_file"
rm -f "$output_file" "$cpu_stats"
echo ""
}
# --- Execution ---
detect_system
echo "Starting benchmark at $(date)"
echo "System detected: ${CPU} | ${RAM}GB RAM | ${GPU} | VRAM: ${VRAM}MB"
OUTPUT_CSV="benchmark_${HARDWARE_ID}_${TIMESTAMP}.csv"
echo "model,backend,size_mb,total_duration,load_duration,prompt_eval_count,prompt_eval_duration,prompt_eval_rate,eval_count,eval_duration,eval_rate,cpu_avg,cpu_max" > "$OUTPUT_CSV"
if [ ! -f models.txt ]; then
echo "Error: models.txt not found. Please provide a list of models and their sizes."
exit 1
fi
echo "Processing models:"
cat models.txt
while read -r model size_mb; do
[[ "$model" =~ ^#.*$ || -z "$model" ]] && continue
if [ "$BACKEND" = "cuda" ]; then
AVAILABLE_RAM=$((RAM * SHARED_VRAM_PERCENT / 100 * 1024))
TOTAL_AVAILABLE=$((VRAM + AVAILABLE_RAM))
if [ "$size_mb" -gt "$TOTAL_AVAILABLE" ]; then
echo "Skipping $model (needs ${size_mb}MB, only ${TOTAL_AVAILABLE}MB total available)"
echo "$model,$BACKEND,$size_mb,0,0,0,0,0,0,0,0,0,0" >> "$OUTPUT_CSV"
continue
elif [ "$size_mb" -gt "$VRAM" ]; then
echo "⚠️ Running $model with partial GPU+CPU offloading (${VRAM}MB VRAM + ${AVAILABLE_RAM}MB RAM)"
fi
elif [ "$BACKEND" = "cpu" ] && [ "$size_mb" -gt $((RAM * SHARED_VRAM_PERCENT / 100 * 1024)) ]; then
echo "Skipping $model (needs $size_mb MB, only $((RAM * SHARED_VRAM_PERCENT / 100 * 1024)) MB RAM available)"
echo "$model,$BACKEND,$size_mb,0,0,0,0,0,0,0,0,0,0" >> "$OUTPUT_CSV"
continue
fi
(run_benchmark "$model" "$size_mb") || true
done < models.txt
echo "Results saved to $OUTPUT_CSV"
[ "$DEBUG_MODE" = true ] && echo "Debug logs saved to $DEBUG_LOG"
Grant execution permission:
chmod +x llm-benchmark.sh
Create file with models for testing:
cat > models.txt <<EOL
gemma3:1b 815
llama3.2:1b 1300
qwen3:1.7b 1400
llama3.2:3b 2000
qwen3:4b 2600
gemma3:4b 3300
mistral:7b 4100
llama3.1:8b 4700
qwen3:8b 5200
gemma3:12b 8000
qwen2.5:14b 9000
qwen2.5-coder:14b 9000
deepseek-r1:14b 9000
phi4:14b 9100
mistral-small:22b 12000
devstral:24b 14000
mistral-small3.1:24b 15000
gemma3:27b 17000
qwen3:30b 18000
qwen2.5-coder:32b 20000
mixtral:8x7b 26000
EOL
Run tests using command:
./llm-benchmark.sh
Output file with results will be in format: benchmark_*.csv
Published on 5/25/2025