scripts/evaluate-embedding-models

#!/usr/bin/env bash

evaluate-embedding-models (Issue 10-031)

Build the side-by-side embedding-model comparison report.

General description (for a CEO): we have several embedding models that each

score "how similar are these two poems" differently. This tool picks a fixed

line-up of poems once, then loads each model in turn, records its similarity

opinions, and finally lays all the opinions side by side as a web page so a

human can see what each model is really rewarding -- shared wording, shared

meaning, shared theme, similar length.

How it works: the GPU can hold one embedding model at a time, so we start the

llama.cpp server on model A, embed the sample, stop it, start model B, and so

on. The per-model results are cached, so re-running only the report is cheap.

Usage:

scripts/evaluate-embedding-models [DIR]

scripts/evaluate-embedding-models --sample 500 --anchors 8 --seed 12345 --top-k 10

scripts/evaluate-embedding-models --models "nomic-embed-text-v1.5:local,mxbai-embed-large-v1:local-mxbai"

scripts/evaluate-embedding-models --report-only # rebuild the page from cached embeddings

scripts/evaluate-embedding-models --open # open the report in a browser when done

set -u

{{{ DIR resolution + defaults

DIR="/mnt/mtwo/programming/ai-stuff/neocities-modernization"
SAMPLE=500
ANCHORS=8
SEED=12345
TOPK=10
REPORT_ONLY=0
OPEN_REPORT=0

One server that can serve several local models (config.lua's `local` entry, via

its available_models). The harness loads each model in turn on this server.

SERVER="local"

Comma-separated model names to compare; each must be in the server's

available_models (config.lua) so its GGUF + prompt prefix resolve.

MODELS_CSV="nomic-embed-text-v1.5,mxbai-embed-large-v1,embeddinggemma-300m"

}}}

{{{ arg parsing

while [ $# -gt 0 ]; do
case "$1" in
--sample) SAMPLE="$2"; shift 2 ;;
--anchors) ANCHORS="$2"; shift 2 ;;
--seed) SEED="$2"; shift 2 ;;
--top-k) TOPK="$2"; shift 2 ;;
--server) SERVER="$2"; shift 2 ;;
--models) MODELS_CSV="$2"; shift 2 ;;
--report-only) REPORT_ONLY=1; shift ;;
--open) OPEN_REPORT=1; shift ;;
--help|-h)
sed -n '2,28p' "$0"; exit 0 ;;
-* ) echo "Unknown flag: $1" >&2; exit 1 ;;

) DIR="$1"; shift ;;

esac
done

EVAL_DIR="${DIR}/output/model-evaluation"
PID_FILE="${DIR}/tmp/llamacpp-server.pid"
HEALTH_URL="" # resolved per model from the server entry's host/port

}}}

{{{ stop_server() -- kill any llama-server we (or a prior run) started

stop_server() {
if [ -f "$PID_FILE" ]; then
local pid
pid="$(cat "$PID_FILE" 2>/dev/null)"
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
kill "$pid" 2>/dev/null
# Wait for it to actually exit so the next model has the GPU + port.
local tries=0
while kill -0 "$pid" 2>/dev/null && [ "$tries" -lt 30 ]; do
sleep 1
tries=$((tries + 1))
done
kill -9 "$pid" 2>/dev/null || true
fi
rm -f "$PID_FILE"
fi
}

}}}

{{{ embed_with_model(model) -- load this model on $SERVER, embed sample, stop

The server can serve several models (one at a time); --model picks which GGUF

to load. Errors are fatal: a half-embedded model would make its column

meaningless, and we would rather stop than publish a misleading comparison.

embed_with_model() {
local model="$1"
echo ""
echo "=== ${model} (server: ${SERVER}) ==="

stop_server # never leave a previous model loaded

if ! "${DIR}/scripts/start-llamacpp-server.sh" "--server=${SERVER}" "--model=${model}" >/dev/null 2>&1; then
echo "ERROR: failed to start '${model}' on server '${SERVER}'" >&2
echo " (check the GGUF in assets/models/ and the llama.cpp binary)" >&2
stop_server
return 1
fi

if ! luajit "${DIR}/src/model-comparison.lua" "${DIR}" embed \
--server "${SERVER}" --model "${model}"; then
echo "ERROR: embedding failed for model '${model}'" >&2
stop_server
return 1
fi

stop_server
return 0
}

}}}

{{{ main

"${DIR}/scripts/ensure-tmp-symlink" "${DIR}" >/dev/null || {
echo "ERROR: could not materialize tmp/" >&2; exit 1
}
mkdir -p "${EVAL_DIR}"

if [ "$REPORT_ONLY" -eq 0 ]; then
echo "== Selecting reproducible sample (=${SAMPLE} poems, ${ANCHORS} anchors, seed ${SEED}) =="
luajit "${DIR}/src/model-comparison.lua" "${DIR}" select \
--sample "${SAMPLE}" --anchors "${ANCHORS}" --seed "${SEED}" || exit 1
fi

IFS=',' read -ra MODELS <<< "${MODELS_CSV}"
for model in "${MODELS[@]}"; do
if [ "$REPORT_ONLY" -eq 0 ]; then
embed_with_model "${model}" || {
echo "Aborting: model '${model}' did not produce embeddings." >&2
exit 1
}
fi
done

echo ""
echo "== Building comparison report (top-${TOPK}) =="
luajit "${DIR}/src/model-comparison.lua" "${DIR}" report \
--models "${MODELS_CSV}" --top-k "${TOPK}" || exit 1

REPORT="${EVAL_DIR}/comparison-report.html"
echo ""
echo "Done. Report: ${REPORT}"

if [ "$OPEN_REPORT" -eq 1 ]; then
if command -v xdg-open >/dev/null 2>&1; then xdg-open "${REPORT}" >/dev/null 2>&1 &
elif command -v firefox >/dev/null 2>&1; then firefox "${REPORT}" >/dev/null 2>&1 &
fi
fi