scripts/evaluate-embedding-models
#!/usr/bin/env bash
evaluate-embedding-models (Issue 10-031)
Build the side-by-side embedding-model comparison report.
#
General description (for a CEO): we have several embedding models that each
score "how similar are these two poems" differently. This tool picks a fixed
line-up of poems once, then loads each model in turn, records its similarity
opinions, and finally lays all the opinions side by side as a web page so a
human can see what each model is really rewarding -- shared wording, shared
meaning, shared theme, similar length.
#
How it works: the GPU can hold one embedding model at a time, so we start the
llama.cpp server on model A, embed the sample, stop it, start model B, and so
on. The per-model results are cached, so re-running only the report is cheap.
#
Usage:
scripts/evaluate-embedding-models [DIR]
scripts/evaluate-embedding-models --sample 500 --anchors 8 --seed 12345 --top-k 10
scripts/evaluate-embedding-models --models "nomic-embed-text-v1.5:local,mxbai-embed-large-v1:local-mxbai"
scripts/evaluate-embedding-models --report-only # rebuild the page from cached embeddings
scripts/evaluate-embedding-models --open # open the report in a browser when done
set -u
{{{ DIR resolution + defaults
DIR="/mnt/mtwo/programming/ai-stuff/neocities-modernization"
SAMPLE=500
ANCHORS=8
SEED=12345
TOPK=10
REPORT_ONLY=0
OPEN_REPORT=0
One server that can serve several local models (config.lua's local entry, via
its available_models). The harness loads each model in turn on this server.
SERVER="local"
Comma-separated model names to compare; each must be in the server's
available_models (config.lua) so its GGUF + prompt prefix resolve.
MODELS_CSV="nomic-embed-text-v1.5,mxbai-embed-large-v1,embeddinggemma-300m"
}}}
{{{ arg parsing
while [ $# -gt 0 ]; do
case "$1" in
--sample) SAMPLE="$2"; shift 2 ;;
--anchors) ANCHORS="$2"; shift 2 ;;
--seed) SEED="$2"; shift 2 ;;
--top-k) TOPK="$2"; shift 2 ;;
--server) SERVER="$2"; shift 2 ;;
--models) MODELS_CSV="$2"; shift 2 ;;
--report-only) REPORT_ONLY=1; shift ;;
--open) OPEN_REPORT=1; shift ;;
--help|-h)
sed -n '2,28p' "$0"; exit 0 ;;
-* ) echo "Unknown flag: $1" >&2; exit 1 ;;
- ) DIR="$1"; shift ;;
esac
done
EVAL_DIR="${DIR}/output/model-evaluation"
PID_FILE="${DIR}/tmp/llamacpp-server.pid"
HEALTH_URL="" # resolved per model from the server entry's host/port
}}}
{{{ stop_server() -- kill any llama-server we (or a prior run) started
stop_server() {
if [ -f "$PID_FILE" ]; then
local pid
pid="$(cat "$PID_FILE" 2>/dev/null)"
if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
kill "$pid" 2>/dev/null
# Wait for it to actually exit so the next model has the GPU + port.
local tries=0
while kill -0 "$pid" 2>/dev/null && [ "$tries" -lt 30 ]; do
sleep 1
tries=$((tries + 1))
done
kill -9 "$pid" 2>/dev/null || true
fi
rm -f "$PID_FILE"
fi
}
}}}
{{{ embed_with_model(model) -- load this model on $SERVER, embed sample, stop
The server can serve several models (one at a time); --model picks which GGUF
to load. Errors are fatal: a half-embedded model would make its column
meaningless, and we would rather stop than publish a misleading comparison.
embed_with_model() {
local model="$1"
echo ""
echo "=== ${model} (server: ${SERVER}) ==="
stop_server # never leave a previous model loaded
if ! "${DIR}/scripts/start-llamacpp-server.sh" "--server=${SERVER}" "--model=${model}" >/dev/null 2>&1; then
echo "ERROR: failed to start '${model}' on server '${SERVER}'" >&2
echo " (check the GGUF in assets/models/ and the llama.cpp binary)" >&2
stop_server
return 1
fi
if ! luajit "${DIR}/src/model-comparison.lua" "${DIR}" embed \
--server "${SERVER}" --model "${model}"; then
echo "ERROR: embedding failed for model '${model}'" >&2
stop_server
return 1
fi
stop_server
return 0
}
}}}
{{{ main
"${DIR}/scripts/ensure-tmp-symlink" "${DIR}" >/dev/null || {
echo "ERROR: could not materialize tmp/" >&2; exit 1
}
mkdir -p "${EVAL_DIR}"
if [ "$REPORT_ONLY" -eq 0 ]; then
echo "== Selecting reproducible sample (=${SAMPLE} poems, ${ANCHORS} anchors, seed ${SEED}) =="
luajit "${DIR}/src/model-comparison.lua" "${DIR}" select \
--sample "${SAMPLE}" --anchors "${ANCHORS}" --seed "${SEED}" || exit 1
fi
IFS=',' read -ra MODELS <<< "${MODELS_CSV}"
for model in "${MODELS[@]}"; do
if [ "$REPORT_ONLY" -eq 0 ]; then
embed_with_model "${model}" || {
echo "Aborting: model '${model}' did not produce embeddings." >&2
exit 1
}
fi
done
echo ""
echo "== Building comparison report (top-${TOPK}) =="
luajit "${DIR}/src/model-comparison.lua" "${DIR}" report \
--models "${MODELS_CSV}" --top-k "${TOPK}" || exit 1
REPORT="${EVAL_DIR}/comparison-report.html"
echo ""
echo "Done. Report: ${REPORT}"
if [ "$OPEN_REPORT" -eq 1 ]; then
if command -v xdg-open >/dev/null 2>&1; then xdg-open "${REPORT}" >/dev/null 2>&1 &
elif command -v firefox >/dev/null 2>&1; then firefox "${REPORT}" >/dev/null 2>&1 &
fi
fi