src/model-comparison.lua
1#!/usr/bin/env luajit
2-- {{{ model-comparison.lua
3-- Data + report layer for the embedding-model evaluation framework (Issue
4-- 10-031). Three subcommands, each a separate resumable step so the expensive
5-- middle one (which needs a model loaded on the GPU) is isolated from the cheap
6-- ends:
7--
8-- select : choose a reproducible sample of poems + a spread of anchor poems,
9-- write output/model-evaluation/sample.json
10-- embed : embed that sample with the CURRENTLY-RUNNING model server, write
11-- output/model-evaluation/<model>/sample-embeddings.json
12-- report : read the sample + every model's embeddings, rank each anchor per
13-- model, compute agreement/divergence + "personality" signals, and
14-- emit output/model-evaluation/comparison-report.html (+ metrics.json)
15--
16-- General description (for a CEO): step one picks the line-up of poems to judge;
17-- step two runs once per model (with that model loaded) to record its opinions;
18-- step three lays the opinions side by side as a web page a human can read.
19--
20-- Orchestration (which server to start for each model) lives in the bash driver
21-- scripts/evaluate-embedding-models; this file does the data, not the process
22-- management -- separation of concerns.
23-- }}}
24
25local DIR = arg[1] or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
26package.path = DIR .. "/?.lua;" .. DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
27
28local dkjson = require("dkjson")
29local utils = require("utils")
30local evaluator = require("model-evaluator")
31
32utils.init_assets_root({ DIR })
33local EVAL_DIR = DIR .. "/output/model-evaluation"
34
35-- {{{ local function parse_flags(argv, from)
36-- Tiny --key value / --flag parser over argv starting at index `from`. Bare
37-- --flag becomes true. Keeps the CLI self-describing without a dependency.
38local function parse_flags(argv, from)
39 local f = {}
40 local i = from
41 while i <= #argv do
42 local a = argv[i]
43 if a:sub(1, 2) == "--" then
44 local key = a:sub(3)
45 local val = argv[i + 1]
46 if val == nil or val:sub(1, 2) == "--" then
47 f[key] = true; i = i + 1
48 else
49 f[key] = val; i = i + 2
50 end
51 else
52 i = i + 1
53 end
54 end
55 return f
56end
57-- }}}
58
59-- {{{ local function read_json(path) / write_json(path, t)
60local function read_json(path)
61 local fh = io.open(path, "r")
62 if not fh then return nil end
63 local s = fh:read("*a"); fh:close()
64 return dkjson.decode(s)
65end
66
67local function write_json(path, t)
68 -- Parent dir is created by the caller; a missing one should fail loudly here
69 -- rather than leave a silent half-run, so io.open's nil is asserted.
70 local fh = assert(io.open(path, "w"), "cannot write " .. path)
71 fh:write(dkjson.encode(t, { indent = true }))
72 fh:close()
73end
74-- }}}
75
76-- {{{ local function model_dir(model)
77-- Per-model output subdir, name sanitized the same way embeddings_dir sanitizes
78-- (colon -> underscore etc.), so "embeddinggemma-300m" and a future "qwen:4b"
79-- both map to a safe folder.
80local function model_dir(model)
81 return EVAL_DIR .. "/" .. model:gsub("[^%w%-_.]", "_")
82end
83-- }}}
84
85-- {{{ local function lcg(seed)
86-- Self-contained linear congruential generator (Park-Miller constants). Used
87-- instead of math.random so the sample is byte-identical on any machine and any
88-- LuaJIT build -- the comparison is only meaningful if everyone judges the SAME
89-- poems. Returns a function yielding floats in [0,1).
90local function lcg(seed)
91 local state = seed % 2147483647
92 if state <= 0 then state = state + 2147483646 end
93 return function()
94 state = (state * 16807) % 2147483647
95 return (state - 1) / 2147483646
96 end
97end
98-- }}}
99
100-- {{{ local function load_poems()
101-- Load poems.json, keeping only real embeddable text (skip image-only entries
102-- and empties): a candidate with no words tells us nothing about a model's taste.
103-- Returns an array of { poem_index, id, content, length }.
104local function load_poems()
105 local data = read_json(utils.asset_path("poems.json"))
106 if not data then error("cannot read assets/poems.json") end
107 local poems = data.poems or data
108 local out = {}
109 for _, p in ipairs(poems) do
110 local content = p.content or p.text or ""
111 if not p.is_image_only and #content:gsub("%s", "") > 0 then
112 out[#out + 1] = {
113 poem_index = p.poem_index or p.id,
114 id = p.id,
115 content = content,
116 length = p.length or #content,
117 }
118 end
119 end
120 return out
121end
122-- }}}
123
124-- {{{ local function cmd_select(flags)
125-- Pick a reproducible sample (--sample N) and a length-spread set of anchors
126-- (--anchors K). Anchors come from sorting the sample by length and taking K
127-- evenly spaced positions, so they span short imagery-poems through long
128-- narratives (the diversity Issue 10-031 asks for) with no hand-curation.
129local function cmd_select(flags)
130 local n_sample = tonumber(flags.sample) or 500
131 local n_anchor = tonumber(flags.anchors) or 8
132 local seed = tonumber(flags.seed) or 12345
133
134 local poems = load_poems()
135 if #poems == 0 then error("no embeddable poems found") end
136
137 -- Seeded Fisher-Yates over an index list, then take the first N. Shuffling
138 -- (vs striding) removes any bias from how poems.json happens to be ordered.
139 local rand = lcg(seed)
140 local order = {}
141 for i = 1, #poems do order[i] = i end
142 for i = #order, 2, -1 do
143 local j = math.floor(rand() * i) + 1
144 order[i], order[j] = order[j], order[i]
145 end
146 n_sample = math.min(n_sample, #poems)
147 local sample = {}
148 for i = 1, n_sample do sample[i] = poems[order[i]] end
149
150 local by_len = {}
151 for i = 1, #sample do by_len[i] = sample[i] end
152 table.sort(by_len, function(a, b) return a.length < b.length end)
153 local anchors = {}
154 n_anchor = math.min(n_anchor, #by_len)
155 for k = 1, n_anchor do
156 local pos = math.floor((k - 0.5) / n_anchor * #by_len) + 1
157 if pos > #by_len then pos = #by_len end
158 anchors[k] = by_len[pos].poem_index
159 end
160
161 utils.ensure_directory(EVAL_DIR)
162 write_json(EVAL_DIR .. "/sample.json", {
163 seed = seed, sample_size = #sample, anchor_count = #anchors,
164 anchors = anchors, sample = sample,
165 })
166 print(string.format("[select] sample=%d anchors=%d seed=%d -> %s/sample.json",
167 #sample, #anchors, seed, EVAL_DIR))
168end
169-- }}}
170
171-- {{{ local function cmd_embed(flags)
172-- Embed the sample with the model currently served by llama.cpp. --server names
173-- the config entry (so the right prompt prefix is applied) and --model is the
174-- identifier sent in the request. Chunked so one over-large request cannot blow
175-- the server's batch limits; a missing vector is a hard error (a partial space
176-- would silently corrupt every ranking), not a skipped poem.
177local function cmd_embed(flags)
178 local server = flags.server or error("--server NAME required")
179 local model = flags.model or error("--model NAME required")
180 local chunk = tonumber(flags.chunk) or 16
181
182 -- Late require: fuzzy-computing pulls in the embedding stack; only this step
183 -- needs it, so select/report stay light and runnable without a server.
184 local inference = require("inference-server-config")
185 inference.set_project_root(DIR)
186 inference.set_selected_server(server)
187 -- Select the model too, so format_embedding_prompt resolves THIS model's
188 -- prefix from the server's available_models (nomic clusters, gemma has its
189 -- own clustering prompt, mxbai none) -- not the server default's.
190 inference.set_selected_model(model)
191 local fuzzy = require("fuzzy-computing")
192 local endpoint = inference.build_host_url()
193 local format_fn = inference.format_embedding_prompt
194 -- Chunk to the LOADED model's context budget. mxbai-embed-large caps at 512
195 -- tokens (BERT-large), while nomic and gemma allow ~2048; a poem longer than
196 -- the cap is split and its chunk vectors averaged -- exactly how the real
197 -- pipeline embeds long poems (Issue 10-050). Computed once per model here so
198 -- we don't re-query /tokenize's budget on every batch.
199 local count_fn = fuzzy.make_token_counter(endpoint)
200 local max_tokens = fuzzy.embedding_chunk_budget(endpoint, format_fn)
201 -- embedding_chunk_budget derives from a fixed MODEL_CONTEXT_TOKENS constant
202 -- (sized for nomic/gemma's ~2048). Some models have a SMALLER trained context
203 -- than that and than the server's launch --ctx-size, so their real limit must
204 -- be capped explicitly or the chunker emits chunks the server rejects
205 -- ("exceed_context_size_error"). mxbai-embed-large is BERT-large: 512 tokens.
206 local model_ctx_cap = {
207 ["mxbai-embed-large-v1"] = 500, -- 512 trained ctx, minus specials, headroom
208 }
209 if flags["max-tokens"] then max_tokens = tonumber(flags["max-tokens"]) end
210 if model_ctx_cap[model] then max_tokens = math.min(max_tokens, model_ctx_cap[model]) end
211
212 local sample_doc = read_json(EVAL_DIR .. "/sample.json")
213 or error("run `select` first: missing " .. EVAL_DIR .. "/sample.json")
214 local sample = sample_doc.sample
215
216 local embeddings = {}
217 local total = #sample
218 for start = 1, total, chunk do
219 local stop = math.min(start + chunk - 1, total)
220 local texts, idxs = {}, {}
221 for i = start, stop do
222 texts[#texts + 1] = sample[i].content
223 idxs[#idxs + 1] = sample[i].poem_index
224 end
225 local vecs, err = fuzzy.embed_texts_with_chunking(texts, model, {
226 endpoint = endpoint, format_fn = format_fn,
227 count_fn = count_fn, max_tokens = max_tokens,
228 })
229 if not vecs then error("embedding batch failed: " .. tostring(err)) end
230 for j = 1, #texts do
231 if not vecs[j] then
232 error(string.format("missing vector for poem_index %s (model %s)",
233 tostring(idxs[j]), model))
234 end
235 embeddings[tostring(idxs[j])] = vecs[j]
236 end
237 io.write(string.format("\r[embed %s] %d/%d", model, stop, total)); io.flush()
238 end
239 io.write("\n")
240
241 local dim = 0
242 for _, v in pairs(embeddings) do dim = #v; break end
243 utils.ensure_directory(model_dir(model))
244 write_json(model_dir(model) .. "/sample-embeddings.json", {
245 model = model, server = server, dimensions = dim, embeddings = embeddings,
246 })
247 print(string.format("[embed] %s: %d vectors x %d dims", model, total, dim))
248end
249-- }}}
250
251-- {{{ local function esc(s)
252-- HTML-escape for poem text dropped into the report.
253local function esc(s)
254 return (tostring(s):gsub("[&<>\"]", {
255 ["&"] = "&", ["<"] = "<", [">"] = ">", ["\""] = """,
256 }))
257end
258-- }}}
259
260-- {{{ local function snippet(text, n)
261-- One-line preview of a poem: collapse whitespace, clip to n chars with an
262-- ellipsis. Keeps the side-by-side columns scannable.
263local function snippet(text, n)
264 local s = tostring(text):gsub("%s+", " "):gsub("^%s+", "")
265 if #s > n then s = s:sub(1, n) .. "\226\128\166" end -- UTF-8 ellipsis
266 return s
267end
268-- }}}
269
270-- {{{ local function cmd_report(flags)
271-- Read the sample + each model's embeddings and render the side-by-side report.
272-- --models is the ordered, comma-separated column list (also the labels). For
273-- each anchor we rank the pool per model, show the top-K, and compute pairwise
274-- agreement (Kendall's tau + top-K overlap). Per model we also aggregate the
275-- "personality" signals so a human can see WHAT each rewards.
276local function cmd_report(flags)
277 local top_k = tonumber(flags["top-k"]) or 10
278 local models_csv = flags.models or error("--models a,b,c required")
279 local models = {}
280 for m in models_csv:gmatch("[^,]+") do models[#models + 1] = m:gsub("^%s+", ""):gsub("%s+$", "") end
281
282 local sample_doc = read_json(EVAL_DIR .. "/sample.json")
283 or error("missing sample.json -- run select + embed first")
284
285 -- poem_index -> content/length, for snippets, lexical overlap, length bias.
286 local text_of, len_of = {}, {}
287 for _, p in ipairs(sample_doc.sample) do
288 text_of[p.poem_index] = p.content
289 len_of[p.poem_index] = p.length
290 text_of[tostring(p.poem_index)] = p.content -- tolerate string/number keys
291 len_of[tostring(p.poem_index)] = p.length
292 end
293
294 -- Load each model's pool (poem_index -> vector). Keys are strings in JSON;
295 -- normalize to the same key type the rest of the code uses.
296 local pools, dims = {}, {}
297 for _, m in ipairs(models) do
298 local doc = read_json(model_dir(m) .. "/sample-embeddings.json")
299 or error("missing embeddings for model '" .. m .. "' -- run embed for it")
300 local pool = {}
301 for k, v in pairs(doc.embeddings) do pool[k] = v end
302 pools[m] = pool
303 dims[m] = doc.dimensions
304 end
305
306 -- Per-anchor rankings, and per-model personality accumulators.
307 local anchors = sample_doc.anchors
308 local rankings = {} -- rankings[anchor][model] = sorted list
309 local pers = {} -- pers[model] = { jacc={}, lenr={}, score={} }
310 for _, m in ipairs(models) do pers[m] = { jacc = {}, lenr = {}, score = {} } end
311
312 for _, anchor in ipairs(anchors) do
313 local akey = tostring(anchor)
314 rankings[akey] = {}
315 for _, m in ipairs(models) do
316 local avec = pools[m][akey]
317 if avec then
318 local r = evaluator.rank_anchor(avec, pools[m], akey, top_k)
319 rankings[akey][m] = r
320 local pp = evaluator.personality(text_of[akey] or "", len_of[akey] or 0,
321 r, text_of, len_of, top_k)
322 pers[m].jacc[#pers[m].jacc + 1] = pp.mean_jaccard
323 pers[m].lenr[#pers[m].lenr + 1] = pp.mean_len_ratio
324 pers[m].score[#pers[m].score + 1] = pp.mean_score
325 end
326 end
327 end
328
329 -- ---- render HTML ----
330 local h = {}
331 local function w(s) h[#h + 1] = s end
332 w([[<!DOCTYPE html><html lang="en"><head><meta charset="utf-8">]])
333 w([[<meta name="viewport" content="width=device-width, initial-scale=1">]])
334 w("<title>Embedding model comparison</title>")
335 w([[<style>
336 :root{--bg:#0f1117;--card:#181b24;--ink:#e6e8ee;--mut:#9aa3b2;--line:#2a2f3a;--hi:#7cd}
337 *{box-sizing:border-box} body{margin:0;background:var(--bg);color:var(--ink);
338 font:15px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;padding:24px}
339 h1{font-size:22px;margin:0 0 4px} .sub{color:var(--mut);margin:0 0 20px}
340 .legend{background:var(--card);border:1px solid var(--line);border-radius:10px;
341 padding:14px 16px;margin:0 0 24px;max-width:1100px}
342 table{border-collapse:collapse;width:100%} td,th{border:1px solid var(--line);
343 padding:8px 10px;vertical-align:top;text-align:left}
344 th{color:var(--mut);font-weight:600;font-size:13px}
345 .pers td{font-variant-numeric:tabular-nums}
346 .anchor{background:var(--card);border:1px solid var(--line);border-radius:10px;
347 padding:16px;margin:26px 0 10px;max-width:1100px}
348 .anchor .meta{color:var(--mut);font-size:13px;margin-bottom:6px}
349 .anchor .text{white-space:pre-wrap;font-size:15px}
350 .cols{display:grid;gap:14px;grid-template-columns:repeat(var(--n),1fr);max-width:1100px}
351 .col{background:var(--card);border:1px solid var(--line);border-radius:10px;overflow:hidden}
352 .col h3{margin:0;padding:10px 12px;background:#1f2430;font-size:14px;border-bottom:1px solid var(--line)}
353 .col ol{margin:0;padding:8px 8px 10px 30px} .col li{margin:0 0 8px;font-size:13px}
354 .sc{color:var(--hi);font-variant-numeric:tabular-nums}
355 .jac{color:var(--mut);font-size:11px} .agree{color:var(--mut);font-size:13px;
356 margin:8px 0 0;max-width:1100px}
357 .shared{outline:2px solid #3b6;outline-offset:-2px;border-radius:4px;padding:1px 3px}
358 code{color:#cdb}
359 </style>]])
360 w("</head><body>")
361 w("<h1>What does each model think \"similar\" means?</h1>")
362 w(string.format([[<p class="sub">Sample of %d poems, %d anchors, seed %s. Each model embedded the SAME poems; for each anchor we show its nearest neighbours per model.</p>]],
363 sample_doc.sample_size, sample_doc.anchor_count, tostring(sample_doc.seed)))
364
365 -- legend / how to read
366 w([[<div class="legend"><b>How to read this.</b> Cosine scores are only
367 comparable <i>within</i> a column (each model has its own space). The
368 interesting thing is <b>which poems</b> each model picks and where they
369 <b>disagree</b>. Green outline = a poem two or more models both chose for
370 this anchor. The personality table below is data, not a verdict:
371 <code>lexical overlap</code> = average shared-word fraction between an
372 anchor and its top matches (high = the model rewards surface wording;
373 low = it rewards something deeper \226\128\148 meaning/theme/tone);
374 <code>length ratio</code> = how close in length its picks are (near 1 =
375 a length bias).</div>]])
376
377 -- personality summary
378 w("<h2 style='max-width:1100px'>Model personalities (averaged over anchors)</h2>")
379 w("<table class='pers' style='max-width:1100px'><tr><th>model</th><th>dims</th>"
380 .. "<th>lexical overlap</th><th>length ratio</th><th>mean top-K cosine</th></tr>")
381 for _, m in ipairs(models) do
382 w(string.format("<tr><td>%s</td><td>%s</td><td>%.3f</td><td>%.3f</td><td>%.3f</td></tr>",
383 esc(m), tostring(dims[m]),
384 evaluator.mean(pers[m].jacc) or 0,
385 evaluator.mean(pers[m].lenr) or 0,
386 evaluator.mean(pers[m].score) or 0))
387 end
388 w("</table>")
389
390 -- per anchor
391 local metrics = { models = models, top_k = top_k, anchors = {} }
392 for _, anchor in ipairs(anchors) do
393 local akey = tostring(anchor)
394 w("<div class='anchor'>")
395 w(string.format("<div class='meta'>Anchor — poem #%s, %s chars</div>",
396 esc(akey), tostring(len_of[akey] or "?")))
397 w("<div class='text'>" .. esc(snippet(text_of[akey] or "", 600)) .. "</div></div>")
398
399 -- which poems are shared across >=2 models for this anchor (for green outline)
400 local count_in = {}
401 for _, m in ipairs(models) do
402 local r = rankings[akey][m]
403 if r then for _, e in ipairs(r) do
404 count_in[e.poem_index] = (count_in[e.poem_index] or 0) + 1
405 end end
406 end
407
408 w(string.format("<div class='cols' style='--n:%d'>", #models))
409 for _, m in ipairs(models) do
410 w("<div class='col'><h3>" .. esc(m) .. "</h3><ol>")
411 local r = rankings[akey][m]
412 if r then
413 for _, e in ipairs(r) do
414 local cls = (count_in[e.poem_index] or 0) >= 2 and " class='shared'" or ""
415 local jac = evaluator.lexical_jaccard(text_of[akey] or "", text_of[e.poem_index] or "")
416 w(string.format("<li><span class='sc'>%.3f</span> "
417 .. "<span class='jac'>(words %.0f%%)</span><br><span%s>%s</span></li>",
418 e.score, jac * 100, cls, esc(snippet(text_of[e.poem_index] or "", 140))))
419 end
420 else
421 w("<li><i>no embedding</i></li>")
422 end
423 w("</ol></div>")
424 end
425 w("</div>")
426
427 -- pairwise agreement line + collect metrics
428 local amx = { anchor = anchor, pairs = {} }
429 local parts = {}
430 for i = 1, #models - 1 do
431 for j = i + 1, #models do
432 local ra, rb = rankings[akey][models[i]], rankings[akey][models[j]]
433 if ra and rb then
434 local shared = evaluator.topk_agreement(ra, rb, top_k)
435 local tau = evaluator.kendall_tau(ra, rb)
436 parts[#parts + 1] = string.format("%s vs %s: %d/%d shared%s",
437 models[i], models[j], shared, top_k,
438 tau and string.format(", \207\132=%.2f", tau) or "")
439 amx.pairs[#amx.pairs + 1] = {
440 a = models[i], b = models[j], shared = shared, kendall_tau = tau,
441 }
442 end
443 end
444 end
445 w("<p class='agree'>Agreement — " .. esc(table.concat(parts, " | ")) .. "</p>")
446 metrics.anchors[#metrics.anchors + 1] = amx
447 end
448
449 w("</body></html>")
450
451 utils.ensure_directory(EVAL_DIR)
452 local html_path = EVAL_DIR .. "/comparison-report.html"
453 local fh = assert(io.open(html_path, "w"), "cannot write " .. html_path)
454 fh:write(table.concat(h)); fh:close()
455 write_json(EVAL_DIR .. "/metrics.json", metrics)
456 print("[report] wrote " .. html_path)
457end
458-- }}}
459
460-- {{{ dispatch
461local sub = arg[2]
462local flags = parse_flags(arg, 3)
463if sub == "select" then cmd_select(flags)
464elseif sub == "embed" then cmd_embed(flags)
465elseif sub == "report" then cmd_report(flags)
466else
467 io.stderr:write("usage: model-comparison.lua DIR {select|embed|report} [--flags]\n")
468 os.exit(1)
469end
470-- }}}
471