src/model-comparison.lua

471 lines

1#!/usr/bin/env luajit

2-- {{{ model-comparison.lua

3-- Data + report layer for the embedding-model evaluation framework (Issue

4-- 10-031). Three subcommands, each a separate resumable step so the expensive

5-- middle one (which needs a model loaded on the GPU) is isolated from the cheap

6-- ends:

7--

8-- select : choose a reproducible sample of poems + a spread of anchor poems,

9-- write output/model-evaluation/sample.json

10-- embed : embed that sample with the CURRENTLY-RUNNING model server, write

11-- output/model-evaluation/<model>/sample-embeddings.json

12-- report : read the sample + every model's embeddings, rank each anchor per

13-- model, compute agreement/divergence + "personality" signals, and

14-- emit output/model-evaluation/comparison-report.html (+ metrics.json)

15--

16-- General description (for a CEO): step one picks the line-up of poems to judge;

17-- step two runs once per model (with that model loaded) to record its opinions;

18-- step three lays the opinions side by side as a web page a human can read.

19--

20-- Orchestration (which server to start for each model) lives in the bash driver

21-- scripts/evaluate-embedding-models; this file does the data, not the process

22-- management -- separation of concerns.

23-- }}}

25local DIR = arg[1] or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

26package.path = DIR .. "/?.lua;" .. DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path

28local dkjson = require("dkjson")

29local utils = require("utils")

30local evaluator = require("model-evaluator")

32utils.init_assets_root({ DIR })

33local EVAL_DIR = DIR .. "/output/model-evaluation"

35-- {{{ local function parse_flags(argv, from)

36-- Tiny --key value / --flag parser over argv starting at index `from`. Bare

37-- --flag becomes true. Keeps the CLI self-describing without a dependency.

38local function parse_flags(argv, from)

39 local f = {}

40 local i = from

41 while i <= #argv do

42 local a = argv[i]

43 if a:sub(1, 2) == "--" then

44 local key = a:sub(3)

45 local val = argv[i + 1]

46 if val == nil or val:sub(1, 2) == "--" then

47 f[key] = true; i = i + 1

48 else

49 f[key] = val; i = i + 2

50 end

51 else

52 i = i + 1

53 end

54 end

55 return f

56end

57-- }}}

59-- {{{ local function read_json(path) / write_json(path, t)

60local function read_json(path)

61 local fh = io.open(path, "r")

62 if not fh then return nil end

63 local s = fh:read("*a"); fh:close()

64 return dkjson.decode(s)

65end

67local function write_json(path, t)

68 -- Parent dir is created by the caller; a missing one should fail loudly here

69 -- rather than leave a silent half-run, so io.open's nil is asserted.

70 local fh = assert(io.open(path, "w"), "cannot write " .. path)

71 fh:write(dkjson.encode(t, { indent = true }))

72 fh:close()

73end

74-- }}}

76-- {{{ local function model_dir(model)

77-- Per-model output subdir, name sanitized the same way embeddings_dir sanitizes

78-- (colon -> underscore etc.), so "embeddinggemma-300m" and a future "qwen:4b"

79-- both map to a safe folder.

80local function model_dir(model)

81 return EVAL_DIR .. "/" .. model:gsub("[^%w%-_.]", "_")

82end

83-- }}}

85-- {{{ local function lcg(seed)

86-- Self-contained linear congruential generator (Park-Miller constants). Used

87-- instead of math.random so the sample is byte-identical on any machine and any

88-- LuaJIT build -- the comparison is only meaningful if everyone judges the SAME

89-- poems. Returns a function yielding floats in [0,1).

90local function lcg(seed)

91 local state = seed % 2147483647

92 if state <= 0 then state = state + 2147483646 end

93 return function()

94 state = (state * 16807) % 2147483647

95 return (state - 1) / 2147483646

96 end

97end

98-- }}}

100-- {{{ local function load_poems()

101-- Load poems.json, keeping only real embeddable text (skip image-only entries

102-- and empties): a candidate with no words tells us nothing about a model's taste.

103-- Returns an array of { poem_index, id, content, length }.

104local function load_poems()

105 local data = read_json(utils.asset_path("poems.json"))

106 if not data then error("cannot read assets/poems.json") end

107 local poems = data.poems or data

108 local out = {}

109 for _, p in ipairs(poems) do

110 local content = p.content or p.text or ""

111 if not p.is_image_only and #content:gsub("%s", "") > 0 then

112 out[#out + 1] = {

113 poem_index = p.poem_index or p.id,

114 id = p.id,

115 content = content,

116 length = p.length or #content,

117 }

118 end

119 end

120 return out

121end

122-- }}}

123

124-- {{{ local function cmd_select(flags)

125-- Pick a reproducible sample (--sample N) and a length-spread set of anchors

126-- (--anchors K). Anchors come from sorting the sample by length and taking K

127-- evenly spaced positions, so they span short imagery-poems through long

128-- narratives (the diversity Issue 10-031 asks for) with no hand-curation.

129local function cmd_select(flags)

130 local n_sample = tonumber(flags.sample) or 500

131 local n_anchor = tonumber(flags.anchors) or 8

132 local seed = tonumber(flags.seed) or 12345

133

134 local poems = load_poems()

135 if #poems == 0 then error("no embeddable poems found") end

136

137 -- Seeded Fisher-Yates over an index list, then take the first N. Shuffling

138 -- (vs striding) removes any bias from how poems.json happens to be ordered.

139 local rand = lcg(seed)

140 local order = {}

141 for i = 1, #poems do order[i] = i end

142 for i = #order, 2, -1 do

143 local j = math.floor(rand() * i) + 1

144 order[i], order[j] = order[j], order[i]

145 end

146 n_sample = math.min(n_sample, #poems)

147 local sample = {}

148 for i = 1, n_sample do sample[i] = poems[order[i]] end

149

150 local by_len = {}

151 for i = 1, #sample do by_len[i] = sample[i] end

152 table.sort(by_len, function(a, b) return a.length < b.length end)

153 local anchors = {}

154 n_anchor = math.min(n_anchor, #by_len)

155 for k = 1, n_anchor do

156 local pos = math.floor((k - 0.5) / n_anchor * #by_len) + 1

157 if pos > #by_len then pos = #by_len end

158 anchors[k] = by_len[pos].poem_index

159 end

160

161 utils.ensure_directory(EVAL_DIR)

162 write_json(EVAL_DIR .. "/sample.json", {

163 seed = seed, sample_size = #sample, anchor_count = #anchors,

164 anchors = anchors, sample = sample,

165 })

166 print(string.format("[select] sample=%d anchors=%d seed=%d -> %s/sample.json",

167 #sample, #anchors, seed, EVAL_DIR))

168end

169-- }}}

170

171-- {{{ local function cmd_embed(flags)

172-- Embed the sample with the model currently served by llama.cpp. --server names

173-- the config entry (so the right prompt prefix is applied) and --model is the

174-- identifier sent in the request. Chunked so one over-large request cannot blow

175-- the server's batch limits; a missing vector is a hard error (a partial space

176-- would silently corrupt every ranking), not a skipped poem.

177local function cmd_embed(flags)

178 local server = flags.server or error("--server NAME required")

179 local model = flags.model or error("--model NAME required")

180 local chunk = tonumber(flags.chunk) or 16

181

182 -- Late require: fuzzy-computing pulls in the embedding stack; only this step

183 -- needs it, so select/report stay light and runnable without a server.

184 local inference = require("inference-server-config")

185 inference.set_project_root(DIR)

186 inference.set_selected_server(server)

187 -- Select the model too, so format_embedding_prompt resolves THIS model's

188 -- prefix from the server's available_models (nomic clusters, gemma has its

189 -- own clustering prompt, mxbai none) -- not the server default's.

190 inference.set_selected_model(model)

191 local fuzzy = require("fuzzy-computing")

192 local endpoint = inference.build_host_url()

193 local format_fn = inference.format_embedding_prompt

194 -- Chunk to the LOADED model's context budget. mxbai-embed-large caps at 512

195 -- tokens (BERT-large), while nomic and gemma allow ~2048; a poem longer than

196 -- the cap is split and its chunk vectors averaged -- exactly how the real

197 -- pipeline embeds long poems (Issue 10-050). Computed once per model here so

198 -- we don't re-query /tokenize's budget on every batch.

199 local count_fn = fuzzy.make_token_counter(endpoint)

200 local max_tokens = fuzzy.embedding_chunk_budget(endpoint, format_fn)

201 -- embedding_chunk_budget derives from a fixed MODEL_CONTEXT_TOKENS constant

202 -- (sized for nomic/gemma's ~2048). Some models have a SMALLER trained context

203 -- than that and than the server's launch --ctx-size, so their real limit must

204 -- be capped explicitly or the chunker emits chunks the server rejects

205 -- ("exceed_context_size_error"). mxbai-embed-large is BERT-large: 512 tokens.

206 local model_ctx_cap = {

207 ["mxbai-embed-large-v1"] = 500, -- 512 trained ctx, minus specials, headroom

208 }

209 if flags["max-tokens"] then max_tokens = tonumber(flags["max-tokens"]) end

210 if model_ctx_cap[model] then max_tokens = math.min(max_tokens, model_ctx_cap[model]) end

211

212 local sample_doc = read_json(EVAL_DIR .. "/sample.json")

213 or error("run `select` first: missing " .. EVAL_DIR .. "/sample.json")

214 local sample = sample_doc.sample

215

216 local embeddings = {}

217 local total = #sample

218 for start = 1, total, chunk do

219 local stop = math.min(start + chunk - 1, total)

220 local texts, idxs = {}, {}

221 for i = start, stop do

222 texts[#texts + 1] = sample[i].content

223 idxs[#idxs + 1] = sample[i].poem_index

224 end

225 local vecs, err = fuzzy.embed_texts_with_chunking(texts, model, {

226 endpoint = endpoint, format_fn = format_fn,

227 count_fn = count_fn, max_tokens = max_tokens,

228 })

229 if not vecs then error("embedding batch failed: " .. tostring(err)) end

230 for j = 1, #texts do

231 if not vecs[j] then

232 error(string.format("missing vector for poem_index %s (model %s)",

233 tostring(idxs[j]), model))

234 end

235 embeddings[tostring(idxs[j])] = vecs[j]

236 end

237 io.write(string.format("\r[embed %s] %d/%d", model, stop, total)); io.flush()

238 end

239 io.write("\n")

240

241 local dim = 0

242 for _, v in pairs(embeddings) do dim = #v; break end

243 utils.ensure_directory(model_dir(model))

244 write_json(model_dir(model) .. "/sample-embeddings.json", {

245 model = model, server = server, dimensions = dim, embeddings = embeddings,

246 })

247 print(string.format("[embed] %s: %d vectors x %d dims", model, total, dim))

248end

249-- }}}

250

251-- {{{ local function esc(s)

252-- HTML-escape for poem text dropped into the report.

253local function esc(s)

254 return (tostring(s):gsub("[&<>\"]", {

255 ["&"] = "&", ["<"] = "<", [">"] = ">", ["\""] = """,

256 }))

257end

258-- }}}

259

260-- {{{ local function snippet(text, n)

261-- One-line preview of a poem: collapse whitespace, clip to n chars with an

262-- ellipsis. Keeps the side-by-side columns scannable.

263local function snippet(text, n)

264 local s = tostring(text):gsub("%s+", " "):gsub("^%s+", "")

265 if #s > n then s = s:sub(1, n) .. "\226\128\166" end -- UTF-8 ellipsis

266 return s

267end

268-- }}}

269

270-- {{{ local function cmd_report(flags)

271-- Read the sample + each model's embeddings and render the side-by-side report.

272-- --models is the ordered, comma-separated column list (also the labels). For

273-- each anchor we rank the pool per model, show the top-K, and compute pairwise

274-- agreement (Kendall's tau + top-K overlap). Per model we also aggregate the

275-- "personality" signals so a human can see WHAT each rewards.

276local function cmd_report(flags)

277 local top_k = tonumber(flags["top-k"]) or 10

278 local models_csv = flags.models or error("--models a,b,c required")

279 local models = {}

280 for m in models_csv:gmatch("[^,]+") do models[#models + 1] = m:gsub("^%s+", ""):gsub("%s+$", "") end

281

282 local sample_doc = read_json(EVAL_DIR .. "/sample.json")

283 or error("missing sample.json -- run select + embed first")

284

285 -- poem_index -> content/length, for snippets, lexical overlap, length bias.

286 local text_of, len_of = {}, {}

287 for _, p in ipairs(sample_doc.sample) do

288 text_of[p.poem_index] = p.content

289 len_of[p.poem_index] = p.length

290 text_of[tostring(p.poem_index)] = p.content -- tolerate string/number keys

291 len_of[tostring(p.poem_index)] = p.length

292 end

293

294 -- Load each model's pool (poem_index -> vector). Keys are strings in JSON;

295 -- normalize to the same key type the rest of the code uses.

296 local pools, dims = {}, {}

297 for _, m in ipairs(models) do

298 local doc = read_json(model_dir(m) .. "/sample-embeddings.json")

299 or error("missing embeddings for model '" .. m .. "' -- run embed for it")

300 local pool = {}

301 for k, v in pairs(doc.embeddings) do pool[k] = v end

302 pools[m] = pool

303 dims[m] = doc.dimensions

304 end

305

306 -- Per-anchor rankings, and per-model personality accumulators.

307 local anchors = sample_doc.anchors

308 local rankings = {} -- rankings[anchor][model] = sorted list

309 local pers = {} -- pers[model] = { jacc={}, lenr={}, score={} }

310 for _, m in ipairs(models) do pers[m] = { jacc = {}, lenr = {}, score = {} } end

311

312 for _, anchor in ipairs(anchors) do

313 local akey = tostring(anchor)

314 rankings[akey] = {}

315 for _, m in ipairs(models) do

316 local avec = pools[m][akey]

317 if avec then

318 local r = evaluator.rank_anchor(avec, pools[m], akey, top_k)

319 rankings[akey][m] = r

320 local pp = evaluator.personality(text_of[akey] or "", len_of[akey] or 0,

321 r, text_of, len_of, top_k)

322 pers[m].jacc[#pers[m].jacc + 1] = pp.mean_jaccard

323 pers[m].lenr[#pers[m].lenr + 1] = pp.mean_len_ratio

324 pers[m].score[#pers[m].score + 1] = pp.mean_score

325 end

326 end

327 end

328

329 -- ---- render HTML ----

330 local h = {}

331 local function w(s) h[#h + 1] = s end

332 w([[<!DOCTYPE html><html lang="en"><head><meta charset="utf-8">]])

333 w([[<meta name="viewport" content="width=device-width, initial-scale=1">]])

334 w("<title>Embedding model comparison</title>")

335 w([[<style>

336 :root{--bg:#0f1117;--card:#181b24;--ink:#e6e8ee;--mut:#9aa3b2;--line:#2a2f3a;--hi:#7cd}

337 *{box-sizing:border-box} body{margin:0;background:var(--bg);color:var(--ink);

338 font:15px/1.5 -apple-system,Segoe UI,Roboto,sans-serif;padding:24px}

339 h1{font-size:22px;margin:0 0 4px} .sub{color:var(--mut);margin:0 0 20px}

340 .legend{background:var(--card);border:1px solid var(--line);border-radius:10px;

341 padding:14px 16px;margin:0 0 24px;max-width:1100px}

342 table{border-collapse:collapse;width:100%} td,th{border:1px solid var(--line);

343 padding:8px 10px;vertical-align:top;text-align:left}

344 th{color:var(--mut);font-weight:600;font-size:13px}

345 .pers td{font-variant-numeric:tabular-nums}

346 .anchor{background:var(--card);border:1px solid var(--line);border-radius:10px;

347 padding:16px;margin:26px 0 10px;max-width:1100px}

348 .anchor .meta{color:var(--mut);font-size:13px;margin-bottom:6px}

349 .anchor .text{white-space:pre-wrap;font-size:15px}

350 .cols{display:grid;gap:14px;grid-template-columns:repeat(var(--n),1fr);max-width:1100px}

351 .col{background:var(--card);border:1px solid var(--line);border-radius:10px;overflow:hidden}

352 .col h3{margin:0;padding:10px 12px;background:#1f2430;font-size:14px;border-bottom:1px solid var(--line)}

353 .col ol{margin:0;padding:8px 8px 10px 30px} .col li{margin:0 0 8px;font-size:13px}

354 .sc{color:var(--hi);font-variant-numeric:tabular-nums}

355 .jac{color:var(--mut);font-size:11px} .agree{color:var(--mut);font-size:13px;

356 margin:8px 0 0;max-width:1100px}

357 .shared{outline:2px solid #3b6;outline-offset:-2px;border-radius:4px;padding:1px 3px}

358 code{color:#cdb}

359 </style>]])

360 w("</head><body>")

361 w("<h1>What does each model think \"similar\" means?</h1>")

362 w(string.format([[<p class="sub">Sample of %d poems, %d anchors, seed %s. Each model embedded the SAME poems; for each anchor we show its nearest neighbours per model.</p>]],

363 sample_doc.sample_size, sample_doc.anchor_count, tostring(sample_doc.seed)))

364

365 -- legend / how to read

366 w([[<div class="legend"><b>How to read this.</b> Cosine scores are only

367 comparable <i>within</i> a column (each model has its own space). The

368 interesting thing is <b>which poems</b> each model picks and where they

369 <b>disagree</b>. Green outline = a poem two or more models both chose for

370 this anchor. The personality table below is data, not a verdict:

371 <code>lexical overlap</code> = average shared-word fraction between an

372 anchor and its top matches (high = the model rewards surface wording;

373 low = it rewards something deeper \226\128\148 meaning/theme/tone);

374 <code>length ratio</code> = how close in length its picks are (near 1 =

375 a length bias).</div>]])

376

377 -- personality summary

378 w("<h2 style='max-width:1100px'>Model personalities (averaged over anchors)</h2>")

379 w("<table class='pers' style='max-width:1100px'><tr><th>model</th><th>dims</th>"

380 .. "<th>lexical overlap</th><th>length ratio</th><th>mean top-K cosine</th></tr>")

381 for _, m in ipairs(models) do

382 w(string.format("<tr><td>%s</td><td>%s</td><td>%.3f</td><td>%.3f</td><td>%.3f</td></tr>",

383 esc(m), tostring(dims[m]),

384 evaluator.mean(pers[m].jacc) or 0,

385 evaluator.mean(pers[m].lenr) or 0,

386 evaluator.mean(pers[m].score) or 0))

387 end

388 w("</table>")

389

390 -- per anchor

391 local metrics = { models = models, top_k = top_k, anchors = {} }

392 for _, anchor in ipairs(anchors) do

393 local akey = tostring(anchor)

394 w("<div class='anchor'>")

395 w(string.format("<div class='meta'>Anchor — poem #%s, %s chars</div>",

396 esc(akey), tostring(len_of[akey] or "?")))

397 w("<div class='text'>" .. esc(snippet(text_of[akey] or "", 600)) .. "</div></div>")

398

399 -- which poems are shared across >=2 models for this anchor (for green outline)

400 local count_in = {}

401 for _, m in ipairs(models) do

402 local r = rankings[akey][m]

403 if r then for _, e in ipairs(r) do

404 count_in[e.poem_index] = (count_in[e.poem_index] or 0) + 1

405 end end

406 end

407

408 w(string.format("<div class='cols' style='--n:%d'>", #models))

409 for _, m in ipairs(models) do

410 w("<div class='col'><h3>" .. esc(m) .. "</h3><ol>")

411 local r = rankings[akey][m]

412 if r then

413 for _, e in ipairs(r) do

414 local cls = (count_in[e.poem_index] or 0) >= 2 and " class='shared'" or ""

415 local jac = evaluator.lexical_jaccard(text_of[akey] or "", text_of[e.poem_index] or "")

416 w(string.format("<li><span class='sc'>%.3f</span> "

417 .. "<span class='jac'>(words %.0f%%)</span><br><span%s>%s</span></li>",

418 e.score, jac * 100, cls, esc(snippet(text_of[e.poem_index] or "", 140))))

419 end

420 else

421 w("<li><i>no embedding</i></li>")

422 end

423 w("</ol></div>")

424 end

425 w("</div>")

426

427 -- pairwise agreement line + collect metrics

428 local amx = { anchor = anchor, pairs = {} }

429 local parts = {}

430 for i = 1, #models - 1 do

431 for j = i + 1, #models do

432 local ra, rb = rankings[akey][models[i]], rankings[akey][models[j]]

433 if ra and rb then

434 local shared = evaluator.topk_agreement(ra, rb, top_k)

435 local tau = evaluator.kendall_tau(ra, rb)

436 parts[#parts + 1] = string.format("%s vs %s: %d/%d shared%s",

437 models[i], models[j], shared, top_k,

438 tau and string.format(", \207\132=%.2f", tau) or "")

439 amx.pairs[#amx.pairs + 1] = {

440 a = models[i], b = models[j], shared = shared, kendall_tau = tau,

441 }

442 end

443 end

444 end

445 w("<p class='agree'>Agreement — " .. esc(table.concat(parts, "  |  ")) .. "</p>")

446 metrics.anchors[#metrics.anchors + 1] = amx

447 end

448

449 w("</body></html>")

450

451 utils.ensure_directory(EVAL_DIR)

452 local html_path = EVAL_DIR .. "/comparison-report.html"

453 local fh = assert(io.open(html_path, "w"), "cannot write " .. html_path)

454 fh:write(table.concat(h)); fh:close()

455 write_json(EVAL_DIR .. "/metrics.json", metrics)

456 print("[report] wrote " .. html_path)

457end

458-- }}}

459

460-- {{{ dispatch

461local sub = arg[2]

462local flags = parse_flags(arg, 3)

463if sub == "select" then cmd_select(flags)

464elseif sub == "embed" then cmd_embed(flags)

465elseif sub == "report" then cmd_report(flags)

466else

467 io.stderr:write("usage: model-comparison.lua DIR {select|embed|report} [--flags]\n")

468 os.exit(1)

469end

470-- }}}

471