libs/model-evaluator.lua
1-- {{{ model-evaluator.lua
2-- Pure comparison + statistics for the embedding-model evaluation framework
3-- (Issue 10-031). Given several models' embeddings of the SAME pool of poems,
4-- it answers: for a given anchor poem, what does each model call "most similar",
5-- how much do the models agree, and -- the interesting part -- what KIND of
6-- similarity does each model seem to reward (surface word overlap vs something
7-- deeper, and does it favor poems of similar length)?
8--
9-- General description (for a CEO): three judges each rank the same line-up of
10-- poems by "how like this one is it". This module measures how often the judges
11-- agree, where they sharply disagree, and what each judge seems to care about --
12-- so a human can read three columns side by side and SEE that one judge rewards
13-- shared wording while another rewards shared meaning.
14--
15-- Deliberately has NO IO and NO model/server knowledge: it takes plain Lua
16-- tables (vectors and text) and returns plain Lua tables. That keeps the math
17-- testable in isolation and lets the generation/orchestration layer own all the
18-- messy parts (servers, files). Data generation and data viewing stay apart.
19-- }}}
20
21local M = {}
22
23-- {{{ local function cosine(a, b)
24-- Cosine similarity of two equal-length vectors. The embedding spaces differ in
25-- dimensionality across models (768 vs 1024), which is fine: cosine is only ever
26-- compared WITHIN a single model's space, never across, so the absolute numbers
27-- are per-model and only their RANKINGS are compared between models.
28local function cosine(a, b)
29 local dot, na, nb = 0.0, 0.0, 0.0
30 for i = 1, #a do
31 local x, y = a[i], b[i]
32 dot = dot + x * y
33 na = na + x * x
34 nb = nb + y * y
35 end
36 if na == 0 or nb == 0 then return 0.0 end
37 return dot / (math.sqrt(na) * math.sqrt(nb))
38end
39M.cosine = cosine
40-- }}}
41
42-- {{{ function M.rank_anchor(anchor_vec, pool, exclude_index, top_k)
43-- Rank every poem in `pool` (a map of poem_index -> vector) by cosine similarity
44-- to anchor_vec, nearest first, dropping the anchor itself (exclude_index).
45-- Returns an array of { poem_index = n, score = s }, length min(top_k, #pool-1).
46-- A stable tiebreaker (poem_index ascending) keeps the ranking deterministic so
47-- two runs -- and the rank-correlation math below -- are reproducible.
48function M.rank_anchor(anchor_vec, pool, exclude_index, top_k)
49 local scored = {}
50 for idx, vec in pairs(pool) do
51 if idx ~= exclude_index then
52 scored[#scored + 1] = { poem_index = idx, score = cosine(anchor_vec, vec) }
53 end
54 end
55 table.sort(scored, function(p, q)
56 if p.score ~= q.score then return p.score > q.score end
57 return p.poem_index < q.poem_index
58 end)
59 if top_k and #scored > top_k then
60 for i = #scored, top_k + 1, -1 do scored[i] = nil end
61 end
62 return scored
63end
64-- }}}
65
66-- {{{ function M.topk_agreement(rank_a, rank_b, k)
67-- How many poems appear in BOTH models' top-k for the same anchor (set overlap,
68-- order ignored). The headline "do they even pick the same poems" number.
69function M.topk_agreement(rank_a, rank_b, k)
70 local in_a = {}
71 for i = 1, math.min(k, #rank_a) do in_a[rank_a[i].poem_index] = true end
72 local shared = 0
73 for i = 1, math.min(k, #rank_b) do
74 if in_a[rank_b[i].poem_index] then shared = shared + 1 end
75 end
76 return shared
77end
78-- }}}
79
80-- {{{ function M.kendall_tau(rank_a, rank_b)
81-- Kendall's tau-b over the poems the two rankings share: +1 = identical order,
82-- 0 = unrelated, -1 = reversed. We restrict to the intersection of the two
83-- rankings (each model only ranks the pool it embedded, and we usually pass the
84-- top-N slices) and count concordant vs discordant pairs. This is O(n^2) in the
85-- shared set, which is fine for the small top-N slices we feed it.
86function M.kendall_tau(rank_a, rank_b)
87 local pos_a, pos_b = {}, {}
88 for i, e in ipairs(rank_a) do pos_a[e.poem_index] = i end
89 for i, e in ipairs(rank_b) do pos_b[e.poem_index] = i end
90 local common = {}
91 for idx in pairs(pos_a) do
92 if pos_b[idx] then common[#common + 1] = idx end
93 end
94 local n = #common
95 if n < 2 then return nil, n end -- undefined with fewer than two shared items
96 local concordant, discordant = 0, 0
97 for i = 1, n - 1 do
98 for j = i + 1, n do
99 local di = pos_a[common[i]] - pos_a[common[j]]
100 local dj = pos_b[common[i]] - pos_b[common[j]]
101 local s = di * dj
102 if s > 0 then concordant = concordant + 1
103 elseif s < 0 then discordant = discordant + 1 end
104 -- ties (s == 0) contribute to neither; with distinct ranks there are none
105 end
106 end
107 local total = concordant + discordant
108 if total == 0 then return nil, n end
109 return (concordant - discordant) / total, n
110end
111-- }}}
112
113-- {{{ local function word_set(text)
114-- Lowercased set of alphanumeric word tokens. Used for the lexical-overlap
115-- signal: it is the crudest, most "surface" notion of similarity there is, which
116-- is exactly why it is useful as a contrast to what the neural models do.
117local function word_set(text)
118 local set = {}
119 for w in tostring(text):lower():gmatch("[%w']+") do
120 set[w] = true
121 end
122 return set
123end
124-- }}}
125
126-- {{{ function M.lexical_jaccard(text_a, text_b)
127-- Jaccard overlap of the two poems' word sets: |A n B| / |A u B|, in [0,1].
128-- High = the two poems literally share many words (surface/structural kinship);
129-- low = they share few words yet a model still called them similar (so the model
130-- is rewarding something OTHER than shared vocabulary -- meaning, theme, tone).
131function M.lexical_jaccard(text_a, text_b)
132 local a, b = word_set(text_a), word_set(text_b)
133 local inter, union = 0, 0
134 local seen = {}
135 for w in pairs(a) do
136 seen[w] = true
137 union = union + 1
138 if b[w] then inter = inter + 1 end
139 end
140 for w in pairs(b) do
141 if not seen[w] then union = union + 1 end
142 end
143 if union == 0 then return 0.0 end
144 return inter / union
145end
146-- }}}
147
148-- {{{ function M.personality(anchor_text, anchor_len, ranked, texts, lengths, k)
149-- Turn a model's top-k matches for one anchor into interpretable signals:
150-- mean_jaccard : average word-overlap between the anchor and its top matches.
151-- Higher => this model leans on shared wording (surface/structure).
152-- mean_len_ratio: average length similarity, min/max of the word counts, in (0,1].
153-- Near 1 => the model's favourites are close in length to the
154-- anchor (a length bias); lower => it pairs across lengths freely.
155-- mean_score : average cosine of the top matches (how "confident" / tight the
156-- neighbourhood is in this model's space -- only comparable to the
157-- same model's other anchors, not across models).
158-- These are descriptive, not verdicts: the report shows them so a human can judge.
159function M.personality(anchor_text, anchor_len, ranked, texts, lengths, k)
160 k = math.min(k or #ranked, #ranked)
161 local sum_j, sum_lr, sum_s, n = 0.0, 0.0, 0.0, 0
162 for i = 1, k do
163 local idx = ranked[i].poem_index
164 local t = texts[idx]
165 if t then
166 n = n + 1
167 sum_j = sum_j + M.lexical_jaccard(anchor_text, t)
168 local la, lb = anchor_len or 0, lengths[idx] or 0
169 if la > 0 and lb > 0 then
170 sum_lr = sum_lr + (math.min(la, lb) / math.max(la, lb))
171 end
172 sum_s = sum_s + ranked[i].score
173 end
174 end
175 if n == 0 then return { mean_jaccard = 0, mean_len_ratio = 0, mean_score = 0, n = 0 } end
176 return {
177 mean_jaccard = sum_j / n,
178 mean_len_ratio = sum_lr / n,
179 mean_score = sum_s / n,
180 n = n,
181 }
182end
183-- }}}
184
185-- {{{ function M.mean(list)
186-- Small helper: arithmetic mean of a numeric array, or nil if empty (so callers
187-- can render "n/a" rather than divide by zero -- nil-as-error, not nil-as-zero).
188function M.mean(list)
189 local sum, n = 0.0, 0
190 for _, v in ipairs(list) do
191 if v then sum = sum + v; n = n + 1 end
192 end
193 if n == 0 then return nil end
194 return sum / n
195end
196-- }}}
197
198return M
199