libs/model-evaluator.lua

199 lines

1-- {{{ model-evaluator.lua

2-- Pure comparison + statistics for the embedding-model evaluation framework

3-- (Issue 10-031). Given several models' embeddings of the SAME pool of poems,

4-- it answers: for a given anchor poem, what does each model call "most similar",

5-- how much do the models agree, and -- the interesting part -- what KIND of

6-- similarity does each model seem to reward (surface word overlap vs something

7-- deeper, and does it favor poems of similar length)?

8--

9-- General description (for a CEO): three judges each rank the same line-up of

10-- poems by "how like this one is it". This module measures how often the judges

11-- agree, where they sharply disagree, and what each judge seems to care about --

12-- so a human can read three columns side by side and SEE that one judge rewards

13-- shared wording while another rewards shared meaning.

14--

15-- Deliberately has NO IO and NO model/server knowledge: it takes plain Lua

16-- tables (vectors and text) and returns plain Lua tables. That keeps the math

17-- testable in isolation and lets the generation/orchestration layer own all the

18-- messy parts (servers, files). Data generation and data viewing stay apart.

19-- }}}

21local M = {}

23-- {{{ local function cosine(a, b)

24-- Cosine similarity of two equal-length vectors. The embedding spaces differ in

25-- dimensionality across models (768 vs 1024), which is fine: cosine is only ever

26-- compared WITHIN a single model's space, never across, so the absolute numbers

27-- are per-model and only their RANKINGS are compared between models.

28local function cosine(a, b)

29 local dot, na, nb = 0.0, 0.0, 0.0

30 for i = 1, #a do

31 local x, y = a[i], b[i]

32 dot = dot + x * y

33 na = na + x * x

34 nb = nb + y * y

35 end

36 if na == 0 or nb == 0 then return 0.0 end

37 return dot / (math.sqrt(na) * math.sqrt(nb))

38end

39M.cosine = cosine

40-- }}}

42-- {{{ function M.rank_anchor(anchor_vec, pool, exclude_index, top_k)

43-- Rank every poem in `pool` (a map of poem_index -> vector) by cosine similarity

44-- to anchor_vec, nearest first, dropping the anchor itself (exclude_index).

45-- Returns an array of { poem_index = n, score = s }, length min(top_k, #pool-1).

46-- A stable tiebreaker (poem_index ascending) keeps the ranking deterministic so

47-- two runs -- and the rank-correlation math below -- are reproducible.

48function M.rank_anchor(anchor_vec, pool, exclude_index, top_k)

49 local scored = {}

50 for idx, vec in pairs(pool) do

51 if idx ~= exclude_index then

52 scored[#scored + 1] = { poem_index = idx, score = cosine(anchor_vec, vec) }

53 end

54 end

55 table.sort(scored, function(p, q)

56 if p.score ~= q.score then return p.score > q.score end

57 return p.poem_index < q.poem_index

58 end)

59 if top_k and #scored > top_k then

60 for i = #scored, top_k + 1, -1 do scored[i] = nil end

61 end

62 return scored

63end

64-- }}}

66-- {{{ function M.topk_agreement(rank_a, rank_b, k)

67-- How many poems appear in BOTH models' top-k for the same anchor (set overlap,

68-- order ignored). The headline "do they even pick the same poems" number.

69function M.topk_agreement(rank_a, rank_b, k)

70 local in_a = {}

71 for i = 1, math.min(k, #rank_a) do in_a[rank_a[i].poem_index] = true end

72 local shared = 0

73 for i = 1, math.min(k, #rank_b) do

74 if in_a[rank_b[i].poem_index] then shared = shared + 1 end

75 end

76 return shared

77end

78-- }}}

80-- {{{ function M.kendall_tau(rank_a, rank_b)

81-- Kendall's tau-b over the poems the two rankings share: +1 = identical order,

82-- 0 = unrelated, -1 = reversed. We restrict to the intersection of the two

83-- rankings (each model only ranks the pool it embedded, and we usually pass the

84-- top-N slices) and count concordant vs discordant pairs. This is O(n^2) in the

85-- shared set, which is fine for the small top-N slices we feed it.

86function M.kendall_tau(rank_a, rank_b)

87 local pos_a, pos_b = {}, {}

88 for i, e in ipairs(rank_a) do pos_a[e.poem_index] = i end

89 for i, e in ipairs(rank_b) do pos_b[e.poem_index] = i end

90 local common = {}

91 for idx in pairs(pos_a) do

92 if pos_b[idx] then common[#common + 1] = idx end

93 end

94 local n = #common

95 if n < 2 then return nil, n end -- undefined with fewer than two shared items

96 local concordant, discordant = 0, 0

97 for i = 1, n - 1 do

98 for j = i + 1, n do

99 local di = pos_a[common[i]] - pos_a[common[j]]

100 local dj = pos_b[common[i]] - pos_b[common[j]]

101 local s = di * dj

102 if s > 0 then concordant = concordant + 1

103 elseif s < 0 then discordant = discordant + 1 end

104 -- ties (s == 0) contribute to neither; with distinct ranks there are none

105 end

106 end

107 local total = concordant + discordant

108 if total == 0 then return nil, n end

109 return (concordant - discordant) / total, n

110end

111-- }}}

112

113-- {{{ local function word_set(text)

114-- Lowercased set of alphanumeric word tokens. Used for the lexical-overlap

115-- signal: it is the crudest, most "surface" notion of similarity there is, which

116-- is exactly why it is useful as a contrast to what the neural models do.

117local function word_set(text)

118 local set = {}

119 for w in tostring(text):lower():gmatch("[%w']+") do

120 set[w] = true

121 end

122 return set

123end

124-- }}}

125

126-- {{{ function M.lexical_jaccard(text_a, text_b)

127-- Jaccard overlap of the two poems' word sets: |A n B| / |A u B|, in [0,1].

128-- High = the two poems literally share many words (surface/structural kinship);

129-- low = they share few words yet a model still called them similar (so the model

130-- is rewarding something OTHER than shared vocabulary -- meaning, theme, tone).

131function M.lexical_jaccard(text_a, text_b)

132 local a, b = word_set(text_a), word_set(text_b)

133 local inter, union = 0, 0

134 local seen = {}

135 for w in pairs(a) do

136 seen[w] = true

137 union = union + 1

138 if b[w] then inter = inter + 1 end

139 end

140 for w in pairs(b) do

141 if not seen[w] then union = union + 1 end

142 end

143 if union == 0 then return 0.0 end

144 return inter / union

145end

146-- }}}

147

148-- {{{ function M.personality(anchor_text, anchor_len, ranked, texts, lengths, k)

149-- Turn a model's top-k matches for one anchor into interpretable signals:

150-- mean_jaccard : average word-overlap between the anchor and its top matches.

151-- Higher => this model leans on shared wording (surface/structure).

152-- mean_len_ratio: average length similarity, min/max of the word counts, in (0,1].

153-- Near 1 => the model's favourites are close in length to the

154-- anchor (a length bias); lower => it pairs across lengths freely.

155-- mean_score : average cosine of the top matches (how "confident" / tight the

156-- neighbourhood is in this model's space -- only comparable to the

157-- same model's other anchors, not across models).

158-- These are descriptive, not verdicts: the report shows them so a human can judge.

159function M.personality(anchor_text, anchor_len, ranked, texts, lengths, k)

160 k = math.min(k or #ranked, #ranked)

161 local sum_j, sum_lr, sum_s, n = 0.0, 0.0, 0.0, 0

162 for i = 1, k do

163 local idx = ranked[i].poem_index

164 local t = texts[idx]

165 if t then

166 n = n + 1

167 sum_j = sum_j + M.lexical_jaccard(anchor_text, t)

168 local la, lb = anchor_len or 0, lengths[idx] or 0

169 if la > 0 and lb > 0 then

170 sum_lr = sum_lr + (math.min(la, lb) / math.max(la, lb))

171 end

172 sum_s = sum_s + ranked[i].score

173 end

174 end

175 if n == 0 then return { mean_jaccard = 0, mean_len_ratio = 0, mean_score = 0, n = 0 } end

176 return {

177 mean_jaccard = sum_j / n,

178 mean_len_ratio = sum_lr / n,

179 mean_score = sum_s / n,

180 n = n,

181 }

182end

183-- }}}

184

185-- {{{ function M.mean(list)

186-- Small helper: arithmetic mean of a numeric array, or nil if empty (so callers

187-- can render "n/a" rather than divide by zero -- nil-as-error, not nil-as-zero).

188function M.mean(list)

189 local sum, n = 0.0, 0

190 for _, v in ipairs(list) do

191 if v then sum = sum + v; n = n + 1 end

192 end

193 if n == 0 then return nil end

194 return sum / n

195end

196-- }}}

197

198return M

199