src/hubness-experiment.lua

182 lines

1#!/usr/bin/env luajit
2-- Image-embedding measurement tool (Issue 9-013). Turn the seam knob and watch
3-- three different things move at once, for an image synthesized between two
4-- chronological neighbours (a = BEFORE poem, b = AFTER poem):
5--
6-- 1. CENTRALITY -- cosine to the corpus centroid (the centre of mass). High =
7-- a HUB that floods similar lists. Compares the dimensional cross-cut
8-- against a weighted average across the whole 0..100% blend.
9-- 2. STRUCTURE/TEXTURE -- cosine of the cross-cut to the BEFORE poem (its
10-- "structure", the front dims) and the AFTER poem (its "texture", the back
11-- dims). Shows how the structure-vs-texture balance tracks the seam.
12-- 3. NEIGHBOURHOOD -- Jaccard overlap of the cross-cut image's top-K similar
13-- set with the BEFORE poem's top-K vs the AFTER poem's top-K. The
14-- user-facing question: whose *similar page* does the image land in? Top-K
15-- is a cliff, so this can flip more sharply than the smooth cosine.
16--
17-- Prefer running this over trusting a number written in a doc. Knobs below.
18local DIR = "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
19package.path = DIR .. "/libs/?.lua;" .. package.path
20local dk = require("dkjson")
21local utils = require("utils") -- Issue 10-054: read embeddings from wherever they live (RAM or disk)
22
23-- {{{ knobs
24local PAIR_SAMPLES = 2000 -- random neighbour pairs for the cheap sweeps (1,2)
25local NBR_PAIRS = 30 -- pairs for the expensive neighbourhood sweep (3)
26local CORPUS = 1500 -- corpus sample the top-K rankings are drawn from
27local TOPK = 20 -- "similar page" size
28local SEED = 12345
29-- }}}
30
31io.write("loading embeddings... "); io.flush()
32local E = dk.decode(io.open(utils.embeddings_dir() .. "/embeddings.json"):read("*a")).embeddings
33local N, D = #E, #E[1].embedding
34print(string.format("%d vectors, %d dims", N, D))
35
36-- {{{ vector helpers
37local function l2(v)
38 local s = 0; for i = 1, #v do s = s + v[i] * v[i] end
39 if s == 0 then return v end
40 local inv = 1 / math.sqrt(s); local o = {}
41 for i = 1, #v do o[i] = v[i] * inv end; return o
42end
43local function dot(a, b) local s = 0; for i = 1, #a do s = s + a[i] * b[i] end; return s end
44local function seam_blend(a, b, fr) -- dimensional cross-cut
45 local seam = math.floor(D * fr); local o = {}
46 for i = 1, D do o[i] = (i <= seam) and a[i] or b[i] end; return l2(o)
47end
48local function weight_blend(a, b, fr) -- weighted average
49 local o = {}; for i = 1, D do o[i] = fr * a[i] + (1 - fr) * b[i] end; return l2(o)
50end
51-- }}}
52
53-- corpus centroid + a fixed corpus sample for rankings
54local cen = {}; for i = 1, D do cen[i] = 0 end
55for _, e in ipairs(E) do local v = e.embedding; for i = 1, D do cen[i] = cen[i] + v[i] end end
56for i = 1, D do cen[i] = cen[i] / N end
57cen = l2(cen)
58math.randomseed(SEED)
59local corpus = {}
60for i = 1, CORPUS do corpus[i] = E[math.random(N)].embedding end
61
62-- {{{ topk_set(q) -> set of corpus indices most similar to q
63local function topk_set(q)
64 local scored = {}
65 for i = 1, CORPUS do scored[i] = { i, dot(q, corpus[i]) } end
66 table.sort(scored, function(x, y) return x[2] > y[2] end)
67 local set = {}
68 for r = 1, TOPK do set[scored[r][1]] = true end
69 return set
70end
71local function jaccard(x, y)
72 local inter, uni = 0, 0
73 local seen = {}
74 for k in pairs(x) do uni = uni + 1; seen[k] = true; if y[k] then inter = inter + 1 end end
75 for k in pairs(y) do if not seen[k] then uni = uni + 1 end end
76 return uni > 0 and inter / uni or 0
77end
78-- }}}
79
80print("\n========================================================================")
81print(" 1. CENTRALITY (cosine to corpus centroid; higher = more of a hub)")
82print("========================================================================")
83do
84 math.randomseed(SEED)
85 local base = 0
86 for s = 1, PAIR_SAMPLES do base = base + dot(E[math.random(N)].embedding, cen) end
87 base = base / PAIR_SAMPLES
88 print(string.format(" baseline (real poem) = %.4f\n", base))
89 print(" before% crooked vs base weighted vs base")
90 for step = 0, 10 do
91 local fr = step / 10
92 local sc, sw = 0, 0
93 for s = 1, PAIR_SAMPLES do
94 local a, b = E[math.random(N)].embedding, E[math.random(N)].embedding
95 sc = sc + dot(seam_blend(a, b, fr), cen)
96 sw = sw + dot(weight_blend(a, b, fr), cen)
97 end
98 sc, sw = sc / PAIR_SAMPLES, sw / PAIR_SAMPLES
99 print(string.format(" %4d%% %.4f %+5.1f%% %.4f %+5.1f%%",
100 step * 10, sc, (sc - base) / base * 100, sw, (sw - base) / base * 100))
101 end
102end
103
104print("\n========================================================================")
105print(" 2. STRUCTURE / TEXTURE (cross-cut's cosine to BEFORE vs AFTER poem)")
106print("========================================================================")
107print(" before% cos->BEFORE cos->AFTER (they cross where structure=texture)")
108do
109 math.randomseed(SEED)
110 for step = 0, 10 do
111 local fr = step / 10
112 local cb, ca = 0, 0
113 for s = 1, PAIR_SAMPLES do
114 local a, b = E[math.random(N)].embedding, E[math.random(N)].embedding
115 local img = seam_blend(a, b, fr)
116 cb = cb + dot(img, a); ca = ca + dot(img, b)
117 end
118 print(string.format(" %4d%% %.3f %.3f", step * 10, cb / PAIR_SAMPLES, ca / PAIR_SAMPLES))
119 end
120end
121
122print("\n========================================================================")
123print(string.format(" 3. NEIGHBOURHOOD (top-%d similar-set Jaccard overlap; %d pairs)", TOPK, NBR_PAIRS))
124print("========================================================================")
125print(" before% J(img, BEFORE-page) J(img, AFTER-page)")
126do
127 math.randomseed(SEED + 1)
128 -- Precompute each pair's BEFORE/AFTER neighbour sets once, reuse per seam.
129 local pairs_data = {}
130 for p = 1, NBR_PAIRS do
131 local a, b = E[math.random(N)].embedding, E[math.random(N)].embedding
132 pairs_data[p] = { a = a, b = b, aset = topk_set(a), bset = topk_set(b) }
133 end
134 for step = 0, 10 do
135 local fr = step / 10
136 local jb, ja = 0, 0
137 for p = 1, NBR_PAIRS do
138 local pd = pairs_data[p]
139 local imgset = topk_set(seam_blend(pd.a, pd.b, fr))
140 jb = jb + jaccard(imgset, pd.aset)
141 ja = ja + jaccard(imgset, pd.bset)
142 end
143 print(string.format(" %4d%% %.3f %.3f", step * 10, jb / NBR_PAIRS, ja / NBR_PAIRS))
144 end
145end
146
147print("\n========================================================================")
148print(string.format(" 4. FREQUENCY (k-occurrence: times each item lands in others' top-%d)", TOPK))
149print("========================================================================")
150print(" the real question: do images show up MORE than poems? (goal: no). This")
151print(" builds a corpus of real poems + injected images and counts appearances.")
152do
153 math.randomseed(SEED + 2)
154 local KP, KI = 600, 90 -- poems + images in the test corpus (kept small: O(n^2))
155 local base_poems, pair_list = {}, {}
156 for i = 1, KP do base_poems[i] = E[math.random(N)].embedding end
157 for i = 1, KI do pair_list[i] = { E[math.random(N)].embedding, E[math.random(N)].embedding } end
158 local function kocc(mkimg)
159 local items = {}
160 for i = 1, KP do items[i] = { v = base_poems[i], img = false } end
161 for i = 1, KI do items[KP + i] = { v = mkimg(pair_list[i][1], pair_list[i][2]), img = true } end
162 local M = #items
163 local occ = {}; for i = 1, M do occ[i] = 0 end
164 for q = 1, M do
165 local sc = {}
166 for j = 1, M do if j ~= q then sc[#sc + 1] = { j, dot(items[q].v, items[j].v) } end end
167 table.sort(sc, function(x, y) return x[2] > y[2] end)
168 for r = 1, TOPK do occ[sc[r][1]] = occ[sc[r][1]] + 1 end
169 end
170 local sp, np, si, ni = 0, 0, 0, 0
171 for i = 1, M do if items[i].img then si = si + occ[i]; ni = ni + 1 else sp = sp + occ[i]; np = np + 1 end end
172 return sp / np, si / ni
173 end
174 local avg_fn = function(a, b) local o = {}; for i = 1, D do o[i] = (a[i] + b[i]) * 0.5 end; return l2(o) end
175 print(" method poems images images vs poems")
176 local pp, im = kocc(avg_fn)
177 print(string.format(" midpoint average %5.1f %5.2f %+5.0f%% (the old flooding)", pp, im, (im / pp - 1) * 100))
178 local pc, ic = kocc(function(a, b) return seam_blend(a, b, 0.5) end)
179 print(string.format(" crooked 50%% (live) %5.1f %5.2f %+5.0f%% (current setting)", pc, ic, (ic / pc - 1) * 100))
180end
181print("")
182