src/augment-embeddings-with-images.test.lua
1-- Tests for augment-embeddings-with-images.lua (Issue 9-013 redesign).
2-- Run: luajit src/augment-embeddings-with-images.test.lua [DIR]
3-- Part 1: pure logic on fixtures. Part 2: a read-only sanity pass over the real
4-- data (counts + idempotency) WITHOUT writing anything.
5local DIR = arg[1] or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
6package.path = DIR .. "/src/?.lua;" .. DIR .. "/libs/?.lua;" .. package.path
7_G.AUGMENT_NO_MAIN = true -- suppress the script's main() on require
8local A = require("augment-embeddings-with-images")
10
11local passed, failed = 0, 0
12local function check(name, cond)
13 if cond then passed = passed + 1 else failed = failed + 1; print(" FAIL: " .. name) end
14end
15local function approx(a, b) return math.abs(a - b) < 1e-6 end
16local function vapprox(v, e)
17 if #v ~= #e then return false end
18 for i = 1, #v do if not approx(v[i], e[i]) then return false end end
19 return true
20end
21local R2 = 1 / math.sqrt(2)
22
23-- {{{ Part 1: fixtures
24local function make_inputs()
25 local embeddings = { embeddings = {
26 { embedding = {1,0,0}, poem_index = 1, id = 1 },
27 { embedding = {0,1,0}, poem_index = 2, id = 2 },
28 { embedding = {0,0,1}, poem_index = 3, id = 3 },
29 { embedding = {0.9,0.1,0}, poem_index = 4, id = 4 }, -- image-only, junk 🖼 vec
30 }}
31 local poems = { poems = {
32 { poem_index = 1, id = 1, category = "fediverse", content = "a real thought about the world", creation_date = "2024-01-01T00:00:00Z" },
33 { poem_index = 2, id = 2, category = "fediverse", content = "another genuine sentence here", creation_date = "2024-01-03T00:00:00Z" },
34 { poem_index = 3, id = 3, category = "fediverse", content = "text with an attached picture below", creation_date = "2024-01-05T00:00:00Z", attachments = {{ url = "/m/x.png", media_type = "image/png" }} },
35 { poem_index = 4, id = 4, category = "fediverse", content = "🖼", creation_date = "2024-01-04T00:00:00Z", attachments = {{ url = "/m/y.png", media_type = "image/png" }} },
36 }}
37 local catalog = { images = {
38 { source_name = "my-art", hash = "abc123", filename = "factory-cube.png",
39 source_directory = "/p/input/images/my-art",
40 relative_path = "/p/input/images/my-art/factory-cube.png",
41 modification_date = "2024-01-02T00:00:00Z", width = 100, height = 100 },
42 }}
43 return embeddings, poems, catalog
44end
45
46local emb, poems, catalog = make_inputs()
47local out, manifest, report = A.augment(emb, poems, catalog)
48
49check("class1 (text+image) counted", report.class1 == 1)
50check("class2 (image-only) counted", report.class2 == 1)
51check("class3 (standalone) appended", report.class3 == 1)
52check("no skips", report.skipped == 0)
53check("output has 5 entries (4 poems + 1 image)", #out == 5)
54
55-- p4 (image-only @ Jan-4) sits between p2 (Jan-3) and p3 (Jan-5): avg -> (0,R2,R2).
56local p4 = out[4]
57check("class-2 embedding replaced with neighbor average", vapprox(p4.embedding, {0, R2, R2}))
58-- catalog image @ Jan-2 sits between p1 (Jan-1) and p2 (Jan-3): avg -> (R2,R2,0).
59local appended = out[5]
60check("class-3 appended embedding is the midpoint", vapprox(appended.embedding, {R2, R2, 0}))
61check("appended entry flagged is_image", appended.is_image == true)
62check("appended entry id prefixed", appended.id == "img-abc123")
63check("appended entry got a fresh poem_index", appended.poem_index == 5)
64
65check("manifest marks poem 4 as class-2 image", manifest["4"] and manifest["4"].class == 2)
66check("manifest marks poem 5 as class-3 image", manifest["5"] and manifest["5"].class == 3)
67check("manifest carries the qualified title", manifest["5"].display_title == "my-art: factory-cube.png")
68
69-- Idempotency: feed the output back in (as if re-running on an augmented file).
70local emb2 = { embeddings = out, metadata = {} }
71local out2, _, report2 = A.augment(emb2, poems, catalog)
72check("idempotent: same entry count on re-run", #out2 == #out)
73check("idempotent: same class counts", report2.class2 == 1 and report2.class3 == 1)
74check("idempotent: class-2 embedding stable", vapprox(out2[4].embedding, {0, R2, R2}))
75-- }}}
76
77-- {{{ Part 2: real-data sanity (read-only)
78local function read_json(p)
79 local f = io.open(p, "r"); if not f then return nil end
80 local s = f:read("*a"); f:close(); return require("dkjson").decode(s)
81end
82local model = (os.getenv("MODEL_NAME") or "nomic-embed-text-v1.5"):gsub(":", "_")
83local real_emb = read_json(utils.embeddings_dir() .. "/embeddings.json")
84local real_poems = read_json(DIR .. "/assets/poems.json")
85local real_cat = read_json(DIR .. "/assets/image-catalog.json")
86if real_emb and real_poems and real_cat then
87 local rout, rmanifest, rreport = A.augment(real_emb, real_poems, real_cat)
88 print(string.format("\n[real data] class1=%d class2=%d class3=%d skipped=%d (%d -> %d entries)",
89 rreport.class1, rreport.class2, rreport.class3, rreport.skipped, #real_emb.embeddings, #rout))
90 check("real: image-only posts found (~52)", rreport.class2 >= 40 and rreport.class2 <= 70)
91 check("real: standalone images appended (~692)", rreport.class3 >= 600 and rreport.class3 <= 720)
92 check("real: output grew by class3", #rout == #real_poems.poems + rreport.class3)
93 -- idempotency on real data (re-run on the augmented set)
94 local rout2 = A.augment({ embeddings = rout, metadata = {} }, real_poems, real_cat)
95 check("real: idempotent entry count", #rout2 == #rout)
96else
97 print("\n[real data] skipped (data files not all present)")
98end
99-- }}}
100
101print(string.format("\naugment-embeddings: %d passed, %d failed", passed, failed))
102os.exit(failed == 0 and 0 or 1)
103