src/augment-embeddings-with-images.lua
1#!/usr/bin/env luajit
2-- augment-embeddings-with-images.lua
3--
4-- Issue 9-013 (redesign) pipeline hook. Runs AFTER poem embeddings exist
5-- (Stage 6) and BEFORE the GPU similarity stage (Stage 7). It gives every
6-- TEXT-LESS image a "pseudo-embedding" (the normalized average of the poem
7-- before and after it in time) so images rank on similar/different pages like
8-- poems. See src/image-pseudo-embeddings.lua for the pure math.
9--
10-- Three image classes (decided from the data, see issue 9-013):
11-- 1. text + image post -> keep the post's real text embedding (left alone)
12-- 2. image-only post -> REPLACE its useless πΌ embedding with a pseudo
13-- 3. standalone catalog image (my-art, ...) -> APPEND a new pseudo entry
14--
15-- Outputs (idempotent β safe to re-run):
16-- embeddings.json : augmented in place (class-2 replaced, class-3
17-- appended with is_image=true). The similarity stage
18-- reads this unchanged.
19-- image-manifest.json : poem_index -> render data for every image entry,
20-- so the HTML renderer can draw an image box instead
21-- of looking the index up in poems.json (where the
22-- class-3 pseudo-poems do not exist).
23--
24-- Run from any directory: luajit src/augment-embeddings-with-images.lua [DIR]
25
26-- {{{ Setup directory + module path
27local DIR = arg[1] or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
28package.path = DIR .. "/libs/?.lua;" .. DIR .. "/src/?.lua;" .. package.path
29
30local dkjson = require("dkjson")
31local pseudo = require("image-pseudo-embeddings")
33-- }}}
34
35local M = {}
36
37-- {{{ local function read_json()
38local function read_json(path)
39 local f = io.open(path, "r")
40 if not f then return nil, "cannot open " .. path end
41 local s = f:read("*a"); f:close()
42 return dkjson.decode(s)
43end
44-- }}}
45
46-- {{{ local function write_json()
47local function write_json(path, data)
48 local f = assert(io.open(path, "w"))
49 f:write(dkjson.encode(data, { indent = false }))
50 f:close()
51end
52-- }}}
53
54-- {{{ function M.parse_iso8601()
55-- Minimal ISO-8601 -> Unix epoch, matching poem-extractor's parser. Used to put
56-- text poems and image-only posts on the same numeric timeline as catalog
57-- images (which already carry a unix modification_time).
58function M.parse_iso8601(ts)
59 if not ts or ts == "" then return 0 end
60 local y, mo, d, h, mi, s = ts:match("(%d%d%d%d)%-(%d%d)%-(%d%d)T(%d%d):(%d%d):(%d%d)")
61 if y then
62 return os.time({ year = y, month = mo, day = d, hour = h, min = mi, sec = s })
63 end
64 y, mo, d = ts:match("(%d%d%d%d)%-(%d%d)%-(%d%d)")
65 if y then
66 return os.time({ year = y, month = mo, day = d, hour = 12, min = 0, sec = 0 })
67 end
68 return 0
69end
70-- }}}
71
72-- {{{ function M.is_image_only()
73-- Class 2 detector. Strips whitespace and the image-placeholder emojis the
74-- collection uses, then treats <10 remaining characters as "no usable text".
75-- Mirrors poem-extractor's is_image_only_post so classification is consistent.
76function M.is_image_only(content)
77 local stripped = (content or ""):gsub("%s+", ""):gsub("[πΌπ·πΈπ¨π
πππποΈ]", "")
78 return #stripped < 10
79end
80-- }}}
81
82-- {{{ function M.rel_below_source()
83-- The image path below its source directory, for the "source: sub: name.png"
84-- title. Catalog `relative_path` is actually absolute, so strip the
85-- `source_directory` prefix.
86function M.rel_below_source(img)
87 local full = img.relative_path or img.file_path or ""
88 local base = img.source_directory or ""
89 if base ~= "" and full:sub(1, #base) == base then
90 return (full:sub(#base + 1):gsub("^/+", ""))
91 end
92 return img.filename or full
93end
94-- }}}
95
96-- {{{ function M.augment()
97-- Pure-ish core: operates on already-loaded tables, returns the augmented
98-- embeddings list and the image manifest. No file I/O (so it is testable).
99-- embeddings_data : { embeddings = [ {embedding, poem_index, id, ...}, ... ] }
100-- poems_data : { poems = [ {poem_index, id, category, content,
101-- creation_date, attachments}, ... ] }
102-- catalog_data : { images = [ {source_name, modification_time, ...}, ... ] }
103function M.augment(embeddings_data, poems_data, catalog_data)
104 local report = { class1 = 0, class2 = 0, class3 = 0, skipped = 0 }
105
106 -- Poem metadata by poem_index (content, date, attachments).
107 local poem_by_index = {}
108 for _, p in ipairs(poems_data.poems or {}) do
109 poem_by_index[p.poem_index] = p
110 end
111
112 -- Drop any previously-appended image entries so re-runs are idempotent;
113 -- keep only the genuine poem entries (poem_index present in poems.json).
114 local poem_entries, max_index = {}, 0
115 for _, e in ipairs(embeddings_data.embeddings or {}) do
116 if not e.is_image and poem_by_index[e.poem_index] then
117 poem_entries[#poem_entries + 1] = e
118 if e.poem_index > max_index then max_index = e.poem_index end
119 end
120 end
121
122 -- Classify poems and build the chronological SPINE (text poems only:
123 -- ordinary poems + class-1 text+image posts; class-2 image-only posts are
124 -- excluded so an image never averages over another image).
125 local spine, class2_entries = {}, {}
126 for _, e in ipairs(poem_entries) do
127 local p = poem_by_index[e.poem_index]
128 local has_att = p.attachments and #p.attachments > 0
129 if has_att and M.is_image_only(p.content) then
130 class2_entries[#class2_entries + 1] = { entry = e, poem = p }
131 report.class2 = report.class2 + 1
132 else
133 if has_att then report.class1 = report.class1 + 1 end
134 spine[#spine + 1] = {
135 poem_index = e.poem_index,
136 timestamp = M.parse_iso8601(p.creation_date),
137 embedding = e.embedding,
138 }
139 end
140 end
141
142 local manifest = {} -- poem_index -> render record
143
144 -- Class 2: replace each image-only post's embedding with its pseudo, in
145 -- place (its entry keeps its poem_index, so it stays ranked).
146 do
147 local imgs = {}
148 for _, c in ipairs(class2_entries) do
149 imgs[#imgs + 1] = {
150 id = c.entry.poem_index,
151 source_name = c.poem.category or "fediverse",
152 rel_below_source = nil,
153 timestamp = M.parse_iso8601(c.poem.creation_date),
154 _entry = c.entry, _poem = c.poem,
155 }
156 end
157 local results, skipped = pseudo.compute_image_pseudo_embeddings(spine, imgs)
158 report.skipped = report.skipped + #skipped
159 for _, r in ipairs(results) do
160 r.image._entry.embedding = r.embedding -- replace in place
161 manifest[tostring(r.image._entry.poem_index)] = {
162 is_image = true, class = 2,
163 poem_index = r.image._entry.poem_index,
164 category = r.image._poem.category,
165 source_id = r.image._poem.id,
166 attachments = r.image._poem.attachments,
167 creation_date = r.image._poem.creation_date,
168 }
169 end
170 end
171
172 -- Class 3: standalone catalog images become NEW appended entries. Skip
173 -- fediverse-media (those are poem attachments, already handled as class 1/2).
174 local appended = {}
175 do
176 local imgs = {}
177 for _, im in ipairs(catalog_data.images or {}) do
178 if im.source_name ~= "fediverse-media" then
179 imgs[#imgs + 1] = {
180 id = im.hash,
181 source_name = im.source_name,
182 rel_below_source = M.rel_below_source(im),
183 timestamp = tonumber(im.modification_time) or M.parse_iso8601(im.modification_date),
184 _img = im,
185 }
186 end
187 end
188 local results, skipped = pseudo.compute_image_pseudo_embeddings(spine, imgs)
189 report.skipped = report.skipped + #skipped
190 for _, r in ipairs(results) do
191 max_index = max_index + 1
192 local new_entry = {
193 embedding = r.embedding,
194 poem_index = max_index,
195 id = "img-" .. tostring(r.image.id),
196 is_image = true,
197 content_length = 0,
198 }
199 appended[#appended + 1] = new_entry
200 report.class3 = report.class3 + 1
201 manifest[tostring(max_index)] = {
202 is_image = true, class = 3,
203 poem_index = max_index,
204 source_name = r.image.source_name,
205 display_title = r.display_title,
206 relative_path = r.image._img.relative_path,
207 source_directory = r.image._img.source_directory,
208 width = r.image._img.width, height = r.image._img.height,
209 creation_date = r.image._img.modification_date,
210 -- Deep-link target on the chronological gallery page. MUST match
211 -- generate-gallery-pages.lua's image_anchor() = "img-" .. hash:sub(1,12).
212 gallery_anchor = "img-" .. tostring(r.image.id):sub(1, 12),
213 }
214 end
215 end
216
217 -- Final augmented list: poems (with class-2 replaced) + appended class-3.
218 local out = {}
219 for _, e in ipairs(poem_entries) do out[#out + 1] = e end
220 for _, e in ipairs(appended) do out[#out + 1] = e end
221 return out, manifest, report
222end
223-- }}}
224
225-- {{{ local function main()
226local function main()
227 -- Resolve the model through the shared resolver instead of a hardcoded
228 -- default: it reads this run's overrides notepad (tmp/run-overrides.lua,
229 -- written by run.sh from --model) and falls back to config.lua -- so the CLI
230 -- override is honored and there is no source-code default to drift out of
231 -- sync with config. Works standalone too (no notepad => config default).
232 local inference_config = require("inference-server-config")
233 inference_config.set_project_root(DIR)
234 local model = inference_config.get_selected_model()
236 local emb_path = edir .. "/embeddings.json"
237 local manifest_path = edir .. "/image-manifest.json"
238
239 local embeddings_data = assert(read_json(emb_path), "missing embeddings.json β run Stage 6 first")
240 local poems_data = assert(read_json(DIR .. "/assets/poems.json"), "missing poems.json")
241 local catalog_data = read_json(DIR .. "/assets/image-catalog.json")
242 if not catalog_data then
243 print("[augment] No image-catalog.json; only image-only posts will be pseudo-embedded.")
244 catalog_data = { images = {} }
245 end
246
247 local out, manifest, report = M.augment(embeddings_data, poems_data, catalog_data)
248 embeddings_data.embeddings = out
249 embeddings_data.metadata = embeddings_data.metadata or {}
250 embeddings_data.metadata.image_pseudo_embeddings = report
251
252 write_json(emb_path, embeddings_data)
253 write_json(manifest_path, { metadata = report, images = manifest })
254
255 print(string.format(
256 "[augment] class1(text+img) kept=%d class2(image-only) replaced=%d class3(standalone) appended=%d skipped=%d",
257 report.class1, report.class2, report.class3, report.skipped))
258 print(string.format("[augment] embeddings: %d entries -> %s", #out, emb_path))
259 print(string.format("[augment] manifest: %s", manifest_path))
260end
261-- }}}
262
263-- Run main() only when invoked as a script (not when required by the test).
264if not _G.AUGMENT_NO_MAIN then
265 main()
266end
267
268return M
269