src/image-pseudo-embeddings.lua
1-- image-pseudo-embeddings.lua
2--
3-- Issue 9-013 (redesign): give every image a "pseudo-embedding" so it can be
4-- ranked on similar/different pages like a poem. An image carries no usable
5-- semantic vector of its own, so we synthesize one from WHERE it sits in time:
6-- the average of the embeddings of the poem immediately before it and the poem
7-- immediately after it chronologically. That places the image at the semantic
8-- midpoint of its two temporal neighbours -- its true "between two moments"
9-- position.
10--
11-- This module is deliberately PURE: it takes poems-with-embeddings and image
12-- records that already carry a numeric `timestamp`, and returns image
13-- pseudo-poems. No file I/O, no date parsing, no GPU -- so it is unit-testable
14-- on tiny fixtures. The pipeline caller does the loading, the ISO-date ->
15-- timestamp conversion, and the join of embeddings.json to poems.json.
16
17local M = {}
18
19-- {{{ local function l2_normalize()
20-- Scale a vector to unit length. Cosine similarity is unaffected by magnitude,
21-- but normalizing keeps the pseudo-embeddings on the same footing as the poem
22-- embeddings the downstream cosine code expects. A zero vector is returned
23-- unchanged (no division by zero) -- it would only arise from degenerate input.
24local function l2_normalize(vec)
25 local sum = 0
26 for i = 1, #vec do sum = sum + vec[i] * vec[i] end
27 if sum == 0 then return vec end
28 local inv = 1 / math.sqrt(sum)
29 local out = {}
30 for i = 1, #vec do out[i] = vec[i] * inv end
31 return out
32end
33-- }}}
34
35-- {{{ local function crooked_embedding()
36-- Build an image's pseudo-embedding by CROSS-CUTTING its two neighbours instead
37-- of averaging them: the first SEAM dimensions come from the poem BEFORE the
38-- image, the rest from the poem AFTER.
39--
40-- Why not the midpoint? Averaging two unit vectors smooths them toward the
41-- corpus centre -- measured at +12% closer to the centroid, with the spread
42-- collapsing -- which turns images into "hubs" that flood every poem's similar
43-- list (and never appear in the diversity-spread different lists). Concatenation
44-- keeps each dimension's full, real-poem magnitude, so the result sits at the
45-- normal baseline centrality (measured -0.1%) and ranks like an ordinary poem.
46--
47-- nomic-embed-text-v1.5 is a Matryoshka model: its leading dimensions carry the
48-- coarse meaning, so the seam reads poetically as "the image takes its SUBJECT
49-- from the poem before it and its TEXTURE from the poem after it." The seam
50-- position is a FLAVOUR knob (it shifts which poems the image resembles) and was
51-- measured to NOT affect hubness, so it is safe to tune for feel.
52--
53-- The one-neighbour case (an image before the first poem or after the last) has
54-- only one side, so it simply takes that real poem's direction.
55local SEAM_FRACTION = 0.5 -- 0.5 = half subject / half texture; lower leans toward the 'after' poem
56local function crooked_embedding(before, after)
57 if before and after then
58 local seam = math.floor(#before * SEAM_FRACTION)
59 local out = {}
60 for i = 1, #before do out[i] = (i <= seam) and before[i] or after[i] end
61 return l2_normalize(out)
62 end
63 -- Exactly one side present (timeline end). Copy + normalize it.
64 local single = before or after
65 local out = {}
66 for i = 1, #single do out[i] = single[i] end
67 return l2_normalize(out)
68end
69-- }}}
70
71-- {{{ function M.qualified_image_title()
72-- Build the colon-joined "full path" title shared with Issue 10-042d, e.g.
73-- my-art: air-defence-drones-5.png
74-- my-art: game-design: camera-idea.png
75-- source_name is the gallery source; rel_below_source is the image's path BELOW
76-- that source dir (subdirs + filename). Slashes become ": " so nesting reads as
77-- a breadcrumb instead of a URL.
78function M.qualified_image_title(source_name, rel_below_source)
79 local tail = (rel_below_source or ""):gsub("^/+", ""):gsub("/", ": ")
80 if tail == "" then return source_name end
81 return source_name .. ": " .. tail
82end
83-- }}}
84
85-- {{{ function M.find_chrono_neighbors()
86-- Given poems sorted ascending by timestamp and a target time, return the
87-- nearest poem at-or-before and the nearest at-or-after (either may be nil at
88-- the ends). Binary search -> O(log n) per image. `sorted_poems` must already
89-- be sorted by `.timestamp`.
90function M.find_chrono_neighbors(sorted_poems, t)
91 local lo, hi = 1, #sorted_poems
92 if hi == 0 then return nil, nil end
93 -- Find the first index whose timestamp >= t.
94 local first_ge = hi + 1
95 while lo <= hi do
96 local mid = math.floor((lo + hi) / 2)
97 if sorted_poems[mid].timestamp >= t then
98 first_ge = mid
99 hi = mid - 1
100 else
101 lo = mid + 1
102 end
103 end
104 local after = sorted_poems[first_ge] -- at-or-after (or nil)
105 local before = sorted_poems[first_ge - 1] -- strictly before (or nil)
106 -- If a poem sits exactly at t, treat it as BOTH sides so the image lands on
107 -- that exact moment rather than averaging across it.
108 if after and after.timestamp == t then
109 return after, after
110 end
111 return before, after
112end
113-- }}}
114
115-- {{{ function M.compute_image_pseudo_embeddings()
116-- Core entry point. Inputs:
117-- poems : array of { poem_index, timestamp (number), embedding (array) }
118-- images : array of { id, source_name, rel_below_source, timestamp (number), ... }
119-- Returns: array of image pseudo-poems, each carrying the synthesized
120-- `embedding`, a `display_title`, and the original image record under `image`.
121-- Images whose neighbours have no usable embedding (empty timeline) are skipped
122-- and reported in the second return value so the caller can warn -- a missing
123-- pseudo-embedding is an error condition (no silent fallback), per project rules.
124function M.compute_image_pseudo_embeddings(poems, images)
125 -- Sort a shallow copy by timestamp so the caller's order is untouched.
126 local sorted = {}
127 for i = 1, #poems do sorted[i] = poems[i] end
128 table.sort(sorted, function(a, b) return a.timestamp < b.timestamp end)
129
130 local pseudo = {}
131 local skipped = {}
132 for _, img in ipairs(images) do
133 local before, after = M.find_chrono_neighbors(sorted, img.timestamp)
134 local be = before and before.embedding
135 local ae = after and after.embedding
136 if be or ae then
137 pseudo[#pseudo + 1] = {
138 is_image = true,
139 id = img.id,
140 source_name = img.source_name,
141 rel_below_source = img.rel_below_source,
142 display_title = M.qualified_image_title(img.source_name, img.rel_below_source),
143 timestamp = img.timestamp,
144 embedding = crooked_embedding(be, ae),
145 image = img, -- keep the raw record for rendering
146 }
147 else
148 skipped[#skipped + 1] = img
149 end
150 end
151 return pseudo, skipped
152end
153-- }}}
154
155return M
156