src/triangular-similarity-matrix.lua
1#!/usr/bin/env luajit
2
3-- Triangular Similarity Matrix Generator (Issue 5-025)
4-- Generates space-efficient triangular similarity matrix
5-- Only stores upper triangle: for i < j, store matrix[i][j]
6-- Exploits symmetry: similarity(A,B) = similarity(B,A)
7-- Storage reduction: ~50% (30.4M entries instead of 60.8M)
8
9local DIR = DIR or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
10package.path = DIR .. '/libs/?.lua;' .. package.path
11
12local utils = require('utils')
13local dkjson = require('dkjson')
14
15local M = {}
16
17-- {{{ local function cosine_similarity
18local function cosine_similarity(vec1, vec2)
19 if not vec1 or not vec2 or #vec1 == 0 or #vec2 == 0 then
20 return 0.0
21 end
22
23 if #vec1 ~= #vec2 then
24 utils.log_error(string.format("Vector dimension mismatch: %d vs %d", #vec1, #vec2))
25 return 0.0
26 end
27
28 local dot_product = 0.0
29 local norm1 = 0.0
30 local norm2 = 0.0
31
32 for i = 1, #vec1 do
33 dot_product = dot_product + (vec1[i] * vec2[i])
34 norm1 = norm1 + (vec1[i] * vec1[i])
35 norm2 = norm2 + (vec2[i] * vec2[i])
36 end
37
38 local magnitude = math.sqrt(norm1) * math.sqrt(norm2)
39 if magnitude == 0 then
40 return 0.0
41 end
42
43 return dot_product / magnitude
44end
45-- }}}
46
47-- {{{ function M.generate_triangular_matrix
48-- @param embeddings_file: path to embeddings JSON file
49-- @param output_file: path to write triangular matrix JSON
50-- @param force_regenerate: if true, overwrite existing file
51-- @param progress_callback: optional function(current, total) for progress updates
52-- @return success boolean, stats table
53function M.generate_triangular_matrix(embeddings_file, output_file, force_regenerate, progress_callback)
54 force_regenerate = force_regenerate or false
55
56 -- Check if matrix already exists
57 if not force_regenerate and utils.file_exists(output_file) then
58 local existing_data = utils.read_json_file(output_file)
59 if existing_data and existing_data.metadata and existing_data.metadata.is_complete then
60 utils.log_info("✅ Triangular similarity matrix already exists and is complete")
61 return true, {exists = true}
62 end
63 end
64
65 utils.log_info("🔺 Generating triangular similarity matrix...")
66 utils.log_info(" Algorithm: Upper triangle only (i < j)")
67 utils.log_info(" Storage optimization: ~50% size reduction via symmetry")
68
69 -- Load embeddings
70 local embeddings_data = utils.read_json_file(embeddings_file)
71 if not embeddings_data or not embeddings_data.embeddings then
72 utils.log_error("Failed to load embeddings from " .. embeddings_file)
73 return false, {error = "embeddings_load_failed"}
74 end
75
76 local embeddings = embeddings_data.embeddings
77 local valid_embeddings = {}
78
79 -- Filter and index embeddings by ID
80 for _, embedding in ipairs(embeddings) do
81 if embedding.embedding and #embedding.embedding > 0 and embedding.id then
82 valid_embeddings[tonumber(embedding.id)] = embedding
83 end
84 end
85
86 if next(valid_embeddings) == nil then
87 utils.log_error("No valid embeddings found")
88 return false, {error = "no_valid_embeddings"}
89 end
90
91 -- Get sorted poem IDs for consistent ordering
92 local poem_ids = {}
93 for id, _ in pairs(valid_embeddings) do
94 table.insert(poem_ids, id)
95 end
96 table.sort(poem_ids)
97
98 local num_poems = #poem_ids
99 local total_comparisons = (num_poems * (num_poems - 1)) / 2 -- Upper triangle only
100 local completed_comparisons = 0
101 local start_time = os.time()
102
103 utils.log_info(string.format("Processing %d poems for triangular matrix", num_poems))
104 utils.log_info(string.format("Total comparisons: %.1fM (vs %.1fM for full matrix)",
105 total_comparisons / 1000000, (num_poems * num_poems) / 1000000))
106
107 -- Initialize triangular matrix
108 local triangular_matrix = {
109 metadata = {
110 is_complete = true,
111 total_poems = num_poems,
112 matrix_type = "upper_triangular",
113 total_comparisons = total_comparisons,
114 algorithm = "cosine_similarity",
115 model_name = embeddings_data.metadata.embedding_model or "unknown",
116 generated_at = os.date("%Y-%m-%d %H:%M:%S"),
117 storage_optimization = "50% reduction via symmetry"
118 },
119 similarities = {}
120 }
121
122 -- Generate ONLY upper triangle (i < j)
123 for i = 1, num_poems do
124 local poem_i_id = poem_ids[i]
125 local poem_i = valid_embeddings[poem_i_id]
126 triangular_matrix.similarities[tostring(poem_i_id)] = {}
127
128 -- Progress indicator (carriage return overwrites)
129 io.write(string.format("\r[INFO] Processing poem %d/%d (ID: %d) ",
130 i, num_poems, poem_i_id))
131 io.flush()
132
133 -- Only calculate for j > i (upper triangle)
134 for j = i + 1, num_poems do
135 local poem_j_id = poem_ids[j]
136 local poem_j = valid_embeddings[poem_j_id]
137
138 -- Calculate similarity
139 local similarity = cosine_similarity(poem_i.embedding, poem_j.embedding)
140 -- Round to 4 decimal places for storage efficiency
141 local rounded_similarity = math.floor(similarity * 10000) / 10000
142
143 triangular_matrix.similarities[tostring(poem_i_id)][tostring(poem_j_id)] = rounded_similarity
144 completed_comparisons = completed_comparisons + 1
145 end
146
147 -- Progressive saving every 100 poems to prevent data loss
148 if i % 100 == 0 then
149 local elapsed = os.time() - start_time
150 local rate = completed_comparisons / elapsed
151 local remaining = (total_comparisons - completed_comparisons) / rate
152 local progress_pct = (completed_comparisons / total_comparisons) * 100
153
154 print() -- Newline after the carriage-return line
155 utils.log_info(string.format("Progress: %.2f%% (%.1fM/%.1fM comparisons)",
156 progress_pct, completed_comparisons / 1000000, total_comparisons / 1000000))
157 utils.log_info(string.format("Rate: %d comparisons/sec, Est. remaining: %d minutes",
158 math.floor(rate), math.floor(remaining / 60)))
159
160 -- Write intermediate checkpoint
161 utils.write_json_file(output_file, triangular_matrix)
162 utils.log_info("✅ Progress saved to disk")
163
164 -- Call progress callback if provided
165 if progress_callback then
166 progress_callback(completed_comparisons, total_comparisons)
167 end
168 end
169 end
170
171 -- Final save
172 print() -- Newline after last carriage-return
173 local success = utils.write_json_file(output_file, triangular_matrix)
174
175 if success then
176 local elapsed = os.time() - start_time
177 utils.log_info("✅ Triangular similarity matrix generated successfully!")
178 utils.log_info(string.format("Total comparisons: %.1fM", completed_comparisons / 1000000))
179 utils.log_info(string.format("Time elapsed: %d minutes", math.floor(elapsed / 60)))
180 utils.log_info(string.format("Output: %s", output_file))
181
182 return true, {
183 comparisons = completed_comparisons,
184 poems = num_poems,
185 elapsed_seconds = elapsed,
186 output_file = output_file
187 }
188 else
189 utils.log_error("Failed to write triangular matrix file")
190 return false, {error = "write_failed"}
191 end
192end
193-- }}}
194
195-- {{{ function M.lookup_similarity
196-- Lookup similarity from triangular matrix (handles symmetry)
197-- @param matrix: triangular matrix data structure
198-- @param id1: first poem ID
199-- @param id2: second poem ID
200-- @return similarity score (0.0 to 1.0)
201function M.lookup_similarity(matrix, id1, id2)
202 id1 = tostring(id1)
203 id2 = tostring(id2)
204
205 -- Handle self-similarity
206 if id1 == id2 then
207 return 1.0
208 end
209
210 -- Ensure consistent ordering for triangle lookup
211 local min_id = id1
212 local max_id = id2
213 if tonumber(id1) > tonumber(id2) then
214 min_id = id2
215 max_id = id1
216 end
217
218 -- Look up in upper triangle
219 if matrix.similarities and matrix.similarities[min_id] and matrix.similarities[min_id][max_id] then
220 return matrix.similarities[min_id][max_id]
221 end
222
223 -- Fallback (should not happen with complete matrix)
224 return 0.0
225end
226-- }}}
227
228-- {{{ function M.get_all_similarities_for_poem
229-- Get all similarities for a specific poem from triangular matrix
230-- @param matrix: triangular matrix data structure
231-- @param poem_id: the poem ID to get similarities for
232-- @param all_poem_ids: list of all poem IDs in the matrix
233-- @return array of {id, similarity} sorted by similarity (descending)
234function M.get_all_similarities_for_poem(matrix, poem_id, all_poem_ids)
235 local similarities = {}
236
237 for _, other_id in ipairs(all_poem_ids) do
238 if other_id ~= poem_id then
239 local score = M.lookup_similarity(matrix, poem_id, other_id)
240 table.insert(similarities, {id = other_id, similarity = score})
241 end
242 end
243
244 -- Sort by similarity (descending)
245 table.sort(similarities, function(a, b)
246 return a.similarity > b.similarity
247 end)
248
249 return similarities
250end
251-- }}}
252
253-- Command line execution
254if arg and arg[0] then
255 local embeddings_file = arg[1] or "assets/embeddings/embeddinggemma_latest/embeddings.json"
256 local output_file = arg[2] or "assets/embeddings/embeddinggemma_latest/similarity_matrix_triangular.json"
257 local force = arg[3] == "--force"
258
259 print("Triangular Similarity Matrix Generator")
260 print("Input: " .. embeddings_file)
261 print("Output: " .. output_file)
262 print("Force: " .. tostring(force))
263 print()
264
265 local success, stats = M.generate_triangular_matrix(embeddings_file, output_file, force)
266 os.exit(success and 0 or 1)
267end
268
269return M
270