libs/vulkan-compute/lua/vk_similarity.lua
1-- Lua FFI bindings for Vulkan similarity computation
2-- Provides GPU-accelerated cosine similarity calculation for triangular individual files
3-- (Issue 10-057: the parallel CPU-sort path was removed; this module is GPU-only now,
4-- so it no longer depends on effil.)
5
6local ffi = require("ffi")
7local utils = require("utils")
8local dkjson = require("dkjson")
9
10-- {{{ FFI definitions
11ffi.cdef[[
12// Error codes
13typedef enum {
14 VKC_SUCCESS = 0,
15 VKC_ERROR_INIT_FAILED = -1,
16 VKC_ERROR_NO_SUITABLE_DEVICE = -2,
17 VKC_ERROR_BUFFER_CREATION_FAILED = -3,
18 VKC_ERROR_SHADER_LOAD_FAILED = -4,
19 VKC_ERROR_PIPELINE_CREATION_FAILED = -5,
20 VKC_ERROR_COMMAND_EXECUTION_FAILED = -6,
21 VKC_ERROR_OUT_OF_MEMORY = -7,
22} VkComputeResult;
23
24// Opaque types
25typedef struct VkComputeContext VkComputeContext;
26typedef struct VkSimilarityContext VkSimilarityContext;
27
28// vk_compute.h functions
29VkComputeContext* vkc_init(bool enable_validation);
30void vkc_destroy(VkComputeContext* ctx);
31const char* vkc_get_error_string(VkComputeResult result);
32const char* vkc_get_device_name(VkComputeContext* ctx);
33
34// vk_similarity.h functions
35VkSimilarityContext* vks_init(VkComputeContext* ctx,
36 const float* embeddings,
37 uint32_t num_poems,
38 uint32_t embedding_dim);
39
40void vks_destroy(VkSimilarityContext* sim_ctx);
41
42// Parallel full-matrix computation (Issue 9-002 original design)
43VkComputeResult vks_compute_all_similarities_parallel(
44 VkSimilarityContext* sim_ctx,
45 float* output_triangular);
46
47// Parallel file I/O with pthreads (avoids Lua serialization overhead)
48VkComputeResult vks_write_similarity_files_parallel(
49 const float* triangular_buffer,
50 uint32_t num_poems,
51 const uint32_t* poem_indices,
52 const char** poem_ids,
53 const char* output_dir,
54 uint32_t num_threads);
55
56// Parallel cache generation with pthreads. top_k caps how many nearest neighbours
57// are stored per poem (0 = all); the cap shrinks the on-disk JSON and the RAM table
58// the HTML stage parses it into (Issue 10-057).
59VkComputeResult vks_write_rankings_cache_parallel(
60 const float* triangular_buffer,
61 uint32_t num_poems,
62 const uint32_t* poem_indices,
63 const char* cache_file,
64 uint32_t num_threads,
65 uint32_t top_k);
66]]
67-- }}}
68
69-- Load Vulkan compute library
70-- Use absolute path or search in current directory
71local DIR = os.getenv("DIR") or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
72local lib_path = DIR .. "/libs/vulkan-compute/build/libvkcompute.so"
73local vklib = ffi.load(lib_path)
74
75local M = {}
76
77-- {{{ function triangular_size
78-- Calculate size of triangular matrix (number of pairs)
79local function triangular_size(num_poems)
80 return (num_poems * (num_poems - 1)) / 2
81end
82-- }}}
83
84-- {{{ function triangular_index
85-- Get linear index for pair (i, j) where i < j
86local function triangular_index(i, j, num_poems)
87 return i * num_poems - (i * (i + 1)) / 2 + (j - i - 1)
88end
89-- }}}
90
91-- {{{ function M.generate_similarity_matrix_gpu_parallel
92-- Generate similarity matrix using TRUE parallel GPU computation
93-- This is the correct implementation per Issue 9-002 original design.
94-- Computes ALL ~30M pairs in a SINGLE GPU dispatch (seconds, not hours)
95--
96-- @param embeddings_file: Path to embeddings.json
97-- @param model_name: Model name for output directory
98-- @param force: Force regeneration even if files exist
99-- @param num_threads: Number of CPU threads for parallel file writing
100-- @param top_k: keep only the top-K nearest neighbours per poem in the rankings
101-- cache (nil/0 = keep all). Caps disk + HTML-stage RAM (Issue 10-057).
102-- @return success: boolean
103function M.generate_similarity_matrix_gpu_parallel(embeddings_file, model_name, force, num_threads, top_k)
104 print("[GPU SIMILARITY] Embeddings file: " .. embeddings_file)
105 print(string.format("[GPU SIMILARITY] Force regeneration: %s", tostring(force)))
106
107 -- Load embeddings
108 local embeddings_data = utils.read_json_file(embeddings_file)
109 if not embeddings_data or not embeddings_data.embeddings then
110 error("[GPU SIMILARITY ERROR] Failed to load embeddings")
111 end
112
113 local num_poems = #embeddings_data.embeddings
114 -- Validate embeddings array is non-empty before accessing first element
115 -- Empty arrays occur when embedding generation failed (network errors, etc.)
116 if num_poems == 0 then
117 local reason = embeddings_data.metadata and embeddings_data.metadata.termination_reason or "unknown"
118 local mode = embeddings_data.metadata and embeddings_data.metadata.processing_mode or "unknown"
119 error(string.format(
120 "[GPU SIMILARITY ERROR] Embeddings array is empty (0 poems).\n" ..
121 " Processing mode: %s\n" ..
122 " Termination reason: %s\n" ..
123 " Remedy: Regenerate embeddings with: ./run.sh --generate-embeddings --force\n" ..
124 " Ensure the inference server is running: ./scripts/start-llamacpp-server.sh",
125 mode, reason
126 ))
127 end
128 local embedding_dim = #embeddings_data.embeddings[1].embedding
129 print(string.format("[GPU SIMILARITY] Loaded %d poems × %d dimensions", num_poems, embedding_dim))
130
131 -- Prepare output directory.
132 -- Issue 10-054: similarities are a movable cache -> embeddings_dir() (RAM).
133 -- This was the last writer still hardcoding a (relative!) disk path: it wrote
134 -- to assets/ and freshness-checked assets/, so with caches in RAM it skipped
135 -- to the stale disk copy and never populated RAM -- the broken-site bug.
136 local model_dir = model_name:gsub(":", "_")
137 local output_dir = utils.embeddings_dir(model_name) .. "/similarities"
138 os.execute("mkdir -p " .. output_dir)
139
140 -- Check if we can skip (files already exist and not forcing)
141 local first_file = string.format("%s/poem_index_1.json", output_dir)
142 local last_file = string.format("%s/poem_index_%d.json", output_dir, num_poems)
143 if not force and utils.file_exists(first_file) and utils.file_exists(last_file) then
144 print("[GPU SIMILARITY] Similarity files already exist, checking cache...")
145 local cache_file = utils.embeddings_dir(model_dir) .. "/similarity_rankings_cache.json"
146 if utils.file_exists(cache_file) then
147 local cache_data = utils.read_json_file(cache_file)
148 if cache_data and cache_data.rankings then
149 local cache_count = 0
150 for _ in pairs(cache_data.rankings) do cache_count = cache_count + 1 end
151 if cache_count > 0 then
152 print(string.format("[GPU SIMILARITY] ⏭️ All files and cache exist (%d poems), skipping", cache_count))
153 return true
154 end
155 end
156 end
157 -- Cache missing or empty: fall through to the GPU regeneration below. The old
158 -- Lua effil rebuild was removed (Issue 10-057) -- it sorted on the CPU AND
159 -- bypassed the top-K cap, so it would silently rewrite a full-size cache. The
160 -- GPU path rebuilds the cache capped, which is the only route we keep.
161 print("[GPU SIMILARITY] Cache missing or empty, regenerating on the GPU...")
162 end
163
164 -- Convert embeddings to flat C array
165 print("[GPU SIMILARITY] Preparing embeddings for GPU...")
166 local flat_embeddings = ffi.new("float[?]", num_poems * embedding_dim)
167 for i, poem in ipairs(embeddings_data.embeddings) do
168 local base = (i - 1) * embedding_dim
169 for j, val in ipairs(poem.embedding) do
170 flat_embeddings[base + j - 1] = val
171 end
172 end
173
174 -- Initialize Vulkan context
175 print("[GPU SIMILARITY] Initializing Vulkan context...")
176 -- vkc_init prints the chosen GPU once ("[VKC] Selected device: ..."), so
177 -- we no longer echo a second "GPU device" line from here.
178 local vk_ctx = vklib.vkc_init(false) -- Disable validation for performance
179 if vk_ctx == nil then
180 error("[GPU SIMILARITY ERROR] Failed to initialize Vulkan context")
181 end
182
183 -- Initialize similarity context
184 local sim_ctx = vklib.vks_init(vk_ctx, flat_embeddings, num_poems, embedding_dim)
185 if sim_ctx == nil then
186 vklib.vkc_destroy(vk_ctx)
187 error("[GPU SIMILARITY ERROR] Failed to initialize similarity context")
188 end
189
190 -- Allocate output buffer for triangular matrix
191 local tri_size = triangular_size(num_poems)
192 print(string.format("[GPU SIMILARITY] Allocating triangular buffer: %d pairs (%.1f MB)",
193 tri_size, tri_size * 4 / 1024 / 1024))
194 local triangular_output = ffi.new("float[?]", tri_size)
195
196 -- SINGLE DISPATCH - compute ALL similarities at once!
197 local start_time = os.time()
198
199 local result = vklib.vks_compute_all_similarities_parallel(sim_ctx, triangular_output)
200 if result ~= 0 then
201 local error_str = ffi.string(vklib.vkc_get_error_string(result))
202 vklib.vks_destroy(sim_ctx)
203 vklib.vkc_destroy(vk_ctx)
204 error("[GPU SIMILARITY ERROR] Parallel computation failed: " .. error_str)
205 end
206
207 local gpu_time = os.time() - start_time
208 print(string.format("[GPU SIMILARITY] ✅ GPU computation complete in %d seconds!", gpu_time))
209
210 -- Cleanup GPU resources (we have all data in RAM now)
211 vklib.vks_destroy(sim_ctx)
212 vklib.vkc_destroy(vk_ctx)
213
214 -- Prepare C arrays for parallel file writing
215 -- The C function handles all file I/O with pthreads (no Lua serialization
216 -- overhead) and prints its own "[VKS FILE] Wrote N files ..." timing line.
217 local max_sort_threads = num_threads or 8
218
219 print("[GPU SIMILARITY] Preparing C arrays for parallel file writing...")
220
221 -- Build poem_index array (0-based array index -> poem_index)
222 local poem_indices_c = ffi.new("uint32_t[?]", num_poems)
223 for idx = 1, num_poems do
224 local emb = embeddings_data.embeddings[idx]
225 if not emb.poem_index then
226 error(string.format("[GPU SIMILARITY ERROR] Embedding at index %d missing poem_index", idx))
227 end
228 poem_indices_c[idx - 1] = emb.poem_index
229 end
230
231 -- Build poem_ids array (strings for metadata)
232 -- Note: We need to keep the Lua strings alive during the C call
233 local poem_ids_lua = {}
234 local poem_ids_c = ffi.new("const char*[?]", num_poems)
235 for idx = 1, num_poems do
236 local emb = embeddings_data.embeddings[idx]
237 poem_ids_lua[idx] = tostring(emb.id)
238 poem_ids_c[idx - 1] = poem_ids_lua[idx]
239 end
240
241 -- Call C function to write files in parallel using pthreads
242 -- This keeps all data in C memory - no Lua serialization needed
243 print(string.format("[GPU SIMILARITY] Writing %d files with %d pthreads (C parallel I/O)...", num_poems, max_sort_threads))
244
245 local result = vklib.vks_write_similarity_files_parallel(
246 triangular_output,
247 num_poems,
248 poem_indices_c,
249 poem_ids_c,
250 output_dir,
251 max_sort_threads
252 )
253
254 if result ~= 0 then
255 local error_str = ffi.string(vklib.vkc_get_error_string(result))
256 error("[GPU SIMILARITY ERROR] Parallel file writing failed: " .. error_str)
257 end
258
259 -- Generate rankings cache using C parallel implementation
260 -- This avoids the O(n²) Lua extraction and effil serialization overhead.
261 -- The C side prints its own "[VKS CACHE] Generating rankings cache ..." line.
262 local cache_file = utils.embeddings_dir(model_dir) .. "/similarity_rankings_cache.json"
263
264 local cache_result = vklib.vks_write_rankings_cache_parallel(
265 triangular_output,
266 num_poems,
267 poem_indices_c,
268 cache_file,
269 max_sort_threads,
270 top_k or 0 -- 0 = keep all (backward compatible)
271 )
272
273 if cache_result ~= 0 then
274 local error_str = ffi.string(vklib.vkc_get_error_string(cache_result))
275 error("[GPU SIMILARITY ERROR] Cache generation failed: " .. error_str)
276 end
277
278 print("[GPU SIMILARITY] ✅ All similarity generation complete!")
279 return true
280end
281-- }}}
282
283return M
284