libs/vulkan-compute/lua/vk_similarity.lua

284 lines

1-- Lua FFI bindings for Vulkan similarity computation

2-- Provides GPU-accelerated cosine similarity calculation for triangular individual files

3-- (Issue 10-057: the parallel CPU-sort path was removed; this module is GPU-only now,

4-- so it no longer depends on effil.)

6local ffi = require("ffi")

7local utils = require("utils")

8local dkjson = require("dkjson")

10-- {{{ FFI definitions

11ffi.cdef[[

12// Error codes

13typedef enum {

14 VKC_SUCCESS = 0,

15 VKC_ERROR_INIT_FAILED = -1,

16 VKC_ERROR_NO_SUITABLE_DEVICE = -2,

17 VKC_ERROR_BUFFER_CREATION_FAILED = -3,

18 VKC_ERROR_SHADER_LOAD_FAILED = -4,

19 VKC_ERROR_PIPELINE_CREATION_FAILED = -5,

20 VKC_ERROR_COMMAND_EXECUTION_FAILED = -6,

21 VKC_ERROR_OUT_OF_MEMORY = -7,

22} VkComputeResult;

24// Opaque types

25typedef struct VkComputeContext VkComputeContext;

26typedef struct VkSimilarityContext VkSimilarityContext;

28// vk_compute.h functions

29VkComputeContext* vkc_init(bool enable_validation);

30void vkc_destroy(VkComputeContext* ctx);

31const char* vkc_get_error_string(VkComputeResult result);

32const char* vkc_get_device_name(VkComputeContext* ctx);

34// vk_similarity.h functions

35VkSimilarityContext* vks_init(VkComputeContext* ctx,

36 const float* embeddings,

37 uint32_t num_poems,

38 uint32_t embedding_dim);

40void vks_destroy(VkSimilarityContext* sim_ctx);

42// Parallel full-matrix computation (Issue 9-002 original design)

43VkComputeResult vks_compute_all_similarities_parallel(

44 VkSimilarityContext* sim_ctx,

45 float* output_triangular);

47// Parallel file I/O with pthreads (avoids Lua serialization overhead)

48VkComputeResult vks_write_similarity_files_parallel(

49 const float* triangular_buffer,

50 uint32_t num_poems,

51 const uint32_t* poem_indices,

52 const char** poem_ids,

53 const char* output_dir,

54 uint32_t num_threads);

56// Parallel cache generation with pthreads. top_k caps how many nearest neighbours

57// are stored per poem (0 = all); the cap shrinks the on-disk JSON and the RAM table

58// the HTML stage parses it into (Issue 10-057).

59VkComputeResult vks_write_rankings_cache_parallel(

60 const float* triangular_buffer,

61 uint32_t num_poems,

62 const uint32_t* poem_indices,

63 const char* cache_file,

64 uint32_t num_threads,

65 uint32_t top_k);

66]]

67-- }}}

69-- Load Vulkan compute library

70-- Use absolute path or search in current directory

71local DIR = os.getenv("DIR") or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

72local lib_path = DIR .. "/libs/vulkan-compute/build/libvkcompute.so"

73local vklib = ffi.load(lib_path)

75local M = {}

77-- {{{ function triangular_size

78-- Calculate size of triangular matrix (number of pairs)

79local function triangular_size(num_poems)

80 return (num_poems * (num_poems - 1)) / 2

81end

82-- }}}

84-- {{{ function triangular_index

85-- Get linear index for pair (i, j) where i < j

86local function triangular_index(i, j, num_poems)

87 return i * num_poems - (i * (i + 1)) / 2 + (j - i - 1)

88end

89-- }}}

91-- {{{ function M.generate_similarity_matrix_gpu_parallel

92-- Generate similarity matrix using TRUE parallel GPU computation

93-- This is the correct implementation per Issue 9-002 original design.

94-- Computes ALL ~30M pairs in a SINGLE GPU dispatch (seconds, not hours)

95--

96-- @param embeddings_file: Path to embeddings.json

97-- @param model_name: Model name for output directory

98-- @param force: Force regeneration even if files exist

99-- @param num_threads: Number of CPU threads for parallel file writing

100-- @param top_k: keep only the top-K nearest neighbours per poem in the rankings

101-- cache (nil/0 = keep all). Caps disk + HTML-stage RAM (Issue 10-057).

102-- @return success: boolean

103function M.generate_similarity_matrix_gpu_parallel(embeddings_file, model_name, force, num_threads, top_k)

104 print("[GPU SIMILARITY] Embeddings file: " .. embeddings_file)

105 print(string.format("[GPU SIMILARITY] Force regeneration: %s", tostring(force)))

106

107 -- Load embeddings

108 local embeddings_data = utils.read_json_file(embeddings_file)

109 if not embeddings_data or not embeddings_data.embeddings then

110 error("[GPU SIMILARITY ERROR] Failed to load embeddings")

111 end

112

113 local num_poems = #embeddings_data.embeddings

114 -- Validate embeddings array is non-empty before accessing first element

115 -- Empty arrays occur when embedding generation failed (network errors, etc.)

116 if num_poems == 0 then

117 local reason = embeddings_data.metadata and embeddings_data.metadata.termination_reason or "unknown"

118 local mode = embeddings_data.metadata and embeddings_data.metadata.processing_mode or "unknown"

119 error(string.format(

120 "[GPU SIMILARITY ERROR] Embeddings array is empty (0 poems).\n" ..

121 " Processing mode: %s\n" ..

122 " Termination reason: %s\n" ..

123 " Remedy: Regenerate embeddings with: ./run.sh --generate-embeddings --force\n" ..

124 " Ensure the inference server is running: ./scripts/start-llamacpp-server.sh",

125 mode, reason

126 ))

127 end

128 local embedding_dim = #embeddings_data.embeddings[1].embedding

129 print(string.format("[GPU SIMILARITY] Loaded %d poems × %d dimensions", num_poems, embedding_dim))

130

131 -- Prepare output directory.

132 -- Issue 10-054: similarities are a movable cache -> embeddings_dir() (RAM).

133 -- This was the last writer still hardcoding a (relative!) disk path: it wrote

134 -- to assets/ and freshness-checked assets/, so with caches in RAM it skipped

135 -- to the stale disk copy and never populated RAM -- the broken-site bug.

136 local model_dir = model_name:gsub(":", "_")

137 local output_dir = utils.embeddings_dir(model_name) .. "/similarities"

138 os.execute("mkdir -p " .. output_dir)

139

140 -- Check if we can skip (files already exist and not forcing)

141 local first_file = string.format("%s/poem_index_1.json", output_dir)

142 local last_file = string.format("%s/poem_index_%d.json", output_dir, num_poems)

143 if not force and utils.file_exists(first_file) and utils.file_exists(last_file) then

144 print("[GPU SIMILARITY] Similarity files already exist, checking cache...")

145 local cache_file = utils.embeddings_dir(model_dir) .. "/similarity_rankings_cache.json"

146 if utils.file_exists(cache_file) then

147 local cache_data = utils.read_json_file(cache_file)

148 if cache_data and cache_data.rankings then

149 local cache_count = 0

150 for _ in pairs(cache_data.rankings) do cache_count = cache_count + 1 end

151 if cache_count > 0 then

152 print(string.format("[GPU SIMILARITY] ⏭️ All files and cache exist (%d poems), skipping", cache_count))

153 return true

154 end

155 end

156 end

157 -- Cache missing or empty: fall through to the GPU regeneration below. The old

158 -- Lua effil rebuild was removed (Issue 10-057) -- it sorted on the CPU AND

159 -- bypassed the top-K cap, so it would silently rewrite a full-size cache. The

160 -- GPU path rebuilds the cache capped, which is the only route we keep.

161 print("[GPU SIMILARITY] Cache missing or empty, regenerating on the GPU...")

162 end

163

164 -- Convert embeddings to flat C array

165 print("[GPU SIMILARITY] Preparing embeddings for GPU...")

166 local flat_embeddings = ffi.new("float[?]", num_poems * embedding_dim)

167 for i, poem in ipairs(embeddings_data.embeddings) do

168 local base = (i - 1) * embedding_dim

169 for j, val in ipairs(poem.embedding) do

170 flat_embeddings[base + j - 1] = val

171 end

172 end

173

174 -- Initialize Vulkan context

175 print("[GPU SIMILARITY] Initializing Vulkan context...")

176 -- vkc_init prints the chosen GPU once ("[VKC] Selected device: ..."), so

177 -- we no longer echo a second "GPU device" line from here.

178 local vk_ctx = vklib.vkc_init(false) -- Disable validation for performance

179 if vk_ctx == nil then

180 error("[GPU SIMILARITY ERROR] Failed to initialize Vulkan context")

181 end

182

183 -- Initialize similarity context

184 local sim_ctx = vklib.vks_init(vk_ctx, flat_embeddings, num_poems, embedding_dim)

185 if sim_ctx == nil then

186 vklib.vkc_destroy(vk_ctx)

187 error("[GPU SIMILARITY ERROR] Failed to initialize similarity context")

188 end

189

190 -- Allocate output buffer for triangular matrix

191 local tri_size = triangular_size(num_poems)

192 print(string.format("[GPU SIMILARITY] Allocating triangular buffer: %d pairs (%.1f MB)",

193 tri_size, tri_size * 4 / 1024 / 1024))

194 local triangular_output = ffi.new("float[?]", tri_size)

195

196 -- SINGLE DISPATCH - compute ALL similarities at once!

197 local start_time = os.time()

198

199 local result = vklib.vks_compute_all_similarities_parallel(sim_ctx, triangular_output)

200 if result ~= 0 then

201 local error_str = ffi.string(vklib.vkc_get_error_string(result))

202 vklib.vks_destroy(sim_ctx)

203 vklib.vkc_destroy(vk_ctx)

204 error("[GPU SIMILARITY ERROR] Parallel computation failed: " .. error_str)

205 end

206

207 local gpu_time = os.time() - start_time

208 print(string.format("[GPU SIMILARITY] ✅ GPU computation complete in %d seconds!", gpu_time))

209

210 -- Cleanup GPU resources (we have all data in RAM now)

211 vklib.vks_destroy(sim_ctx)

212 vklib.vkc_destroy(vk_ctx)

213

214 -- Prepare C arrays for parallel file writing

215 -- The C function handles all file I/O with pthreads (no Lua serialization

216 -- overhead) and prints its own "[VKS FILE] Wrote N files ..." timing line.

217 local max_sort_threads = num_threads or 8

218

219 print("[GPU SIMILARITY] Preparing C arrays for parallel file writing...")

220

221 -- Build poem_index array (0-based array index -> poem_index)

222 local poem_indices_c = ffi.new("uint32_t[?]", num_poems)

223 for idx = 1, num_poems do

224 local emb = embeddings_data.embeddings[idx]

225 if not emb.poem_index then

226 error(string.format("[GPU SIMILARITY ERROR] Embedding at index %d missing poem_index", idx))

227 end

228 poem_indices_c[idx - 1] = emb.poem_index

229 end

230

231 -- Build poem_ids array (strings for metadata)

232 -- Note: We need to keep the Lua strings alive during the C call

233 local poem_ids_lua = {}

234 local poem_ids_c = ffi.new("const char*[?]", num_poems)

235 for idx = 1, num_poems do

236 local emb = embeddings_data.embeddings[idx]

237 poem_ids_lua[idx] = tostring(emb.id)

238 poem_ids_c[idx - 1] = poem_ids_lua[idx]

239 end

240

241 -- Call C function to write files in parallel using pthreads

242 -- This keeps all data in C memory - no Lua serialization needed

243 print(string.format("[GPU SIMILARITY] Writing %d files with %d pthreads (C parallel I/O)...", num_poems, max_sort_threads))

244

245 local result = vklib.vks_write_similarity_files_parallel(

246 triangular_output,

247 num_poems,

248 poem_indices_c,

249 poem_ids_c,

250 output_dir,

251 max_sort_threads

252 )

253

254 if result ~= 0 then

255 local error_str = ffi.string(vklib.vkc_get_error_string(result))

256 error("[GPU SIMILARITY ERROR] Parallel file writing failed: " .. error_str)

257 end

258

259 -- Generate rankings cache using C parallel implementation

260 -- This avoids the O(n²) Lua extraction and effil serialization overhead.

261 -- The C side prints its own "[VKS CACHE] Generating rankings cache ..." line.

262 local cache_file = utils.embeddings_dir(model_dir) .. "/similarity_rankings_cache.json"

263

264 local cache_result = vklib.vks_write_rankings_cache_parallel(

265 triangular_output,

266 num_poems,

267 poem_indices_c,

268 cache_file,

269 max_sort_threads,

270 top_k or 0 -- 0 = keep all (backward compatible)

271 )

272

273 if cache_result ~= 0 then

274 local error_str = ffi.string(vklib.vkc_get_error_string(cache_result))

275 error("[GPU SIMILARITY ERROR] Cache generation failed: " .. error_str)

276 end

277

278 print("[GPU SIMILARITY] ✅ All similarity generation complete!")

279 return true

280end

281-- }}}

282

283return M

284