libs/vulkan-compute/lua/vk

23-- {{{ local M = {}

24local M = {}

25-- }}}

27-- {{{ FFI definitions

28ffi.cdef[[

29 // Opaque handles

30 typedef struct VkComputeContext VkComputeContext;

31 typedef struct VkDiversityContext VkDiversityContext;

32

33 // Error codes

34 typedef enum {

35 VKC_SUCCESS = 0,

36 VKC_ERROR_INIT_FAILED = -1,

37 VKC_ERROR_NO_SUITABLE_DEVICE = -2,

38 VKC_ERROR_BUFFER_CREATION_FAILED = -3,

39 VKC_ERROR_SHADER_LOAD_FAILED = -4,

40 VKC_ERROR_PIPELINE_CREATION_FAILED = -5,

41 VKC_ERROR_COMMAND_EXECUTION_FAILED = -6,

42 VKC_ERROR_OUT_OF_MEMORY = -7,

43 } VkComputeResult;

44

45 // Core Vulkan compute functions

46 VkComputeContext* vkc_init(bool enable_validation);

47 void vkc_destroy(VkComputeContext* ctx);

48 const char* vkc_get_error_string(VkComputeResult result);

49 const char* vkc_get_device_name(VkComputeContext* ctx);

50 uint64_t vkc_get_device_memory(VkComputeContext* ctx);

51

52 // Diversity sequence functions

53 VkDiversityContext* vkd_init(VkComputeContext* ctx,

54 const float* embeddings,

55 uint32_t num_poems,

56 uint32_t embedding_dim);

57 VkComputeResult vkd_compute_sequence(VkDiversityContext* div_ctx,

58 uint32_t start_poem,

59 uint32_t* output_sequence);

60 void vkd_destroy(VkDiversityContext* div_ctx);

61

62 // Batch processing functions

63 typedef struct VkDiversityBatchContext VkDiversityBatchContext;

64

65 VkDiversityBatchContext* vkd_batch_init(VkComputeContext* ctx,

66 const uint16_t* embeddings_fp16,

67 uint32_t num_poems,

68 uint32_t embedding_dim,

69 uint32_t batch_size,

70 const uint32_t* start_indices);

71 VkComputeResult vkd_batch_compute_chunk(VkDiversityBatchContext* batch_ctx,

72 uint32_t start_slot,

73 uint32_t slot_count,

74 uint32_t tile_size);

75 // 9-014 dispatch-per-tile + pipelined variant. Same parameters as

76 // vkd_batch_compute_chunk; differs only in synchronization granularity.

77 VkComputeResult vkd_batch_compute_chunk_pipelined(VkDiversityBatchContext* batch_ctx,

78 uint32_t start_slot,

79 uint32_t slot_count,

80 uint32_t tile_size);

81 VkComputeResult vkd_batch_download_sequences(VkDiversityBatchContext* batch_ctx,

82 uint32_t* output_sequences);

83 void vkd_batch_destroy(VkDiversityBatchContext* batch_ctx);

84

85 // FP16 conversion helpers. The bulk FP32 -> FP16 routine is used to

86 // produce the on-disk embeddings_fp16.bin cache file from the FP32

87 // embeddings.json.

88 void vkc_fp32_to_fp16(const float* src, uint16_t* dst, uint32_t count);

89 float vkc_fp16_to_fp32(uint16_t bits);

90

91 // Shared progress renderer (same bar + TTY/--debug rules as the C stages).

92 // Used by the diversity chunk loop so its display matches stage 7.

93 void vkc_progress_update_ex(const char* label, uint64_t current, uint64_t total,

94 const char* suffix);

95 void vkc_progress_finish(void);

96 int vkc_progress_mode(void);

97]]

98-- }}}

100-- {{{ Load shared library

101-- Issue 10-057: DIR-based absolute path (matching vk_similarity.lua) so the library

102-- loads from ANY working directory. Together with the shader paths in vk_diversity.c

103-- now being project-root-relative, diversity no longer needs to be run from inside

104-- libs/vulkan-compute/ -- the cd-wrapper requirement is gone. An explicit

105-- VK_COMPUTE_LIB still overrides for unusual setups.

106local _vkc_dir = os.getenv("DIR") or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"

107local lib_path = _G.VK_COMPUTE_LIB or

108 os.getenv("VK_COMPUTE_LIB") or

109 (_vkc_dir .. "/libs/vulkan-compute/build/libvkcompute.so")

110local vk = ffi.load(lib_path)

111-- }}}

113-- {{{ Error handling helper

114local function check_result(result, operation)

115 if result ~= 0 then

116 local err_str = ffi.string(vk.vkc_get_error_string(result))

117 error(string.format("%s failed: %s (code %d)", operation, err_str, tonumber(result)))

118 end

119end

120-- }}}

122-- {{{ local function format_duration

123-- Pretty-print a wall-clock duration in seconds as either "Hh Mm", "Mm Ss",

124-- or "Ss" depending on magnitude. Used for ETAs in long progress loops where

125-- the bare-seconds number is hard to grasp.

126local function format_duration(seconds)

127 if seconds < 60 then

128 return string.format("%.0fs", seconds)

129 elseif seconds < 3600 then

130 return string.format("%dm %02ds", math.floor(seconds / 60), math.floor(seconds) % 60)

131 else

132 return string.format("%dh %02dm",

133 math.floor(seconds / 3600),

134 math.floor((seconds % 3600) / 60))

135 end

136end

137-- }}}

139-- {{{ local function init()

140-- Initialize Vulkan compute context

141-- Returns: Context handle (must be passed to shutdown when done)

142function M.init(enable_validation)

143 enable_validation = enable_validation or false

144 local ctx = vk.vkc_init(enable_validation)

145 if ctx == nil then

146 error("Failed to initialize Vulkan context")

147 end

148

149 -- Print device info

150 local device_name = ffi.string(vk.vkc_get_device_name(ctx))

151 local device_memory = tonumber(vk.vkc_get_device_memory(ctx))

152 print(string.format("[Vulkan] Device: %s (%.2f GB)",

153 device_name, device_memory / 1024^3))

154

155 return ctx

156end

157-- }}}

159-- {{{ local function shutdown()

160-- Cleanup Vulkan resources

161function M.shutdown(ctx)

162 if ctx ~= nil then

163 vk.vkc_destroy(ctx)

164 end

165end

166-- }}}

168-- {{{ local function compute_diversity_sequence()

169-- Compute a diversity sequence starting from a given poem

170--

171-- Parameters:

172-- ctx - Vulkan compute context from init()

173-- embeddings - Flat Lua table of floats (num_poems * embedding_dim)

174-- num_poems - Number of poems

175-- embedding_dim - Dimension of embeddings (e.g., 768)

176-- start_poem - Index of starting poem (0-indexed)

177--

178-- Returns: Lua table of poem indices representing the diversity sequence

179function M.compute_diversity_sequence(ctx, embeddings, num_poems, embedding_dim, start_poem)

180 -- Convert Lua table to C float array

181 local embeddings_arr = ffi.new("float[?]", num_poems * embedding_dim)

182 for i = 1, num_poems * embedding_dim do

183 embeddings_arr[i - 1] = embeddings[i]

184 end

185

186 -- Initialize diversity context

187 local div_ctx = vk.vkd_init(ctx, embeddings_arr, num_poems, embedding_dim)

188 if div_ctx == nil then

189 error("Failed to initialize diversity context")

190 end

191

192 -- Allocate output sequence buffer

193 local sequence_arr = ffi.new("uint32_t[?]", num_poems)

194

195 -- Compute sequence

196 local result = vk.vkd_compute_sequence(div_ctx, start_poem, sequence_arr)

197 check_result(result, "Diversity sequence computation")

198

199 -- Convert C array to Lua table

200 local sequence = {}

201 for i = 0, num_poems - 1 do

202 sequence[i + 1] = tonumber(sequence_arr[i])

203 end

204

205 -- Cleanup diversity context

206 vk.vkd_destroy(div_ctx)

207

208 return sequence

209end

210-- }}}

212-- {{{ local function compute_all_diversity_sequences()

213-- Compute diversity sequences for all poems

214--

215-- Parameters:

216-- ctx - Vulkan compute context

217-- embeddings - Flat Lua table of floats

218-- num_poems - Number of poems

219-- embedding_dim - Dimension of embeddings

220-- output_file - Optional file path to write sequences

221-- start_from - Optional poem index to resume from (default: 0)

222--

223-- Returns: Table mapping poem_id -> sequence table

224function M.compute_all_diversity_sequences(ctx, embeddings, num_poems, embedding_dim, output_file, start_from)

225 start_from = start_from or 0

226 print(string.format("[Diversity] Computing sequences for %d poems (starting from %d)...",

227 num_poems, start_from))

228

229 local sequences = {}

230 local start_time = os.clock()

231

232 -- Open output file for incremental writing if specified

233 local out_file = nil

234 if output_file then

235 -- Check if file exists for resume

236 local existing = io.open(output_file, "rb")

237 if existing and start_from == 0 then

238 print("[Diversity] Warning: Output file exists, will overwrite")

239 existing:close()

240 out_file = io.open(output_file, "wb")

241 -- Write header: num_poems (4 bytes)

242 local header = ffi.new("uint32_t[1]", num_poems)

243 out_file:write(ffi.string(header, 4))

244 elseif existing then

245 print(string.format("[Diversity] Resuming: appending to existing file"))

246 existing:close()

247 out_file = io.open(output_file, "ab") -- Append mode

248 else

249 out_file = io.open(output_file, "wb")

250 -- Write header: num_poems (4 bytes)

251 local header = ffi.new("uint32_t[1]", num_poems)

252 out_file:write(ffi.string(header, 4))

253 end

254

255 if not out_file then

256 error("Failed to open output file: " .. output_file)

257 end

258 end

259

260 -- Convert embeddings once

261 local embeddings_arr = ffi.new("float[?]", num_poems * embedding_dim)

262 for i = 1, num_poems * embedding_dim do

263 embeddings_arr[i - 1] = embeddings[i]

264 end

265

266 -- Initialize diversity context once

267 local div_ctx = vk.vkd_init(ctx, embeddings_arr, num_poems, embedding_dim)

268 if div_ctx == nil then

269 error("Failed to initialize diversity context")

270 end

271

272 local sequence_arr = ffi.new("uint32_t[?]", num_poems)

273

274 -- Compute sequence for each poem

275 for start_poem = start_from, num_poems - 1 do

276 local result = vk.vkd_compute_sequence(div_ctx, start_poem, sequence_arr)

277 check_result(result, string.format("Diversity sequence for poem %d", start_poem))

278

279 -- Convert to Lua table

280 local sequence = {}

281 for i = 0, num_poems - 1 do

282 sequence[i + 1] = tonumber(sequence_arr[i])

283 end

284 sequences[start_poem] = sequence

285

286 -- Write sequence to file immediately if file is open

287 if out_file then

288 out_file:write(ffi.string(sequence_arr, num_poems * 4))

289 -- Flush every 10 sequences to ensure progress is saved

290 if (start_poem + 1) % 10 == 0 then

291 out_file:flush()

292 end

293 end

294

295 -- Progress update every 100 poems

296 if (start_poem + 1 - start_from) % 100 == 0 or start_poem == num_poems - 1 then

297 local elapsed = os.clock() - start_time

298 local computed = start_poem + 1 - start_from

299 local rate = computed / elapsed

300 local remaining = (num_poems - start_poem - 1) / rate

301 local eta_hours = remaining / 3600

302 print(string.format(" [%d/%d] %.2f seq/sec, ETA: %.1fh (%.0fs)",

303 start_poem + 1, num_poems, rate, eta_hours, remaining))

304 end

305 end

306

307 vk.vkd_destroy(div_ctx)

308

309 -- Close output file if open

310 if out_file then

311 out_file:close()

312 print(string.format("[Diversity] Wrote sequences to: %s", output_file))

313 end

314

315 local elapsed = os.clock() - start_time

316 local computed = num_poems - start_from

317 print(string.format("[Diversity] Completed %d sequences in %.2fs (%.2f seq/s)",

318 computed, elapsed, computed / elapsed))

319

320 return sequences

321end

322-- }}}

324-- {{{ local function write_sequences_to_file()

325-- Write diversity sequences to file in binary format

326function M.write_sequences_to_file(sequences, output_file)

327 local f = io.open(output_file, "wb")

328 if not f then

329 error("Failed to open output file: " .. output_file)

330 end

331

332 local num_poems = #sequences

333

334 -- Write header: num_poems (4 bytes) using FFI

335 local header = ffi.new("uint32_t[1]", num_poems)

336 f:write(ffi.string(header, 4))

337

338 -- Write each sequence

339 for poem_id = 0, num_poems - 1 do

340 local sequence = sequences[poem_id]

341 for i = 1, #sequence do

342 local value = ffi.new("uint32_t[1]", sequence[i])

343 f:write(ffi.string(value, 4))

344 end

345 end

346

347 f:close()

348 print(string.format("[Diversity] Wrote sequences to %s", output_file))

349end

350-- }}}

352-- {{{ local function load_sequences_from_file()

353-- Load diversity sequences from binary file

354function M.load_sequences_from_file(input_file)

355 local f = io.open(input_file, "rb")

356 if not f then

357 error("Failed to open input file: " .. input_file)

358 end

359

360 -- Read header using FFI

361 local header_data = f:read(4)

362 local header = ffi.cast("uint32_t*", header_data)

363 local num_poems = tonumber(header[0])

364

365 -- Read sequences

366 local sequences = {}

367 for poem_id = 0, num_poems - 1 do

368 local sequence = {}

369 for i = 1, num_poems do

370 local data = f:read(4)

371 local value = ffi.cast("uint32_t*", data)

372 sequence[i] = tonumber(value[0])

373 end

374 sequences[poem_id] = sequence

375 end

376

377 f:close()

378 print(string.format("[Diversity] Loaded %d sequences from %s", num_poems, input_file))

379 return sequences

380end

381-- }}}

383-- {{{ local function compute_all_diversity_sequences_batched()

384-- Compute diversity sequences using batch parallel processing (2,600× faster)

385--

386-- Parameters:

387-- ctx - Vulkan compute context

388-- embeddings - Flat Lua table of floats

389-- num_poems - Number of poems

390-- embedding_dim - Dimension of embeddings

391-- output_file - File path to write sequences

392-- batch_size - Optional batch size (default: 3584)

393--

394-- Returns: Table mapping poem_id -> sequence table

395-- embeddings_fp16 is now a uint16_t FFI buffer of length num_poems *

396-- embedding_dim, holding FP16-packed values in row-major order. The

397-- wrapper script is responsible for producing this buffer (typically

398-- by reading a cached embeddings_fp16.bin file). We no longer accept

399-- a Lua-table flat array because the per-element copy loop into an

400-- FFI float[?] was a measurable bottleneck on ~20 million floats, and

401-- the new path uses ffi.copy from a binary file straight into the

402-- target buffer instead.

403function M.compute_all_diversity_sequences_batched(ctx, embeddings_fp16, num_poems, embedding_dim, output_file, batch_size)

404 batch_size = batch_size or 3584

405

406 print(string.format("[Diversity Batch] Computing sequences for %d poems (batch size: %d, FP16 storage)...",

407 num_poems, batch_size))

408

409 local start_time = wall_clock()

410 local all_sequences = {}

411

412 -- Process in batches

413 local num_batches = math.ceil(num_poems / batch_size)

414

415 for batch_num = 1, num_batches do

416 local batch_start = (batch_num - 1) * batch_size

417 local batch_end = math.min(batch_start + batch_size - 1, num_poems - 1)

418 local current_batch_size = batch_end - batch_start + 1

419

420 print(string.format("\n[Batch %d/%d] Processing poems %d-%d (%d sequences)",

421 batch_num, num_batches, batch_start, batch_end, current_batch_size))

422

423 -- Create start indices for this batch

424 local start_indices = ffi.new("uint32_t[?]", current_batch_size)

425 for i = 0, current_batch_size - 1 do

426 start_indices[i] = batch_start + i

427 end

428

429 -- Initialize batch context with the FP16 embedding buffer directly.

430 local batch_ctx = vk.vkd_batch_init(ctx, embeddings_fp16, num_poems, embedding_dim,

431 current_batch_size, start_indices)

432 if batch_ctx == nil then

433 error("Failed to initialize batch context")

434 end

435

436 -- Chunked GPU dispatch with adaptive chunk sizing.

437 --

438 -- Why chunks: a single dispatch covering all num_poems-1 iterations

439 -- runs long enough to trip the kernel GPU watchdog (typically

440 -- 2 s on Wayland / 10 s on Xorg), at which point Vulkan reports

441 -- VK_ERROR_DEVICE_LOST and the rest of the run is dead.

442 --

443 -- Why adaptive: per-iteration cost depends on the dataset size,

444 -- GPU model, residency, and how much display work is competing

445 -- for the GPU right now. Hardcoding a chunk size that's safe

446 -- everywhere makes the common case much slower than it needs to

447 -- be. Instead, we run a small probe dispatch first to measure

448 -- actual iter-time on this run, then size all subsequent chunks

449 -- to fit comfortably under a target wall-clock budget per chunk.

450 --

451 -- The probe is one extra dispatch per batch, rounding error in

452 -- a ~100-chunk batch. The first chunk is intentionally tiny so a

453 -- pathologically slow GPU still survives it.

454 local PROBE_ITERS = 10 -- size of the warm-up probe

455 local TARGET_CHUNK_SECONDS = 1.5 -- aim for this much GPU work per dispatch

456 local SAFETY_FACTOR = 0.6 -- pad below the measured ceiling so jitter doesn't trip the watchdog

457

458 local total_iters = num_poems - 1

459 local batch_start_time = wall_clock()

460

461 -- 9-014: tile size for the inner candidate scan. Target tile working

462 -- set ≤ L2 cache with margin. With FP16-packed storage each

463 -- candidate occupies embedding_dim × 2 bytes. The 0.85 factor

464 -- leaves room for the centroid, the mask, and shader code in L2.

465 -- L2_BYTES is the published spec for the GTX 1080 Ti (5.5 MiB);

466 -- a future port to a different card should query this at runtime.

467 local L2_BYTES = 5 * 1024 * 1024 + 512 * 1024 -- 5.5 MiB

468 local bytes_per_candidate = embedding_dim * 2

469 local tile_size = math.max(1, math.floor(L2_BYTES * 0.85 / bytes_per_candidate))

470 if tile_size > num_poems then tile_size = num_poems end

471

472 -- Probe: small dispatch, time it. The probe uses the same tile_size

473 -- the production chunks will use, so the timing actually reflects

474 -- the post-tiling per-iter cost. We dispatch through the pipelined

475 -- variant so the probe also exercises the dispatch-per-tile path

476 -- the production chunks will use.

477 local probe_start = wall_clock()

478 local result = vk.vkd_batch_compute_chunk_pipelined(batch_ctx, 1, PROBE_ITERS, tile_size)

479 check_result(result, string.format(

480 "Batch compute-chunk probe dispatch (slots [1, %d))", 1 + PROBE_ITERS))

481 local probe_elapsed = wall_clock() - probe_start

482 local iter_seconds = probe_elapsed / PROBE_ITERS

483 local chunk_size = math.max(1, math.floor(

484 (TARGET_CHUNK_SECONDS * SAFETY_FACTOR) / iter_seconds))

485

486 local remaining = total_iters - PROBE_ITERS

487 local num_chunks_est = math.ceil(remaining / chunk_size) + 1 -- +1 for the probe

488 print(string.format(

489 " Probe: %d iters in %.3fs (%.1f iter/sec) -> chunk_size = %d, tile_size = %d (~%d more chunks)",

490 PROBE_ITERS, probe_elapsed, PROBE_ITERS / probe_elapsed,

491 chunk_size, tile_size, num_chunks_est - 1))

492 io.stdout:flush()

493

494 -- Progress reporting: print at chunk 1, every 10th chunk, and the

495 -- last one. Anything else is log spam on a multi-thousand-chunk

496 -- run. We also keep a rolling EMA of iter rate to compute an ETA

497 -- that adapts as conditions change.

498 -- Progress is rendered by the shared C bar (vkc_progress_*), so it

499 -- looks like and obeys the same TTY/--debug rules as stage 7. On a TTY

500 -- we update every chunk for a smooth bar; when verbose (--debug) we

501 -- keep the old every-10th cadence so a multi-thousand-chunk run does

502 -- not flood the durable log; when piped without --debug it stays quiet.

503 local PRINT_EVERY = 10

504 local PROGRESS_BAR_MODE = 1 -- mirrors VKC_PROGRESS_BAR in vk_compute.c

505 local progress_mode = vk.vkc_progress_mode()

506 local ema_iter_rate = nil

507 local EMA_ALPHA = 0.2 -- new sample weight; 1 = no smoothing

508

509 local slot = 1 + PROBE_ITERS

510 local chunk_idx = 1

511 while slot <= total_iters do

512 local this_chunk = math.min(chunk_size, total_iters - slot + 1)

513 local chunk_start = wall_clock()

514

515 result = vk.vkd_batch_compute_chunk_pipelined(batch_ctx, slot, this_chunk, tile_size)

516 check_result(result, string.format(

517 "Batch compute-chunk dispatch (chunk %d, slots [%d, %d))",

518 chunk_idx, slot, slot + this_chunk))

519

520 local chunk_elapsed = wall_clock() - chunk_start

521 local total_done = slot + this_chunk - 1

522 local sample_rate = this_chunk / chunk_elapsed

523

524 if ema_iter_rate == nil then

525 ema_iter_rate = sample_rate

526 else

527 ema_iter_rate = EMA_ALPHA * sample_rate + (1 - EMA_ALPHA) * ema_iter_rate

528 end

529

530 local is_last = total_done >= total_iters

531 if progress_mode == PROGRESS_BAR_MODE or chunk_idx == 1

532 or chunk_idx % PRINT_EVERY == 0 or is_last then

533 local eta_seconds = (total_iters - total_done) / ema_iter_rate

534 -- Rolling-average rate + ETA ride along as the bar suffix.

535 local suffix = string.format("%.1f iter/sec, ETA %s",

536 ema_iter_rate, format_duration(eta_seconds))

537 vk.vkc_progress_update_ex("[VKD] sequences", total_done, total_iters, suffix)

538 end

539

540 slot = slot + this_chunk

541 chunk_idx = chunk_idx + 1

542 end

543 vk.vkc_progress_finish()

544

545 local batch_compute_elapsed = wall_clock() - batch_start_time

546 print(string.format(" GPU finished %d iterations in %.2fs (%.2f iter/sec average)",

547 total_iters, batch_compute_elapsed,

548 total_iters / batch_compute_elapsed))

549 io.stdout:flush()

550

551 -- Download sequences for this batch

552 local batch_sequences_arr = ffi.new("uint32_t[?]", current_batch_size * num_poems)

553 local result = vk.vkd_batch_download_sequences(batch_ctx, batch_sequences_arr)

554 check_result(result, "Download batch sequences")

555

556 -- Convert to Lua tables

557 for i = 0, current_batch_size - 1 do

558 local poem_id = batch_start + i

559 local sequence = {}

560 for j = 0, num_poems - 1 do

561 sequence[j + 1] = tonumber(batch_sequences_arr[i * num_poems + j])

562 end

563 all_sequences[poem_id] = sequence

564 end

565

566 -- Cleanup batch

567 vk.vkd_batch_destroy(batch_ctx)

568

569 local batch_elapsed = wall_clock() - batch_start_time

570 print(string.format("[Batch %d/%d] Completed in %.2fs (%.2f seq/s)",

571 batch_num, num_batches, batch_elapsed, current_batch_size / batch_elapsed))

572 end

573

574 local total_elapsed = wall_clock() - start_time

575 print(string.format("\n[Diversity Batch] Completed ALL %d sequences in %.2fs (%.2f seq/s)",

576 num_poems, total_elapsed, num_poems / total_elapsed))

577

578 -- Write to file if requested

579 if output_file then

580 M.write_sequences_to_file(all_sequences, output_file)

581 end

582

583 return all_sequences

584end

585-- }}}

587-- {{{ FP16 conversion: thin wrappers around the C helpers

588-- Exposed on M so the wrapper script can convert FP32 -> FP16 without

589-- needing its own ffi.load. The C helpers are private to this module

590-- otherwise (the FFI library object `vk` is module-local).

591function M.fp32_to_fp16(src, dst, count)

592 return vk.vkc_fp32_to_fp16(src, dst, count)

593end

594

595function M.fp16_to_fp32(bits)

596 return vk.vkc_fp16_to_fp32(bits)

597end

598-- }}}

libs/vulkan-compute/lua/vk_compute.lua