libs/vulkan-compute/lua/vk_compute.lua
1-- vk_compute.lua - LuaJIT FFI bindings for Vulkan Compute Library
2--
3-- This module provides a Lua-friendly interface to GPU-accelerated
4-- diversity sequence computation and similarity operations.
5--
6-- Usage:
7-- local vk = require("vk_compute")
8-- local ctx = vk.init()
9-- local embeddings = {...} -- Flat array of floats
10-- local sequence = vk.compute_diversity_sequence(ctx, embeddings, 7797, 768, 0)
11-- vk.shutdown(ctx)
12
13local ffi = require("ffi")
14-- socket.gettime() is sub-second wall-clock time. We use it instead of
15-- os.clock() because the diversity loop spends most of its CPU thread
16-- blocked in vkWaitForFences (the GPU is busy, the CPU is sleeping),
17-- and os.clock() only counts time the process was actually scheduled —
18-- so it under-reports the elapsed time by orders of magnitude and makes
19-- the iter/sec line claim impossible speeds.
20local socket = require("socket")
21local wall_clock = socket.gettime
22
23-- {{{ local M = {}
24local M = {}
25-- }}}
26
27-- {{{ FFI definitions
28ffi.cdef[[
29 // Opaque handles
30 typedef struct VkComputeContext VkComputeContext;
31 typedef struct VkDiversityContext VkDiversityContext;
32
33 // Error codes
34 typedef enum {
35 VKC_SUCCESS = 0,
36 VKC_ERROR_INIT_FAILED = -1,
37 VKC_ERROR_NO_SUITABLE_DEVICE = -2,
38 VKC_ERROR_BUFFER_CREATION_FAILED = -3,
39 VKC_ERROR_SHADER_LOAD_FAILED = -4,
40 VKC_ERROR_PIPELINE_CREATION_FAILED = -5,
41 VKC_ERROR_COMMAND_EXECUTION_FAILED = -6,
42 VKC_ERROR_OUT_OF_MEMORY = -7,
43 } VkComputeResult;
44
45 // Core Vulkan compute functions
46 VkComputeContext* vkc_init(bool enable_validation);
47 void vkc_destroy(VkComputeContext* ctx);
48 const char* vkc_get_error_string(VkComputeResult result);
49 const char* vkc_get_device_name(VkComputeContext* ctx);
50 uint64_t vkc_get_device_memory(VkComputeContext* ctx);
51
52 // Diversity sequence functions
53 VkDiversityContext* vkd_init(VkComputeContext* ctx,
54 const float* embeddings,
55 uint32_t num_poems,
56 uint32_t embedding_dim);
57 VkComputeResult vkd_compute_sequence(VkDiversityContext* div_ctx,
58 uint32_t start_poem,
59 uint32_t* output_sequence);
60 void vkd_destroy(VkDiversityContext* div_ctx);
61
62 // Batch processing functions
63 typedef struct VkDiversityBatchContext VkDiversityBatchContext;
64
65 VkDiversityBatchContext* vkd_batch_init(VkComputeContext* ctx,
66 const uint16_t* embeddings_fp16,
67 uint32_t num_poems,
68 uint32_t embedding_dim,
69 uint32_t batch_size,
70 const uint32_t* start_indices);
71 VkComputeResult vkd_batch_compute_chunk(VkDiversityBatchContext* batch_ctx,
72 uint32_t start_slot,
73 uint32_t slot_count,
74 uint32_t tile_size);
75 // 9-014 dispatch-per-tile + pipelined variant. Same parameters as
76 // vkd_batch_compute_chunk; differs only in synchronization granularity.
77 VkComputeResult vkd_batch_compute_chunk_pipelined(VkDiversityBatchContext* batch_ctx,
78 uint32_t start_slot,
79 uint32_t slot_count,
80 uint32_t tile_size);
81 VkComputeResult vkd_batch_download_sequences(VkDiversityBatchContext* batch_ctx,
82 uint32_t* output_sequences);
83 void vkd_batch_destroy(VkDiversityBatchContext* batch_ctx);
84
85 // FP16 conversion helpers. The bulk FP32 -> FP16 routine is used to
86 // produce the on-disk embeddings_fp16.bin cache file from the FP32
87 // embeddings.json.
88 void vkc_fp32_to_fp16(const float* src, uint16_t* dst, uint32_t count);
89 float vkc_fp16_to_fp32(uint16_t bits);
90
91 // Shared progress renderer (same bar + TTY/--debug rules as the C stages).
92 // Used by the diversity chunk loop so its display matches stage 7.
93 void vkc_progress_update_ex(const char* label, uint64_t current, uint64_t total,
94 const char* suffix);
95 void vkc_progress_finish(void);
96 int vkc_progress_mode(void);
97]]
98-- }}}
99
100-- {{{ Load shared library
101-- Issue 10-057: DIR-based absolute path (matching vk_similarity.lua) so the library
102-- loads from ANY working directory. Together with the shader paths in vk_diversity.c
103-- now being project-root-relative, diversity no longer needs to be run from inside
104-- libs/vulkan-compute/ -- the cd-wrapper requirement is gone. An explicit
105-- VK_COMPUTE_LIB still overrides for unusual setups.
106local _vkc_dir = os.getenv("DIR") or "/mnt/mtwo/programming/ai-stuff/neocities-modernization"
107local lib_path = _G.VK_COMPUTE_LIB or
108 os.getenv("VK_COMPUTE_LIB") or
109 (_vkc_dir .. "/libs/vulkan-compute/build/libvkcompute.so")
110local vk = ffi.load(lib_path)
111-- }}}
112
113-- {{{ Error handling helper
114local function check_result(result, operation)
115 if result ~= 0 then
116 local err_str = ffi.string(vk.vkc_get_error_string(result))
117 error(string.format("%s failed: %s (code %d)", operation, err_str, tonumber(result)))
118 end
119end
120-- }}}
121
122-- {{{ local function format_duration
123-- Pretty-print a wall-clock duration in seconds as either "Hh Mm", "Mm Ss",
124-- or "Ss" depending on magnitude. Used for ETAs in long progress loops where
125-- the bare-seconds number is hard to grasp.
126local function format_duration(seconds)
127 if seconds < 60 then
128 return string.format("%.0fs", seconds)
129 elseif seconds < 3600 then
130 return string.format("%dm %02ds", math.floor(seconds / 60), math.floor(seconds) % 60)
131 else
132 return string.format("%dh %02dm",
133 math.floor(seconds / 3600),
134 math.floor((seconds % 3600) / 60))
135 end
136end
137-- }}}
138
139-- {{{ local function init()
140-- Initialize Vulkan compute context
141-- Returns: Context handle (must be passed to shutdown when done)
142function M.init(enable_validation)
143 enable_validation = enable_validation or false
144 local ctx = vk.vkc_init(enable_validation)
145 if ctx == nil then
146 error("Failed to initialize Vulkan context")
147 end
148
149 -- Print device info
150 local device_name = ffi.string(vk.vkc_get_device_name(ctx))
151 local device_memory = tonumber(vk.vkc_get_device_memory(ctx))
152 print(string.format("[Vulkan] Device: %s (%.2f GB)",
153 device_name, device_memory / 1024^3))
154
155 return ctx
156end
157-- }}}
158
159-- {{{ local function shutdown()
160-- Cleanup Vulkan resources
161function M.shutdown(ctx)
162 if ctx ~= nil then
163 vk.vkc_destroy(ctx)
164 end
165end
166-- }}}
167
168-- {{{ local function compute_diversity_sequence()
169-- Compute a diversity sequence starting from a given poem
170--
171-- Parameters:
172-- ctx - Vulkan compute context from init()
173-- embeddings - Flat Lua table of floats (num_poems * embedding_dim)
174-- num_poems - Number of poems
175-- embedding_dim - Dimension of embeddings (e.g., 768)
176-- start_poem - Index of starting poem (0-indexed)
177--
178-- Returns: Lua table of poem indices representing the diversity sequence
179function M.compute_diversity_sequence(ctx, embeddings, num_poems, embedding_dim, start_poem)
180 -- Convert Lua table to C float array
181 local embeddings_arr = ffi.new("float[?]", num_poems * embedding_dim)
182 for i = 1, num_poems * embedding_dim do
183 embeddings_arr[i - 1] = embeddings[i]
184 end
185
186 -- Initialize diversity context
187 local div_ctx = vk.vkd_init(ctx, embeddings_arr, num_poems, embedding_dim)
188 if div_ctx == nil then
189 error("Failed to initialize diversity context")
190 end
191
192 -- Allocate output sequence buffer
193 local sequence_arr = ffi.new("uint32_t[?]", num_poems)
194
195 -- Compute sequence
196 local result = vk.vkd_compute_sequence(div_ctx, start_poem, sequence_arr)
197 check_result(result, "Diversity sequence computation")
198
199 -- Convert C array to Lua table
200 local sequence = {}
201 for i = 0, num_poems - 1 do
202 sequence[i + 1] = tonumber(sequence_arr[i])
203 end
204
205 -- Cleanup diversity context
206 vk.vkd_destroy(div_ctx)
207
208 return sequence
209end
210-- }}}
211
212-- {{{ local function compute_all_diversity_sequences()
213-- Compute diversity sequences for all poems
214--
215-- Parameters:
216-- ctx - Vulkan compute context
217-- embeddings - Flat Lua table of floats
218-- num_poems - Number of poems
219-- embedding_dim - Dimension of embeddings
220-- output_file - Optional file path to write sequences
221-- start_from - Optional poem index to resume from (default: 0)
222--
223-- Returns: Table mapping poem_id -> sequence table
224function M.compute_all_diversity_sequences(ctx, embeddings, num_poems, embedding_dim, output_file, start_from)
225 start_from = start_from or 0
226 print(string.format("[Diversity] Computing sequences for %d poems (starting from %d)...",
227 num_poems, start_from))
228
229 local sequences = {}
230 local start_time = os.clock()
231
232 -- Open output file for incremental writing if specified
233 local out_file = nil
234 if output_file then
235 -- Check if file exists for resume
236 local existing = io.open(output_file, "rb")
237 if existing and start_from == 0 then
238 print("[Diversity] Warning: Output file exists, will overwrite")
239 existing:close()
240 out_file = io.open(output_file, "wb")
241 -- Write header: num_poems (4 bytes)
242 local header = ffi.new("uint32_t[1]", num_poems)
243 out_file:write(ffi.string(header, 4))
244 elseif existing then
245 print(string.format("[Diversity] Resuming: appending to existing file"))
246 existing:close()
247 out_file = io.open(output_file, "ab") -- Append mode
248 else
249 out_file = io.open(output_file, "wb")
250 -- Write header: num_poems (4 bytes)
251 local header = ffi.new("uint32_t[1]", num_poems)
252 out_file:write(ffi.string(header, 4))
253 end
254
255 if not out_file then
256 error("Failed to open output file: " .. output_file)
257 end
258 end
259
260 -- Convert embeddings once
261 local embeddings_arr = ffi.new("float[?]", num_poems * embedding_dim)
262 for i = 1, num_poems * embedding_dim do
263 embeddings_arr[i - 1] = embeddings[i]
264 end
265
266 -- Initialize diversity context once
267 local div_ctx = vk.vkd_init(ctx, embeddings_arr, num_poems, embedding_dim)
268 if div_ctx == nil then
269 error("Failed to initialize diversity context")
270 end
271
272 local sequence_arr = ffi.new("uint32_t[?]", num_poems)
273
274 -- Compute sequence for each poem
275 for start_poem = start_from, num_poems - 1 do
276 local result = vk.vkd_compute_sequence(div_ctx, start_poem, sequence_arr)
277 check_result(result, string.format("Diversity sequence for poem %d", start_poem))
278
279 -- Convert to Lua table
280 local sequence = {}
281 for i = 0, num_poems - 1 do
282 sequence[i + 1] = tonumber(sequence_arr[i])
283 end
284 sequences[start_poem] = sequence
285
286 -- Write sequence to file immediately if file is open
287 if out_file then
288 out_file:write(ffi.string(sequence_arr, num_poems * 4))
289 -- Flush every 10 sequences to ensure progress is saved
290 if (start_poem + 1) % 10 == 0 then
291 out_file:flush()
292 end
293 end
294
295 -- Progress update every 100 poems
296 if (start_poem + 1 - start_from) % 100 == 0 or start_poem == num_poems - 1 then
297 local elapsed = os.clock() - start_time
298 local computed = start_poem + 1 - start_from
299 local rate = computed / elapsed
300 local remaining = (num_poems - start_poem - 1) / rate
301 local eta_hours = remaining / 3600
302 print(string.format(" [%d/%d] %.2f seq/sec, ETA: %.1fh (%.0fs)",
303 start_poem + 1, num_poems, rate, eta_hours, remaining))
304 end
305 end
306
307 vk.vkd_destroy(div_ctx)
308
309 -- Close output file if open
310 if out_file then
311 out_file:close()
312 print(string.format("[Diversity] Wrote sequences to: %s", output_file))
313 end
314
315 local elapsed = os.clock() - start_time
316 local computed = num_poems - start_from
317 print(string.format("[Diversity] Completed %d sequences in %.2fs (%.2f seq/s)",
318 computed, elapsed, computed / elapsed))
319
320 return sequences
321end
322-- }}}
323
324-- {{{ local function write_sequences_to_file()
325-- Write diversity sequences to file in binary format
326function M.write_sequences_to_file(sequences, output_file)
327 local f = io.open(output_file, "wb")
328 if not f then
329 error("Failed to open output file: " .. output_file)
330 end
331
332 local num_poems = #sequences
333
334 -- Write header: num_poems (4 bytes) using FFI
335 local header = ffi.new("uint32_t[1]", num_poems)
336 f:write(ffi.string(header, 4))
337
338 -- Write each sequence
339 for poem_id = 0, num_poems - 1 do
340 local sequence = sequences[poem_id]
341 for i = 1, #sequence do
342 local value = ffi.new("uint32_t[1]", sequence[i])
343 f:write(ffi.string(value, 4))
344 end
345 end
346
347 f:close()
348 print(string.format("[Diversity] Wrote sequences to %s", output_file))
349end
350-- }}}
351
352-- {{{ local function load_sequences_from_file()
353-- Load diversity sequences from binary file
354function M.load_sequences_from_file(input_file)
355 local f = io.open(input_file, "rb")
356 if not f then
357 error("Failed to open input file: " .. input_file)
358 end
359
360 -- Read header using FFI
361 local header_data = f:read(4)
362 local header = ffi.cast("uint32_t*", header_data)
363 local num_poems = tonumber(header[0])
364
365 -- Read sequences
366 local sequences = {}
367 for poem_id = 0, num_poems - 1 do
368 local sequence = {}
369 for i = 1, num_poems do
370 local data = f:read(4)
371 local value = ffi.cast("uint32_t*", data)
372 sequence[i] = tonumber(value[0])
373 end
374 sequences[poem_id] = sequence
375 end
376
377 f:close()
378 print(string.format("[Diversity] Loaded %d sequences from %s", num_poems, input_file))
379 return sequences
380end
381-- }}}
382
383-- {{{ local function compute_all_diversity_sequences_batched()
384-- Compute diversity sequences using batch parallel processing (2,600× faster)
385--
386-- Parameters:
387-- ctx - Vulkan compute context
388-- embeddings - Flat Lua table of floats
389-- num_poems - Number of poems
390-- embedding_dim - Dimension of embeddings
391-- output_file - File path to write sequences
392-- batch_size - Optional batch size (default: 3584)
393--
394-- Returns: Table mapping poem_id -> sequence table
395-- embeddings_fp16 is now a uint16_t FFI buffer of length num_poems *
396-- embedding_dim, holding FP16-packed values in row-major order. The
397-- wrapper script is responsible for producing this buffer (typically
398-- by reading a cached embeddings_fp16.bin file). We no longer accept
399-- a Lua-table flat array because the per-element copy loop into an
400-- FFI float[?] was a measurable bottleneck on ~20 million floats, and
401-- the new path uses ffi.copy from a binary file straight into the
402-- target buffer instead.
403function M.compute_all_diversity_sequences_batched(ctx, embeddings_fp16, num_poems, embedding_dim, output_file, batch_size)
404 batch_size = batch_size or 3584
405
406 print(string.format("[Diversity Batch] Computing sequences for %d poems (batch size: %d, FP16 storage)...",
407 num_poems, batch_size))
408
409 local start_time = wall_clock()
410 local all_sequences = {}
411
412 -- Process in batches
413 local num_batches = math.ceil(num_poems / batch_size)
414
415 for batch_num = 1, num_batches do
416 local batch_start = (batch_num - 1) * batch_size
417 local batch_end = math.min(batch_start + batch_size - 1, num_poems - 1)
418 local current_batch_size = batch_end - batch_start + 1
419
420 print(string.format("\n[Batch %d/%d] Processing poems %d-%d (%d sequences)",
421 batch_num, num_batches, batch_start, batch_end, current_batch_size))
422
423 -- Create start indices for this batch
424 local start_indices = ffi.new("uint32_t[?]", current_batch_size)
425 for i = 0, current_batch_size - 1 do
426 start_indices[i] = batch_start + i
427 end
428
429 -- Initialize batch context with the FP16 embedding buffer directly.
430 local batch_ctx = vk.vkd_batch_init(ctx, embeddings_fp16, num_poems, embedding_dim,
431 current_batch_size, start_indices)
432 if batch_ctx == nil then
433 error("Failed to initialize batch context")
434 end
435
436 -- Chunked GPU dispatch with adaptive chunk sizing.
437 --
438 -- Why chunks: a single dispatch covering all num_poems-1 iterations
439 -- runs long enough to trip the kernel GPU watchdog (typically
440 -- 2 s on Wayland / 10 s on Xorg), at which point Vulkan reports
441 -- VK_ERROR_DEVICE_LOST and the rest of the run is dead.
442 --
443 -- Why adaptive: per-iteration cost depends on the dataset size,
444 -- GPU model, residency, and how much display work is competing
445 -- for the GPU right now. Hardcoding a chunk size that's safe
446 -- everywhere makes the common case much slower than it needs to
447 -- be. Instead, we run a small probe dispatch first to measure
448 -- actual iter-time on this run, then size all subsequent chunks
449 -- to fit comfortably under a target wall-clock budget per chunk.
450 --
451 -- The probe is one extra dispatch per batch, rounding error in
452 -- a ~100-chunk batch. The first chunk is intentionally tiny so a
453 -- pathologically slow GPU still survives it.
454 local PROBE_ITERS = 10 -- size of the warm-up probe
455 local TARGET_CHUNK_SECONDS = 1.5 -- aim for this much GPU work per dispatch
456 local SAFETY_FACTOR = 0.6 -- pad below the measured ceiling so jitter doesn't trip the watchdog
457
458 local total_iters = num_poems - 1
459 local batch_start_time = wall_clock()
460
461 -- 9-014: tile size for the inner candidate scan. Target tile working
462 -- set ≤ L2 cache with margin. With FP16-packed storage each
463 -- candidate occupies embedding_dim × 2 bytes. The 0.85 factor
464 -- leaves room for the centroid, the mask, and shader code in L2.
465 -- L2_BYTES is the published spec for the GTX 1080 Ti (5.5 MiB);
466 -- a future port to a different card should query this at runtime.
467 local L2_BYTES = 5 * 1024 * 1024 + 512 * 1024 -- 5.5 MiB
468 local bytes_per_candidate = embedding_dim * 2
469 local tile_size = math.max(1, math.floor(L2_BYTES * 0.85 / bytes_per_candidate))
470 if tile_size > num_poems then tile_size = num_poems end
471
472 -- Probe: small dispatch, time it. The probe uses the same tile_size
473 -- the production chunks will use, so the timing actually reflects
474 -- the post-tiling per-iter cost. We dispatch through the pipelined
475 -- variant so the probe also exercises the dispatch-per-tile path
476 -- the production chunks will use.
477 local probe_start = wall_clock()
478 local result = vk.vkd_batch_compute_chunk_pipelined(batch_ctx, 1, PROBE_ITERS, tile_size)
479 check_result(result, string.format(
480 "Batch compute-chunk probe dispatch (slots [1, %d))", 1 + PROBE_ITERS))
481 local probe_elapsed = wall_clock() - probe_start
482 local iter_seconds = probe_elapsed / PROBE_ITERS
483 local chunk_size = math.max(1, math.floor(
484 (TARGET_CHUNK_SECONDS * SAFETY_FACTOR) / iter_seconds))
485
486 local remaining = total_iters - PROBE_ITERS
487 local num_chunks_est = math.ceil(remaining / chunk_size) + 1 -- +1 for the probe
488 print(string.format(
489 " Probe: %d iters in %.3fs (%.1f iter/sec) -> chunk_size = %d, tile_size = %d (~%d more chunks)",
490 PROBE_ITERS, probe_elapsed, PROBE_ITERS / probe_elapsed,
491 chunk_size, tile_size, num_chunks_est - 1))
492 io.stdout:flush()
493
494 -- Progress reporting: print at chunk 1, every 10th chunk, and the
495 -- last one. Anything else is log spam on a multi-thousand-chunk
496 -- run. We also keep a rolling EMA of iter rate to compute an ETA
497 -- that adapts as conditions change.
498 -- Progress is rendered by the shared C bar (vkc_progress_*), so it
499 -- looks like and obeys the same TTY/--debug rules as stage 7. On a TTY
500 -- we update every chunk for a smooth bar; when verbose (--debug) we
501 -- keep the old every-10th cadence so a multi-thousand-chunk run does
502 -- not flood the durable log; when piped without --debug it stays quiet.
503 local PRINT_EVERY = 10
504 local PROGRESS_BAR_MODE = 1 -- mirrors VKC_PROGRESS_BAR in vk_compute.c
505 local progress_mode = vk.vkc_progress_mode()
506 local ema_iter_rate = nil
507 local EMA_ALPHA = 0.2 -- new sample weight; 1 = no smoothing
508
509 local slot = 1 + PROBE_ITERS
510 local chunk_idx = 1
511 while slot <= total_iters do
512 local this_chunk = math.min(chunk_size, total_iters - slot + 1)
513 local chunk_start = wall_clock()
514
515 result = vk.vkd_batch_compute_chunk_pipelined(batch_ctx, slot, this_chunk, tile_size)
516 check_result(result, string.format(
517 "Batch compute-chunk dispatch (chunk %d, slots [%d, %d))",
518 chunk_idx, slot, slot + this_chunk))
519
520 local chunk_elapsed = wall_clock() - chunk_start
521 local total_done = slot + this_chunk - 1
522 local sample_rate = this_chunk / chunk_elapsed
523
524 if ema_iter_rate == nil then
525 ema_iter_rate = sample_rate
526 else
527 ema_iter_rate = EMA_ALPHA * sample_rate + (1 - EMA_ALPHA) * ema_iter_rate
528 end
529
530 local is_last = total_done >= total_iters
531 if progress_mode == PROGRESS_BAR_MODE or chunk_idx == 1
532 or chunk_idx % PRINT_EVERY == 0 or is_last then
533 local eta_seconds = (total_iters - total_done) / ema_iter_rate
534 -- Rolling-average rate + ETA ride along as the bar suffix.
535 local suffix = string.format("%.1f iter/sec, ETA %s",
536 ema_iter_rate, format_duration(eta_seconds))
537 vk.vkc_progress_update_ex("[VKD] sequences", total_done, total_iters, suffix)
538 end
539
540 slot = slot + this_chunk
541 chunk_idx = chunk_idx + 1
542 end
543 vk.vkc_progress_finish()
544
545 local batch_compute_elapsed = wall_clock() - batch_start_time
546 print(string.format(" GPU finished %d iterations in %.2fs (%.2f iter/sec average)",
547 total_iters, batch_compute_elapsed,
548 total_iters / batch_compute_elapsed))
549 io.stdout:flush()
550
551 -- Download sequences for this batch
552 local batch_sequences_arr = ffi.new("uint32_t[?]", current_batch_size * num_poems)
553 local result = vk.vkd_batch_download_sequences(batch_ctx, batch_sequences_arr)
554 check_result(result, "Download batch sequences")
555
556 -- Convert to Lua tables
557 for i = 0, current_batch_size - 1 do
558 local poem_id = batch_start + i
559 local sequence = {}
560 for j = 0, num_poems - 1 do
561 sequence[j + 1] = tonumber(batch_sequences_arr[i * num_poems + j])
562 end
563 all_sequences[poem_id] = sequence
564 end
565
566 -- Cleanup batch
567 vk.vkd_batch_destroy(batch_ctx)
568
569 local batch_elapsed = wall_clock() - batch_start_time
570 print(string.format("[Batch %d/%d] Completed in %.2fs (%.2f seq/s)",
571 batch_num, num_batches, batch_elapsed, current_batch_size / batch_elapsed))
572 end
573
574 local total_elapsed = wall_clock() - start_time
575 print(string.format("\n[Diversity Batch] Completed ALL %d sequences in %.2fs (%.2f seq/s)",
576 num_poems, total_elapsed, num_poems / total_elapsed))
577
578 -- Write to file if requested
579 if output_file then
580 M.write_sequences_to_file(all_sequences, output_file)
581 end
582
583 return all_sequences
584end
585-- }}}
586
587-- {{{ FP16 conversion: thin wrappers around the C helpers
588-- Exposed on M so the wrapper script can convert FP32 -> FP16 without
589-- needing its own ffi.load. The C helpers are private to this module
590-- otherwise (the FFI library object `vk` is module-local).
591function M.fp32_to_fp16(src, dst, count)
592 return vk.vkc_fp32_to_fp16(src, dst, count)
593end
594
595function M.fp16_to_fp32(bits)
596 return vk.vkc_fp16_to_fp32(bits)
597end
598-- }}}
599
600return M
601