libs/vulkan-compute/src/vk_compute.c

1301 lines

1/* vk_compute.c - Vulkan Compute Wrapper Implementation
2 *
3 * This file implements a simplified Vulkan compute interface for
4 * accelerating vector operations on poem embeddings (768-dimensional).
5 *
6 * Architecture:
7 * - Single compute queue for all operations
8 * - Explicit synchronization with fences
9 * - Descriptor sets per pipeline
10 * - Push constants for per-dispatch parameters
11 */
12
13#include "vk_compute.h"
14#include <vulkan/vulkan.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <unistd.h> /* isatty for progress-bar mode selection */
19
20#define MAX_BUFFERS 32
21#define MAX_DESCRIPTOR_SETS 16
22
23/* {{{ Internal structures
24 */
25
26typedef struct {
27 VkBuffer buffer;
28 VkDeviceMemory memory;
29 VkDeviceSize size;
30 VkComputeBufferType type;
31} BufferInternal;
32
33typedef struct {
34 VkShaderModule shader;
35 VkPipelineLayout layout;
36 VkPipeline pipeline;
37 VkDescriptorSetLayout desc_set_layout;
38 VkDescriptorPool desc_pool;
39 VkDescriptorSet desc_set;
40 uint32_t push_constant_size;
41 uint32_t num_bindings;
42} PipelineInternal;
43
44/* 9-014 pipelining: depth of the async dispatch pipeline. Two is enough
45 * for "CPU records the next dispatch while the GPU runs the current
46 * one" overlap; deeper pipelines don't help because the workload is
47 * GPU-bound, not CPU-bound, and deeper queues just defer the wait. */
48#define VKC_ASYNC_PIPELINE_DEPTH 2
49
50struct VkComputeContext {
51 /* Vulkan core objects */
52 VkInstance instance;
53 VkPhysicalDevice physical_device;
54 VkDevice device;
55 VkQueue compute_queue;
56 uint32_t compute_queue_family;
57
58 /* Command execution — synchronous path: one command buffer, one fence,
59 * each dispatch waits inline. Used by the similarity engine and any
60 * caller that wants "do one thing, wait for it." */
61 VkCommandPool command_pool;
62 VkCommandBuffer command_buffer;
63 VkFence fence;
64
65 /* Command execution — async / pipelined path: N command buffers and
66 * N fences cycling round-robin. Each dispatch waits for the OLDEST
67 * slot's previous use to finish (which is generally already done by
68 * the time we record the next slot's commands), then submits without
69 * waiting for THIS slot to finish. The CPU records dispatch N+1
70 * concurrently with the GPU running dispatch N. */
71 VkCommandBuffer async_cmd_buffers[VKC_ASYNC_PIPELINE_DEPTH];
72 VkFence async_fences[VKC_ASYNC_PIPELINE_DEPTH];
73 uint32_t next_async_slot;
74
75 /* Device properties */
76 VkPhysicalDeviceProperties device_properties;
77 VkPhysicalDeviceMemoryProperties memory_properties;
78
79 /* Validation */
80 bool validation_enabled;
81 VkDebugUtilsMessengerEXT debug_messenger;
82};
83
84struct VkComputeBuffer {
85 BufferInternal internal;
86};
87
88struct VkComputePipeline {
89 PipelineInternal internal;
90};
91
92/* }}} */
93
94/* {{{ Error handling
95 */
96
97const char* vkc_get_error_string(VkComputeResult result) {
98 switch (result) {
99 case VKC_SUCCESS: return "Success";
100 case VKC_ERROR_INIT_FAILED: return "Initialization failed";
101 case VKC_ERROR_NO_SUITABLE_DEVICE: return "No suitable Vulkan device found";
102 case VKC_ERROR_BUFFER_CREATION_FAILED: return "Buffer creation failed";
103 case VKC_ERROR_SHADER_LOAD_FAILED: return "Shader loading failed";
104 case VKC_ERROR_PIPELINE_CREATION_FAILED: return "Pipeline creation failed";
105 case VKC_ERROR_COMMAND_EXECUTION_FAILED: return "Command execution failed";
106 case VKC_ERROR_OUT_OF_MEMORY: return "Out of memory";
107 default: return "Unknown error";
108 }
109}
110
111static void check_vk_result(VkResult result, const char* operation) {
112 if (result != VK_SUCCESS) {
113 fprintf(stderr, "[VKC ERROR] %s failed with code %d\n", operation, result);
114 }
115}
116
117/* }}} */
118
119/* {{{ Debug messenger callback
120 */
121
122static VKAPI_ATTR VkBool32 VKAPI_CALL debug_callback(
123 VkDebugUtilsMessageSeverityFlagBitsEXT severity,
124 VkDebugUtilsMessageTypeFlagsEXT type,
125 const VkDebugUtilsMessengerCallbackDataEXT* callback_data,
126 void* user_data)
127{
128 (void)type;
129 (void)user_data;
130
131 if (severity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) {
132 fprintf(stderr, "[VK VALIDATION] %s\n", callback_data->pMessage);
133 }
134
135 return VK_FALSE;
136}
137
138/* }}} */
139
140/* {{{ Helper: Find memory type
141 */
142
143static uint32_t find_memory_type(VkComputeContext* ctx,
144 uint32_t type_filter,
145 VkMemoryPropertyFlags properties)
146{
147 for (uint32_t i = 0; i < ctx->memory_properties.memoryTypeCount; i++) {
148 if ((type_filter & (1 << i)) &&
149 (ctx->memory_properties.memoryTypes[i].propertyFlags & properties) == properties) {
150 return i;
151 }
152 }
153
154 fprintf(stderr, "[VKC ERROR] Failed to find suitable memory type\n");
155 return UINT32_MAX;
156}
157
158/* }}} */
159
160/* {{{ Initialization: vkc_init
161 */
162
163VkComputeContext* vkc_init(bool enable_validation) {
164 VkComputeContext* ctx = calloc(1, sizeof(VkComputeContext));
165 if (!ctx) {
166 return NULL;
167 }
168
169 ctx->validation_enabled = enable_validation;
170
171 /* Create Vulkan instance */
172 VkApplicationInfo app_info = {
173 .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
174 .pApplicationName = "Vulkan Compute Poetry Embeddings",
175 .applicationVersion = VK_MAKE_VERSION(1, 0, 0),
176 .pEngineName = "vk_compute",
177 .engineVersion = VK_MAKE_VERSION(1, 0, 0),
178 .apiVersion = VK_API_VERSION_1_2,
179 };
180
181 const char* validation_layers[] = {
182 "VK_LAYER_KHRONOS_validation"
183 };
184
185 VkInstanceCreateInfo instance_info = {
186 .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
187 .pApplicationInfo = &app_info,
188 .enabledLayerCount = enable_validation ? 1 : 0,
189 .ppEnabledLayerNames = enable_validation ? validation_layers : NULL,
190 };
191
192 VkResult result = vkCreateInstance(&instance_info, NULL, &ctx->instance);
193 if (result != VK_SUCCESS) {
194 fprintf(stderr, "[VKC ERROR] Failed to create Vulkan instance: %d\n", result);
195 free(ctx);
196 return NULL;
197 }
198
199 /* Select physical device with compute support */
200 uint32_t device_count = 0;
201 vkEnumeratePhysicalDevices(ctx->instance, &device_count, NULL);
202
203 if (device_count == 0) {
204 fprintf(stderr, "[VKC ERROR] No Vulkan-capable devices found\n");
205 vkDestroyInstance(ctx->instance, NULL);
206 free(ctx);
207 return NULL;
208 }
209
210 VkPhysicalDevice* devices = malloc(sizeof(VkPhysicalDevice) * device_count);
211 vkEnumeratePhysicalDevices(ctx->instance, &device_count, devices);
212
213 /* Find device with compute queue */
214 ctx->physical_device = VK_NULL_HANDLE;
215 ctx->compute_queue_family = UINT32_MAX;
216
217 for (uint32_t i = 0; i < device_count; i++) {
218 uint32_t queue_family_count = 0;
219 vkGetPhysicalDeviceQueueFamilyProperties(devices[i], &queue_family_count, NULL);
220
221 VkQueueFamilyProperties* queue_families =
222 malloc(sizeof(VkQueueFamilyProperties) * queue_family_count);
223 vkGetPhysicalDeviceQueueFamilyProperties(devices[i], &queue_family_count, queue_families);
224
225 for (uint32_t j = 0; j < queue_family_count; j++) {
226 if (queue_families[j].queueFlags & VK_QUEUE_COMPUTE_BIT) {
227 ctx->physical_device = devices[i];
228 ctx->compute_queue_family = j;
229 break;
230 }
231 }
232
233 free(queue_families);
234
235 if (ctx->physical_device != VK_NULL_HANDLE) {
236 break;
237 }
238 }
239
240 free(devices);
241
242 if (ctx->physical_device == VK_NULL_HANDLE) {
243 fprintf(stderr, "[VKC ERROR] No device with compute queue found\n");
244 vkDestroyInstance(ctx->instance, NULL);
245 free(ctx);
246 return NULL;
247 }
248
249 /* Get device properties */
250 vkGetPhysicalDeviceProperties(ctx->physical_device, &ctx->device_properties);
251 vkGetPhysicalDeviceMemoryProperties(ctx->physical_device, &ctx->memory_properties);
252
253 printf("[VKC] Selected device: %s\n", ctx->device_properties.deviceName);
254 printf("[VKC] Compute queue family: %u\n", ctx->compute_queue_family);
255
256 /* Create logical device */
257 float queue_priority = 1.0f;
258 VkDeviceQueueCreateInfo queue_info = {
259 .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
260 .queueFamilyIndex = ctx->compute_queue_family,
261 .queueCount = 1,
262 .pQueuePriorities = &queue_priority,
263 };
264
265 VkDeviceCreateInfo device_info = {
266 .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
267 .queueCreateInfoCount = 1,
268 .pQueueCreateInfos = &queue_info,
269 .enabledLayerCount = enable_validation ? 1 : 0,
270 .ppEnabledLayerNames = enable_validation ? validation_layers : NULL,
271 };
272
273 result = vkCreateDevice(ctx->physical_device, &device_info, NULL, &ctx->device);
274 if (result != VK_SUCCESS) {
275 fprintf(stderr, "[VKC ERROR] Failed to create logical device: %d\n", result);
276 vkDestroyInstance(ctx->instance, NULL);
277 free(ctx);
278 return NULL;
279 }
280
281 /* Get compute queue */
282 vkGetDeviceQueue(ctx->device, ctx->compute_queue_family, 0, &ctx->compute_queue);
283
284 /* Create command pool */
285 VkCommandPoolCreateInfo pool_info = {
286 .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
287 .queueFamilyIndex = ctx->compute_queue_family,
288 .flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
289 };
290
291 result = vkCreateCommandPool(ctx->device, &pool_info, NULL, &ctx->command_pool);
292 if (result != VK_SUCCESS) {
293 fprintf(stderr, "[VKC ERROR] Failed to create command pool: %d\n", result);
294 vkDestroyDevice(ctx->device, NULL);
295 vkDestroyInstance(ctx->instance, NULL);
296 free(ctx);
297 return NULL;
298 }
299
300 /* Allocate command buffer */
301 VkCommandBufferAllocateInfo alloc_info = {
302 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
303 .commandPool = ctx->command_pool,
304 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
305 .commandBufferCount = 1,
306 };
307
308 result = vkAllocateCommandBuffers(ctx->device, &alloc_info, &ctx->command_buffer);
309 if (result != VK_SUCCESS) {
310 fprintf(stderr, "[VKC ERROR] Failed to allocate command buffer: %d\n", result);
311 vkDestroyCommandPool(ctx->device, ctx->command_pool, NULL);
312 vkDestroyDevice(ctx->device, NULL);
313 vkDestroyInstance(ctx->instance, NULL);
314 free(ctx);
315 return NULL;
316 }
317
318 /* Create fence for synchronization */
319 VkFenceCreateInfo fence_info = {
320 .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
321 .flags = VK_FENCE_CREATE_SIGNALED_BIT,
322 };
323
324 result = vkCreateFence(ctx->device, &fence_info, NULL, &ctx->fence);
325 if (result != VK_SUCCESS) {
326 fprintf(stderr, "[VKC ERROR] Failed to create fence: %d\n", result);
327 vkDestroyCommandPool(ctx->device, ctx->command_pool, NULL);
328 vkDestroyDevice(ctx->device, NULL);
329 vkDestroyInstance(ctx->instance, NULL);
330 free(ctx);
331 return NULL;
332 }
333
334 /* 9-014 pipelining: allocate the async command-buffer pool (N buffers
335 * and N fences, fences pre-signaled so the first N dispatches do not
336 * block waiting on an unused slot's previous use). */
337 VkCommandBufferAllocateInfo async_alloc_info = {
338 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
339 .commandPool = ctx->command_pool,
340 .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
341 .commandBufferCount = VKC_ASYNC_PIPELINE_DEPTH,
342 };
343 result = vkAllocateCommandBuffers(ctx->device, &async_alloc_info, ctx->async_cmd_buffers);
344 if (result != VK_SUCCESS) {
345 fprintf(stderr, "[VKC ERROR] Failed to allocate async command buffers: %d\n", result);
346 vkDestroyFence(ctx->device, ctx->fence, NULL);
347 vkDestroyCommandPool(ctx->device, ctx->command_pool, NULL);
348 vkDestroyDevice(ctx->device, NULL);
349 vkDestroyInstance(ctx->instance, NULL);
350 free(ctx);
351 return NULL;
352 }
353 for (int i = 0; i < VKC_ASYNC_PIPELINE_DEPTH; i++) {
354 result = vkCreateFence(ctx->device, &fence_info, NULL, &ctx->async_fences[i]);
355 if (result != VK_SUCCESS) {
356 fprintf(stderr, "[VKC ERROR] Failed to create async fence %d: %d\n", i, result);
357 /* Clean up any fences already created. */
358 for (int j = 0; j < i; j++) {
359 vkDestroyFence(ctx->device, ctx->async_fences[j], NULL);
360 }
361 vkDestroyFence(ctx->device, ctx->fence, NULL);
362 vkDestroyCommandPool(ctx->device, ctx->command_pool, NULL);
363 vkDestroyDevice(ctx->device, NULL);
364 vkDestroyInstance(ctx->instance, NULL);
365 free(ctx);
366 return NULL;
367 }
368 }
369 ctx->next_async_slot = 0;
370
371 printf("[VKC] Initialization complete (async pipeline depth: %d)\n", VKC_ASYNC_PIPELINE_DEPTH);
372 return ctx;
373}
374
375/* }}} */
376
377/* {{{ Cleanup: vkc_destroy
378 */
379
380void vkc_destroy(VkComputeContext* ctx) {
381 if (!ctx) return;
382
383 vkDeviceWaitIdle(ctx->device);
384
385 /* 9-014: destroy async pipeline fences. The command buffers are freed
386 * implicitly when the command pool is destroyed below. */
387 for (int i = 0; i < VKC_ASYNC_PIPELINE_DEPTH; i++) {
388 if (ctx->async_fences[i] != VK_NULL_HANDLE) {
389 vkDestroyFence(ctx->device, ctx->async_fences[i], NULL);
390 }
391 }
392
393 if (ctx->fence != VK_NULL_HANDLE) {
394 vkDestroyFence(ctx->device, ctx->fence, NULL);
395 }
396
397 if (ctx->command_pool != VK_NULL_HANDLE) {
398 vkDestroyCommandPool(ctx->device, ctx->command_pool, NULL);
399 }
400
401 if (ctx->device != VK_NULL_HANDLE) {
402 vkDestroyDevice(ctx->device, NULL);
403 }
404
405 if (ctx->instance != VK_NULL_HANDLE) {
406 vkDestroyInstance(ctx->instance, NULL);
407 }
408
409 free(ctx);
410}
411
412/* }}} */
413
414/* {{{ Progress rendering
415 *
416 * Mode is resolved once and cached: neither the TTY-ness of stdout nor the
417 * VKC_DEBUG environment flag changes mid-run. See vk_compute.h for the three
418 * behaviours (verbose / animated / quiet).
419 */
420
421enum { VKC_PROGRESS_QUIET = 0, VKC_PROGRESS_BAR = 1, VKC_PROGRESS_VERBOSE = 2 };
422
423/* Public so Lua callers (e.g. the diversity chunk loop) can throttle updates
424 * by mode: animate every step on a TTY, but only emit occasional plain lines
425 * when verbose, to keep a thousand-chunk run from flooding the log. */
426int vkc_progress_mode(void) {
427 static int mode = -1;
428 if (mode == -1) {
429 /* --debug wins: a frozen run is exactly when you want every line on
430 * durable disk, not a single overwriting bar that loses its history. */
431 if (getenv("VKC_DEBUG") != NULL) {
432 mode = VKC_PROGRESS_VERBOSE;
433 } else if (isatty(STDOUT_FILENO)) {
434 mode = VKC_PROGRESS_BAR;
435 } else {
436 mode = VKC_PROGRESS_QUIET;
437 }
438 }
439 return mode;
440}
441
442void vkc_progress_update_ex(const char* label, uint64_t current, uint64_t total,
443 const char* suffix) {
444 int mode = vkc_progress_mode();
445 if (mode == VKC_PROGRESS_QUIET) return;
446
447 double frac = total > 0 ? (double)current / (double)total : 1.0;
448 if (frac > 1.0) frac = 1.0; /* callers may overshoot (e.g. claimed-task counters) */
449
450 if (mode == VKC_PROGRESS_VERBOSE) {
451 /* Plain, newline-terminated: log-friendly, no carriage returns. */
452 printf("%s %llu/%llu (%.0f%%)%s%s\n", label,
453 (unsigned long long)current, (unsigned long long)total, frac * 100.0,
454 suffix ? " " : "", suffix ? suffix : "");
455 fflush(stdout);
456 return;
457 }
458
459 /* Animated bar: overwrite one line. █ = done, ░ = pending. The trailing
460 * spaces clear any leftover tail from a previous, longer suffix (ETA
461 * strings shrink as a run finishes). */
462 const int bar_width = 40;
463 int filled = (int)(frac * bar_width);
464 printf("\r%s [", label);
465 for (int i = 0; i < bar_width; i++) {
466 fputs(i < filled ? "█" : "░", stdout);
467 }
468 printf("] %llu/%llu (%3.0f%%)%s%s ",
469 (unsigned long long)current, (unsigned long long)total, frac * 100.0,
470 suffix ? " " : "", suffix ? suffix : "");
471 fflush(stdout);
472}
473
474void vkc_progress_update(const char* label, uint64_t current, uint64_t total) {
475 vkc_progress_update_ex(label, current, total, NULL);
476}
477
478void vkc_progress_finish(void) {
479 /* Only the animated bar leaves the cursor mid-line; close it. Verbose and
480 * quiet modes already ended their output with (or without) a newline. */
481 if (vkc_progress_mode() == VKC_PROGRESS_BAR) {
482 putchar('\n');
483 fflush(stdout);
484 }
485}
486
487/* }}} */
488
489/* {{{ Device info functions
490 */
491
492const char* vkc_get_device_name(VkComputeContext* ctx) {
493 return ctx ? ctx->device_properties.deviceName : "Unknown";
494}
495
496uint32_t vkc_get_max_workgroup_size(VkComputeContext* ctx) {
497 return ctx ? ctx->device_properties.limits.maxComputeWorkGroupSize[0] : 0;
498}
499
500uint64_t vkc_get_device_memory(VkComputeContext* ctx) {
501 if (!ctx) return 0;
502
503 uint64_t total = 0;
504 for (uint32_t i = 0; i < ctx->memory_properties.memoryHeapCount; i++) {
505 if (ctx->memory_properties.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
506 total += ctx->memory_properties.memoryHeaps[i].size;
507 }
508 }
509 return total;
510}
511
512/* }}} */
513
514/* NOTE: Buffer management, pipeline management, and command execution
515 * functions will be implemented in the next iteration to keep file manageable.
516 * For now, these return placeholder values.
517 */
518
519/* {{{ Buffer management
520 */
521
522VkComputeBuffer* vkc_create_buffer(VkComputeContext* ctx, size_t size, VkComputeBufferType type) {
523 if (!ctx || size == 0) return NULL;
524
525 VkComputeBuffer* buffer = calloc(1, sizeof(VkComputeBuffer));
526 if (!buffer) return NULL;
527
528 buffer->internal.size = size;
529 buffer->internal.type = type;
530
531 /* Determine buffer usage and memory properties */
532 VkBufferUsageFlags usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
533 VkMemoryPropertyFlags memory_props;
534
535 switch (type) {
536 case VKC_BUFFER_DEVICE_LOCAL:
537 usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
538 memory_props = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
539 break;
540
541 case VKC_BUFFER_HOST_VISIBLE:
542 memory_props = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
543 VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
544 break;
545
546 case VKC_BUFFER_STAGING:
547 usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
548 memory_props = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
549 VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
550 break;
551
552 default:
553 free(buffer);
554 return NULL;
555 }
556
557 /* Create buffer */
558 VkBufferCreateInfo buffer_info = {
559 .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
560 .size = size,
561 .usage = usage,
562 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
563 };
564
565 VkResult result = vkCreateBuffer(ctx->device, &buffer_info, NULL, &buffer->internal.buffer);
566 if (result != VK_SUCCESS) {
567 fprintf(stderr, "[VKC ERROR] Failed to create buffer: %d\n", result);
568 free(buffer);
569 return NULL;
570 }
571
572 /* Allocate memory */
573 VkMemoryRequirements mem_reqs;
574 vkGetBufferMemoryRequirements(ctx->device, buffer->internal.buffer, &mem_reqs);
575
576 uint32_t memory_type = find_memory_type(ctx, mem_reqs.memoryTypeBits, memory_props);
577 if (memory_type == UINT32_MAX) {
578 vkDestroyBuffer(ctx->device, buffer->internal.buffer, NULL);
579 free(buffer);
580 return NULL;
581 }
582
583 VkMemoryAllocateInfo alloc_info = {
584 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
585 .allocationSize = mem_reqs.size,
586 .memoryTypeIndex = memory_type,
587 };
588
589 result = vkAllocateMemory(ctx->device, &alloc_info, NULL, &buffer->internal.memory);
590 if (result != VK_SUCCESS) {
591 fprintf(stderr, "[VKC ERROR] Failed to allocate buffer memory: %d\n", result);
592 vkDestroyBuffer(ctx->device, buffer->internal.buffer, NULL);
593 free(buffer);
594 return NULL;
595 }
596
597 /* Bind memory to buffer */
598 vkBindBufferMemory(ctx->device, buffer->internal.buffer, buffer->internal.memory, 0);
599
600 return buffer;
601}
602
603void vkc_destroy_buffer(VkComputeContext* ctx, VkComputeBuffer* buffer) {
604 if (!ctx || !buffer) return;
605
606 vkDestroyBuffer(ctx->device, buffer->internal.buffer, NULL);
607 vkFreeMemory(ctx->device, buffer->internal.memory, NULL);
608 free(buffer);
609}
610
611VkComputeResult vkc_upload_buffer(VkComputeContext* ctx, VkComputeBuffer* buffer,
612 const void* data, size_t size) {
613 if (!ctx || !buffer || !data || size == 0) {
614 return VKC_ERROR_BUFFER_CREATION_FAILED;
615 }
616
617 if (size > buffer->internal.size) {
618 fprintf(stderr, "[VKC ERROR] Upload size exceeds buffer size\n");
619 return VKC_ERROR_BUFFER_CREATION_FAILED;
620 }
621
622 /* For host-visible buffers, map and copy directly */
623 if (buffer->internal.type == VKC_BUFFER_HOST_VISIBLE ||
624 buffer->internal.type == VKC_BUFFER_STAGING) {
625 void* mapped;
626 VkResult result = vkMapMemory(ctx->device, buffer->internal.memory, 0, size, 0, &mapped);
627 if (result != VK_SUCCESS) {
628 fprintf(stderr, "[VKC ERROR] Failed to map buffer memory: %d\n", result);
629 return VKC_ERROR_BUFFER_CREATION_FAILED;
630 }
631
632 memcpy(mapped, data, size);
633 vkUnmapMemory(ctx->device, buffer->internal.memory);
634 return VKC_SUCCESS;
635 }
636
637 /* For device-local buffers, use staging buffer */
638 VkComputeBuffer* staging = vkc_create_buffer(ctx, size, VKC_BUFFER_STAGING);
639 if (!staging) {
640 return VKC_ERROR_BUFFER_CREATION_FAILED;
641 }
642
643 /* Upload to staging buffer */
644 VkComputeResult upload_result = vkc_upload_buffer(ctx, staging, data, size);
645 if (upload_result != VKC_SUCCESS) {
646 vkc_destroy_buffer(ctx, staging);
647 return upload_result;
648 }
649
650 /* Copy from staging to device buffer */
651 VkCommandBufferBeginInfo begin_info = {
652 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
653 .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
654 };
655
656 vkBeginCommandBuffer(ctx->command_buffer, &begin_info);
657
658 VkBufferCopy copy_region = {
659 .srcOffset = 0,
660 .dstOffset = 0,
661 .size = size,
662 };
663
664 vkCmdCopyBuffer(ctx->command_buffer, staging->internal.buffer,
665 buffer->internal.buffer, 1, &copy_region);
666
667 vkEndCommandBuffer(ctx->command_buffer);
668
669 /* Submit and wait */
670 VkSubmitInfo submit_info = {
671 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
672 .commandBufferCount = 1,
673 .pCommandBuffers = &ctx->command_buffer,
674 };
675
676 vkResetFences(ctx->device, 1, &ctx->fence);
677 vkQueueSubmit(ctx->compute_queue, 1, &submit_info, ctx->fence);
678 vkWaitForFences(ctx->device, 1, &ctx->fence, VK_TRUE, UINT64_MAX);
679
680 vkc_destroy_buffer(ctx, staging);
681 return VKC_SUCCESS;
682}
683
684VkComputeResult vkc_download_buffer(VkComputeContext* ctx, VkComputeBuffer* buffer,
685 void* data, size_t size) {
686 if (!ctx || !buffer || !data || size == 0) {
687 return VKC_ERROR_BUFFER_CREATION_FAILED;
688 }
689
690 if (size > buffer->internal.size) {
691 fprintf(stderr, "[VKC ERROR] Download size exceeds buffer size\n");
692 return VKC_ERROR_BUFFER_CREATION_FAILED;
693 }
694
695 /* For host-visible buffers, map and copy directly */
696 if (buffer->internal.type == VKC_BUFFER_HOST_VISIBLE ||
697 buffer->internal.type == VKC_BUFFER_STAGING) {
698 void* mapped;
699 VkResult result = vkMapMemory(ctx->device, buffer->internal.memory, 0, size, 0, &mapped);
700 if (result != VK_SUCCESS) {
701 fprintf(stderr, "[VKC ERROR] Failed to map buffer memory: %d\n", result);
702 return VKC_ERROR_BUFFER_CREATION_FAILED;
703 }
704
705 memcpy(data, mapped, size);
706 vkUnmapMemory(ctx->device, buffer->internal.memory);
707 return VKC_SUCCESS;
708 }
709
710 /* For device-local buffers, use staging buffer */
711 VkComputeBuffer* staging = vkc_create_buffer(ctx, size, VKC_BUFFER_STAGING);
712 if (!staging) {
713 return VKC_ERROR_BUFFER_CREATION_FAILED;
714 }
715
716 /* Copy from device buffer to staging */
717 VkCommandBufferBeginInfo begin_info = {
718 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
719 .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
720 };
721
722 vkBeginCommandBuffer(ctx->command_buffer, &begin_info);
723
724 VkBufferCopy copy_region = {
725 .srcOffset = 0,
726 .dstOffset = 0,
727 .size = size,
728 };
729
730 vkCmdCopyBuffer(ctx->command_buffer, buffer->internal.buffer,
731 staging->internal.buffer, 1, &copy_region);
732
733 vkEndCommandBuffer(ctx->command_buffer);
734
735 /* Submit and wait */
736 VkSubmitInfo submit_info = {
737 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
738 .commandBufferCount = 1,
739 .pCommandBuffers = &ctx->command_buffer,
740 };
741
742 vkResetFences(ctx->device, 1, &ctx->fence);
743 vkQueueSubmit(ctx->compute_queue, 1, &submit_info, ctx->fence);
744 vkWaitForFences(ctx->device, 1, &ctx->fence, VK_TRUE, UINT64_MAX);
745
746 /* Download from staging buffer */
747 VkComputeResult download_result = vkc_download_buffer(ctx, staging, data, size);
748 vkc_destroy_buffer(ctx, staging);
749
750 return download_result;
751}
752
753/* }}} */
754
755/* {{{ Helper: Load SPIR-V shader
756 */
757
758static VkShaderModule load_shader_module(VkComputeContext* ctx, const char* path) {
759 FILE* file = fopen(path, "rb");
760 if (!file) {
761 fprintf(stderr, "[VKC ERROR] Failed to open shader file: %s\n", path);
762 return VK_NULL_HANDLE;
763 }
764
765 /* Get file size */
766 fseek(file, 0, SEEK_END);
767 long file_size = ftell(file);
768 fseek(file, 0, SEEK_SET);
769
770 if (file_size <= 0 || file_size % 4 != 0) {
771 fprintf(stderr, "[VKC ERROR] Invalid SPIR-V file size: %ld\n", file_size);
772 fclose(file);
773 return VK_NULL_HANDLE;
774 }
775
776 /* Read shader code */
777 uint32_t* code = malloc(file_size);
778 if (!code) {
779 fclose(file);
780 return VK_NULL_HANDLE;
781 }
782
783 size_t bytes_read = fread(code, 1, file_size, file);
784 fclose(file);
785
786 if (bytes_read != (size_t)file_size) {
787 fprintf(stderr, "[VKC ERROR] Failed to read shader file\n");
788 free(code);
789 return VK_NULL_HANDLE;
790 }
791
792 /* Create shader module */
793 VkShaderModuleCreateInfo module_info = {
794 .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
795 .codeSize = file_size,
796 .pCode = code,
797 };
798
799 VkShaderModule shader_module;
800 VkResult result = vkCreateShaderModule(ctx->device, &module_info, NULL, &shader_module);
801
802 free(code);
803
804 if (result != VK_SUCCESS) {
805 fprintf(stderr, "[VKC ERROR] Failed to create shader module: %d\n", result);
806 return VK_NULL_HANDLE;
807 }
808
809 return shader_module;
810}
811
812/* }}} */
813
814/* {{{ Pipeline management
815 */
816
817VkComputePipeline* vkc_create_pipeline(VkComputeContext* ctx, const char* shader_path,
818 uint32_t push_constant_size) {
819 if (!ctx || !shader_path) return NULL;
820
821 VkComputePipeline* pipeline = calloc(1, sizeof(VkComputePipeline));
822 if (!pipeline) return NULL;
823
824 pipeline->internal.push_constant_size = push_constant_size;
825 pipeline->internal.num_bindings = MAX_DESCRIPTOR_SETS;
826
827 /* Load shader module */
828 pipeline->internal.shader = load_shader_module(ctx, shader_path);
829 if (pipeline->internal.shader == VK_NULL_HANDLE) {
830 free(pipeline);
831 return NULL;
832 }
833
834 /* Create descriptor set layout (supports up to MAX_DESCRIPTOR_SETS storage buffers) */
835 VkDescriptorSetLayoutBinding bindings[MAX_DESCRIPTOR_SETS];
836 for (uint32_t i = 0; i < MAX_DESCRIPTOR_SETS; i++) {
837 bindings[i].binding = i;
838 bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
839 bindings[i].descriptorCount = 1;
840 bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
841 bindings[i].pImmutableSamplers = NULL;
842 }
843
844 VkDescriptorSetLayoutCreateInfo layout_info = {
845 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
846 .bindingCount = MAX_DESCRIPTOR_SETS,
847 .pBindings = bindings,
848 };
849
850 VkResult result = vkCreateDescriptorSetLayout(ctx->device, &layout_info, NULL,
851 &pipeline->internal.desc_set_layout);
852 if (result != VK_SUCCESS) {
853 vkDestroyShaderModule(ctx->device, pipeline->internal.shader, NULL);
854 free(pipeline);
855 return NULL;
856 }
857
858 /* Create pipeline layout with push constants */
859 VkPushConstantRange push_constant_range = {
860 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
861 .offset = 0,
862 .size = push_constant_size,
863 };
864
865 VkPipelineLayoutCreateInfo pipeline_layout_info = {
866 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
867 .setLayoutCount = 1,
868 .pSetLayouts = &pipeline->internal.desc_set_layout,
869 .pushConstantRangeCount = push_constant_size > 0 ? 1 : 0,
870 .pPushConstantRanges = push_constant_size > 0 ? &push_constant_range : NULL,
871 };
872
873 result = vkCreatePipelineLayout(ctx->device, &pipeline_layout_info, NULL,
874 &pipeline->internal.layout);
875 if (result != VK_SUCCESS) {
876 vkDestroyDescriptorSetLayout(ctx->device, pipeline->internal.desc_set_layout, NULL);
877 vkDestroyShaderModule(ctx->device, pipeline->internal.shader, NULL);
878 free(pipeline);
879 return NULL;
880 }
881
882 /* Create compute pipeline */
883 VkPipelineShaderStageCreateInfo shader_stage_info = {
884 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
885 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
886 .module = pipeline->internal.shader,
887 .pName = "main",
888 };
889
890 VkComputePipelineCreateInfo pipeline_info = {
891 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
892 .stage = shader_stage_info,
893 .layout = pipeline->internal.layout,
894 };
895
896 result = vkCreateComputePipelines(ctx->device, VK_NULL_HANDLE, 1, &pipeline_info,
897 NULL, &pipeline->internal.pipeline);
898 if (result != VK_SUCCESS) {
899 vkDestroyPipelineLayout(ctx->device, pipeline->internal.layout, NULL);
900 vkDestroyDescriptorSetLayout(ctx->device, pipeline->internal.desc_set_layout, NULL);
901 vkDestroyShaderModule(ctx->device, pipeline->internal.shader, NULL);
902 free(pipeline);
903 return NULL;
904 }
905
906 /* Create descriptor pool */
907 VkDescriptorPoolSize pool_size = {
908 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
909 .descriptorCount = MAX_DESCRIPTOR_SETS,
910 };
911
912 VkDescriptorPoolCreateInfo pool_info = {
913 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
914 .poolSizeCount = 1,
915 .pPoolSizes = &pool_size,
916 .maxSets = 1,
917 };
918
919 result = vkCreateDescriptorPool(ctx->device, &pool_info, NULL,
920 &pipeline->internal.desc_pool);
921 if (result != VK_SUCCESS) {
922 vkDestroyPipeline(ctx->device, pipeline->internal.pipeline, NULL);
923 vkDestroyPipelineLayout(ctx->device, pipeline->internal.layout, NULL);
924 vkDestroyDescriptorSetLayout(ctx->device, pipeline->internal.desc_set_layout, NULL);
925 vkDestroyShaderModule(ctx->device, pipeline->internal.shader, NULL);
926 free(pipeline);
927 return NULL;
928 }
929
930 /* Allocate descriptor set */
931 VkDescriptorSetAllocateInfo alloc_info = {
932 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
933 .descriptorPool = pipeline->internal.desc_pool,
934 .descriptorSetCount = 1,
935 .pSetLayouts = &pipeline->internal.desc_set_layout,
936 };
937
938 result = vkAllocateDescriptorSets(ctx->device, &alloc_info,
939 &pipeline->internal.desc_set);
940 if (result != VK_SUCCESS) {
941 vkDestroyDescriptorPool(ctx->device, pipeline->internal.desc_pool, NULL);
942 vkDestroyPipeline(ctx->device, pipeline->internal.pipeline, NULL);
943 vkDestroyPipelineLayout(ctx->device, pipeline->internal.layout, NULL);
944 vkDestroyDescriptorSetLayout(ctx->device, pipeline->internal.desc_set_layout, NULL);
945 vkDestroyShaderModule(ctx->device, pipeline->internal.shader, NULL);
946 free(pipeline);
947 return NULL;
948 }
949
950 return pipeline;
951}
952
953void vkc_destroy_pipeline(VkComputeContext* ctx, VkComputePipeline* pipeline) {
954 if (!ctx || !pipeline) return;
955
956 vkDestroyDescriptorPool(ctx->device, pipeline->internal.desc_pool, NULL);
957 vkDestroyPipeline(ctx->device, pipeline->internal.pipeline, NULL);
958 vkDestroyPipelineLayout(ctx->device, pipeline->internal.layout, NULL);
959 vkDestroyDescriptorSetLayout(ctx->device, pipeline->internal.desc_set_layout, NULL);
960 vkDestroyShaderModule(ctx->device, pipeline->internal.shader, NULL);
961 free(pipeline);
962}
963
964VkComputeResult vkc_bind_buffer(VkComputeContext* ctx, VkComputePipeline* pipeline,
965 uint32_t binding, VkComputeBuffer* buffer) {
966 if (!ctx || !pipeline || !buffer) {
967 return VKC_ERROR_PIPELINE_CREATION_FAILED;
968 }
969
970 if (binding >= MAX_DESCRIPTOR_SETS) {
971 fprintf(stderr, "[VKC ERROR] Binding index %u exceeds maximum %d\n",
972 binding, MAX_DESCRIPTOR_SETS);
973 return VKC_ERROR_PIPELINE_CREATION_FAILED;
974 }
975
976 VkDescriptorBufferInfo buffer_info = {
977 .buffer = buffer->internal.buffer,
978 .offset = 0,
979 .range = VK_WHOLE_SIZE,
980 };
981
982 VkWriteDescriptorSet descriptor_write = {
983 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
984 .dstSet = pipeline->internal.desc_set,
985 .dstBinding = binding,
986 .dstArrayElement = 0,
987 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
988 .descriptorCount = 1,
989 .pBufferInfo = &buffer_info,
990 };
991
992 vkUpdateDescriptorSets(ctx->device, 1, &descriptor_write, 0, NULL);
993
994 return VKC_SUCCESS;
995}
996
997VkComputeResult vkc_dispatch(VkComputeContext* ctx, VkComputePipeline* pipeline,
998 uint32_t x, uint32_t y, uint32_t z,
999 const void* push_constants) {
1000 if (!ctx || !pipeline) {
1001 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1002 }
1003
1004 /* Begin command buffer */
1005 VkCommandBufferBeginInfo begin_info = {
1006 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
1007 .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
1008 };
1009
1010 VkResult result = vkBeginCommandBuffer(ctx->command_buffer, &begin_info);
1011 if (result != VK_SUCCESS) {
1012 fprintf(stderr, "[VKC ERROR] Failed to begin command buffer: %d\n", result);
1013 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1014 }
1015
1016 /* Bind pipeline */
1017 vkCmdBindPipeline(ctx->command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
1018 pipeline->internal.pipeline);
1019
1020 /* Bind descriptor sets */
1021 vkCmdBindDescriptorSets(ctx->command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE,
1022 pipeline->internal.layout, 0, 1,
1023 &pipeline->internal.desc_set, 0, NULL);
1024
1025 /* Push constants if provided */
1026 if (push_constants && pipeline->internal.push_constant_size > 0) {
1027 vkCmdPushConstants(ctx->command_buffer, pipeline->internal.layout,
1028 VK_SHADER_STAGE_COMPUTE_BIT, 0,
1029 pipeline->internal.push_constant_size, push_constants);
1030 }
1031
1032 /* Dispatch compute shader */
1033 vkCmdDispatch(ctx->command_buffer, x, y, z);
1034
1035 /* End command buffer */
1036 result = vkEndCommandBuffer(ctx->command_buffer);
1037 if (result != VK_SUCCESS) {
1038 fprintf(stderr, "[VKC ERROR] Failed to end command buffer: %d\n", result);
1039 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1040 }
1041
1042 /* Submit command buffer */
1043 VkSubmitInfo submit_info = {
1044 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
1045 .commandBufferCount = 1,
1046 .pCommandBuffers = &ctx->command_buffer,
1047 };
1048
1049 vkResetFences(ctx->device, 1, &ctx->fence);
1050 result = vkQueueSubmit(ctx->compute_queue, 1, &submit_info, ctx->fence);
1051 if (result != VK_SUCCESS) {
1052 fprintf(stderr, "[VKC ERROR] Failed to submit command buffer: %d\n", result);
1053 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1054 }
1055
1056 /* Wait for completion */
1057 result = vkWaitForFences(ctx->device, 1, &ctx->fence, VK_TRUE, UINT64_MAX);
1058 if (result != VK_SUCCESS) {
1059 fprintf(stderr, "[VKC ERROR] Failed to wait for fence: %d\n", result);
1060 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1061 }
1062
1063 return VKC_SUCCESS;
1064}
1065
1066VkComputeResult vkc_wait_idle(VkComputeContext* ctx) {
1067 if (!ctx) return VKC_ERROR_INIT_FAILED;
1068 vkDeviceWaitIdle(ctx->device);
1069 return VKC_SUCCESS;
1070}
1071
1072/* {{{ vkc_dispatch_async
1073 *
1074 * 9-014 pipelining: same shape as vkc_dispatch but does not wait for the
1075 * dispatched work to finish before returning. Instead, it waits for the
1076 * NEXT slot's previous submission to finish (which is generally already
1077 * done by the time the CPU has finished recording this one), so the CPU
1078 * can return immediately to record dispatch N+2 while the GPU is still
1079 * working on dispatch N+1.
1080 *
1081 * Memory dependencies between successive dispatches are expressed via
1082 * vkCmdPipelineBarrier inside the recorded command buffer (see callers
1083 * in vk_diversity.c). Vulkan does not give us implicit storage-buffer
1084 * synchronization between dispatches; the barrier is mandatory if
1085 * dispatch K+1 reads what dispatch K wrote.
1086 */
1087VkComputeResult vkc_dispatch_async(VkComputeContext* ctx, VkComputePipeline* pipeline,
1088 uint32_t x, uint32_t y, uint32_t z,
1089 const void* push_constants) {
1090 if (!ctx || !pipeline) {
1091 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1092 }
1093
1094 uint32_t slot = ctx->next_async_slot;
1095 VkCommandBuffer cmd_buf = ctx->async_cmd_buffers[slot];
1096 VkFence fence = ctx->async_fences[slot];
1097
1098 /* Wait for this slot's previous use to finish. Initially the fences are
1099 * created in the signaled state so the first VKC_ASYNC_PIPELINE_DEPTH
1100 * dispatches return without blocking. After that, this wait IS the
1101 * pipelining sync: it blocks the CPU only if the GPU is more than N
1102 * dispatches behind, which is the depth-limit of the pipeline. */
1103 VkResult result = vkWaitForFences(ctx->device, 1, &fence, VK_TRUE, UINT64_MAX);
1104 if (result != VK_SUCCESS) {
1105 fprintf(stderr, "[VKC ERROR] Failed to wait for async fence %u: %d\n", slot, result);
1106 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1107 }
1108 vkResetFences(ctx->device, 1, &fence);
1109
1110 /* Reset and re-record this slot's command buffer. */
1111 vkResetCommandBuffer(cmd_buf, 0);
1112
1113 VkCommandBufferBeginInfo begin_info = {
1114 .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
1115 .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
1116 };
1117 result = vkBeginCommandBuffer(cmd_buf, &begin_info);
1118 if (result != VK_SUCCESS) {
1119 fprintf(stderr, "[VKC ERROR] Failed to begin async command buffer: %d\n", result);
1120 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1121 }
1122
1123 /* Insert a compute-to-compute memory barrier at the START of this
1124 * dispatch's command buffer. Any storage-buffer writes from previously-
1125 * submitted dispatches on the same queue must be visible to this
1126 * dispatch's reads. Without this barrier the GPU is free to overlap
1127 * the two dispatches, which would race on the running_max buffer. */
1128 VkMemoryBarrier mem_barrier = {
1129 .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
1130 .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
1131 .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
1132 };
1133 vkCmdPipelineBarrier(cmd_buf,
1134 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1135 VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
1136 0,
1137 1, &mem_barrier,
1138 0, NULL,
1139 0, NULL);
1140
1141 vkCmdBindPipeline(cmd_buf, VK_PIPELINE_BIND_POINT_COMPUTE,
1142 pipeline->internal.pipeline);
1143
1144 vkCmdBindDescriptorSets(cmd_buf, VK_PIPELINE_BIND_POINT_COMPUTE,
1145 pipeline->internal.layout, 0, 1,
1146 &pipeline->internal.desc_set, 0, NULL);
1147
1148 if (push_constants && pipeline->internal.push_constant_size > 0) {
1149 vkCmdPushConstants(cmd_buf, pipeline->internal.layout,
1150 VK_SHADER_STAGE_COMPUTE_BIT, 0,
1151 pipeline->internal.push_constant_size, push_constants);
1152 }
1153
1154 vkCmdDispatch(cmd_buf, x, y, z);
1155
1156 result = vkEndCommandBuffer(cmd_buf);
1157 if (result != VK_SUCCESS) {
1158 fprintf(stderr, "[VKC ERROR] Failed to end async command buffer: %d\n", result);
1159 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1160 }
1161
1162 VkSubmitInfo submit_info = {
1163 .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
1164 .commandBufferCount = 1,
1165 .pCommandBuffers = &cmd_buf,
1166 };
1167
1168 result = vkQueueSubmit(ctx->compute_queue, 1, &submit_info, fence);
1169 if (result != VK_SUCCESS) {
1170 fprintf(stderr, "[VKC ERROR] Failed to submit async command buffer: %d\n", result);
1171 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1172 }
1173
1174 /* Advance to the next slot. The OLD slot's fence will be signaled by the
1175 * GPU when its work finishes, so the next time we cycle back to it the
1176 * wait at the top of this function will be very fast (probably zero). */
1177 ctx->next_async_slot = (slot + 1) % VKC_ASYNC_PIPELINE_DEPTH;
1178
1179 return VKC_SUCCESS;
1180}
1181/* }}} */
1182
1183/* {{{ vkc_wait_async_all
1184 *
1185 * Drains the async pipeline by waiting on every slot's fence. Called at
1186 * the end of a batch / chunk to ensure all submitted work has completed
1187 * before the caller reads the output buffers.
1188 */
1189VkComputeResult vkc_wait_async_all(VkComputeContext* ctx) {
1190 if (!ctx) return VKC_ERROR_INIT_FAILED;
1191
1192 VkResult result = vkWaitForFences(ctx->device,
1193 VKC_ASYNC_PIPELINE_DEPTH,
1194 ctx->async_fences,
1195 VK_TRUE,
1196 UINT64_MAX);
1197 if (result != VK_SUCCESS) {
1198 fprintf(stderr, "[VKC ERROR] Failed to wait for async fences: %d\n", result);
1199 return VKC_ERROR_COMMAND_EXECUTION_FAILED;
1200 }
1201
1202 return VKC_SUCCESS;
1203}
1204/* }}} */
1205
1206/* }}} */
1207
1208/* {{{ FP16 / FP32 conversion helpers
1209 *
1210 * IEEE 754 binary16 has 1 sign bit, 5 exponent bits (bias 15), and 10
1211 * fraction bits. The conversion to/from binary32 (FP32, bias 127) is
1212 * straightforward bit manipulation. We do not bother with FP16 subnormal
1213 * support: subnormals round to zero. For cosine-distance ranking of
1214 * embedding vectors, the dynamic range of |x| is essentially always
1215 * in [1e-4, 1] and the few values that would underflow to subnormal in
1216 * FP16 do not change the ranking of candidates.
1217 */
1218
1219void vkc_fp32_to_fp16(const float* src, uint16_t* dst, uint32_t count) {
1220 /* The bit-manipulation algorithm is the standard "fast half" used in
1221 * cuBLAS, FlashAttention, and most ML inference runtimes. Round-to-
1222 * nearest-even is implemented by adding the rounding bias (0x1000)
1223 * before truncating the mantissa to 10 bits, then handling the rare
1224 * mantissa-overflow case by bumping the exponent. */
1225 for (uint32_t i = 0; i < count; i++) {
1226 uint32_t bits;
1227 memcpy(&bits, &src[i], sizeof(bits));
1228
1229 uint16_t sign = (uint16_t)((bits >> 16) & 0x8000u);
1230 int32_t exp = (int32_t)((bits >> 23) & 0xffu) - 127;
1231 uint32_t frac = bits & 0x7fffffu;
1232
1233 uint16_t out;
1234 if (exp == 128) {
1235 /* Inf or NaN: preserve sign, force exponent to all-ones, and
1236 * keep at least one mantissa bit if the input was NaN. */
1237 out = (uint16_t)(sign | 0x7c00u | (frac ? 1u : 0u));
1238 } else if (exp >= 16) {
1239 /* Overflow to inf. */
1240 out = (uint16_t)(sign | 0x7c00u);
1241 } else if (exp >= -14) {
1242 /* Normal range. Round mantissa to 10 bits, handle overflow. */
1243 uint32_t rounded = frac + 0x1000u;
1244 uint32_t new_frac = rounded >> 13;
1245 int32_t new_exp = exp;
1246 if (new_frac & 0x400u) {
1247 /* Mantissa overflow rolled into the implicit bit; bump exp. */
1248 new_frac = 0;
1249 new_exp += 1;
1250 }
1251 if (new_exp >= 16) {
1252 out = (uint16_t)(sign | 0x7c00u);
1253 } else {
1254 out = (uint16_t)(sign | (uint16_t)((new_exp + 15) << 10) | (uint16_t)(new_frac & 0x3ffu));
1255 }
1256 } else {
1257 /* Underflow: round to zero (subnormal support skipped). */
1258 out = sign;
1259 }
1260 dst[i] = out;
1261 }
1262}
1263
1264float vkc_fp16_to_fp32(uint16_t bits) {
1265 /* Inverse of the above, with subnormal renormalization. Used per
1266 * embedding slot when materializing the FP32 initial centroid; not
1267 * on a hot path so we can afford to be careful. */
1268 uint32_t sign = ((uint32_t)bits & 0x8000u) << 16;
1269 uint32_t exp = (bits >> 10) & 0x1fu;
1270 uint32_t frac = bits & 0x3ffu;
1271
1272 uint32_t result;
1273 if (exp == 0) {
1274 if (frac == 0) {
1275 result = sign; /* signed zero */
1276 } else {
1277 /* Subnormal: renormalize by shifting until the implicit bit
1278 * appears, decrementing the exponent each time. */
1279 int32_t e = 1;
1280 while ((frac & 0x400u) == 0) {
1281 frac <<= 1;
1282 e -= 1;
1283 }
1284 frac &= 0x3ffu;
1285 result = sign | ((uint32_t)(e + 127 - 15) << 23) | (frac << 13);
1286 }
1287 } else if (exp == 0x1f) {
1288 /* Inf or NaN. */
1289 result = sign | 0x7f800000u | (frac << 13);
1290 } else {
1291 /* Normal range: re-bias the exponent and left-pad the mantissa. */
1292 result = sign | ((exp + 127u - 15u) << 23) | (frac << 13);
1293 }
1294
1295 float f;
1296 memcpy(&f, &result, sizeof(f));
1297 return f;
1298}
1299
1300/* }}} */
1301