libs/vulkan-compute/include/vk_compute.h
1/* vk_compute.h - Vulkan Compute Wrapper for Poetry Embedding Operations
2 *
3 * This library provides a simplified interface to Vulkan compute shaders
4 * for accelerating vector operations on poem embeddings.
5 *
6 * Design Goals:
7 * - Minimal boilerplate for compute-only operations
8 * - LuaJIT FFI-friendly C interface
9 * - Automatic resource management
10 * - Validation layer support for debugging
11 */
12
13#ifndef VK_COMPUTE_H
14#define VK_COMPUTE_H
15
16#include <stdint.h>
17#include <stddef.h>
18#include <stdbool.h>
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24/* Opaque handle types */
25typedef struct VkComputeContext VkComputeContext;
26typedef struct VkComputeBuffer VkComputeBuffer;
27typedef struct VkComputePipeline VkComputePipeline;
28
29/* Error codes */
30typedef enum {
31 VKC_SUCCESS = 0,
32 VKC_ERROR_INIT_FAILED = -1,
33 VKC_ERROR_NO_SUITABLE_DEVICE = -2,
34 VKC_ERROR_BUFFER_CREATION_FAILED = -3,
35 VKC_ERROR_SHADER_LOAD_FAILED = -4,
36 VKC_ERROR_PIPELINE_CREATION_FAILED = -5,
37 VKC_ERROR_COMMAND_EXECUTION_FAILED = -6,
38 VKC_ERROR_OUT_OF_MEMORY = -7,
39} VkComputeResult;
40
41/* Buffer types */
42typedef enum {
43 VKC_BUFFER_DEVICE_LOCAL, /* GPU-only memory (fastest) */
44 VKC_BUFFER_HOST_VISIBLE, /* CPU-GPU shared memory */
45 VKC_BUFFER_STAGING, /* Temporary transfer buffer */
46} VkComputeBufferType;
47
48/* Initialization and cleanup */
49VkComputeContext* vkc_init(bool enable_validation);
50void vkc_destroy(VkComputeContext* ctx);
51const char* vkc_get_error_string(VkComputeResult result);
52
53/* Device information */
54const char* vkc_get_device_name(VkComputeContext* ctx);
55uint32_t vkc_get_max_workgroup_size(VkComputeContext* ctx);
56uint64_t vkc_get_device_memory(VkComputeContext* ctx);
57
58/* Buffer management */
59VkComputeBuffer* vkc_create_buffer(VkComputeContext* ctx,
60 size_t size,
61 VkComputeBufferType type);
62void vkc_destroy_buffer(VkComputeContext* ctx, VkComputeBuffer* buffer);
63
64/* Data transfer */
65VkComputeResult vkc_upload_buffer(VkComputeContext* ctx,
66 VkComputeBuffer* buffer,
67 const void* data,
68 size_t size);
69VkComputeResult vkc_download_buffer(VkComputeContext* ctx,
70 VkComputeBuffer* buffer,
71 void* data,
72 size_t size);
73
74/* Shader and pipeline management */
75VkComputePipeline* vkc_create_pipeline(VkComputeContext* ctx,
76 const char* shader_path,
77 uint32_t push_constant_size);
78void vkc_destroy_pipeline(VkComputeContext* ctx, VkComputePipeline* pipeline);
79
80/* Descriptor binding */
81VkComputeResult vkc_bind_buffer(VkComputeContext* ctx,
82 VkComputePipeline* pipeline,
83 uint32_t binding,
84 VkComputeBuffer* buffer);
85
86/* Command execution */
87VkComputeResult vkc_dispatch(VkComputeContext* ctx,
88 VkComputePipeline* pipeline,
89 uint32_t group_count_x,
90 uint32_t group_count_y,
91 uint32_t group_count_z,
92 const void* push_constants);
93
94/* Synchronization */
95VkComputeResult vkc_wait_idle(VkComputeContext* ctx);
96
97/* 9-014 pipelining: async dispatch + drain. vkc_dispatch_async submits
98 * to a round-robin pool of command buffers, blocking only when the pool
99 * fills up (which lets the CPU stay ahead of the GPU by N submissions).
100 * Inserts an implicit compute-to-compute memory barrier so the next
101 * dispatch sees writes from the previous one. Drains via vkc_wait_async_all.
102 */
103VkComputeResult vkc_dispatch_async(VkComputeContext* ctx, VkComputePipeline* pipeline,
104 uint32_t x, uint32_t y, uint32_t z,
105 const void* push_constants);
106VkComputeResult vkc_wait_async_all(VkComputeContext* ctx);
107
108/* Bulk float -> half-precision conversion.
109 *
110 * Used to prepare the embeddings buffer for diversity_full.spv, which
111 * reads embeddings as packed FP16 (two values per uint via the
112 * unpackHalf2x16 builtin) to halve memory bandwidth.
113 *
114 * Caller provides src and dst arrays with `count` elements each. The
115 * conversion is IEEE 754 binary16 with round-to-nearest-even, but
116 * subnormals round to zero (acceptable for cosine-distance ranking).
117 */
118void vkc_fp32_to_fp16(const float* src, uint16_t* dst, uint32_t count);
119
120/* Single FP16 -> FP32 conversion. Used to materialize the per-batch
121 * initial centroids in FP32 from a FP16-encoded embedding seed.
122 */
123float vkc_fp16_to_fp32(uint16_t bits);
124
125/* {{{ Progress rendering (shared by the similarity + diversity stages)
126 *
127 * One in-place progress bar, three behaviours, picked once per run:
128 * - VKC_DEBUG set in the environment (run.sh --debug): verbose mode. Each
129 * update prints a plain, newline-terminated line so a redirected log keeps
130 * the full history of a (possibly frozen) run.
131 * - else stdout is a TTY: animated mode. Updates overwrite one line with a
132 * "\r" Unicode bar -- clean for an interactive watcher.
133 * - else (piped to a file, cron, no debug): quiet. Nothing is drawn, so logs
134 * do not fill with thousands of overwrite characters on one giant line.
135 *
136 * Callers update as often as they like (throttle large loops to ~100 calls);
137 * call vkc_progress_finish() once after the loop to close an animated line.
138 *
139 * vkc_progress_update_ex appends an optional suffix after the percentage
140 * (e.g. "30.3 iter/sec, ETA 15s") -- used by loops that have rate/ETA stats
141 * worth keeping when their scrolling lines collapse into one bar.
142 *
143 * vkc_progress_mode returns 0 = quiet, 1 = animated bar, 2 = verbose, so a
144 * caller can update every step in bar mode but throttle in verbose mode.
145 */
146void vkc_progress_update(const char* label, uint64_t current, uint64_t total);
147void vkc_progress_update_ex(const char* label, uint64_t current, uint64_t total,
148 const char* suffix);
149void vkc_progress_finish(void);
150int vkc_progress_mode(void);
151/* }}} */
152
153#ifdef __cplusplus
154}
155#endif
156
157#endif /* VK_COMPUTE_H */
158