libs/vulkan-compute/include/vk_compute.h

158 lines

1/* vk_compute.h - Vulkan Compute Wrapper for Poetry Embedding Operations

2 *

3 * This library provides a simplified interface to Vulkan compute shaders

4 * for accelerating vector operations on poem embeddings.

5 *

6 * Design Goals:

7 * - Minimal boilerplate for compute-only operations

8 * - LuaJIT FFI-friendly C interface

9 * - Automatic resource management

10 * - Validation layer support for debugging

11 */

13#ifndef VK_COMPUTE_H

14#define VK_COMPUTE_H

16#include <stdint.h>

17#include <stddef.h>

18#include <stdbool.h>

20#ifdef __cplusplus

21extern "C" {

22#endif

24/* Opaque handle types */

25typedef struct VkComputeContext VkComputeContext;

26typedef struct VkComputeBuffer VkComputeBuffer;

27typedef struct VkComputePipeline VkComputePipeline;

29/* Error codes */

30typedef enum {

31 VKC_SUCCESS = 0,

32 VKC_ERROR_INIT_FAILED = -1,

33 VKC_ERROR_NO_SUITABLE_DEVICE = -2,

34 VKC_ERROR_BUFFER_CREATION_FAILED = -3,

35 VKC_ERROR_SHADER_LOAD_FAILED = -4,

36 VKC_ERROR_PIPELINE_CREATION_FAILED = -5,

37 VKC_ERROR_COMMAND_EXECUTION_FAILED = -6,

38 VKC_ERROR_OUT_OF_MEMORY = -7,

39} VkComputeResult;

41/* Buffer types */

42typedef enum {

43 VKC_BUFFER_DEVICE_LOCAL, /* GPU-only memory (fastest) */

44 VKC_BUFFER_HOST_VISIBLE, /* CPU-GPU shared memory */

45 VKC_BUFFER_STAGING, /* Temporary transfer buffer */

46} VkComputeBufferType;

48/* Initialization and cleanup */

49VkComputeContext* vkc_init(bool enable_validation);

50void vkc_destroy(VkComputeContext* ctx);

51const char* vkc_get_error_string(VkComputeResult result);

53/* Device information */

54const char* vkc_get_device_name(VkComputeContext* ctx);

55uint32_t vkc_get_max_workgroup_size(VkComputeContext* ctx);

56uint64_t vkc_get_device_memory(VkComputeContext* ctx);

58/* Buffer management */

59VkComputeBuffer* vkc_create_buffer(VkComputeContext* ctx,

60 size_t size,

61 VkComputeBufferType type);

62void vkc_destroy_buffer(VkComputeContext* ctx, VkComputeBuffer* buffer);

64/* Data transfer */

65VkComputeResult vkc_upload_buffer(VkComputeContext* ctx,

66 VkComputeBuffer* buffer,

67 const void* data,

68 size_t size);

69VkComputeResult vkc_download_buffer(VkComputeContext* ctx,

70 VkComputeBuffer* buffer,

71 void* data,

72 size_t size);

74/* Shader and pipeline management */

75VkComputePipeline* vkc_create_pipeline(VkComputeContext* ctx,

76 const char* shader_path,

77 uint32_t push_constant_size);

78void vkc_destroy_pipeline(VkComputeContext* ctx, VkComputePipeline* pipeline);

80/* Descriptor binding */

81VkComputeResult vkc_bind_buffer(VkComputeContext* ctx,

82 VkComputePipeline* pipeline,

83 uint32_t binding,

84 VkComputeBuffer* buffer);

86/* Command execution */

87VkComputeResult vkc_dispatch(VkComputeContext* ctx,

88 VkComputePipeline* pipeline,

89 uint32_t group_count_x,

90 uint32_t group_count_y,

91 uint32_t group_count_z,

92 const void* push_constants);

94/* Synchronization */

95VkComputeResult vkc_wait_idle(VkComputeContext* ctx);

97/* 9-014 pipelining: async dispatch + drain. vkc_dispatch_async submits

98 * to a round-robin pool of command buffers, blocking only when the pool

99 * fills up (which lets the CPU stay ahead of the GPU by N submissions).

100 * Inserts an implicit compute-to-compute memory barrier so the next

101 * dispatch sees writes from the previous one. Drains via vkc_wait_async_all.

102 */

103VkComputeResult vkc_dispatch_async(VkComputeContext* ctx, VkComputePipeline* pipeline,

104 uint32_t x, uint32_t y, uint32_t z,

105 const void* push_constants);

106VkComputeResult vkc_wait_async_all(VkComputeContext* ctx);

107

108/* Bulk float -> half-precision conversion.

109 *

110 * Used to prepare the embeddings buffer for diversity_full.spv, which

111 * reads embeddings as packed FP16 (two values per uint via the

112 * unpackHalf2x16 builtin) to halve memory bandwidth.

113 *

114 * Caller provides src and dst arrays with `count` elements each. The

115 * conversion is IEEE 754 binary16 with round-to-nearest-even, but

116 * subnormals round to zero (acceptable for cosine-distance ranking).

117 */

118void vkc_fp32_to_fp16(const float* src, uint16_t* dst, uint32_t count);

119

120/* Single FP16 -> FP32 conversion. Used to materialize the per-batch

121 * initial centroids in FP32 from a FP16-encoded embedding seed.

122 */

123float vkc_fp16_to_fp32(uint16_t bits);

124

125/* {{{ Progress rendering (shared by the similarity + diversity stages)

126 *

127 * One in-place progress bar, three behaviours, picked once per run:

128 * - VKC_DEBUG set in the environment (run.sh --debug): verbose mode. Each

129 * update prints a plain, newline-terminated line so a redirected log keeps

130 * the full history of a (possibly frozen) run.

131 * - else stdout is a TTY: animated mode. Updates overwrite one line with a

132 * "\r" Unicode bar -- clean for an interactive watcher.

133 * - else (piped to a file, cron, no debug): quiet. Nothing is drawn, so logs

134 * do not fill with thousands of overwrite characters on one giant line.

135 *

136 * Callers update as often as they like (throttle large loops to ~100 calls);

137 * call vkc_progress_finish() once after the loop to close an animated line.

138 *

139 * vkc_progress_update_ex appends an optional suffix after the percentage

140 * (e.g. "30.3 iter/sec, ETA 15s") -- used by loops that have rate/ETA stats

141 * worth keeping when their scrolling lines collapse into one bar.

142 *

143 * vkc_progress_mode returns 0 = quiet, 1 = animated bar, 2 = verbose, so a

144 * caller can update every step in bar mode but throttle in verbose mode.

145 */

146void vkc_progress_update(const char* label, uint64_t current, uint64_t total);

147void vkc_progress_update_ex(const char* label, uint64_t current, uint64_t total,

148 const char* suffix);

149void vkc_progress_finish(void);

150int vkc_progress_mode(void);

151/* }}} */

152

153#ifdef __cplusplus

154}

155#endif

156

157#endif /* VK_COMPUTE_H */

158